# ignore
from typing import Callable, Optional, Type, Tuple, Any
from typing import Dict, Union, Set, List

def current_repo() -> Optional[str]:
    path = os.getcwd()
    while True:
        if os.path.exists(os.path.join(path, '.git')):
            return os.path.normpath(path)

        # Go one level up
        new_path = os.path.normpath(os.path.join(path, '..'))
        if new_path != path:
            path = new_path
        else:
            return None

    return None

current_repo()

'/Users/zeller/Projects/debuggingbook'

book_miner = Repository(current_repo(), to=datetime(2020, 10, 1))

DEBUGGINGBOOK_REMOTE_REPO = 'https://github.com/uds-se/debuggingbook.git'
# book_miner = Repository(DEBUGGINGBOOK_REMOTE_REPO)

# ignore
if 'CI' in os.environ:
    # The CI git clone is shallow, so access full repo remotely
    book_miner = Repository(DEBUGGINGBOOK_REMOTE_REPO,
                            to=datetime(2020, 10, 1))

book_commits = book_miner.traverse_commits()
book_first_commit = next(book_commits)

[attr for attr in dir(book_first_commit) if not attr.startswith('_')]

['author',
 'author_date',
 'author_timezone',
 'branches',
 'co_authors',
 'committer',
 'committer_date',
 'committer_timezone',
 'deletions',
 'dmm_unit_complexity',
 'dmm_unit_interfacing',
 'dmm_unit_size',
 'files',
 'hash',
 'in_main_branch',
 'insertions',
 'lines',
 'merge',
 'modified_files',
 'msg',
 'parents',
 'project_name',
 'project_path']

book_first_commit.msg

'first commit'

[attr for attr in dir(book_first_commit.author) if not attr.startswith('_')]

['email', 'name']

book_first_commit.author.name, book_first_commit.author.email

('Andreas Zeller', 'zeller@cispa.saarland')

book_first_commit.modified_files

[<pydriller.domain.commit.ModifiedFile at 0x11d5398e0>]

[attr for attr in dir(book_first_commit.modified_files[0]) if not attr.startswith('_')]

['added_lines',
 'change_type',
 'changed_methods',
 'complexity',
 'content',
 'content_before',
 'deleted_lines',
 'diff',
 'diff_parsed',
 'filename',
 'language_supported',
 'methods',
 'methods_before',
 'new_path',
 'nloc',
 'old_path',
 'source_code',
 'source_code_before',
 'token_count']

book_first_commit.modified_files[0].new_path

'README.md'

print(book_first_commit.modified_files[0].content)

b'# debuggingbook\n'

print(book_first_commit.modified_files[0].content_before)

None

book_second_commit = next(book_commits)

[m.new_path for m in book_second_commit.modified_files]

['Chapters.makefile',
 'LICENSE.md',
 'Makefile',
 'README.md',
 'debuggingbook.bib',
 'ipypublish',
 'ipypublish_plugins',
 'notebooks/.ipynb_checkpoints/index-checkpoint.ipynb',
 'notebooks/index.ipynb',
 'utils']

readme_modification = [m for m in book_second_commit.modified_files if m.new_path == 'README.md'][0]

print(str(readme_modification.content_before, 'utf8'))

# debuggingbook

print(str(readme_modification.content[:400], 'utf8'))

# About this Book

__Welcome to "The Debugging Book"!__ 

Software has bugs, and finding bugs can involve lots of effort.  This book addresses this problem by _automating_ software debugging, specifically by _locating errors and their causes automatically_.  Recent years have seen the development of novel techniques that lead to dramatic improvements in test generation and software testing.  They

print(readme_modification.diff[:100])

@@ -1 +1,157 @@
-# debuggingbook
+
+# About this Book
+
+__Welcome to "The Debugging Book"!__ 
+
+So

readme_modification.diff_parsed['added'][:10]

[(1, ''),
 (2, '# About this Book'),
 (3, ''),
 (4, '__Welcome to "The Debugging Book"!__'),
 (5, ''),
 (6,
  'Software has bugs, and finding bugs can involve lots of effort.  This book addresses this problem by _automating_ software debugging, specifically by _locating errors and their causes automatically_.  Recent years have seen the development of novel techniques that lead to dramatic improvements in test generation and software testing.  They now are mature enough to be assembled in a book – even with executable code.'),
 (7, ''),
 (8, ''),
 (9, ''),
 (10, '## A Textbook for Paper, Screen, and Keyboard')]

# ignore
del book_miner  # Save a bit of memory

tuple('debuggingbook/notebooks/ChangeCounter.ipynb'.split('/'))

('debuggingbook', 'notebooks', 'ChangeCounter.ipynb')

Node = Tuple

class ChangeCounter:
    """Count the number of changes for a repository."""

    def __init__(self, repo: str, *, 
                 filter: Optional[Callable[[Commit], bool]] = None, 
                 log: bool = False, 
                 **kwargs: Any) -> None:
        """
        Constructor.
        `repo` is a git repository (as URL or directory).
        `filter` is a predicate that takes a modification and returns True 
        if it should be considered (default: consider all).
        `log` turns on logging if set.
        `kwargs` are passed to the `Repository()` constructor.
        """
        self.repo = repo
        self.log = log

        if filter is None:
            def filter(m: ModifiedFile) -> bool:
                return True
        assert filter is not None

        self.filter = filter

        # A node is an tuple (f_1, f_2, f_3, ..., f_n) denoting
        # a folder f_1 holding a folder f_2 ... holding a file f_n.

        # Mapping node -> #of changes
        self.changes: Dict[Node, int] = defaultdict(int)

        # Mapping node -> list of commit messages
        self.messages: Dict[Node, List[str]] = defaultdict(list)

        # Mapping node -> last size seen
        self.sizes: Dict[Node, Union[int, float]] = {}

        self.mine(**kwargs)

class ChangeCounter(ChangeCounter):
    def mine(self, **kwargs: Any) -> None:
        """Gather data from repository. To be extended in subclasses."""
        miner = Repository(self.repo, **kwargs)

        for commit in miner.traverse_commits():
            try:
                self.mine_commit(commit)
            except GitCommandError as err:
                # Warn about failing git commands, but continue
                warnings.warn("Cannot mine commit " + repr(commit.hash) + '\n' + str(err))
            except (ValueError, TypeError) as err:
                warnings.warn("Cannot mine commit " + repr(commit.hash) + '\n' + str(err))
                raise err

    def mine_commit(self, commit: Commit) -> None:
        for m in commit.modified_files:
            m.committer = commit.committer
            m.committer_date = commit.committer_date
            m.msg = commit.msg

            if self.include(m):
                self.update_stats(m)

class ChangeCounter(ChangeCounter):
    def include(self, m: ModifiedFile) -> bool:
        """
        Return True if the modification `m` should be included
        (default: the `filter` predicate given to the constructor).
        To be overloaded in subclasses.
        """
        return self.filter(m)

class ChangeCounter(ChangeCounter):
    def update_stats(self, m: ModifiedFile) -> None:
        """
        Update counters with modification `m`.
        Can be extended in subclasses.
        """
        if not m.new_path:
            return

        node = tuple(m.new_path.split('/'))

        self.update_size(node, len(m.content) if m.content else 0)
        self.update_changes(node, m.msg)

        self.update_elems(node, m)

class ChangeCounter(ChangeCounter):
    def update_size(self, node: Tuple, size: int) -> None:
        """
        Update counters for `node` with `size`.
        Can be extended in subclasses.
        """
        self.sizes[node] = size

class ChangeCounter(ChangeCounter):
    def update_changes(self, node: Tuple, commit_msg: str) -> None:
        """
        Update stats for `node` changed with `commit_msg`.
        Can be extended in subclasses.
        """
        self.changes[node] += 1

        self.messages[node].append(commit_msg)

class ChangeCounter(ChangeCounter):
    def update_elems(self, node: Tuple, m: ModifiedFile) -> None:
        """
        Update counters for subelements of `node` with modification `m`.
        To be defined in subclasses.
        """
        pass

DEBUGGINGBOOK_REPO = current_repo()

DEBUGGINGBOOK_REPO

'/Users/zeller/Projects/debuggingbook'

DEBUGGINGBOOK_START_DATE: datetime = datetime(2021, 3, 1)

NUM_WORKERS = 4  # Number of threads to be run in parallel

def debuggingbook_change_counter(
        cls: Type,
        start_date: datetime = DEBUGGINGBOOK_START_DATE) -> Any:
    """
    Instantiate a ChangeCounter (sub)class `cls` with the debuggingbook repo.
    Only mines changes after `start_date` (default: DEBUGGINGBOOK_START_DATE)
    """

    def filter(m: ModifiedFile) -> bool:
        """
        Do not include
        * the `docs/` directory; it only holds generated Web pages
        * the `notebooks/shared/` package; this is infrastructure
        * the `synopsis` pictures; these are all generated
         """
        return (m.new_path and
                not m.new_path.startswith('docs/') and
                not m.new_path.startswith('notebooks/shared/') and
                '-synopsis-' not in m.new_path)

    return cls(DEBUGGINGBOOK_REPO,
               filter=filter,
               since=start_date,
               num_workers=NUM_WORKERS)

with Timer() as t:
    change_counter = debuggingbook_change_counter(ChangeCounter)

t.elapsed_time()

137.38987245899625

list(change_counter.changes.keys())[:10]

[('notebooks', 'Intro_Debugging.ipynb'),
 ('html', 'custom.css'),
 ('Chapters.makefile',),
 ('notebooks', '02_Observing.ipynb'),
 ('notebooks', '04_Reducing.ipynb'),
 ('notebooks', 'PerformanceDebugger.ipynb'),
 ('notebooks', 'ThreadDebugger.ipynb'),
 ('notebooks', 'Debugger.ipynb'),
 ('notebooks', 'Tracer.ipynb'),
 ('binder', 'postBuild')]

change_counter.changes.get(('Chapters.makefile',), None)

33

change_counter.messages.get(('Chapters.makefile',), None)

['New: (Incomplete) chapters on performance and concurrency debugging',
 'New: moved StackInspector in its own module',
 "New: mark (pretty much) all chapters as 'ready'",
 'Doc fix',
 'New: release first chapters',
 "New: have a 'shared' directory for material shared between fuzzingbook and debuggingbook; avoid cross-project links",
 "New: 'make shared' syncs the 'shared' folder",
 'New: can now run notebooks and check HTML as part of CI',
 'New: Assertions is ready',
 "Fix: Mark 'Assertions' as new, too",
 'New: publish Slicer',
 'Fix: DEPENDENCIES_PART was missing in PUBLIC_CHAPTERS',
 'New: publish DeltaDebugger',
 'New release: ChangeDebugger',
 'New: publish StatisticalDebugger',
 'Made StatisticalDebugger visible',
 'New: public chapter on dynamic invariants',
 'New: publish DDSetDebugger',
 'New: PerformanceDebugger goes live',
 'New: Publish repair chapter',
 'New chapter: Tracking',
 'New: ChangeCounter is public',
 'New: do not check types in fuzzingbook (yet)',
 'New: checking types is now the default',
 'New: illustrated code',
 'New: Illustrated Code',
 'New: efficient timeout handling',
 'Fix: added default target',
 'Fix: bad default target',
 'No new chapters',
 'Moved announcements from Twitter to Mastodon',
 'New: first sketch of Alhazen chapter',
 'Moved Alhazen out of beta']

for node in change_counter.changes:
    assert len(change_counter.messages[node]) == change_counter.changes[node]

change_counter.sizes.get(('Chapters.makefile',), None)

3801

class ChangeCounter(ChangeCounter):
    def map_node_sizes(self,scale: str = 'log') -> \
        Dict[Node, Union[int, float]]:
        """
        Return a mapping of nodes to sizes.
        Can be overloaded in subclasses.
        """

        if scale == 'log':
            # Default: use log scale
            return {node: math.log(size+1) 
                    for node, size in self.sizes.items()}

        elif scale == 'sqrt':
            # Alternative: use sqrt size
            return {node: math.sqrt(size)
                    for node, size in self.sizes.items()}

        elif scale == 'abs':
            # Alternative: use absolute size
            return self.sizes

        else:
            raise ValueError(f"Unknown scale: {scale}; "
                             f"use one of [log, sqrt, abs]")

class ChangeCounter(ChangeCounter):
    def map_node_color(self, node: Node) -> Optional[int]:
        """
        Return a color of the node, as a number.
        Can be overloaded in subclasses.
        """
        return self.changes.get(node)

class ChangeCounter(ChangeCounter):
    def map_node_text(self, node: Node) -> Optional[str]:
        """
        Return the text to be shown for the node (default: #changes).
        Can be overloaded in subclasses.
        """
        change = self.changes.get(node)
        return str(change) if change is not None else None

class ChangeCounter(ChangeCounter):
    def map_hoverinfo(self) -> str:
        """
        Return the text to be shown when hovering over a node.
        To be overloaded in subclasses.
        """
        return 'label+text'

    def map_colorscale(self) -> str:
        """
        Return the colorscale for the map. To be overloaded in subclasses.
        """
        return 'YlOrRd'

class ChangeCounter(ChangeCounter):
    def map(self) -> go.Figure:
        """Produce an interactive tree map of the repository."""
        treemap = ep.Treemap(
                     self.map_node_sizes(),
                     text=self.map_node_text,
                     hoverinfo=self.map_hoverinfo(),
                     marker_colors=self.map_node_color,
                     marker_colorscale=self.map_colorscale(),
                     root_label=self.repo,
                     branchvalues='total'
                    )

        fig = go.Figure(treemap)
        fig.update_layout(margin=dict(l=0, r=0, t=30, b=0))

        return fig

change_counter = debuggingbook_change_counter(ChangeCounter)

change_counter.map()

sorted(change_counter.changes.items(), key=lambda kv: kv[1], reverse=True)[:4]

[(('notebooks', 'Slicer.ipynb'), 51),
 (('notebooks', 'ChangeCounter.ipynb'), 45),
 (('requirements.txt',), 38),
 (('Chapters.makefile',), 33)]

# ignore
all_notebooks = [node for node in change_counter.changes.keys()
                 if len(node) == 2 and node[1].endswith('.ipynb')]
all_notebooks.sort(key=lambda node: change_counter.changes[node],
                   reverse=True)

quiz("Which two notebooks have seen the most changes over time?",
     [
         f"`{all_notebooks[i][1].split('.')[0]}`"
         for i in [0, 3, 1, 2]
         if i < len(all_notebooks)
     ]
     , '[1234 % 3, 3702 / 1234]')

[notebook[1].split('.')[0] for notebook in all_notebooks[:2]]

['Slicer', 'ChangeCounter']

class FixCounter(ChangeCounter):
    """
    Count the fixes for files in the repository.
    Fixes are all commits whose message starts with the word 'Fix: '
    """

    def include(self, m: ModifiedFile) -> bool:
        """Include all modifications whose commit messages start with 'Fix:'"""
        return super().include(m) and m and m.msg.startswith("Fix:")

class FixCounter(FixCounter):
    def map_node_text(self, node: Node) -> str:
        return "<br>".join(self.messages.get(node, []))

    def map_hoverinfo(self) -> str:
        return 'label'

fix_counter = debuggingbook_change_counter(FixCounter)

fix_counter.map()

# fix_counter = debuggingbook_change_counter(
#     FixCounter,
#     start_date=datetime(1999, 1, 1)
# )
# fix_counter.map()

magic.from_buffer('''
#include <stdio.h>

int main(int argc, char *argv[]) {
    printf("Hello, world!\n")
}
''')

'C source, ASCII text'

magic.from_buffer('''
def foo():
    print("Hello, world!")
''')

'Python script, ASCII text executable'

magic.from_buffer(open(os.path.join(current_repo(),   # type: ignore
                                    'notebooks',
                                    'Assertions.ipynb')).read())

'JSON text data'

# ignore
from typing import Pattern

DELIMITERS: List[Tuple[Pattern, Pattern, Pattern]] = [
    (
        # Python
        re.compile(r'^python.*'),

        # Beginning of element
        re.compile(r'^(async\s+)?(def|class)\s+(?P<name>\w+)\W.*'),

        # End of element
        re.compile(r'^[^#\s]')
    ),
    (
        # Jupyter Notebooks
        re.compile(r'^(json|exported sgml|jupyter).*'),
        re.compile(r'^\s+"(async\s+)?(def|class)\s+(?P<name>\w+)\W'),
        re.compile(r'^(\s+"[^#\s\\]|\s+\])')
    ),
    (
        # C source code (actually, any { }-delimited language)
        re.compile(r'^(c |c\+\+|c#|java|perl|php).*'),
        re.compile(r'^[^\s].*\s+(?P<name>\w+)\s*[({].*'),
        re.compile(r'^[}]')
    )
]

def rxdelim(content: str) -> Tuple[Optional[Pattern], Optional[Pattern]]:
    """
    Return suitable begin and end delimiters for the content `content`.
    If no matching delimiters are found, return `None, None`.
    """
    tp = magic.from_buffer(content).lower()
    for rxtp, rxbegin, rxend in DELIMITERS:
        if rxtp.match(tp):
            return rxbegin, rxend

    return None, None

Mapping = List[Optional[str]]

def elem_mapping(content: str, log: bool = False) -> Mapping:
    """Return a list of the elements in `content`, indexed by line number."""
    rxbegin, rxend = rxdelim(content)
    if rxbegin is None:
        return []
    if rxend is None:
        return []

    mapping: List[Optional[str]] = [None]
    current_elem = None
    lineno = 0

    for line in content.split('\n'):
        lineno += 1

        match = rxbegin.match(line)
        if match:
            current_elem = match.group('name')
        elif rxend.match(line):
            current_elem = None

        mapping.append(current_elem)

        if log:
            print(f"{lineno:3} {str(current_elem):15} {line}")

    return mapping

some_c_source = """
#include <stdio.h>

int foo(int x) {
    return x;
}

struct bar {
    int x, y;
}

int main(int argc, char *argv[]) {
    return foo(argc);
}

"""
some_c_mapping = elem_mapping(some_c_source, log=True)

  1 None            
  2 None            #include <stdio.h>
  3 None            
  4 foo             int foo(int x) {
  5 foo                 return x;
  6 None            }
  7 None            
  8 bar             struct bar {
  9 bar                 int x, y;
 10 None            }
 11 None            
 12 main            int main(int argc, char *argv[]) {
 13 main                return foo(argc);
 14 None            }
 15 None            
 16 None

some_c_mapping[1], some_c_mapping[8]

(None, 'bar')

some_python_source = """
def foo(x):
    return x

class bar(blue):
    x = 25
    def f(x):
        return 26

def main(argc):
    return foo(argc)

"""
some_python_mapping = elem_mapping(some_python_source, log=True)

  1 None            
  2 foo             def foo(x):
  3 foo                 return x
  4 foo             
  5 bar             class bar(blue):
  6 bar                 x = 25
  7 bar                 def f(x):
  8 bar                     return 26
  9 bar             
 10 main            def main(argc):
 11 main                return foo(argc)
 12 main            
 13 main

# some_jupyter_source = open("Debugger.ipynb").read()
# some_jupyter_mapping = elem_mapping(some_jupyter_source, log=True)

def changed_elems_by_mapping(mapping: Mapping, start: int, length: int = 0) -> Set[str]:
    """
    Within `mapping`, return the set of elements affected by a change
    starting in line `start` and extending over `length` additional lines.
    """
    elems = set()
    for line in range(start, start + length + 1):
        if line < len(mapping) and mapping[line]:
            elem = mapping[line]
            assert elem is not None
            elems.add(elem)

    return elems

changed_elems_by_mapping(some_python_mapping, start=2, length=4)

{'bar', 'foo'}

def elem_size(elem: str, source: str) -> int:
    """Within `source`, return the size of `elem`"""
    source_lines = [''] + source.split('\n')
    size = 0
    mapping = elem_mapping(source)

    for line_no in range(len(mapping)):
        if mapping[line_no] == elem or mapping[line_no] is elem:
            size += len(source_lines[line_no] + '\n')

    return size

elem_size('foo', some_python_source)

26

assert sum(elem_size(name, some_python_source) 
           for name in ['foo', 'bar', 'main']) == len(some_python_source)

def changed_elems(old_source: str, new_source: str) -> Set[str]:
    """Determine the elements affected by the change from `old_source` to `new_source`"""
    patches = diff(old_source, new_source)

    old_mapping = elem_mapping(old_source)
    new_mapping = elem_mapping(new_source)

    elems = set()

    for patch in patches:
        old_start_line = patch.start1 + 1
        new_start_line = patch.start2 + 1

        for (op, data) in patch.diffs:
            length = data.count('\n')

            if op == diff_match_patch.DIFF_INSERT:
                elems |= changed_elems_by_mapping(old_mapping, old_start_line)
                elems |= changed_elems_by_mapping(new_mapping, new_start_line, length)
            elif op == diff_match_patch.DIFF_DELETE:
                elems |= changed_elems_by_mapping(old_mapping, old_start_line, length)
                elems |= changed_elems_by_mapping(new_mapping, new_start_line)

            old_start_line += length
            new_start_line += length

    return elems

some_new_python_source = """
def foo(y):
    return y

class qux(blue):
    x = 25
    def f(x):
        return 26

def main(argc):
    return foo(argc)

"""

changed_elems(some_python_source, some_new_python_source)

{'bar', 'foo', 'qux'}

class FineChangeCounter(ChangeCounter):
    """Count the changes for files in the repository and their elements"""

    def update_elems(self, node: Node, m: ModifiedFile) -> None:
        old_source = m.content_before if m.content_before else bytes()
        new_source = m.content if m.content else bytes()

        # Content comes as bytes instead of strings
        # Let's convert this in a conservative way
        if not isinstance(old_source, str):
            old_source = str(old_source, 'latin1')
        if not isinstance(new_source, str):
            new_source = str(new_source, 'latin1')

        changed = changed_elems(old_source, new_source)
        for elem in changed:
            elem_node = node + (elem,)

            self.update_size(elem_node, elem_size(elem, new_source))
            self.update_changes(elem_node, m.msg)

with Timer() as t:
    fine_change_counter = debuggingbook_change_counter(FineChangeCounter)

t.elapsed_time()

128.7874694170314

fine_change_counter.map()

elem_nodes = [node for node in fine_change_counter.changes.keys()
              if len(node) == 3 and node[1].endswith('.ipynb')]
elem_nodes.sort(key=lambda node: fine_change_counter.changes[node],
                reverse=True)
[(node, fine_change_counter.changes[node]) for node in elem_nodes[:1]]

[(('notebooks', 'Slicer.ipynb', 'Slicer'), 3)]

quiz("Which is the _second_ most changed element?",
     [
        f"`{elem_nodes[i][2]}` in `{elem_nodes[i][1].split('.ipynb')[0]}`"
        for i in [3, 1, 2, 0]
        if i < len(elem_nodes)
     ], '1975308642 // 987654321')

[(node, fine_change_counter.changes[node]) for node in elem_nodes[:5]]

[(('notebooks', 'Slicer.ipynb', 'Slicer'), 3),
 (('notebooks', 'StackInspector.ipynb', 'StackInspector'), 2),
 (('notebooks', 'Assertions.ipynb', 'ManagedMemory'), 2),
 (('notebooks', 'ChangeCounter.ipynb', 'ChangeCounter'), 2),
 (('notebooks', 'Repairer.ipynb', 'middle_tree'), 2)]

change_counter = ChangeCounter(repository)

change_counter.changes.get(('README.md',), None)

20

change_counter.messages.get(('README.md',), None)

['Doc update',
 'Doc update',
 'Doc update',
 'Doc update',
 'Fix: corrected rule for rendered notebooks (#24)\nNew: strip out any <iframe> tags\nNew: when rendering .md files, replace videos by proper image',
 'Doc update',
 'Doc update',
 'New: show badges at top of GitHub project page',
 'More badges',
 'Fix: bad links in CI badges',
 'New: prefer Unicode arrows over LaTeX ones',
 'Updated README.md',
 'Update',
 'Doc update',
 'Doc update',
 'Doc update',
 'Doc update',
 'Updated README',
 'Doc update',
 'Doc update']

change_counter.sizes.get(('README.md',), None)

4728

fine_change_counter.map()

# ignore
from ClassDiagram import display_class_hierarchy

# ignore
display_class_hierarchy([FineChangeCounter, FixCounter],
                        public_methods=[
                            ChangeCounter.__init__,
                            ChangeCounter.map
                        ],
                        project='debuggingbook')

Where the Bugs are¶

Mining Change Histories¶

Mining with PyDriller¶

Counting Changes¶

Visualizing Past Changes¶

Quiz

Counting Past Fixes¶

Counting Fine-Grained Changes¶

Mapping Elements to Locations¶

Determining Changed Elements¶

Putting it all Together¶

Quiz

Synopsis¶

Lessons Learned¶

Background¶

Exercises¶

Exercise 1: Fine-Grained Fixes¶