Merge pull request #40 from regisb/regisb/multiline-annotations

[BD-21] Implement multi-line code annotations
openedx · Jul 22, 2020 · a1bad02 · a1bad02
2 parents 4870b3e + 4cfcf96
commit a1bad02
Show file tree

Hide file tree

Showing 11 changed files with 210 additions and 71 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -16,6 +16,11 @@ Unreleased
 
 *
 
+[0.4.0] - 2020-07-22
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Add support for multi-line code annotations
+
 [0.3.4] - 2020-05-06
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py
@@ -2,4 +2,4 @@
 Extensible tools for parsing annotations in codebases.
 """
 
-__version__ = '0.3.4'
+__version__ = '0.4.0'
diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
@@ -4,7 +4,7 @@
 import re
 from abc import ABCMeta, abstractmethod
 
-from code_annotations.helpers import clean_abs_path, get_annotation_regex
+from code_annotations.helpers import clean_abs_path, clean_annotation, get_annotation_regex
 
 
 class AnnotationExtension(object, metaclass=ABCMeta):
@@ -56,24 +56,9 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     Returns a 2-tuple of:
      - ("Comment text", None) in the case of a multi-line comment OR
      - (None, "Comment text") in the case of a single-line comment
-
-    TODO: Make this handle multi-line annotation comments again.
     """
     comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'
 
-    r"""
-    This format string/regex finds our annotation token and choices / comments inside a comment:
-
-    [\s\S]*? - Strip out any characters between the start of the comment and the annotation
-    ({})     - {} is a Python format string that will be replaced with a regex escaped and
-               then or-joined to make a list of the annotation tokens we're looking for
-               Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)     - and capture all characters until the end of the line
-
-    Returns a 2-tuple of found annotation token and annotation comment
-
-    TODO: Make multi line annotation comments work again.
-    """
     def __init__(self, config, echo):
         """
         Set up the extension and create the regexes used to do searches.
@@ -88,14 +73,16 @@ def __init__(self, config, echo):
             raise ValueError('Subclasses of SimpleRegexAnnotationExtension must define lang_comment_definition!')
 
         # pylint: disable=not-a-mapping
-        self.comment_regex = self.comment_regex_fmt.format(**self.lang_comment_definition)
+        self.comment_regex = re.compile(
+            self.comment_regex_fmt.format(**self.lang_comment_definition)
+        )
 
         # Parent class will allow this class to populate self.strings_to_search via
         # calls to _add_annotation_token or _add_annotation_group for each configured
         # annotation.
         self.query = get_annotation_regex(self.config.annotation_regexes)
 
-        self.ECHO.echo_v("{} extension regex query: {}".format(self.extension_name, self.query))
+        self.ECHO.echo_v("{} extension regex query: {}".format(self.extension_name, self.query.pattern))
 
     def search(self, file_handle):
         """
@@ -115,34 +102,32 @@ def search(self, file_handle):
         if any(anno in txt for anno in self.config.annotation_tokens):
             fname = clean_abs_path(file_handle.name, self.config.source_path)
 
-            for match in re.finditer(self.comment_regex, txt):
+            for match in self.comment_regex.finditer(txt):
                 # Should only be one match
                 comment_content = [item for item in match.groups() if item is not None][0]
-                for inner_match in re.finditer(self.query, comment_content):
+                for inner_match in self.query.finditer(comment_content):
                     # Get the line number by counting newlines + 1 (for the first line).
                     # Note that this is the line number of the beginning of the comment, not the
                     # annotation token itself.
                     line = txt.count('\n', 0, match.start()) + 1
 
-                    # No matter how long the regex is, there should only be 2 non-None items,
-                    # with the first being the annotation token and the 2nd being the comment.
-                    cleaned_groups = [item for item in inner_match.groups() if item is not None]
-
-                    if len(cleaned_groups) != 2:  # pragma: no cover
-                        raise Exception('{}::{}: Number of found items in the list is not 2. Found: {}'.format(
+                    try:
+                        annotation_token = inner_match.group('token')
+                        annotation_data = inner_match.group('data')
+                    except IndexError:
+                        # pragma: no cover
+                        raise ValueError('{}::{}: Could not find "data" or "token" groups. Found: {}'.format(
                             fname,
                             line,
-                            cleaned_groups
+                            inner_match.groupdict()
                         ))
-
-                    annotation, comment = cleaned_groups
-
+                    annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
                     found_annotations.append({
                         'found_by': self.extension_name,
                         'filename': fname,
                         'line_number': line,
-                        'annotation_token': annotation.strip(),
-                        'annotation_data': comment.strip()
+                        'annotation_token': annotation_token,
+                        'annotation_data': annotation_data,
                     })
 
         return found_annotations
diff --git a/code_annotations/find_django.py b/code_annotations/find_django.py
@@ -4,7 +4,6 @@
 import inspect
 import os
 import pprint
-import re
 import sys
 
 import django
@@ -13,7 +12,7 @@
 from django.db import models
 
 from code_annotations.base import BaseSearch
-from code_annotations.helpers import fail, get_annotation_regex
+from code_annotations.helpers import clean_annotation, fail, get_annotation_regex
 
 DEFAULT_SAFELIST_FILE_PATH = '.annotation_safe_list.yml'
 
@@ -109,33 +108,30 @@ def _append_model_annotations(self, model_type, model_id, query, model_annotatio
         with open(filename, 'r') as file_handle:
             txt = file_handle.read()
 
-        for inner_match in re.finditer(query, model_type.__doc__):
-            # TODO: This is duplicated code with extensions/base.py
-            # No matter how long the regex is, there should only be 2 non-None items,
-            # with the first being the annotation token and the 2nd being the comment.
-            cleaned_groups = [item for item in inner_match.groups() if item is not None]
-
-            if len(cleaned_groups) != 2:  # pragma: no cover
-                raise Exception('{}: Number of found items in the list is not 2. Found: {}'.format(
+        # Get the line number by counting newlines + 1 (for the first line).
+        # Note that this is the line number of the beginning of the comment, not the
+        # annotation token itself. We find based on the entire code content of the model
+        # as that seems to be the only way to be sure we're getting the correct line number.
+        # It is slow and should be replaced if we can find a better way that is accurate.
+        line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1
+
+        for inner_match in query.finditer(model_type.__doc__):
+            try:
+                annotation_token = inner_match.group('token')
+                annotation_data = inner_match.group('data')
+            except IndexError:
+                # pragma: no cover
+                raise ValueError('{}: Could not find "data" or "token" groups. Found: {}'.format(
                     self.get_model_id(model_type),
-                    cleaned_groups
+                    inner_match.groupdict()
                 ))
-
-            annotation, comment = cleaned_groups
-
-            # Get the line number by counting newlines + 1 (for the first line).
-            # Note that this is the line number of the beginning of the comment, not the
-            # annotation token itself. We find based on the entire code content of the model
-            # as that seems to be the only way to be sure we're getting the correct line number.
-            # It is slow and should be replaced if we can find a better way that is accurate.
-            line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1
-
+            annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
             model_annotations.append({
                 'found_by': "django",
                 'filename': filename,
                 'line_number': line,
-                'annotation_token': annotation.strip(),
-                'annotation_data': comment.strip(),
+                'annotation_token': annotation_token,
+                'annotation_data': annotation_data,
                 'extra': {
                     'object_id': model_id,
                     'full_comment': model_type.__doc__.strip()

diff --git a/code_annotations/helpers.py b/code_annotations/helpers.py
@@ -2,6 +2,7 @@
 Helpers for code_annotations scripts.
 """
 import os
+import re
 import sys
 
 import click
@@ -113,25 +114,58 @@ def get_annotation_regex(annotation_regexes):
     """
     Return the full regex to search inside comments for configured annotations.
 
+    A successful match against the regex will return two groups of interest: 'token'
+    and 'data'.
+
+    This regular expression supports annotation tokens that span multiple lines. To do
+    so, prefix each line after the first by at least two leading spaces. E.g:
+
+        .. pii: First line
+          second line
+
+    Unfortunately, the indenting spaces will find their way to the content of the "token" group.
+
     Args:
         annotation_regexes: List of re.escaped annotation tokens to search for.
 
     Returns:
         Regex ready for searching comments for annotations.
     """
-    # pylint: disable=pointless-string-statement
-    r"""
-    This format string/regex finds our annotation token and choices / comments inside a comment:
+    annotation_regex = r"""
+    (?P<space>[\ \t]*)               # Leading empty spaces
+    (?P<token>{tokens})              # Python format string that will be replaced with a
+                                     # regex, escaped and then or-joined to make a list
+                                     # of the annotation tokens we're looking for
+                                     # Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
+    (?P<data>                        # Captured annotation data
+        (?:                          # non-capture mode
+            .                        # any non-newline character
+            |                        # or new line of multi-line annotation data
+            (?:                      # non-capture mode
+                \n{{1,}}             # at least one newline,
+                (?P=space)           # followed by as much space as the prefix,
+                (?P<indent>\ {{2,}}) # at least two spaces,
+                (?=[^\ ])            # and a non-space character (look-ahead)
+                (?!{tokens})         # that does not match any of the token regexes
+            )                        #
+        )*                           # any number of times
+    )
+    """
+    annotation_regex = annotation_regex.format(tokens='|'.join(annotation_regexes))
+    return re.compile(annotation_regex, flags=re.VERBOSE)
 
-    [\s\S]*? - Strip out any characters between the start of the comment and the annotation
-    ({})     - {} is a Python format string that will be replaced with a regex escaped and
-               then or-joined to make a list of the annotation tokens we're looking for
-               Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)     - and capture all characters until the end of the line
 
-    Returns a 2-tuple of found annotation token and annotation comment
+def clean_annotation(token, data):
+    """
+    Clean annotation token and data by stripping all trailing/prefix empty spaces.
+
+    Args:
+        token (str)
+        data (str)
 
-    TODO: Make multi line annotation comments work again.
+    Returns:
+        (str, str): Tuple of cleaned token, data
     """
-    annotation_regex = r'[\s\S]*?({})(.*)'
-    return annotation_regex.format('|'.join(annotation_regexes))
+    token = token.strip()
+    data = data.strip()
+    return token, data
diff --git a/docs/writing_annotations.rst b/docs/writing_annotations.rst
@@ -13,8 +13,10 @@ comments into two parts- the annotation token, and the annotation data.
 
 - Annotation data
     Annotation data can either be a simple free text comment that is on the same line as the token, or a choice list.
-    The choices in a choice list are configured in the configuration file and can be separated by spaces or commas when
-    used in comments. As such, the choices themselves should not contain spaces or commas.
+    Free text annotations can span multiple lines, provided all lines after the first
+    are indented by at least two spaces. The choices in a choice list are configured in
+    the configuration file and can be separated by spaces or commas when used in
+    comments. As such, the choices themselves should not contain spaces or commas.
 
 The information below applies to both the Static Search and Django Model Search tools, with the exception that the
 Django Model Search only looks in model docstrings.
@@ -26,7 +28,7 @@ Configuration for a "fun fact" annotation type, denoted by the annotation token
 .. code-block:: yaml
 
     annotations:
-        ".. fun_fact::":
+        ".. fun_fact:":
 
 There are no choices given, so this is a free form comment type of annotation. Note the trailing colon! It would be used
 in Python like this:
@@ -51,6 +53,30 @@ When a report is run against this code an entry like this will be generated in t
 
 *Note that the rest of the comment is ignored in the report.*
 
+An annotation can also span multiple lines. For instance:
+
+.. code-block:: python
+
+    """
+        This function handles setting the price on an item in the database.
+
+        .. fun_fact: This code is the only remaining piece of our first commit!
+          To write long texts, prepend at least two additional spaces at the start
+          of every line after the first.
+    """
+
+This code would result in the following report:
+
+.. code-block:: yaml
+
+    - annotation_data: "This code is the only remaining piece of our first commit!\n \
+        \     To write long texts, prepend at least two additional spaces at the start\n\
+        \      of every line after the first."
+      annotation_token: '.. fun_fact:'
+      filename: foo/bar/something.py
+      found_by: python
+      line_number: 1
+
 Configuration for an "async" annotation type, denoted by the annotation token ``.. async:`` and choices denoting the
 types of asynchronous processors hooked up to it:
 

diff --git a/tests/extensions/python_test_files/multiline_empty_first_line.pyt b/tests/extensions/python_test_files/multiline_empty_first_line.pyt
@@ -0,0 +1,7 @@
+"""
+.. pii:
+  This is an annotation that
+  spans multiple lines and allows developers to
+  write more extensive docs.
+Comment after annotation and being annotated
+"""
diff --git a/tests/extensions/python_test_files/multiline_indented.pyt b/tests/extensions/python_test_files/multiline_indented.pyt
@@ -0,0 +1,6 @@
+"""
+    .. pii: A long description that
+        spans multiple indented
+        lines
+    .. pii_types: id, name
+"""
diff --git a/tests/extensions/python_test_files/multiline_paragraphs.pyt b/tests/extensions/python_test_files/multiline_paragraphs.pyt
@@ -0,0 +1,15 @@
+"""
+.. pii: This is an annotation that
+  spans multiple paragraphs.
+
+  This allows developers to write even more
+  extensive docs.
+Comment after annotation and being annotated
+"""
+
+"""
+Docstring
+.. pii: Annotation 1 with:
+
+     Multi-line and multi-paragraph.
+"""
diff --git a/tests/extensions/python_test_files/multiline_simple.pyt b/tests/extensions/python_test_files/multiline_simple.pyt
@@ -0,0 +1,8 @@
+"""
+Docstring
+
+.. pii: A long description that
+  spans multiple
+  lines
+.. pii_types: id, name
+"""