Skip to content

Commit

Permalink
Merge pull request #40 from regisb/regisb/multiline-annotations
Browse files Browse the repository at this point in the history
[BD-21] Implement multi-line code annotations
  • Loading branch information
robrap authored Jul 22, 2020
2 parents 4870b3e + 4cfcf96 commit a1bad02
Show file tree
Hide file tree
Showing 11 changed files with 210 additions and 71 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ Unreleased

*

[0.4.0] - 2020-07-22
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

* Add support for multi-line code annotations

[0.3.4] - 2020-05-06
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion code_annotations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
Extensible tools for parsing annotations in codebases.
"""

__version__ = '0.3.4'
__version__ = '0.4.0'
49 changes: 17 additions & 32 deletions code_annotations/extensions/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
from abc import ABCMeta, abstractmethod

from code_annotations.helpers import clean_abs_path, get_annotation_regex
from code_annotations.helpers import clean_abs_path, clean_annotation, get_annotation_regex


class AnnotationExtension(object, metaclass=ABCMeta):
Expand Down Expand Up @@ -56,24 +56,9 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
Returns a 2-tuple of:
- ("Comment text", None) in the case of a multi-line comment OR
- (None, "Comment text") in the case of a single-line comment
TODO: Make this handle multi-line annotation comments again.
"""
comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'

r"""
This format string/regex finds our annotation token and choices / comments inside a comment:
[\s\S]*? - Strip out any characters between the start of the comment and the annotation
({}) - {} is a Python format string that will be replaced with a regex escaped and
then or-joined to make a list of the annotation tokens we're looking for
Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
(.*) - and capture all characters until the end of the line
Returns a 2-tuple of found annotation token and annotation comment
TODO: Make multi line annotation comments work again.
"""
def __init__(self, config, echo):
"""
Set up the extension and create the regexes used to do searches.
Expand All @@ -88,14 +73,16 @@ def __init__(self, config, echo):
raise ValueError('Subclasses of SimpleRegexAnnotationExtension must define lang_comment_definition!')

# pylint: disable=not-a-mapping
self.comment_regex = self.comment_regex_fmt.format(**self.lang_comment_definition)
self.comment_regex = re.compile(
self.comment_regex_fmt.format(**self.lang_comment_definition)
)

# Parent class will allow this class to populate self.strings_to_search via
# calls to _add_annotation_token or _add_annotation_group for each configured
# annotation.
self.query = get_annotation_regex(self.config.annotation_regexes)

self.ECHO.echo_v("{} extension regex query: {}".format(self.extension_name, self.query))
self.ECHO.echo_v("{} extension regex query: {}".format(self.extension_name, self.query.pattern))

def search(self, file_handle):
"""
Expand All @@ -115,34 +102,32 @@ def search(self, file_handle):
if any(anno in txt for anno in self.config.annotation_tokens):
fname = clean_abs_path(file_handle.name, self.config.source_path)

for match in re.finditer(self.comment_regex, txt):
for match in self.comment_regex.finditer(txt):
# Should only be one match
comment_content = [item for item in match.groups() if item is not None][0]
for inner_match in re.finditer(self.query, comment_content):
for inner_match in self.query.finditer(comment_content):
# Get the line number by counting newlines + 1 (for the first line).
# Note that this is the line number of the beginning of the comment, not the
# annotation token itself.
line = txt.count('\n', 0, match.start()) + 1

# No matter how long the regex is, there should only be 2 non-None items,
# with the first being the annotation token and the 2nd being the comment.
cleaned_groups = [item for item in inner_match.groups() if item is not None]

if len(cleaned_groups) != 2: # pragma: no cover
raise Exception('{}::{}: Number of found items in the list is not 2. Found: {}'.format(
try:
annotation_token = inner_match.group('token')
annotation_data = inner_match.group('data')
except IndexError:
# pragma: no cover
raise ValueError('{}::{}: Could not find "data" or "token" groups. Found: {}'.format(
fname,
line,
cleaned_groups
inner_match.groupdict()
))

annotation, comment = cleaned_groups

annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
found_annotations.append({
'found_by': self.extension_name,
'filename': fname,
'line_number': line,
'annotation_token': annotation.strip(),
'annotation_data': comment.strip()
'annotation_token': annotation_token,
'annotation_data': annotation_data,
})

return found_annotations
42 changes: 19 additions & 23 deletions code_annotations/find_django.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import inspect
import os
import pprint
import re
import sys

import django
Expand All @@ -13,7 +12,7 @@
from django.db import models

from code_annotations.base import BaseSearch
from code_annotations.helpers import fail, get_annotation_regex
from code_annotations.helpers import clean_annotation, fail, get_annotation_regex

DEFAULT_SAFELIST_FILE_PATH = '.annotation_safe_list.yml'

Expand Down Expand Up @@ -109,33 +108,30 @@ def _append_model_annotations(self, model_type, model_id, query, model_annotatio
with open(filename, 'r') as file_handle:
txt = file_handle.read()

for inner_match in re.finditer(query, model_type.__doc__):
# TODO: This is duplicated code with extensions/base.py
# No matter how long the regex is, there should only be 2 non-None items,
# with the first being the annotation token and the 2nd being the comment.
cleaned_groups = [item for item in inner_match.groups() if item is not None]

if len(cleaned_groups) != 2: # pragma: no cover
raise Exception('{}: Number of found items in the list is not 2. Found: {}'.format(
# Get the line number by counting newlines + 1 (for the first line).
# Note that this is the line number of the beginning of the comment, not the
# annotation token itself. We find based on the entire code content of the model
# as that seems to be the only way to be sure we're getting the correct line number.
# It is slow and should be replaced if we can find a better way that is accurate.
line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1

for inner_match in query.finditer(model_type.__doc__):
try:
annotation_token = inner_match.group('token')
annotation_data = inner_match.group('data')
except IndexError:
# pragma: no cover
raise ValueError('{}: Could not find "data" or "token" groups. Found: {}'.format(
self.get_model_id(model_type),
cleaned_groups
inner_match.groupdict()
))

annotation, comment = cleaned_groups

# Get the line number by counting newlines + 1 (for the first line).
# Note that this is the line number of the beginning of the comment, not the
# annotation token itself. We find based on the entire code content of the model
# as that seems to be the only way to be sure we're getting the correct line number.
# It is slow and should be replaced if we can find a better way that is accurate.
line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1

annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
model_annotations.append({
'found_by': "django",
'filename': filename,
'line_number': line,
'annotation_token': annotation.strip(),
'annotation_data': comment.strip(),
'annotation_token': annotation_token,
'annotation_data': annotation_data,
'extra': {
'object_id': model_id,
'full_comment': model_type.__doc__.strip()
Expand Down
58 changes: 46 additions & 12 deletions code_annotations/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Helpers for code_annotations scripts.
"""
import os
import re
import sys

import click
Expand Down Expand Up @@ -113,25 +114,58 @@ def get_annotation_regex(annotation_regexes):
"""
Return the full regex to search inside comments for configured annotations.
A successful match against the regex will return two groups of interest: 'token'
and 'data'.
This regular expression supports annotation tokens that span multiple lines. To do
so, prefix each line after the first by at least two leading spaces. E.g:
.. pii: First line
second line
Unfortunately, the indenting spaces will find their way to the content of the "token" group.
Args:
annotation_regexes: List of re.escaped annotation tokens to search for.
Returns:
Regex ready for searching comments for annotations.
"""
# pylint: disable=pointless-string-statement
r"""
This format string/regex finds our annotation token and choices / comments inside a comment:
annotation_regex = r"""
(?P<space>[\ \t]*) # Leading empty spaces
(?P<token>{tokens}) # Python format string that will be replaced with a
# regex, escaped and then or-joined to make a list
# of the annotation tokens we're looking for
# Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
(?P<data> # Captured annotation data
(?: # non-capture mode
. # any non-newline character
| # or new line of multi-line annotation data
(?: # non-capture mode
\n{{1,}} # at least one newline,
(?P=space) # followed by as much space as the prefix,
(?P<indent>\ {{2,}}) # at least two spaces,
(?=[^\ ]) # and a non-space character (look-ahead)
(?!{tokens}) # that does not match any of the token regexes
) #
)* # any number of times
)
"""
annotation_regex = annotation_regex.format(tokens='|'.join(annotation_regexes))
return re.compile(annotation_regex, flags=re.VERBOSE)

[\s\S]*? - Strip out any characters between the start of the comment and the annotation
({}) - {} is a Python format string that will be replaced with a regex escaped and
then or-joined to make a list of the annotation tokens we're looking for
Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
(.*) - and capture all characters until the end of the line

Returns a 2-tuple of found annotation token and annotation comment
def clean_annotation(token, data):
"""
Clean annotation token and data by stripping all trailing/prefix empty spaces.
Args:
token (str)
data (str)
TODO: Make multi line annotation comments work again.
Returns:
(str, str): Tuple of cleaned token, data
"""
annotation_regex = r'[\s\S]*?({})(.*)'
return annotation_regex.format('|'.join(annotation_regexes))
token = token.strip()
data = data.strip()
return token, data
32 changes: 29 additions & 3 deletions docs/writing_annotations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ comments into two parts- the annotation token, and the annotation data.

- Annotation data
Annotation data can either be a simple free text comment that is on the same line as the token, or a choice list.
The choices in a choice list are configured in the configuration file and can be separated by spaces or commas when
used in comments. As such, the choices themselves should not contain spaces or commas.
Free text annotations can span multiple lines, provided all lines after the first
are indented by at least two spaces. The choices in a choice list are configured in
the configuration file and can be separated by spaces or commas when used in
comments. As such, the choices themselves should not contain spaces or commas.

The information below applies to both the Static Search and Django Model Search tools, with the exception that the
Django Model Search only looks in model docstrings.
Expand All @@ -26,7 +28,7 @@ Configuration for a "fun fact" annotation type, denoted by the annotation token
.. code-block:: yaml
annotations:
".. fun_fact::":
".. fun_fact:":
There are no choices given, so this is a free form comment type of annotation. Note the trailing colon! It would be used
in Python like this:
Expand All @@ -51,6 +53,30 @@ When a report is run against this code an entry like this will be generated in t
*Note that the rest of the comment is ignored in the report.*

An annotation can also span multiple lines. For instance:

.. code-block:: python
"""
This function handles setting the price on an item in the database.
.. fun_fact: This code is the only remaining piece of our first commit!
To write long texts, prepend at least two additional spaces at the start
of every line after the first.
"""
This code would result in the following report:

.. code-block:: yaml
- annotation_data: "This code is the only remaining piece of our first commit!\n \
\ To write long texts, prepend at least two additional spaces at the start\n\
\ of every line after the first."
annotation_token: '.. fun_fact:'
filename: foo/bar/something.py
found_by: python
line_number: 1
Configuration for an "async" annotation type, denoted by the annotation token ``.. async:`` and choices denoting the
types of asynchronous processors hooked up to it:

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
.. pii:
This is an annotation that
spans multiple lines and allows developers to
write more extensive docs.
Comment after annotation and being annotated
"""
6 changes: 6 additions & 0 deletions tests/extensions/python_test_files/multiline_indented.pyt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
.. pii: A long description that
spans multiple indented
lines
.. pii_types: id, name
"""
15 changes: 15 additions & 0 deletions tests/extensions/python_test_files/multiline_paragraphs.pyt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
.. pii: This is an annotation that
spans multiple paragraphs.
This allows developers to write even more
extensive docs.
Comment after annotation and being annotated
"""

"""
Docstring
.. pii: Annotation 1 with:
Multi-line and multi-paragraph.
"""
8 changes: 8 additions & 0 deletions tests/extensions/python_test_files/multiline_simple.pyt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""
Docstring
.. pii: A long description that
spans multiple
lines
.. pii_types: id, name
"""
Loading

0 comments on commit a1bad02

Please sign in to comment.