Merge pull request #3 from chmeliik/update-from-upstream

Update from upstream
containerbuildsystem · Jul 27, 2023 · 40b200f · 40b200f
2 parents a2f0f40 + 3e71d7c
commit 40b200f
Show file tree

Hide file tree

Showing 12 changed files with 239 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -4,3 +4,20 @@ This repository contains Gemfile.lock parser vendored from [ScanCode
 toolkit](https://github.com/nexB/scancode-toolkit). One of the main goals is to
 make sure that RubyGems dependencies are parsed without executing arbitrary Ruby
 code.
+
+## Updating
+
+To update the repository based on upstream changes, run `hack/update-from-upstream.sh`.
+
+* for `gemfile_lock.py` and its tests, the script updates everything below the import
+  block
+* for the supporting files, the script updates the existing top-level definitions
+  * functions, classes, constants, module docstrings
+
+Inspect the changes carefully, make adjustments as necessary. Make sure unit tests
+are passing.
+
+Port the relevant unit test changes from [`test_analysis.py`][test_analysis.py]
+manually, the update script is not smart enough to do that.
+
+[test_analysis.py]: https://github.com/nexB/scancode-toolkit/blob/develop/tests/textcode/test_analysis.py
diff --git a/gemlock_parser/analysis.py b/gemlock_parser/analysis.py
@@ -1,6 +1,6 @@
 # This file is a modified version of analysis.py from Scancode, based on work
 # of nexB Inc. and others. See original file at
-# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/textcode/analysis.py
+# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/textcode/analysis.py
 #
 # Copyright (c) nexB Inc. and others. All rights reserved.
 # ScanCode is a trademark of nexB Inc.
@@ -26,6 +26,7 @@
 def remove_null_bytes(s):
     """
     Return a string replacing by a space all null bytes.
+
     There are some rare cases where we can have binary strings that are not
     caught early when detecting a file type, but only late at the line level.
     This help catch most of these cases.
@@ -39,6 +40,7 @@ def as_unicode(line):
     Try to decode line as Unicode. Try first some default encodings,
     then attempt Unicode trans-literation and finally
     fall-back to ASCII strings extraction.
+
     TODO: Add file/magic detection, unicodedmanit/BS3/4
     """
     if isinstance(line, str):
@@ -74,18 +76,25 @@ def remove_verbatim_cr_lf_tab_chars(s):
     Return a string replacing by a space any verbatim but escaped line endings
     and tabs (such as a literal \n or \r \t).
     """
-    if not s:
-        return s
     return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')
 
 
-def unicode_text_lines(location):
+def unicode_text_lines(location, decrlf=False):
     """
-    Return an iterable over unicode text lines from a file at `location` if it
-    contains text. Open the file as binary with universal new lines then try to
-    decode each line as Unicode.
+    Yield unicode text lines from a file at ``location`` if it
+    contains text.
+
+    Open the file as binary then try to decode each line as Unicode.
+    Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True.
     """
+    lines = _unicode_text_lines(location)
+    if decrlf:
+        return map(remove_verbatim_cr_lf_tab_chars, lines)
+    else:
+        return lines
+
+
+def _unicode_text_lines(location):
     with open(location, 'rb') as f:
         for line in f.read().splitlines(True):
-            yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))
-
+            yield as_unicode(line)
diff --git a/gemlock_parser/gemfile_lock.py b/gemlock_parser/gemfile_lock.py
@@ -1,6 +1,6 @@
 # This file is a modified version of gemfile_lock.py from Scancode, based on
 # work of nexB Inc. and others. See original file at
-# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/packagedcode/gemfile_lock.py
+# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/packagedcode/gemfile_lock.py
 #
 # Copyright (c) nexB Inc. and others. All rights reserved.
 # ScanCode is a trademark of nexB Inc.
@@ -107,6 +107,7 @@ def logger_debug(*args):
 SVN = 'SVN'
 GEM = 'GEM'
 PLATFORMS = 'PLATFORMS'
+BUNDLED = 'BUNDLED WITH'
 DEPENDENCIES = 'DEPENDENCIES'
 SPECS = '  specs:'
 
@@ -338,6 +339,7 @@ def get_option(s):
     '$' % locals()).match
 
 PLATS = re.compile('^  (?P<platform>.*)$').match
+BUNDLED_WITH = re.compile('^\s+(?P<version>(?:\d+.)+\d+)\s*$').match
 
 
 class GemfileLockParser:
@@ -356,6 +358,7 @@ def __init__(self, lockfile):
         self.STATES = {
             DEPENDENCIES: self.parse_dependency,
             PLATFORMS: self.parse_platform,
+            BUNDLED: self.parse_bundler_version,
             GIT: self.parse_options,
             PATH: self.parse_options,
             SVN: self.parse_options,
@@ -366,11 +369,16 @@ def __init__(self, lockfile):
         # the final tree of dependencies, keyed by name
         self.dependency_tree = {}
 
+        # the package that the gemfile.lock is for
+        self.primary_gem = None
+
         # a flat dict of all gems, keyed by name
         self.all_gems = {}
 
         self.platforms = []
 
+        self.bundled_with = None
+
         # init parsing state
         self.reset_state()
 
@@ -397,6 +405,9 @@ def __init__(self, lockfile):
         # finally refine the collected data
         self.refine()
 
+        # set primary gem
+        self.set_primary_gem()
+
     def reset_state (self):
         self.state = None
         self.current_options = {}
@@ -407,6 +418,13 @@ def refine(self):
         for gem in self.all_gems.values():
             gem.refine()
 
+    def set_primary_gem(self):
+        for gem in self.all_gems.values():
+            if not gem.type == PATH:
+                continue
+            self.primary_gem = gem
+            break
+
     def get_or_create(self, name, version=None, platform=None):
         """
         Return an existing gem if it exists or creates a new one.
@@ -521,6 +539,16 @@ def parse_platform(self, line):
         plat = plat.group('platform')
         self.platforms.append(plat.strip())
 
+    def parse_bundler_version(self, line):
+        version = BUNDLED_WITH(line)
+        if not version:
+            if TRACE:
+                logger_debug('ERROR: parse_bundler_version: '
+                      'line not matched: %(line)r' % locals())
+            return
+        version = version.group('version')
+        self.bundled_with = version
+
     def flatten(self):
         """
         Return the Gems dependency_tree as a sorted list of unique
@@ -531,4 +559,3 @@ def flatten(self):
             flattened.append((None, direct,))
             flattened.extend(direct.flatten())
         return sorted(set(flattened))
-
diff --git a/gemlock_parser/strings.py b/gemlock_parser/strings.py
@@ -1,6 +1,6 @@
 # This file is a modified version of strings.py from Scancode, based on
 # work of nexB Inc. and others. See original file at
-# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/textcode/strings.py
+# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/textcode/strings.py
 #
 # Copyright (c) nexB Inc. and others. All rights reserved.
 # ScanCode is a trademark of nexB Inc.

diff --git a/gemlock_parser/tokenize.py b/gemlock_parser/tokenize.py
@@ -2,7 +2,7 @@
 #
 # This file is a modified version of tokenize.py from Scancode, based on
 # work of nexB Inc. and others. See original file at
-# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/licensedcode/tokenize.py
+# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/licensedcode/tokenize.py
 #
 #
 # Copyright (c) nexB Inc. and others. All rights reserved.

diff --git a/hack/_update_pyfiles.py b/hack/_update_pyfiles.py
@@ -0,0 +1,138 @@
+import ast
+from pathlib import Path
+from typing import Callable, NamedTuple
+
+
+class ParsedFile(NamedTuple):
+    source: str
+    parsed: ast.Module
+
+
+def update_pyfile(
+    our_path: Path,
+    their_path: Path,
+    how_to_update: Callable[[ParsedFile, ParsedFile], str],
+) -> None:
+    our_source = our_path.read_text()
+    their_source = their_path.read_text()
+
+    our_module = ast.parse(our_source, str(our_path))
+    their_module = ast.parse(their_source, str(their_path))
+
+    updated_source = how_to_update(
+        ParsedFile(our_source, our_module),
+        ParsedFile(their_source, their_module),
+    )
+
+    if not updated_source.endswith("\n"):
+        updated_source += "\n"
+
+    our_path.write_text(updated_source)
+
+
+def update_toplevel(our_file: ParsedFile, their_file: ParsedFile) -> str:
+    our_defs = _get_top_level_defs(our_file.parsed)
+    their_defs = _get_top_level_defs(their_file.parsed)
+
+    updated_source = our_file.source
+
+    def replace_def(updated_src: str, our_def: ast.stmt, their_def: ast.stmt) -> str:
+        our_src_segment = ast.get_source_segment(our_file.source, our_def)
+        their_src_segment = ast.get_source_segment(their_file.source, their_def)
+        assert our_src_segment and their_src_segment
+        return updated_src.replace(our_src_segment, their_src_segment, 1)
+
+    for def_name, our_def in our_defs.items():
+        their_def = their_defs[def_name]
+        updated_source = replace_def(updated_source, our_def, their_def)
+
+    return updated_source
+
+
+def _get_top_level_defs(module: ast.Module) -> dict[str, ast.stmt]:
+    defs: dict[str, ast.stmt] = {}
+    n_docstrings = 0
+
+    for node in module.body:
+        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
+            defs[node.name] = node
+        # top-level variables (constants)
+        if isinstance(node, ast.Assign) and isinstance(node.targets[0], ast.Name):
+            defs[node.targets[0].id] = node
+        if (
+            isinstance(node, ast.Expr)
+            and isinstance(node.value, ast.Constant)
+            and isinstance(node.value.value, str)
+        ):
+            n_docstrings += 1
+            defs[f"__docstring_{n_docstrings}__"] = node
+
+    return defs
+
+
+def update_all_but_imports(our_file: ParsedFile, their_file: ParsedFile) -> str:
+    def find_last_import(module: ast.Module) -> ast.stmt:
+        *_, last_import = (
+            node
+            for node in module.body
+            if isinstance(node, (ast.Import, ast.ImportFrom))
+        )
+        return last_import
+
+    our_last_import = find_last_import(our_file.parsed)
+    their_last_import = find_last_import(their_file.parsed)
+
+    def endline(node: ast.stmt) -> int:
+        return node.end_lineno or node.lineno
+
+    our_imports = our_file.source.splitlines()[: endline(our_last_import)]
+    their_code = their_file.source.splitlines()[endline(their_last_import) :]
+
+    return "\n".join(our_imports + their_code)
+
+
+FileToUpdate = tuple[str, str, Callable[[ParsedFile, ParsedFile], str]]
+
+
+update_matrix: list[FileToUpdate] = [
+    (
+        "gemlock_parser/analysis.py",
+        "scancode-toolkit/src/textcode/analysis.py",
+        update_toplevel,
+    ),
+    (
+        "gemlock_parser/strings.py",
+        "scancode-toolkit/src/textcode/strings.py",
+        update_toplevel,
+    ),
+    (
+        "gemlock_parser/tokenize.py",
+        "scancode-toolkit/src/licensedcode/tokenize.py",
+        update_toplevel,
+    ),
+    (
+        "gemlock_parser/gemfile_lock.py",
+        "scancode-toolkit/src/packagedcode/gemfile_lock.py",
+        update_all_but_imports,
+    ),
+    (
+        "tests/test_gemfile_lock.py",
+        "scancode-toolkit/tests/packagedcode/test_gemfile_lock.py",
+        update_all_but_imports,
+    ),
+    (
+        "tests/scancode_config.py",
+        "scancode-toolkit/src/scancode_config.py",
+        update_toplevel,
+    ),
+]
+
+
+def main() -> None:
+    for our_path, their_path, how_to_update in update_matrix:
+        update_pyfile(Path(our_path), Path(their_path), how_to_update)
+        print(our_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hack/update-from-upstream.sh b/hack/update-from-upstream.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -o errexit -o nounset -o pipefail
+
+if [[ ! -e scancode-toolkit ]]; then
+    git clone https://github.com/nexB/scancode-toolkit --depth 1 --single-branch
+else
+    cd scancode-toolkit
+    git pull origin develop
+    cd ..
+fi
+
+cd scancode-toolkit
+revision=$(git rev-parse HEAD)
+cd ..
+
+python hack/_update_pyfiles.py | while read -r updated_file; do
+    sed "s|scancode-toolkit/blob/[a-f0-9]*|scancode-toolkit/blob/$revision|" \
+        --in-place "$updated_file"
+done
+
+cp -r scancode-toolkit/tests/packagedcode/data/gemfile_lock tests/data/
diff --git a/tests/data/gemfile_lock/bundled/Gemfile.lock b/tests/data/gemfile_lock/bundled/Gemfile.lock
@@ -0,0 +1,2 @@
+BUNDLED WITH
+   2.0.1
diff --git a/tests/data/gemfile_lock/bundled/Gemfile.lock.expected.json b/tests/data/gemfile_lock/bundled/Gemfile.lock.expected.json
@@ -0,0 +1 @@
+[]
diff --git a/tests/scancode_config.py b/tests/scancode_config.py
@@ -1,6 +1,6 @@
 # This file is a modified version of scancode_config.py from Scancode, based on work
 # of nexB Inc. and others. See original file at
-# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/scancode_config.py
+# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/scancode_config.py
 #
 # 
 # Copyright (c) nexB Inc. and others. All rights reserved.

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -1,6 +1,6 @@
 # This file is a modified version of test_analysis.py from Scancode, based on work
 # of nexB Inc. and others. See original file at
-# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/tests/textcode/test_analysis.py
+# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/tests/textcode/test_analysis.py
 #
 # Copyright (c) nexB Inc. and others. All rights reserved.
 # ScanCode is a trademark of nexB Inc.