Skip to content

Commit

Permalink
Merge pull request #3 from chmeliik/update-from-upstream
Browse files Browse the repository at this point in the history
Update from upstream
  • Loading branch information
chmeliik authored Jul 27, 2023
2 parents a2f0f40 + 3e71d7c commit 40b200f
Show file tree
Hide file tree
Showing 12 changed files with 239 additions and 17 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,20 @@ This repository contains Gemfile.lock parser vendored from [ScanCode
toolkit](https://github.com/nexB/scancode-toolkit). One of the main goals is to
make sure that RubyGems dependencies are parsed without executing arbitrary Ruby
code.

## Updating

To update the repository based on upstream changes, run `hack/update-from-upstream.sh`.

* for `gemfile_lock.py` and its tests, the script updates everything below the import
block
* for the supporting files, the script updates the existing top-level definitions
* functions, classes, constants, module docstrings

Inspect the changes carefully, make adjustments as necessary. Make sure unit tests
are passing.

Port the relevant unit test changes from [`test_analysis.py`][test_analysis.py]
manually, the update script is not smart enough to do that.

[test_analysis.py]: https://github.com/nexB/scancode-toolkit/blob/develop/tests/textcode/test_analysis.py
27 changes: 18 additions & 9 deletions gemlock_parser/analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is a modified version of analysis.py from Scancode, based on work
# of nexB Inc. and others. See original file at
# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/textcode/analysis.py
# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/textcode/analysis.py
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
Expand All @@ -26,6 +26,7 @@
def remove_null_bytes(s):
"""
Return a string replacing by a space all null bytes.
There are some rare cases where we can have binary strings that are not
caught early when detecting a file type, but only late at the line level.
This help catch most of these cases.
Expand All @@ -39,6 +40,7 @@ def as_unicode(line):
Try to decode line as Unicode. Try first some default encodings,
then attempt Unicode trans-literation and finally
fall-back to ASCII strings extraction.
TODO: Add file/magic detection, unicodedmanit/BS3/4
"""
if isinstance(line, str):
Expand Down Expand Up @@ -74,18 +76,25 @@ def remove_verbatim_cr_lf_tab_chars(s):
Return a string replacing by a space any verbatim but escaped line endings
and tabs (such as a literal \n or \r \t).
"""
if not s:
return s
return s.replace('\\r', ' ').replace('\\n', ' ').replace('\\t', ' ')


def unicode_text_lines(location):
def unicode_text_lines(location, decrlf=False):
"""
Return an iterable over unicode text lines from a file at `location` if it
contains text. Open the file as binary with universal new lines then try to
decode each line as Unicode.
Yield unicode text lines from a file at ``location`` if it
contains text.
Open the file as binary then try to decode each line as Unicode.
Remove verbatim, escaped CR, LF and tabs if ``decrlf`` is True.
"""
lines = _unicode_text_lines(location)
if decrlf:
return map(remove_verbatim_cr_lf_tab_chars, lines)
else:
return lines


def _unicode_text_lines(location):
with open(location, 'rb') as f:
for line in f.read().splitlines(True):
yield remove_verbatim_cr_lf_tab_chars(as_unicode(line))

yield as_unicode(line)
31 changes: 29 additions & 2 deletions gemlock_parser/gemfile_lock.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is a modified version of gemfile_lock.py from Scancode, based on
# work of nexB Inc. and others. See original file at
# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/packagedcode/gemfile_lock.py
# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/packagedcode/gemfile_lock.py
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
Expand Down Expand Up @@ -107,6 +107,7 @@ def logger_debug(*args):
SVN = 'SVN'
GEM = 'GEM'
PLATFORMS = 'PLATFORMS'
BUNDLED = 'BUNDLED WITH'
DEPENDENCIES = 'DEPENDENCIES'
SPECS = ' specs:'

Expand Down Expand Up @@ -338,6 +339,7 @@ def get_option(s):
'$' % locals()).match

PLATS = re.compile('^ (?P<platform>.*)$').match
BUNDLED_WITH = re.compile('^\s+(?P<version>(?:\d+.)+\d+)\s*$').match


class GemfileLockParser:
Expand All @@ -356,6 +358,7 @@ def __init__(self, lockfile):
self.STATES = {
DEPENDENCIES: self.parse_dependency,
PLATFORMS: self.parse_platform,
BUNDLED: self.parse_bundler_version,
GIT: self.parse_options,
PATH: self.parse_options,
SVN: self.parse_options,
Expand All @@ -366,11 +369,16 @@ def __init__(self, lockfile):
# the final tree of dependencies, keyed by name
self.dependency_tree = {}

# the package that the gemfile.lock is for
self.primary_gem = None

# a flat dict of all gems, keyed by name
self.all_gems = {}

self.platforms = []

self.bundled_with = None

# init parsing state
self.reset_state()

Expand All @@ -397,6 +405,9 @@ def __init__(self, lockfile):
# finally refine the collected data
self.refine()

# set primary gem
self.set_primary_gem()

def reset_state (self):
self.state = None
self.current_options = {}
Expand All @@ -407,6 +418,13 @@ def refine(self):
for gem in self.all_gems.values():
gem.refine()

def set_primary_gem(self):
for gem in self.all_gems.values():
if not gem.type == PATH:
continue
self.primary_gem = gem
break

def get_or_create(self, name, version=None, platform=None):
"""
Return an existing gem if it exists or creates a new one.
Expand Down Expand Up @@ -521,6 +539,16 @@ def parse_platform(self, line):
plat = plat.group('platform')
self.platforms.append(plat.strip())

def parse_bundler_version(self, line):
version = BUNDLED_WITH(line)
if not version:
if TRACE:
logger_debug('ERROR: parse_bundler_version: '
'line not matched: %(line)r' % locals())
return
version = version.group('version')
self.bundled_with = version

def flatten(self):
"""
Return the Gems dependency_tree as a sorted list of unique
Expand All @@ -531,4 +559,3 @@ def flatten(self):
flattened.append((None, direct,))
flattened.extend(direct.flatten())
return sorted(set(flattened))

2 changes: 1 addition & 1 deletion gemlock_parser/strings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is a modified version of strings.py from Scancode, based on
# work of nexB Inc. and others. See original file at
# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/textcode/strings.py
# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/textcode/strings.py
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
Expand Down
2 changes: 1 addition & 1 deletion gemlock_parser/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# This file is a modified version of tokenize.py from Scancode, based on
# work of nexB Inc. and others. See original file at
# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/licensedcode/tokenize.py
# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/licensedcode/tokenize.py
#
#
# Copyright (c) nexB Inc. and others. All rights reserved.
Expand Down
138 changes: 138 additions & 0 deletions hack/_update_pyfiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import ast
from pathlib import Path
from typing import Callable, NamedTuple


class ParsedFile(NamedTuple):
source: str
parsed: ast.Module


def update_pyfile(
our_path: Path,
their_path: Path,
how_to_update: Callable[[ParsedFile, ParsedFile], str],
) -> None:
our_source = our_path.read_text()
their_source = their_path.read_text()

our_module = ast.parse(our_source, str(our_path))
their_module = ast.parse(their_source, str(their_path))

updated_source = how_to_update(
ParsedFile(our_source, our_module),
ParsedFile(their_source, their_module),
)

if not updated_source.endswith("\n"):
updated_source += "\n"

our_path.write_text(updated_source)


def update_toplevel(our_file: ParsedFile, their_file: ParsedFile) -> str:
our_defs = _get_top_level_defs(our_file.parsed)
their_defs = _get_top_level_defs(their_file.parsed)

updated_source = our_file.source

def replace_def(updated_src: str, our_def: ast.stmt, their_def: ast.stmt) -> str:
our_src_segment = ast.get_source_segment(our_file.source, our_def)
their_src_segment = ast.get_source_segment(their_file.source, their_def)
assert our_src_segment and their_src_segment
return updated_src.replace(our_src_segment, their_src_segment, 1)

for def_name, our_def in our_defs.items():
their_def = their_defs[def_name]
updated_source = replace_def(updated_source, our_def, their_def)

return updated_source


def _get_top_level_defs(module: ast.Module) -> dict[str, ast.stmt]:
defs: dict[str, ast.stmt] = {}
n_docstrings = 0

for node in module.body:
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
defs[node.name] = node
# top-level variables (constants)
if isinstance(node, ast.Assign) and isinstance(node.targets[0], ast.Name):
defs[node.targets[0].id] = node
if (
isinstance(node, ast.Expr)
and isinstance(node.value, ast.Constant)
and isinstance(node.value.value, str)
):
n_docstrings += 1
defs[f"__docstring_{n_docstrings}__"] = node

return defs


def update_all_but_imports(our_file: ParsedFile, their_file: ParsedFile) -> str:
def find_last_import(module: ast.Module) -> ast.stmt:
*_, last_import = (
node
for node in module.body
if isinstance(node, (ast.Import, ast.ImportFrom))
)
return last_import

our_last_import = find_last_import(our_file.parsed)
their_last_import = find_last_import(their_file.parsed)

def endline(node: ast.stmt) -> int:
return node.end_lineno or node.lineno

our_imports = our_file.source.splitlines()[: endline(our_last_import)]
their_code = their_file.source.splitlines()[endline(their_last_import) :]

return "\n".join(our_imports + their_code)


FileToUpdate = tuple[str, str, Callable[[ParsedFile, ParsedFile], str]]


update_matrix: list[FileToUpdate] = [
(
"gemlock_parser/analysis.py",
"scancode-toolkit/src/textcode/analysis.py",
update_toplevel,
),
(
"gemlock_parser/strings.py",
"scancode-toolkit/src/textcode/strings.py",
update_toplevel,
),
(
"gemlock_parser/tokenize.py",
"scancode-toolkit/src/licensedcode/tokenize.py",
update_toplevel,
),
(
"gemlock_parser/gemfile_lock.py",
"scancode-toolkit/src/packagedcode/gemfile_lock.py",
update_all_but_imports,
),
(
"tests/test_gemfile_lock.py",
"scancode-toolkit/tests/packagedcode/test_gemfile_lock.py",
update_all_but_imports,
),
(
"tests/scancode_config.py",
"scancode-toolkit/src/scancode_config.py",
update_toplevel,
),
]


def main() -> None:
for our_path, their_path, how_to_update in update_matrix:
update_pyfile(Path(our_path), Path(their_path), how_to_update)
print(our_path)


if __name__ == "__main__":
main()
21 changes: 21 additions & 0 deletions hack/update-from-upstream.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
set -o errexit -o nounset -o pipefail

if [[ ! -e scancode-toolkit ]]; then
git clone https://github.com/nexB/scancode-toolkit --depth 1 --single-branch
else
cd scancode-toolkit
git pull origin develop
cd ..
fi

cd scancode-toolkit
revision=$(git rev-parse HEAD)
cd ..

python hack/_update_pyfiles.py | while read -r updated_file; do
sed "s|scancode-toolkit/blob/[a-f0-9]*|scancode-toolkit/blob/$revision|" \
--in-place "$updated_file"
done

cp -r scancode-toolkit/tests/packagedcode/data/gemfile_lock tests/data/
2 changes: 2 additions & 0 deletions tests/data/gemfile_lock/bundled/Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
BUNDLED WITH
2.0.1
1 change: 1 addition & 0 deletions tests/data/gemfile_lock/bundled/Gemfile.lock.expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
2 changes: 1 addition & 1 deletion tests/scancode_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is a modified version of scancode_config.py from Scancode, based on work
# of nexB Inc. and others. See original file at
# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/src/scancode_config.py
# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/src/scancode_config.py
#
#
# Copyright (c) nexB Inc. and others. All rights reserved.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is a modified version of test_analysis.py from Scancode, based on work
# of nexB Inc. and others. See original file at
# https://github.com/nexB/scancode-toolkit/blob/aba31126dcb3ab57f2b885090f7145f69b67351a/tests/textcode/test_analysis.py
# https://github.com/nexB/scancode-toolkit/blob/a15174f31efaf8816e8c9a65c9f85c4beffc0227/tests/textcode/test_analysis.py
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
Expand Down
Loading

0 comments on commit 40b200f

Please sign in to comment.