diff --git a/README.md b/README.md index f634a1d1..3bddb220 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -# Software Metadata Extraction Framework (SOMEF) +# Software Metadata Extraction Framework (SOMEF) +[![Documentation Status](https://readthedocs.org/projects/somef/badge/?version=latest)](https://somef.readthedocs.io/en/latest/?badge=latest) [![Python](https://img.shields.io/pypi/pyversions/somef.svg?style=plastic)](https://badge.fury.io/py/somef) [![PyPI](https://badge.fury.io/py/somef.svg)](https://badge.fury.io/py/somef) [![DOI](https://zenodo.org/badge/190487675.svg)](https://zenodo.org/badge/latestdoi/190487675) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) logo diff --git a/src/somef/__main__.py b/src/somef/__main__.py index 0001bf94..0bf50873 100644 --- a/src/somef/__main__.py +++ b/src/somef/__main__.py @@ -17,10 +17,6 @@ class URLParamType(click.types.StringParamType): @click.version_option(__version__) def cli(): click.echo("SOftware Metadata Extraction Framework (SOMEF) Command Line Interface") - # Logging setup - logging.basicConfig(level=logging.DEBUG, format='%(asctime)s-%(levelname)s-%(message)s', - datefmt='%d-%b-%y %H:%M:%S') - # logging.getLogger("bibtexparser").setLevel(logging.WARNING) @cli.command(help="Configure GitHub credentials and classifiers file path") diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index f01ea8bb..223c3846 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -1,4 +1,3 @@ -import base64 import logging import os import zipfile @@ -11,6 +10,7 @@ from . import configuration from .process_results import Result + # Constructs a template HTTP header, which: # - has a key for the authorization token if passed via the authorization argument, otherwise # - has a key for the authorization token if specified via config, otherwise @@ -39,8 +39,9 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs): rate_limit_remaining = response.headers["X-RateLimit-Remaining"] epochtime = int(response.headers["X-RateLimit-Reset"]) date_reset = datetime.fromtimestamp(epochtime) - logging.info("Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str( - date_reset)) + logging.info( + "Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str( + date_reset)) if 'message' in response and 'API rate limit exceeded' in response['message']: rate_limited = True logging.warning(f"rate limited. Backing off for {initial_backoff} seconds") @@ -324,7 +325,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, return None logging.info(f"Loading Repository {repository_url} Information....") - + # Create template header with optional authorization token header = header_template(authorization) header['accept'] = constants.GITHUB_ACCEPT_HEADER @@ -438,7 +439,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, # get releases releases_list_raw, date = rate_limit_get(repo_api_base_url + "/releases", - headers=header) + headers=header) releases_list = releases_list_raw.json() if isinstance(releases_list, dict) and 'message' in releases_list.keys(): logging.error("Releases Error: " + releases_list['message']) @@ -492,7 +493,8 @@ def do_crosswalk(data, crosswalk_table): return output -def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None, authorization=None): +def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None, + authorization=None): """ Given a repository, this method will download its files and return the readme text Parameters @@ -543,7 +545,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization): logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}") repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip" logging.info(f"Trying to download {repo_archive_url}") - repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) + repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) if repo_download.status_code != 200: sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}") @@ -593,6 +595,7 @@ class GithubUrlError(Exception): # print("The URL provided seems to be incorrect") pass + def get_readme_content(readme_url): """Function to retrieve the content of a readme file given its URL (github)""" readme_url = readme_url.replace("/blob/", "/raw/") diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py index 97bd66e1..1d0df072 100644 --- a/src/somef/regular_expressions.py +++ b/src/somef/regular_expressions.py @@ -476,30 +476,33 @@ def extract_bibtex(readme_text, repository_metadata: Result, readme_source) -> R ------- @returns Result object with the bibtex associated with this software component """ - bib_database = bibtexparser.loads(readme_text) - entries = bib_database.entries - for entry in entries: - # dumping the found fields does not seem to work, so rebuilding the object: - exported_bibtex = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n" - for key, value in entry.items(): - if key not in ('ENTRYTYPE', 'ID'): - exported_bibtex += f" {key} = {{{value}}},\n" - exported_bibtex += "}" - result = { - constants.PROP_VALUE: exported_bibtex, - constants.PROP_TYPE: constants.TEXT_EXCERPT, - constants.PROP_FORMAT: constants.FORMAT_BIB - } - if constants.PROP_DOI in entry: - result[constants.PROP_DOI] = entry[constants.PROP_DOI] - if constants.PROP_TITLE in entry: - result[constants.PROP_TITLE] = entry[constants.PROP_TITLE] - if constants.PROP_AUTHOR in entry: - result[constants.PROP_AUTHOR] = entry[constants.PROP_AUTHOR] - if constants.PROP_URL in entry: - result[constants.PROP_URL] = entry[constants.PROP_URL] - repository_metadata.add_result(constants.CAT_CITATION, result, 1, - constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source) + try: + bib_database = bibtexparser.loads(readme_text) + entries = bib_database.entries + for entry in entries: + # dumping the found fields does not seem to work, so rebuilding the object: + exported_bibtex = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n" + for key, value in entry.items(): + if key not in ('ENTRYTYPE', 'ID'): + exported_bibtex += f" {key} = {{{value}}},\n" + exported_bibtex += "}" + result = { + constants.PROP_VALUE: exported_bibtex, + constants.PROP_TYPE: constants.TEXT_EXCERPT, + constants.PROP_FORMAT: constants.FORMAT_BIB + } + if constants.PROP_DOI in entry: + result[constants.PROP_DOI] = entry[constants.PROP_DOI] + if constants.PROP_TITLE in entry: + result[constants.PROP_TITLE] = entry[constants.PROP_TITLE] + if constants.PROP_AUTHOR in entry: + result[constants.PROP_AUTHOR] = entry[constants.PROP_AUTHOR] + if constants.PROP_URL in entry: + result[constants.PROP_URL] = entry[constants.PROP_URL] + repository_metadata.add_result(constants.CAT_CITATION, result, 1, + constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source) + except Exception as e: + logging.warning("An error occurred when trying to extract bibtex from README " + str(e)) return repository_metadata diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 00d56383..6c281b45 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -37,6 +37,11 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc ------- @return: Dictionary with the results found by SOMEF, formatted as a Result object. """ + # Set up logging + logging.basicConfig(level=logging.DEBUG, format='%(asctime)s-%(levelname)s-%(message)s', + datefmt='%d-%b-%y %H:%M:%S', force=True) + logging.getLogger("bibtexparser").setLevel(logging.ERROR) + file_paths = configuration.get_configuration_file() repo_type = constants.RepositoryType.GITHUB repository_metadata = Result() @@ -152,8 +157,6 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc logging.info("Completed extracting regular expressions") return repository_metadata - - except Exception as e: logging.error("Error processing repository " + str(e)) return repository_metadata