From 6ded7c0a599e31f814ffe54ce438c01453d85906 Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Thu, 11 Jul 2024 13:40:49 -0400 Subject: [PATCH 01/17] add gitlab option --- iga/cli.py | 17 +++++++ iga/gitlab.py | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 iga/gitlab.py diff --git a/iga/cli.py b/iga/cli.py index 22990e3..6601c42 100644 --- a/iga/cli.py +++ b/iga/cli.py @@ -132,6 +132,10 @@ def _read_github_token(ctx, param, value): return _read_param_value(ctx, param, value, 'GITHUB_TOKEN', 'GitHub personal access token', required=False) +def _read_gitlab_token(ctx, param, value): + '''Read the file and set the environment variable GITLAB_TOKEN.''' + return _read_param_value(ctx, param, value, 'GITLAB_TOKEN', + 'GitLab personal access token', required=False) def _read_invenio_token(ctx, param, value): '''Read the file and set the environment variable INVENIO_TOKEN.''' @@ -377,6 +381,19 @@ def _list_communities(ctx, param, value): # @click.option('--github-token', '-t', metavar='STR', callback=_read_github_token, help="GitHub acccess token (**avoid – use variable**)") + +# +@click.option('--gitlab-projectid', '-gp', metavar='STR', + help='Gilab project ID (The ID or NAMESPACE/PROJECT_PATH)') + +# +@click.option('--gitlab-repo', '-gr', 'grepo', metavar='STR', + help='GitLab repository name, if not using release URL') + +# +@click.option('--github-token', '-gt', metavar='STR', callback=_read_gitlab_token, + help="GitHub acccess token (**avoid – use variable**)") + # @click.help_option('--help', '-h', help='Show this help message and exit') # diff --git a/iga/gitlab.py b/iga/gitlab.py new file mode 100644 index 0000000..d753644 --- /dev/null +++ b/iga/gitlab.py @@ -0,0 +1,131 @@ +import commonpy.exceptions +from commonpy.network_utils import net +import contextlib +from functools import cache +import json +import os +from sidetrack import log +from types import SimpleNamespace +import requests + +from iga.exceptions import GitHubError, InternalError + +_BOT_NAME_WORDS = ['daemon', 'dependabot', 'dependabot[bot]'] +'''List of words such that, if one of the words is the last word in an account +name, mean the account will be assumed to be a software bot of some kind.''' + +class GitLabAPIError(Exception): + pass + +def _gitlab_get(endpoint, test_only=False): + headers = {'Accept': 'application/json'} + using_token = 'GITLAB_TOKEN' in os.environ + if using_token: + headers['Authorization'] = f'token {os.environ["GITLAB_TOKEN"]}' + method = 'head' if test_only else 'get' + try: + if method == 'HEAD': + response = requests.head(endpoint, headers=headers) + else: + response = requests.get(endpoint, headers=headers) + + if response.status_code == 401: + raise GitLabAPIError(f"Unauthorized: Check your GitLab token or permissions. Endpoint: {endpoint}") + elif response.status_code == 429: + # Too Many Requests error + raise GitLabAPIError(f"Too Many Requests: Rate limit exceeded. Try again later. Endpoint: {endpoint}") + return response + + except requests.exceptions.RequestException as e: + # Handle connection errors or timeouts + raise GitLabAPIError(f"Request failed: {e}") from e + +@cache +def _object_for_gitlab(api_url, cls): + '''Return object of class cls made from the data obtained from the API url.''' + try: + response = _gitlab_get(api_url) + if not response: + return None + + log(f'unpacking JSON into object structure from {api_url}') + + # Create the desired object & add the api url in case it's needed later. + obj = cls(response.json()) + obj.api_url = api_url + return obj + + except GitLabAPIError as e: + # Handle GitLab API specific errors + log(f'GitLab API Error: {e}') + raise InternalError('Encountered error trying to unpack GitLab data.') from e + + except Exception as ex: + # Handle other unexpected errors + log(f'Error: {ex}') + raise InternalError('Encountered unexpected error trying to unpack GitLab data.') from ex + + +class GitLabAccount(SimpleNamespace): + '''Simple data structure corresponding to a GitHub user or org account.''' + def __init__(self, user_dict): + super().__init__(**user_dict) + if os.environ.get('IGA_RUN_MODE') == 'debug': + log('GitHub user data: ' + json.dumps(user_dict, indent=2)) + # Save the original data for debugging purposes. + self._json_dict = user_dict + +class GitLabAsset(SimpleNamespace): + '''Simple data structure corresponding to a GitHub file asset JSON object.''' + def __init__(self, asset_dict): + super().__init__(**asset_dict) + +class GitLabRelease(SimpleNamespace): + '''Simple data structure corresponding to a GitHub release JSON object.''' + def __init__(self, release_dict): + super().__init__(**release_dict) + if os.environ.get('IGA_RUN_MODE') == 'debug': + log('GitHub release data: ' + json.dumps(release_dict, indent=2)) + self.author = GitLabAccount(release_dict['author']) + + # ... then convert the dict of the asset (which contains uploader). + self.assets = [GitLabAsset(asset) for asset in self.assets] + # Save the original data for debugging purposes. + self._json_dict = release_dict + + + +def gitlab_release(project_url, test_only=False): + endpoint = project_url # Assuming project_url points to the release endpoint + if test_only: + log('testing for existence: ' + endpoint) + return _gitlab_get(endpoint, test_only) + + log('getting GitLab release data from ' + endpoint) + return _object_for_gitlab(endpoint, GitLabRelease) + +def gitlab_release_assets(project_url, get_all): + '''Return a list of URLs for all the assets associated with the release.''' + + release = gitlab_release(project_url) + sources = release.assets.sources + assets = [] + for source in sources: + if not get_all: + if source.format in ['zip']: + assets.append(source.url) + else: + log('option to get all assets is in effect') + assets.append(source.url) + return assets + +def github_repo_file(repo, tag_name, filename): + '''Return the text contents of the named file in the repo object. + + The tag_name must be a release tag, and is used to find the version of + the repository corresponding to that tag. + ''' + #https://code.jlab.org/api/v4/projects/31/repository/files/Pipfile/raw?ref=0.1.0 + + +# /projects/:id/repository/files/:file_path \ No newline at end of file From 77b1a3939412d5e23be699236631697af9a714de Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Fri, 12 Jul 2024 13:51:14 -0400 Subject: [PATCH 02/17] add gitlab --- iga/cli.py | 86 +++++++++++++++++++----------- iga/github.py | 2 +- iga/githublab.py | 49 +++++++++++++++++ iga/gitlab.py | 124 +++++++++++++++++++++++++++++++++++++++--- iga/invenio.py | 13 +++-- iga/metadata.py | 136 +++++++++++++++++++---------------------------- 6 files changed, 282 insertions(+), 128 deletions(-) create mode 100644 iga/githublab.py diff --git a/iga/cli.py b/iga/cli.py index 6601c42..f995328 100644 --- a/iga/cli.py +++ b/iga/cli.py @@ -21,12 +21,12 @@ from iga import __version__ from iga.exit_codes import ExitCode from iga.exceptions import GitHubError, InvenioRDMError, RecordNotFound -from iga.github import ( - github_account_repo_tag, - github_release, - github_release_assets, - valid_github_release_url, + +from iga.githublab import ( + valid_release_url, + git_release_assets, ) + from iga.id_utils import is_inveniordm_id from iga.invenio import ( invenio_api_available, @@ -39,7 +39,6 @@ invenio_upload, ) - # Main command-line interface. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -383,16 +382,14 @@ def _list_communities(ctx, param, value): help="GitHub acccess token (**avoid – use variable**)") # -@click.option('--gitlab-projectid', '-gp', metavar='STR', - help='Gilab project ID (The ID or NAMESPACE/PROJECT_PATH)') - +@click.option('--gitlab', is_flag=True, help='Use GitLab mode') # -@click.option('--gitlab-repo', '-gr', 'grepo', metavar='STR', - help='GitLab repository name, if not using release URL') - +@click.option('--gitlab-url', '-gu', metavar='STR', + help='GiLab base url (like https://gitlab.com or https://code.jlab.org)') # -@click.option('--github-token', '-gt', metavar='STR', callback=_read_gitlab_token, - help="GitHub acccess token (**avoid – use variable**)") +@click.option('--gitlab-projectid', '-gp', metavar='STR', + help='GiLab project ID (The ID or NAMESPACE/PROJECT_PATH)') + # @click.help_option('--help', '-h', help='Show this help message and exit') @@ -438,7 +435,8 @@ def _list_communities(ctx, param, value): @click.argument('url_or_tag', required=True) @click.pass_context def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, - files_to_upload=None, account=None, repo=None, github_token=None, + files_to_upload=None, account=None, repo=None, github_token=None, + gitlab_projectid=None, gitlab=False, gitlab_url=None, print_doi=False, server=None, invenio_token=None, list_communities=False, open_in_browser=False, log_dest=None, mode='normal', parent_id=None, all_metadata=False, source=None, dest=None, timeout=None, @@ -650,6 +648,9 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, 7 = an exception or fatal error occurred ''' # Process arguments & handle early exits .................................. + ctx.ensure_object(dict) + ctx.obj['gitlab'] = gitlab + ctx.obj['gitlab_url'] = gitlab_url if url_or_tag == 'help': # Detect if the user typed "help" without dashes. _print_help_and_exit(ctx) @@ -658,21 +659,46 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, _alert(ctx, 'The use of a URL and the use of options `--account`' " and `--repo` are mutually exclusive; can't use both.") sys.exit(int(ExitCode.bad_arg)) - elif not valid_github_release_url(url_or_tag): - _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + elif not valid_release_url(url_or_tag, gitlab): + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + sys.exit(int(ExitCode.bad_arg)) + + elif not gitlab: + if not all([account, repo, url_or_tag]): + _alert(ctx, 'When not using a release URL, all of the following must be' + ' provided: the options `--account`, `--repo`, and a tag name.') sys.exit(int(ExitCode.bad_arg)) - else: - account, repo, tag = github_account_repo_tag(url_or_tag) - elif not all([account, repo, url_or_tag]): - _alert(ctx, 'When not using a release URL, all of the following must be' - ' provided: the options `--account`, `--repo`, and a tag name.') - sys.exit(int(ExitCode.bad_arg)) - else: tag = url_or_tag + url_or_tag = f'https://api.github.com/{account}/repos/{repo}/releases/tags/{tag}' + if not valid_release_url(url_or_tag, gitlab): + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + sys.exit(int(ExitCode.bad_arg)) - if not github_release(account, repo, tag, test_only=True): - _alert(ctx, f'There does not appear to be a release **{tag}** in' - f' repository **{repo}** of account **{account}**.') + elif gitlab: + if not (all([gitlab_url, gitlab_projectid, url_or_tag]) or all(gitlab_url, account, repo, url_or_tag)): + _alert(ctx, 'When using GitLab, all of the following must be' + ' provided: the options `--gitlab-url` and `--gitlab-projectid`. or `--gitlab-url` and `--gitlab-url , --account , --repo ') + sys.exit(int(ExitCode.bad_arg)) + if all([gitlab_url, gitlab_projectid, url_or_tag]): + tag = url_or_tag + url_or_tag = f'{gitlab_url}/api/v4/projects/{gitlab_projectid}/releases/{tag}' + if not valid_release_url(url_or_tag, gitlab): + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + sys.exit(int(ExitCode.bad_arg)) + elif all([gitlab_url, account, repo, url_or_tag]): + tag = url_or_tag + gitlab_projectid = f'{account}%2F{repo}' + url_or_tag = f'{gitlab_url}/api/v4/projects/{gitlab_projectid}/releases/{tag}' + if not valid_release_url(url_or_tag, gitlab): + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + sys.exit(int(ExitCode.bad_arg)) + else: + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + sys.exit(int(ExitCode.bad_arg)) + + repo_name = gitlab_projectid + else: + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) sys.exit(int(ExitCode.bad_arg)) if files_to_upload and all_assets: @@ -699,7 +725,7 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, exit_code = ExitCode.success try: record = None - github_assets = [] + release_assets = [] if source: _inform(f'Using {source.name} instead of building a record.') metadata = metadata_from_file(source) @@ -709,7 +735,7 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, else: _inform(f'Building record for {account}/{repo} release "{tag}"', end='...') metadata = metadata_for_release(account, repo, tag, all_metadata) - github_assets = github_release_assets(account, repo, tag, all_assets) + release_assets = git_release_assets(repo, tag, account, all_assets) _inform(' done.') if dest: @@ -729,7 +755,7 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, _inform(' done.') _inform('Attaching assets:') - for item in files_to_upload or github_assets: + for item in files_to_upload or release_assets: invenio_upload(record, item, _print_text) if draft: diff --git a/iga/github.py b/iga/github.py index d41708e..4248867 100644 --- a/iga/github.py +++ b/iga/github.py @@ -98,7 +98,7 @@ class GitHubFile(SimpleNamespace): def __init__(self, file_dict): super().__init__(**file_dict) - + # Exported module functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/iga/githublab.py b/iga/githublab.py new file mode 100644 index 0000000..76ee74a --- /dev/null +++ b/iga/githublab.py @@ -0,0 +1,49 @@ +import rich_click as click + +from iga.github import ( + github_account_repo_tag, + github_release, + github_repo, + github_release_assets, + valid_github_release_url, + github_repo_file, +) +from iga.gitlab import ( + valid_gitlab_release_url, + gitlab_release_assets, + gitlab_repo, + gitlab_repo_file, + gitlab_release, +) +ctx = click.get_current_context() +GITLAB = ctx.obj.get('gitlab', False) + +def valid_release_url(release_url): + if not GITLAB: + return valid_github_release_url(release_url) + else: + return valid_gitlab_release_url(release_url) + +def git_release(repo_name, tag, account_name=None): + if not GITLAB: + return github_release(account_name, repo_name, tag) + else: + return gitlab_release(repo_name, tag) + +def git_repo(repo_name, account_name=None): + if not GITLAB: + return github_repo(account_name, repo_name) + else: + return gitlab_repo(repo_name) + +def git_repo_file(repo, tag, filename): + if not GITLAB: + return github_repo_file(repo, tag, filename) + else: + return gitlab_repo_file(repo, tag, filename) + +def git_release_assets(repo, tag, account_name=None, all_assets=False): + if not GITLAB: + return github_release_assets(account_name,repo, tag, all_assets) + else: + return gitlab_release_assets(repo, tag, all_assets) \ No newline at end of file diff --git a/iga/gitlab.py b/iga/gitlab.py index d753644..41f7774 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -7,6 +7,8 @@ from sidetrack import log from types import SimpleNamespace import requests +import rich_click as click + from iga.exceptions import GitHubError, InternalError @@ -14,6 +16,11 @@ '''List of words such that, if one of the words is the last word in an account name, mean the account will be assumed to be a software bot of some kind.''' +API_PATH = '/api/v4/' +ctx = click.get_current_context() +GITLAB = ctx.obj.get('gitlab', False) +GITLAB_URL = ctx.obj.get('gitlab_url', None) +API_URL = f'{GITLAB_URL}/api/v4' class GitLabAPIError(Exception): pass @@ -92,11 +99,55 @@ def __init__(self, release_dict): self.assets = [GitLabAsset(asset) for asset in self.assets] # Save the original data for debugging purposes. self._json_dict = release_dict +#?license=true + +class GitLabRepo(SimpleNamespace): + '''Simple data structure corresponding to a GitHub repository JSON object. + This object is enhanced with a "files" property that contains a list of + the files in the default branch of the repository.''' + + def __init__(self, repo_dict): + super().__init__(**repo_dict) + if os.environ.get('IGA_RUN_MODE') == 'debug': + log('GitHub repo data: ' + json.dumps(repo_dict, indent=2)) + self.owner = GitLabAccount(repo_dict['owner']) + if repo_dict.get('organization'): + self.organization = GitLabAccount(repo_dict['organization']) + if repo_dict.get('license'): + self.license = GitLabLicense(repo_dict['license']) + # Save the original data for debugging purposes. + self._json_dict = repo_dict + +class GitLabLicense(SimpleNamespace): + '''Simple data structure corresponding to a license object.''' + def __init__(self, license_dict): + super().__init__(**license_dict) + +class GitLabFile(SimpleNamespace): + '''Simple data structure corresponding to a file in a repo.''' + def __init__(self, file_dict): + super().__init__(**file_dict) + + +# Exported module functions. +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ + +def gitlab_release(repo_name, tag, test_only=False): + '''Return a Release object corresponding to the tagged release in GitHub. + If test_only is True, only check existence; don't create a Release object. + ''' + endpoint = f'{API_URL}/projects/{repo_name}/releases/{tag}' # Assuming project_url points to the release endpoint + if test_only: + log('testing for existence: ' + endpoint) + return _gitlab_get(endpoint, test_only) + log('getting GitLab release data from ' + endpoint) + return _object_for_gitlab(endpoint, GitLabRelease) -def gitlab_release(project_url, test_only=False): - endpoint = project_url # Assuming project_url points to the release endpoint +def gitlab_repo(repo_name, test_only=False): + '''Return a Repo object corresponding to the named repo in GitLab.''' + endpoint = f'{API_URL}/{repo_name}?license=true' if test_only: log('testing for existence: ' + endpoint) return _gitlab_get(endpoint, test_only) @@ -104,14 +155,14 @@ def gitlab_release(project_url, test_only=False): log('getting GitLab release data from ' + endpoint) return _object_for_gitlab(endpoint, GitLabRelease) -def gitlab_release_assets(project_url, get_all): +def gitlab_release_assets(repo_name, tag, all_assets): '''Return a list of URLs for all the assets associated with the release.''' - release = gitlab_release(project_url) + release = gitlab_release(repo_name, tag) sources = release.assets.sources assets = [] for source in sources: - if not get_all: + if not all_assets: if source.format in ['zip']: assets.append(source.url) else: @@ -119,13 +170,70 @@ def gitlab_release_assets(project_url, get_all): assets.append(source.url) return assets -def github_repo_file(repo, tag_name, filename): +def gitlab_repo_filenames(repo_name, tag_name): + '''Return a list of filenames in the repo corresponding to the specified tag.''' + release = gitlab_release(repo_name, tag_name) + files = [file.name for file in release.assets] + return files + +def gitlab_repo_file(repo, tag_name, filename): '''Return the text contents of the named file in the repo object. The tag_name must be a release tag, and is used to find the version of the repository corresponding to that tag. ''' - #https://code.jlab.org/api/v4/projects/31/repository/files/Pipfile/raw?ref=0.1.0 + if filename in getattr(repo, '_files_contents', {}): + log(f'{filename} found in the files of {repo}') + return repo._files_contents[filename] + + endpoint = f'{API_URL}/projects/{repo.id}/repository/files/{filename}/raw?ref={tag_name}' + response = _gitlab_get(endpoint) + if not response: + log(f'got no content for file {filename} or it does not exist') + return '' + json_dict = response.json() + if json_dict['encoding'] != 'base64': + log(f'GitHub file encoding for {filename} is ' + json_dict['encoding']) + raise InternalError('Unimplemented file encoding ' + json_dict['encoding']) + import base64 + contents = base64.b64decode(json_dict['content']).decode() + if not getattr(repo, '_file_contents', {}): + repo._file_contents = {} + # Cache the file contents, so we don't have to get it from GitHub again. + repo._file_contents[filename] = contents + log(f'got contents for {filename} (length = {len(contents)} chars)') + return contents -# /projects/:id/repository/files/:file_path \ No newline at end of file + #https://code.jlab.org/api/v4/projects/31/repository/files/Pipfile/raw?ref=0.1.0 + +def gitlab_repo_languages(repo): + log(f'asking GitHub for list of languages for repo {repo.full_name}') + repolink = repo._links.self + endpoint = f'{repolink}/languages' + response = _gitlab_get(endpoint) + if not response: + log(f'got no content for list of languages for repo {repo}') + return '' + json_dict = response.json() + languages = json_dict.keys() if json_dict else [] + log(f'GitLab lists {len(languages)} languages for the repo') + return languages + +def gitlab_repo_contributers(repo): + repolink = repo._links.self + endpoint = f'{repolink}/repository/contributors' + response = _gitlab_get(endpoint) + if not response: + log(f'got no content for list of contributors for repo {repo}') + return [] + # The JSON data is a list containing a kind of minimal user info dict. + contributors = [] + for user_dict in response.json(): + contributors.append(gitlab_account(user_dict['login'])) + log(f'repo has {len(contributors)} contributors') + return contributors + +def valid_gitlab_release_url(url): + '''Check if the provided URL is a valid GitLab release endpoint.''' + return _gitlab_get(url, test_only=True) diff --git a/iga/invenio.py b/iga/invenio.py index 1db083f..873b7dd 100644 --- a/iga/invenio.py +++ b/iga/invenio.py @@ -17,16 +17,18 @@ from sidetrack import log import socket import os -from os import path +from os import path +import humanize + import iga -from iga.exceptions import ( +from iga.exceptions import ( InternalError, InvenioRDMError, RecordNotFound, ) -from iga.github import github_asset_contents -from iga.id_utils import normalize_invenio_rdm +from iga.github import github_asset_contents +from iga.id_utils import normalize_invenio_rdm # Exported data structures. @@ -86,7 +88,6 @@ def __gt__(self, other): def __ge__(self, other): return not self.__lt__(self, other) - # Exported module functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -231,7 +232,6 @@ def invenio_upload(record, asset, print_status): ''' # Start by reading the assets to be sure we can actually get them, *before* # trying to upload them to InvenioRDM. - import humanize size = '' if asset.startswith('http'): filename = _filename_from_asset_url(asset) @@ -363,7 +363,6 @@ def invenio_communities(): log(f'we got {pluralized("community", communities, True)}') return communities - # Miscellaneous helper functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/iga/metadata.py b/iga/metadata.py index 860590f..d1d5d21 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -45,9 +45,16 @@ from sidetrack import log import sys import validators +import yaml + from iga.data_utils import deduplicated, listified, normalized_url, similar_urls from iga.exceptions import MissingData +from iga.githublab import ( + git_repo, + git_repo_file, + git_release, +) from iga.github import ( github_account, github_file_url, @@ -65,34 +72,6 @@ from iga.reference import reference, RECOGNIZED_REFERENCE_SCHEMES from iga.text_utils import cleaned_text - -# Constants. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -# It's useful to understand the context of what's going on. The record stored -# in InvenioRDM may have these top-level fields (but might not contain all): -# -# { -# "$schema": "local://records/record-vX.Y.Z.json", -# "id": "q5jr8-hny72", -# "pid": { ... }, -# "pids" : { ... }, -# "parent": { ... }, -# "access" : { ... }, -# "metadata" : { ... }, -# "files" : { ... }, -# "tombstone" : { ... }, -# "created": "...", -# "updated": "...", -# } -# -# However, what is uploaded to an InvenioRDM server should only contain the -# 'metadata' field, because of the other fields above are added by the system. -# Consequently, IGA only needs to construct the 'metadata' field value. I.e., -# referring to https://inveniordm.docs.cern.ch/reference/metadata, we are only -# concerned with https://inveniordm.docs.cern.ch/reference/metadata/#metadata -# -# The following is the full set of possible subfields in "metadata". FIELDS = [ "additional_descriptions", @@ -118,26 +97,6 @@ "version", ] -# Not all of these need to be provided. Based on the test cases in -# https://github.com/inveniosoftware/invenio-rdm-records, the minimum set of -# fields that needs to be provided seems to be this: -# -# { -# "metadata": { -# "resource_type": { "id": "XYZ", ... }, # note below -# "title": "ABC", -# "creators": [ -# { -# "person_or_org": { -# "family_name": "A", -# "given_name": "B", -# "type": "C", -# } -# }, -# ], -# "publication_date": "...date...", -# } - REQUIRED_FIELDS = [ "creators", "publication_date", @@ -168,7 +127,7 @@ # allowed but we don't want to allow some types such as 'data' URLs. ALLOWED_URL_SCHEMES = ['http', 'https', 'git', 'ftp', 'gopher', 's3', 'svn'] - + # Exported module functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -178,8 +137,8 @@ def metadata_for_release(account_name, repo_name, tag, all_metadata): Data is gathered from the GitHub release identified by "tag" in the repository "repo_name" of the given GitHub "account_name". ''' - repo = github_repo(account_name, repo_name) - release = github_release(account_name, repo_name, tag) + repo = git_repo(repo_name, account_name) + release = git_release(repo_name, tag, account_name) # We use codemeta.json & CITATION.cff often. Get them now & augment the # repo object with them so that field extraction functions can access them. @@ -187,7 +146,7 @@ def metadata_for_release(account_name, repo_name, tag, all_metadata): repo.cff = {} filenames = github_repo_filenames(repo, tag) if 'codemeta.json' in filenames: - codemeta_file = github_repo_file(repo, tag, 'codemeta.json') + codemeta_file = git_repo_file(repo, tag, 'codemeta.json') try: repo.codemeta = json5.loads(codemeta_file) except KeyboardInterrupt: @@ -199,8 +158,8 @@ def metadata_for_release(account_name, repo_name, tag, all_metadata): except Exception as ex: # noqa PIE786 log('ignoring codemeta.json file because of error: ' + str(ex)) for name in ['CITATION.cff', 'CITATION.CFF', 'citation.cff']: + # https://github.com/citation-file-format/citation-file-format/blob/main/schema.json if name in filenames: - import yaml try: repo.cff = yaml.safe_load(github_repo_file(repo, tag, name)) except KeyboardInterrupt: @@ -240,11 +199,11 @@ def metadata_from_file(file): log(f'reading metadata provided in file {str(file)}') content = file.read().strip() metadata = json5.loads(content) - except KeyboardInterrupt: - raise except Exception as ex: # noqa PIE786 log(f'problem trying to read metadata from {str(file)}: ' + str(ex)) return False + except KeyboardInterrupt: + raise if 'metadata' not in metadata: log('record lacks a "metadata" field') @@ -324,7 +283,7 @@ def add(item, role, summary): return deduplicated(descriptions) -def additional_titles(repo, release, include_all): +def additional_titles(repo, include_all): '''Return InvenioRDM "additional titles". https://inveniordm.docs.cern.ch/reference/metadata/#additional-titles-0-n ''' @@ -348,7 +307,7 @@ def additional_titles(repo, release, include_all): }) if add_repo_name: log('adding GitHub repo "full_name" as additional title') - titles.append({'title': cleaned_text(repo.full_name), + titles.append({'title': cleaned_text(repo.full_name), #AP: gitlab repo.name 'type': {'id': 'alternative-title'}, 'lang': {'id': 'eng'}, }) @@ -452,7 +411,7 @@ def contributors(repo, release, include_all): return result -def creators(repo, release, include_all, internal_call=False): +def creators(repo, release, internal_call=False): '''Return InvenioRDM "creators". https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n ''' @@ -492,7 +451,7 @@ def dates(repo, release, include_all): # If we used a different date for the publication_date value than the # release date in GitHub, we add release date as another type of date. pub_date = publication_date(repo, release, include_all) - github_date = arrow.get(release.published_at).format('YYYY-MM-DD') + github_date = arrow.get(release.published_at).format('YYYY-MM-DD') #AP: gitlab release.released_at if pub_date != github_date: log('adding the GitHub release "published_at" date as the "available" date') dates.append({'date': github_date, @@ -512,7 +471,7 @@ def dates(repo, release, include_all): # to the GitHub repo "updated_at" date. if mod_date := repo.codemeta.get('dateModified', ''): log('adding the CodeMeta "dateModified" as the "updated" date') - elif include_all and (mod_date := repo.updated_at): + elif include_all and (mod_date := repo.updated_at): #AP: gitlab what log('adding the GitHub repo "updated_at" date as the "updated" date') if mod_date: dates.append({'date': arrow.get(mod_date).format('YYYY-MM-DD'), @@ -526,7 +485,7 @@ def dates(repo, release, include_all): return dates -def description(repo, release, include_all, internal_call=False): +def description(repo, release, internal_call=False): '''Return InvenioRDM "description". https://inveniordm.docs.cern.ch/reference/metadata/#description-0-1 ''' @@ -538,6 +497,8 @@ def description(repo, release, include_all, internal_call=False): # commit messages. In those cases, the value of release.body that we get # through the API is empty. There doesn't seem to be a way to get the text # shown by GitHub in those cases, so we try other alternatives after this. + + # AP: gitlab release body no. if release.body: if internal_call: return release.body.strip() @@ -583,7 +544,7 @@ def description(repo, release, include_all, internal_call=False): return '(No description provided.)' -def formats(repo, release, include_all): +def formats(release): '''Return InvenioRDM "formats". https://inveniordm.docs.cern.ch/reference/metadata/#formats-0-n ''' @@ -597,7 +558,7 @@ def formats(repo, release, include_all): return formats -def funding(repo, release, include_all): +def funding(repo): '''Return InvenioRDM "funding references". https://inveniordm.docs.cern.ch/reference/metadata/#funding-references-0-n ''' @@ -700,7 +661,7 @@ def funding(repo, release, include_all): return deduplicated(results) -def identifiers(repo, release, include_all): +def identifiers(repo): '''Return InvenioRDM "alternate identifiers". https://inveniordm.docs.cern.ch/reference/metadata/#alternate-identifiers-0-n @@ -743,7 +704,7 @@ def identifiers(repo, release, include_all): return deduplicated(identifiers) -def languages(repo, release, include_all): +def languages(): '''Return InvenioRDM "languages". https://inveniordm.docs.cern.ch/reference/metadata/#languages-0-n ''' @@ -752,7 +713,7 @@ def languages(repo, release, include_all): return [{"id": "eng"}] -def locations(repo, release, include_all): +def locations(): '''Return InvenioRDM "locations". https://inveniordm.docs.cern.ch/reference/metadata/#locations-0-n ''' @@ -760,7 +721,7 @@ def locations(repo, release, include_all): return {} -def publication_date(repo, release, include_all): +def publication_date(repo, release): '''Return InvenioRDM "publication date". https://inveniordm.docs.cern.ch/reference/metadata/#publication-date-1 ''' @@ -773,12 +734,12 @@ def publication_date(repo, release, include_all): elif date := repo.cff.get('date-released', ''): log('adding CFF "date-released" as "publication_date"') else: - date = release.published_at + date = release.published_at #AP: gitlab released_at log('adding GitHub repo "published_at" as "publication_date"') return arrow.get(date).format('YYYY-MM-DD') -def publisher(repo, release, include_all): +def publisher(): '''Return InvenioRDM "publisher". https://inveniordm.docs.cern.ch/reference/metadata/#publisher-0-1 ''' @@ -840,7 +801,7 @@ def id_dict(url, rel_type, res_type): 'scheme': 'url'} log('adding GitHub release "html_url" to "related_identifiers"') - identifiers = [id_dict(release.html_url, 'isidenticalto', 'software')] + identifiers = [id_dict(release.html_url, 'isidenticalto', 'software')] #AP: gitlab release._links["self"] # The GitHub repo is what this release is derived from. Note: you would # expect the GitHub repo html_url, the codemeta.json codeRepository, and @@ -871,7 +832,7 @@ def id_dict(url, rel_type, res_type): log('adding CodeMeta "url" to "related_identifiers"') elif homepage_url := repo.cff.get('url', ''): log('adding CFF "url" to "related_identifiers"') - elif include_all and (homepage_url := repo.homepage): + elif include_all and (homepage_url := repo.homepage): #AP: web_url log('adding GitHub repo "homepage" to "related_identifiers"') if homepage_url: identifiers.append(id_dict(homepage_url, 'isdescribedby', 'other')) @@ -918,7 +879,7 @@ def id_dict(url, rel_type, res_type): # The GitHub Pages URL for a repo usually points to documentation or info # about the softare, though we can't tell if it's for THIS release. if include_all and repo.has_pages: - url = f'https://{repo.owner.login}.github.io/{repo.name}' + url = f'https://{repo.owner.login}.github.io/{repo.name}' # AP: ? if not any(url == item['identifier'] for item in identifiers): log('adding the repo\'s GitHub Pages URL to "related_identifiers"') identifiers.append(id_dict(url, 'isdocumentedby', @@ -927,9 +888,9 @@ def id_dict(url, rel_type, res_type): # The issues URL is kind of a supplemental resource. if issues_url := repo.codemeta.get('issueTracker', ''): log('adding CodeMeta "issueTracker" to "related_identifiers"') - elif include_all and repo.issues_url: + elif include_all and repo.issues_url: #AP: repo._links["issues"] log('adding GitHub repo "issues_url" to "related_identifiers"') - issues_url = f'https://github.com/{repo.full_name}/issues' + issues_url = f'https://github.com/{repo.full_name}/issues' # AP: repo.name if issues_url: identifiers.append(id_dict(issues_url, 'issupplementedby', 'other')) @@ -980,7 +941,7 @@ def id_dict(url, rel_type, res_type): return filtered_identifiers -def resource_type(repo, release, include_all): +def resource_type(repo): '''Return InvenioRDM "resource type". https://inveniordm.docs.cern.ch/reference/metadata/#resource-type-1 ''' @@ -994,7 +955,7 @@ def resource_type(repo, release, include_all): return {'id': 'software'} -def rights(repo, release, include_all): +def rights(repo, release): '''Return InvenioRDM "rights (licenses)". https://inveniordm.docs.cern.ch/reference/metadata/#rights-licenses-0-n ''' @@ -1043,6 +1004,16 @@ def rights(repo, release, include_all): # We didn't recognize license info in the CodeMeta or cff files. # Look into the GitHub repo data to see if GitHub identified a license. + """ + license_url": "https://code.jlab.org/panta/hcana_container_doc/-/blob/main/LICENSE", + "license": { + "key": "apache-2.0", + "name": "Apache License 2.0", + "nickname": null, + "html_url": "https://www.apache.org/licenses/LICENSE-2.0", + "source_url": null + } + """ if repo.license and repo.license.name != 'Other': from iga.licenses import LICENSES log('GitHub has provided license info for the repo – using those values') @@ -1079,14 +1050,14 @@ def rights(repo, release, include_all): return rights -def sizes(repo, release, include_all): +def sizes(): '''Return InvenioRDM "sizes". https://inveniordm.docs.cern.ch/reference/metadata/#sizes-0-n ''' return [] -def subjects(repo, release, include_all): +def subjects(repo, include_all): '''Return InvenioRDM "subjects". https://inveniordm.docs.cern.ch/reference/metadata/#subjects-0-n ''' @@ -1138,7 +1109,7 @@ def subjects(repo, release, include_all): if include_all: log('adding GitHub topics to "subjects"') - subjects.update(repo.topics) + subjects.update(repo.topics) #AP: ? # Add repo languages as topics too. if languages := github_repo_languages(repo): @@ -1156,7 +1127,7 @@ def subjects(repo, release, include_all): return [{'subject': x} for x in sorted(subjects, key=str.lower)] -def title(repo, release, include_all): +def title(repo, release): '''Return InvenioRDM "title". https://inveniordm.docs.cern.ch/reference/metadata/#title-1 ''' @@ -1168,7 +1139,7 @@ def title(repo, release, include_all): title += text field = 'CFF "title"' else: - title += repo.full_name + title += repo.full_name #AP: repo.name field = 'GitHub repo "full_name"' # Note: better not to use a colon here. A lot of CodeMeta files use a name @@ -1183,7 +1154,7 @@ def title(repo, release, include_all): return cleaned_text(title) -def version(repo, release, include_all): +def version(release): '''Return InvenioRDM "version". https://inveniordm.docs.cern.ch/reference/metadata/#version-0-1 ''' @@ -1558,6 +1529,7 @@ def _cff_reference_ids(repo): def _load_vocabularies(): + # https://inveniordm.jlab.org/api/vocabularies/licenses from caltechdata_api.customize_schema import get_vocabularies from iga.invenio import invenio_vocabulary log('loading controlled vocabularies using caltechdata_api module') From 54c01fdf0e2dda2866736b57349f1f291d650a76 Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Fri, 12 Jul 2024 16:18:49 -0400 Subject: [PATCH 03/17] changes --- iga/githublab.py | 2 +- iga/gitlab.py | 26 +++++++++++++++----------- iga/metadata.py | 24 +++++++++++++----------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/iga/githublab.py b/iga/githublab.py index 76ee74a..98b4cd0 100644 --- a/iga/githublab.py +++ b/iga/githublab.py @@ -46,4 +46,4 @@ def git_release_assets(repo, tag, account_name=None, all_assets=False): if not GITLAB: return github_release_assets(account_name,repo, tag, all_assets) else: - return gitlab_release_assets(repo, tag, all_assets) \ No newline at end of file + return gitlab_release_assets(repo, tag, all_assets) diff --git a/iga/gitlab.py b/iga/gitlab.py index 41f7774..9d037e9 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -99,8 +99,6 @@ def __init__(self, release_dict): self.assets = [GitLabAsset(asset) for asset in self.assets] # Save the original data for debugging purposes. self._json_dict = release_dict -#?license=true - class GitLabRepo(SimpleNamespace): '''Simple data structure corresponding to a GitHub repository JSON object. This object is enhanced with a "files" property that contains a list of @@ -128,16 +126,12 @@ class GitLabFile(SimpleNamespace): def __init__(self, file_dict): super().__init__(**file_dict) - -# Exported module functions. -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ - def gitlab_release(repo_name, tag, test_only=False): '''Return a Release object corresponding to the tagged release in GitHub. If test_only is True, only check existence; don't create a Release object. ''' - endpoint = f'{API_URL}/projects/{repo_name}/releases/{tag}' # Assuming project_url points to the release endpoint + endpoint = f'{API_URL}/projects/{repo_name}/releases/{tag}' if test_only: log('testing for existence: ' + endpoint) return _gitlab_get(endpoint, test_only) @@ -199,14 +193,11 @@ def gitlab_repo_file(repo, tag_name, filename): contents = base64.b64decode(json_dict['content']).decode() if not getattr(repo, '_file_contents', {}): repo._file_contents = {} - # Cache the file contents, so we don't have to get it from GitHub again. + # Cache the file contents, so we don't have to get it from GitLab again. repo._file_contents[filename] = contents log(f'got contents for {filename} (length = {len(contents)} chars)') return contents - - #https://code.jlab.org/api/v4/projects/31/repository/files/Pipfile/raw?ref=0.1.0 - def gitlab_repo_languages(repo): log(f'asking GitHub for list of languages for repo {repo.full_name}') repolink = repo._links.self @@ -234,6 +225,19 @@ def gitlab_repo_contributers(repo): log(f'repo has {len(contributors)} contributors') return contributors +def gitlab_asset_contents(asset_url): + '''Return the raw contents of a release asset file.''' + try: + response = _gitlab_get(asset_url) + return response.content + except KeyboardInterrupt: + raise + except commonpy.exceptions.CommonPyException: + raise GitHubError(f'Failed to download GitHub asset at {asset_url}' + ' – either it does not exist or it is inaccessible.') + except Exception: + raise + def valid_gitlab_release_url(url): '''Check if the provided URL is a valid GitLab release endpoint.''' return _gitlab_get(url, test_only=True) diff --git a/iga/metadata.py b/iga/metadata.py index d1d5d21..f582efb 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -46,8 +46,6 @@ import sys import validators import yaml - - from iga.data_utils import deduplicated, listified, normalized_url, similar_urls from iga.exceptions import MissingData from iga.githublab import ( @@ -72,6 +70,9 @@ from iga.reference import reference, RECOGNIZED_REFERENCE_SCHEMES from iga.text_utils import cleaned_text +import rich_click as click +ctx = click.get_current_context() +GITLAB = ctx.obj.get('gitlab', False) FIELDS = [ "additional_descriptions", @@ -306,8 +307,9 @@ def additional_titles(repo, include_all): 'lang': {'id': 'eng'}, }) if add_repo_name: + title = repo.name if GITLAB else repo.full_name log('adding GitHub repo "full_name" as additional title') - titles.append({'title': cleaned_text(repo.full_name), #AP: gitlab repo.name + titles.append({'title': cleaned_text(title), 'type': {'id': 'alternative-title'}, 'lang': {'id': 'eng'}, }) @@ -451,7 +453,7 @@ def dates(repo, release, include_all): # If we used a different date for the publication_date value than the # release date in GitHub, we add release date as another type of date. pub_date = publication_date(repo, release, include_all) - github_date = arrow.get(release.published_at).format('YYYY-MM-DD') #AP: gitlab release.released_at + github_date = arrow.get(release.released_at if GITLAB else release.published_at).format('YYYY-MM-DD') if pub_date != github_date: log('adding the GitHub release "published_at" date as the "available" date') dates.append({'date': github_date, @@ -734,7 +736,7 @@ def publication_date(repo, release): elif date := repo.cff.get('date-released', ''): log('adding CFF "date-released" as "publication_date"') else: - date = release.published_at #AP: gitlab released_at + date = release.released_at if GITLAB else release.published_at log('adding GitHub repo "published_at" as "publication_date"') return arrow.get(date).format('YYYY-MM-DD') @@ -801,7 +803,7 @@ def id_dict(url, rel_type, res_type): 'scheme': 'url'} log('adding GitHub release "html_url" to "related_identifiers"') - identifiers = [id_dict(release.html_url, 'isidenticalto', 'software')] #AP: gitlab release._links["self"] + identifiers = [id_dict(release._links["self"] if GITLAB else release.html_url, 'isidenticalto', 'software')] # The GitHub repo is what this release is derived from. Note: you would # expect the GitHub repo html_url, the codemeta.json codeRepository, and @@ -832,7 +834,7 @@ def id_dict(url, rel_type, res_type): log('adding CodeMeta "url" to "related_identifiers"') elif homepage_url := repo.cff.get('url', ''): log('adding CFF "url" to "related_identifiers"') - elif include_all and (homepage_url := repo.homepage): #AP: web_url + elif include_all and (homepage_url := repo.web_url if GITLAB else repo.homepage): log('adding GitHub repo "homepage" to "related_identifiers"') if homepage_url: identifiers.append(id_dict(homepage_url, 'isdescribedby', 'other')) @@ -888,9 +890,9 @@ def id_dict(url, rel_type, res_type): # The issues URL is kind of a supplemental resource. if issues_url := repo.codemeta.get('issueTracker', ''): log('adding CodeMeta "issueTracker" to "related_identifiers"') - elif include_all and repo.issues_url: #AP: repo._links["issues"] + elif include_all and (if GITLAB repo._links["issues"] else repo.issues_url): log('adding GitHub repo "issues_url" to "related_identifiers"') - issues_url = f'https://github.com/{repo.full_name}/issues' # AP: repo.name + issues_url = f'https://github.com/{repo.name if GITLAB else repo.full_name}/issues' if issues_url: identifiers.append(id_dict(issues_url, 'issupplementedby', 'other')) @@ -1109,7 +1111,7 @@ def subjects(repo, include_all): if include_all: log('adding GitHub topics to "subjects"') - subjects.update(repo.topics) #AP: ? + subjects.update(repo.topics) # Add repo languages as topics too. if languages := github_repo_languages(repo): @@ -1139,7 +1141,7 @@ def title(repo, release): title += text field = 'CFF "title"' else: - title += repo.full_name #AP: repo.name + title += repo.name if GITLAB else repo.full_name field = 'GitHub repo "full_name"' # Note: better not to use a colon here. A lot of CodeMeta files use a name From 6e2202695b1da8a17476058a3bc8be99d4c2c8ba Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Mon, 15 Jul 2024 10:05:27 -0400 Subject: [PATCH 04/17] added more info. TO DO: Need addiding contributors --- iga/cli.py | 26 +++++++-------- iga/githublab.py | 51 ++++++++++++++++++++++++++++-- iga/gitlab.py | 82 +++++++++++++++++++++++++++++++++++++----------- iga/invenio.py | 6 ++-- iga/metadata.py | 68 ++++++++++++++++++++------------------- 5 files changed, 161 insertions(+), 72 deletions(-) diff --git a/iga/cli.py b/iga/cli.py index f995328..a384900 100644 --- a/iga/cli.py +++ b/iga/cli.py @@ -25,6 +25,7 @@ from iga.githublab import ( valid_release_url, git_release_assets, + git_account_repo_tag, ) from iga.id_utils import is_inveniordm_id @@ -129,12 +130,7 @@ def _read_param_value(ctx, param, value, env_var, thing, required=True): def _read_github_token(ctx, param, value): '''Read the file and set the environment variable GITHUB_TOKEN.''' return _read_param_value(ctx, param, value, 'GITHUB_TOKEN', - 'GitHub personal access token', required=False) - -def _read_gitlab_token(ctx, param, value): - '''Read the file and set the environment variable GITLAB_TOKEN.''' - return _read_param_value(ctx, param, value, 'GITLAB_TOKEN', - 'GitLab personal access token', required=False) + 'GitHub/GitLab personal access token', required=False) def _read_invenio_token(ctx, param, value): '''Read the file and set the environment variable INVENIO_TOKEN.''' @@ -379,7 +375,7 @@ def _list_communities(ctx, param, value): help='GitHub repository name, if not using release URL') # @click.option('--github-token', '-t', metavar='STR', callback=_read_github_token, - help="GitHub acccess token (**avoid – use variable**)") + help="GitHub/GitLab acccess token (**avoid – use variable**)") # @click.option('--gitlab', is_flag=True, help='Use GitLab mode') @@ -659,9 +655,11 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, _alert(ctx, 'The use of a URL and the use of options `--account`' " and `--repo` are mutually exclusive; can't use both.") sys.exit(int(ExitCode.bad_arg)) - elif not valid_release_url(url_or_tag, gitlab): + elif not valid_release_url(url_or_tag): _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) sys.exit(int(ExitCode.bad_arg)) + else: + account, repo, tag = git_account_repo_tag(url_or_tag) elif not gitlab: if not all([account, repo, url_or_tag]): @@ -670,9 +668,9 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, sys.exit(int(ExitCode.bad_arg)) tag = url_or_tag url_or_tag = f'https://api.github.com/{account}/repos/{repo}/releases/tags/{tag}' - if not valid_release_url(url_or_tag, gitlab): - _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) - sys.exit(int(ExitCode.bad_arg)) + if not valid_release_url(url_or_tag): + _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) + sys.exit(int(ExitCode.bad_arg)) elif gitlab: if not (all([gitlab_url, gitlab_projectid, url_or_tag]) or all(gitlab_url, account, repo, url_or_tag)): @@ -682,21 +680,21 @@ def cli(ctx, url_or_tag, all_assets=False, community=None, draft=False, if all([gitlab_url, gitlab_projectid, url_or_tag]): tag = url_or_tag url_or_tag = f'{gitlab_url}/api/v4/projects/{gitlab_projectid}/releases/{tag}' - if not valid_release_url(url_or_tag, gitlab): + if not valid_release_url(url_or_tag): _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) sys.exit(int(ExitCode.bad_arg)) elif all([gitlab_url, account, repo, url_or_tag]): tag = url_or_tag gitlab_projectid = f'{account}%2F{repo}' url_or_tag = f'{gitlab_url}/api/v4/projects/{gitlab_projectid}/releases/{tag}' - if not valid_release_url(url_or_tag, gitlab): + if not valid_release_url(url_or_tag): _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) sys.exit(int(ExitCode.bad_arg)) else: _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) sys.exit(int(ExitCode.bad_arg)) - repo_name = gitlab_projectid + repo = gitlab_projectid else: _alert(ctx, 'Malformed release URL: ' + str(url_or_tag)) sys.exit(int(ExitCode.bad_arg)) diff --git a/iga/githublab.py b/iga/githublab.py index 98b4cd0..d4f3113 100644 --- a/iga/githublab.py +++ b/iga/githublab.py @@ -1,4 +1,5 @@ import rich_click as click +from sidetrack import log from iga.github import ( github_account_repo_tag, @@ -7,6 +8,9 @@ github_release_assets, valid_github_release_url, github_repo_file, + github_repo_filenames, + github_repo_languages, + github_asset_contents ) from iga.gitlab import ( valid_gitlab_release_url, @@ -14,9 +18,16 @@ gitlab_repo, gitlab_repo_file, gitlab_release, + gitlab_account_repo_tag, + gitlab_repo_filenames, + gitlab_repo_languages, + gitlab_asset_contents ) -ctx = click.get_current_context() -GITLAB = ctx.obj.get('gitlab', False) +try: + ctx = click.get_current_context() + GITLAB = ctx.obj.get('gitlab', False) +except Exception as e: + log(f"Error getting GitLab API URL: {e}") def valid_release_url(release_url): if not GITLAB: @@ -24,6 +35,16 @@ def valid_release_url(release_url): else: return valid_gitlab_release_url(release_url) +def git_account_repo_tag(release_url): + '''Return tuple (account, repo name, tag) based on the given web URL.''' + # Example URL: https://code.jlab.org/physdiv/jrdb/inveniordm_jlab/-/releases/0.1.0 + # Note this is not the same as the "release url" below. + if not GITLAB: + return github_account_repo_tag(release_url) + else: + return gitlab_account_repo_tag(release_url) + + def git_release(repo_name, tag, account_name=None): if not GITLAB: return github_release(account_name, repo_name, tag) @@ -35,7 +56,13 @@ def git_repo(repo_name, account_name=None): return github_repo(account_name, repo_name) else: return gitlab_repo(repo_name) - + +def git_repo_filenames(repo, tag): + if not GITLAB: + return github_repo_filenames(repo, tag) + else: + return gitlab_repo_filenames(repo, tag) + def git_repo_file(repo, tag, filename): if not GITLAB: return github_repo_file(repo, tag, filename) @@ -47,3 +74,21 @@ def git_release_assets(repo, tag, account_name=None, all_assets=False): return github_release_assets(account_name,repo, tag, all_assets) else: return gitlab_release_assets(repo, tag, all_assets) + +def git_repo_filenames(repo, tag): + if not GITLAB: + return github_repo_filenames(repo, tag) + else: + return gitlab_repo_filenames(repo, tag) + +def git_repo_languages(repo): + if not GITLAB: + return github_repo_languages(repo) + else: + return gitlab_repo_languages(repo) + +def git_asset_contents(asset): + if not GITLAB: + return github_asset_contents(asset) + else: + return gitlab_asset_contents(asset) diff --git a/iga/gitlab.py b/iga/gitlab.py index 9d037e9..05fca46 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -8,6 +8,7 @@ from types import SimpleNamespace import requests import rich_click as click +from urllib.parse import quote from iga.exceptions import GitHubError, InternalError @@ -17,24 +18,30 @@ name, mean the account will be assumed to be a software bot of some kind.''' API_PATH = '/api/v4/' -ctx = click.get_current_context() -GITLAB = ctx.obj.get('gitlab', False) -GITLAB_URL = ctx.obj.get('gitlab_url', None) -API_URL = f'{GITLAB_URL}/api/v4' +try: + ctx = click.get_current_context() + GITLAB = ctx.obj.get('gitlab', False) + GITLAB_URL = ctx.obj.get('gitlab_url', None) + API_URL = f'{GITLAB_URL}/api/v4' +except Exception as e: + log.error(f"Error getting GitLab API URL: {e}") + class GitLabAPIError(Exception): pass def _gitlab_get(endpoint, test_only=False): headers = {'Accept': 'application/json'} - using_token = 'GITLAB_TOKEN' in os.environ + using_token = True#'GITLAB_TOKEN' in os.environ if using_token: - headers['Authorization'] = f'token {os.environ["GITLAB_TOKEN"]}' + headers['Authorization'] = f'Bearer glpat-3z9T1F3zNa7WNAaireqi' method = 'head' if test_only else 'get' try: if method == 'HEAD': response = requests.head(endpoint, headers=headers) else: response = requests.get(endpoint, headers=headers) + print(response.request.url) + print("reponse URL",response.status_code) if response.status_code == 401: raise GitLabAPIError(f"Unauthorized: Check your GitLab token or permissions. Endpoint: {endpoint}") @@ -54,11 +61,11 @@ def _object_for_gitlab(api_url, cls): response = _gitlab_get(api_url) if not response: return None - log(f'unpacking JSON into object structure from {api_url}') # Create the desired object & add the api url in case it's needed later. obj = cls(response.json()) + print(response.json()) obj.api_url = api_url return obj @@ -93,12 +100,13 @@ def __init__(self, release_dict): super().__init__(**release_dict) if os.environ.get('IGA_RUN_MODE') == 'debug': log('GitHub release data: ' + json.dumps(release_dict, indent=2)) - self.author = GitLabAccount(release_dict['author']) + #self.author = GitLabAccount(release_dict['author']) # ... then convert the dict of the asset (which contains uploader). - self.assets = [GitLabAsset(asset) for asset in self.assets] + #self.assets = [GitLabAsset(asset) for asset in self.assets] # Save the original data for debugging purposes. self._json_dict = release_dict + class GitLabRepo(SimpleNamespace): '''Simple data structure corresponding to a GitHub repository JSON object. This object is enhanced with a "files" property that contains a list of @@ -108,9 +116,10 @@ def __init__(self, repo_dict): super().__init__(**repo_dict) if os.environ.get('IGA_RUN_MODE') == 'debug': log('GitHub repo data: ' + json.dumps(repo_dict, indent=2)) - self.owner = GitLabAccount(repo_dict['owner']) - if repo_dict.get('organization'): - self.organization = GitLabAccount(repo_dict['organization']) + if repo_dict.get('owner',{}): + self.owner = GitLabAccount(repo_dict['owner']) + #if repo_dict.get('organization'): + # self.organization = GitLabAccount(repo_dict['organization']) if repo_dict.get('license'): self.license = GitLabLicense(repo_dict['license']) # Save the original data for debugging purposes. @@ -141,7 +150,8 @@ def gitlab_release(repo_name, tag, test_only=False): def gitlab_repo(repo_name, test_only=False): '''Return a Repo object corresponding to the named repo in GitLab.''' - endpoint = f'{API_URL}/{repo_name}?license=true' + endpoint = f'{API_URL}/projects/{repo_name}?license=true' + print(endpoint) if test_only: log('testing for existence: ' + endpoint) return _gitlab_get(endpoint, test_only) @@ -164,10 +174,14 @@ def gitlab_release_assets(repo_name, tag, all_assets): assets.append(source.url) return assets -def gitlab_repo_filenames(repo_name, tag_name): +def gitlab_repo_filenames(repo, tag_name): '''Return a list of filenames in the repo corresponding to the specified tag.''' - release = gitlab_release(repo_name, tag_name) - files = [file.name for file in release.assets] + endpoint = f'{API_URL}/projects/{repo.id}/repository/tree' + response = _gitlab_get(endpoint) + if not response: + log(f'got no tree or it does not exist') + return '' + files = [res["path"] for res in response.json()] return files def gitlab_repo_file(repo, tag_name, filename): @@ -199,8 +213,8 @@ def gitlab_repo_file(repo, tag_name, filename): return contents def gitlab_repo_languages(repo): - log(f'asking GitHub for list of languages for repo {repo.full_name}') - repolink = repo._links.self + log(f'asking GitHub for list of languages for repo {repo.name}') + repolink = repo._links["self"] endpoint = f'{repolink}/languages' response = _gitlab_get(endpoint) if not response: @@ -240,4 +254,34 @@ def gitlab_asset_contents(asset_url): def valid_gitlab_release_url(url): '''Check if the provided URL is a valid GitLab release endpoint.''' - return _gitlab_get(url, test_only=True) + print("HERE") + #return _gitlab_get(url, test_only=True) + return True + +def gitlab_account_repo_tag(release_url): + '''{gitlab_projectid}/releases/{tag}''' + from urllib.parse import urlparse + parsed = urlparse(release_url) + ctx = click.get_current_context() + ctx.ensure_object(dict) + ctx.obj['gitlab_url'] = parsed.hostname + path = parsed.path + path = path.rstrip('/') + tag = path.split('/')[-1] + y='/'.join(path.split('/')[:-1]) + project_id = y.rstrip("-/releases").lstrip('/') + from urllib.parse import quote + project_id = quote(project_id,safe='') + return ( None, project_id, tag) + +def gitlab_account(account_name): + endpoint = f'{API_URL}/users/{account_name}' + result = _object_for_gitlab(endpoint, GitLabAccount) + if not result: + raise GitLabAPIError(f'Failed to get GitHub account data for {account_name}.' + ' This could be due to a number of causes. Please' + ' check that the account exists, and that the GitHub' + ' access token (if one is being used) is configured' + ' with appropriate permissions to grant access to' + ' user account data.') + return result diff --git a/iga/invenio.py b/iga/invenio.py index 873b7dd..14a6106 100644 --- a/iga/invenio.py +++ b/iga/invenio.py @@ -27,10 +27,10 @@ InvenioRDMError, RecordNotFound, ) -from iga.github import github_asset_contents +from iga.githublab import git_asset_contents from iga.id_utils import normalize_invenio_rdm - + # Exported data structures. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -236,7 +236,7 @@ def invenio_upload(record, asset, print_status): if asset.startswith('http'): filename = _filename_from_asset_url(asset) print_status(f' - Downloading [bold]{filename}[/] from GitHub', end='...') - content = github_asset_contents(asset) + content = git_asset_contents(asset) print_status('done') size = humanize.naturalsize(len(content)) log(f'downloaded {size} bytes of {asset}') diff --git a/iga/metadata.py b/iga/metadata.py index f582efb..00973c4 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -52,6 +52,8 @@ git_repo, git_repo_file, git_release, + git_repo_filenames, + git_repo_languages ) from iga.github import ( github_account, @@ -60,7 +62,6 @@ github_repo, github_repo_contributors, github_repo_file, - github_repo_filenames, github_repo_languages, GitHubError, probable_bot, @@ -72,15 +73,15 @@ import rich_click as click ctx = click.get_current_context() -GITLAB = ctx.obj.get('gitlab', False) +GITLAB = True FIELDS = [ - "additional_descriptions", - "additional_titles", - "contributors", - "creators", +# "additional_descriptions", +# "additional_titles", +# "contributors", +# "creators", "dates", - "description", +# "description", # "formats", # 2023-03-23 not clear we need this. Skip for now. "funding", "identifiers", @@ -145,7 +146,7 @@ def metadata_for_release(account_name, repo_name, tag, all_metadata): # repo object with them so that field extraction functions can access them. repo.codemeta = {} repo.cff = {} - filenames = github_repo_filenames(repo, tag) + filenames = git_repo_filenames(repo, tag) if 'codemeta.json' in filenames: codemeta_file = git_repo_file(repo, tag, 'codemeta.json') try: @@ -218,7 +219,7 @@ def metadata_from_file(file): log(f'metadata in {file} validated to have minimum fields') return metadata - + # Field value functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Summary of the approach: the functions for extracting values from GitHub @@ -560,7 +561,7 @@ def formats(release): return formats -def funding(repo): +def funding(repo, release, include_all): '''Return InvenioRDM "funding references". https://inveniordm.docs.cern.ch/reference/metadata/#funding-references-0-n ''' @@ -663,7 +664,7 @@ def funding(repo): return deduplicated(results) -def identifiers(repo): +def identifiers(repo, release, include_all): '''Return InvenioRDM "alternate identifiers". https://inveniordm.docs.cern.ch/reference/metadata/#alternate-identifiers-0-n @@ -706,7 +707,7 @@ def identifiers(repo): return deduplicated(identifiers) -def languages(): +def languages(repo, release, include_all): '''Return InvenioRDM "languages". https://inveniordm.docs.cern.ch/reference/metadata/#languages-0-n ''' @@ -715,7 +716,7 @@ def languages(): return [{"id": "eng"}] -def locations(): +def locations(repo, release, include_all): '''Return InvenioRDM "locations". https://inveniordm.docs.cern.ch/reference/metadata/#locations-0-n ''' @@ -723,7 +724,7 @@ def locations(): return {} -def publication_date(repo, release): +def publication_date(repo, release, include_all): '''Return InvenioRDM "publication date". https://inveniordm.docs.cern.ch/reference/metadata/#publication-date-1 ''' @@ -741,7 +742,7 @@ def publication_date(repo, release): return arrow.get(date).format('YYYY-MM-DD') -def publisher(): +def publisher(repo, release, include_all): '''Return InvenioRDM "publisher". https://inveniordm.docs.cern.ch/reference/metadata/#publisher-0-1 ''' @@ -809,11 +810,12 @@ def id_dict(url, rel_type, res_type): # expect the GitHub repo html_url, the codemeta.json codeRepository, and # the CFF repository-code all to be the same value, but we can't be sure, # so we have to look at them, and use them in the order of priority. + repo_html=repo.web_url if GITLAB else release.html_url if repo_url := repo.codemeta.get('codeRepository', ''): log('adding CodeMeta "codeRepository" to "related_identifiers"') elif repo_url := repo.cff.get('repository-code', ''): log('adding CFF "repository-code" to "related_identifiers"') - elif include_all and (repo_url := repo.html_url): + elif include_all and (repo_url := repo_html): log('adding GitHub repo "html_url" to "related_identifiers"') if repo_url: identifiers.append(id_dict(repo_url, 'isderivedfrom', 'software')) @@ -880,17 +882,18 @@ def id_dict(url, rel_type, res_type): # The GitHub Pages URL for a repo usually points to documentation or info # about the softare, though we can't tell if it's for THIS release. - if include_all and repo.has_pages: - url = f'https://{repo.owner.login}.github.io/{repo.name}' # AP: ? - if not any(url == item['identifier'] for item in identifiers): - log('adding the repo\'s GitHub Pages URL to "related_identifiers"') - identifiers.append(id_dict(url, 'isdocumentedby', - 'publication-softwaredocumentation')) + #if include_all and repo.has_pages: + # url = f'https://{repo.owner.login}.github.io/{repo.name}' # AP: ? + # if not any(url == item['identifier'] for item in identifiers): + # log('adding the repo\'s GitHub Pages URL to "related_identifiers"') + # identifiers.append(id_dict(url, 'isdocumentedby', + # 'publication-softwaredocumentation')) # The issues URL is kind of a supplemental resource. + repo_issues = repo._links["issues"] if GITLAB else repo.issues_url if issues_url := repo.codemeta.get('issueTracker', ''): log('adding CodeMeta "issueTracker" to "related_identifiers"') - elif include_all and (if GITLAB repo._links["issues"] else repo.issues_url): + elif include_all and (repo_issues): log('adding GitHub repo "issues_url" to "related_identifiers"') issues_url = f'https://github.com/{repo.name if GITLAB else repo.full_name}/issues' if issues_url: @@ -943,7 +946,7 @@ def id_dict(url, rel_type, res_type): return filtered_identifiers -def resource_type(repo): +def resource_type(repo, release, include_all): '''Return InvenioRDM "resource type". https://inveniordm.docs.cern.ch/reference/metadata/#resource-type-1 ''' @@ -957,7 +960,7 @@ def resource_type(repo): return {'id': 'software'} -def rights(repo, release): +def rights(repo, release, include_all): '''Return InvenioRDM "rights (licenses)". https://inveniordm.docs.cern.ch/reference/metadata/#rights-licenses-0-n ''' @@ -1034,7 +1037,7 @@ def rights(repo, release): # GitHub didn't fill in the license info -- maybe it didn't recognize # the license or its format. Try to look for a license file ourselves. - filenames = github_repo_filenames(repo, release.tag_name) + filenames = git_repo_filenames(repo, release.tag_name) for basename in ['LICENSE', 'License', 'license', 'LICENCE', 'Licence', 'licence', 'COPYING', 'COPYRIGHT', 'Copyright', 'copyright']: @@ -1059,7 +1062,7 @@ def sizes(): return [] -def subjects(repo, include_all): +def subjects(repo, release, include_all): '''Return InvenioRDM "subjects". https://inveniordm.docs.cern.ch/reference/metadata/#subjects-0-n ''' @@ -1114,7 +1117,7 @@ def subjects(repo, include_all): subjects.update(repo.topics) # Add repo languages as topics too. - if languages := github_repo_languages(repo): + if languages := git_repo_languages(repo): log('adding GitHub repo languages to "subjects"') for lang in languages: subjects.add(lang) @@ -1129,7 +1132,7 @@ def subjects(repo, include_all): return [{'subject': x} for x in sorted(subjects, key=str.lower)] -def title(repo, release): +def title(repo, release, include_all): '''Return InvenioRDM "title". https://inveniordm.docs.cern.ch/reference/metadata/#title-1 ''' @@ -1156,7 +1159,7 @@ def title(repo, release): return cleaned_text(title) -def version(release): +def version(repo, release, include_all): '''Return InvenioRDM "version". https://inveniordm.docs.cern.ch/reference/metadata/#version-0-1 ''' @@ -1171,7 +1174,6 @@ def version(release): tag = re.sub(r'v(er|version)?[ .]? ?', '', tag) return tag.strip() - # Miscellaneous helper functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1381,12 +1383,12 @@ def _release_author(release): # We can call GitHub's user data API, but it returns very little info # about a user (e.g.,, it gives a name but that name is not broken out # into family & given name), plus sometimes fields are empty. - account = github_account(release.author.login) + account = github_account(release.author.login) #AP: release.author.username return _identity_from_github(account) if account.name else None def _repo_owner(repo): - account = github_account(repo.owner.login) + account = github_account(repo.owner.login) #AP: repo.owner.username or maybe deal with namespace.kind.group? return _identity_from_github(account) From 472e619b7126102009c3b00f733bd909dd48692d Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Tue, 16 Jul 2024 14:54:50 -0400 Subject: [PATCH 05/17] finalize --- iga/github.py | 50 +++++- iga/githublab.py | 28 +++- iga/gitlab.py | 411 +++++++++++++++-------------------------------- iga/metadata.py | 216 ++++++++++++------------- 4 files changed, 294 insertions(+), 411 deletions(-) diff --git a/iga/github.py b/iga/github.py index 4248867..02957a4 100644 --- a/iga/github.py +++ b/iga/github.py @@ -18,8 +18,9 @@ from types import SimpleNamespace from iga.exceptions import GitHubError, InternalError +from iga.name_utils import split_name + - # Constants. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -33,7 +34,7 @@ '''List of words such that, if one of the words is the last word in an account name, mean the account will be assumed to be a software bot of some kind.''' - + # Classes. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -331,7 +332,7 @@ def valid_github_release_url(url): and split_url[6] == 'tag') -def probable_bot(account): +def github_probable_bot(account): '''Return True if this account is probably a bot. Bot accounts on GitHub are supposed to have an explicit type value of "bot" @@ -349,7 +350,48 @@ def probable_bot(account): log(f'account {account.login} looks like it {"is" if is_bot else "is NOT"} a bot') return is_bot - +def identity_from_github(account, role=None): + if account.type == 'User': + if account.name: + (given, family) = split_name(account.name) + person_or_org = {'given_name': given, + 'family_name': family, + 'type': 'personal'} + else: + # The GitHub account record has no name, and InvenioRDM won't pass + # a record without a family name. All we have is the login name. + person_or_org = {'given_name': '', + 'family_name': account.login, + 'type': 'personal'} + + else: + name = account.name.strip() if account.name else '' + person_or_org = {'name': name, + 'type': 'organizational'} + result = {'person_or_org': person_or_org} + if account.company and account.type == 'User': + account.company = account.company.strip() + if account.company.startswith('@'): + # Some people write @foo to indicate org account "foo" in GitHub. + # Grab only the first token after the '@'. + log(f'company for {account.login} account starts with @') + try: + import re + candidate = re.search(r'\w+', account.company).group() + org_account = github_account(candidate) + except GitHubError: + # No luck. Take it as-is. + log(f'failed to find {account.company[1:]} as a GitHub account') + result['affiliations'] = [{'name': account.company}] + else: + log(f'using org {candidate} as affiliation for {account.name}') + result['affiliations'] = [{'name': org_account.name}] + else: + result['affiliations'] = [{'name': account.company}] + if role: + result['role'] = {'id': role} + return result + # Helper functions # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/iga/githublab.py b/iga/githublab.py index d4f3113..e73c658 100644 --- a/iga/githublab.py +++ b/iga/githublab.py @@ -1,5 +1,6 @@ import rich_click as click from sidetrack import log +import os from iga.github import ( github_account_repo_tag, @@ -10,7 +11,9 @@ github_repo_file, github_repo_filenames, github_repo_languages, - github_asset_contents + github_asset_contents, + github_account, + github_repo_contributors ) from iga.gitlab import ( valid_gitlab_release_url, @@ -21,14 +24,18 @@ gitlab_account_repo_tag, gitlab_repo_filenames, gitlab_repo_languages, - gitlab_asset_contents + gitlab_asset_contents, + gitlab_account, + gitlab_repo_contributors ) try: - ctx = click.get_current_context() - GITLAB = ctx.obj.get('gitlab', False) + if os.environ["GITLAB"]: + GITLAB = True except Exception as e: log(f"Error getting GitLab API URL: {e}") +GITLAB = True + def valid_release_url(release_url): if not GITLAB: return valid_github_release_url(release_url) @@ -67,6 +74,7 @@ def git_repo_file(repo, tag, filename): if not GITLAB: return github_repo_file(repo, tag, filename) else: + print("here") return gitlab_repo_file(repo, tag, filename) def git_release_assets(repo, tag, account_name=None, all_assets=False): @@ -75,11 +83,11 @@ def git_release_assets(repo, tag, account_name=None, all_assets=False): else: return gitlab_release_assets(repo, tag, all_assets) -def git_repo_filenames(repo, tag): +def git_account(repo): if not GITLAB: - return github_repo_filenames(repo, tag) + return github_account(repo) else: - return gitlab_repo_filenames(repo, tag) + return gitlab_account(repo) def git_repo_languages(repo): if not GITLAB: @@ -92,3 +100,9 @@ def git_asset_contents(asset): return github_asset_contents(asset) else: return gitlab_asset_contents(asset) + +def git_repo_contributors(repo): + if not GITLAB: + return github_repo_contributors(repo) + else: + return gitlab_repo_contributors(repo) \ No newline at end of file diff --git a/iga/gitlab.py b/iga/gitlab.py index 05fca46..665beeb 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -1,287 +1,128 @@ -import commonpy.exceptions -from commonpy.network_utils import net -import contextlib -from functools import cache -import json -import os -from sidetrack import log -from types import SimpleNamespace -import requests import rich_click as click -from urllib.parse import quote - - -from iga.exceptions import GitHubError, InternalError - -_BOT_NAME_WORDS = ['daemon', 'dependabot', 'dependabot[bot]'] -'''List of words such that, if one of the words is the last word in an account -name, mean the account will be assumed to be a software bot of some kind.''' - -API_PATH = '/api/v4/' -try: - ctx = click.get_current_context() - GITLAB = ctx.obj.get('gitlab', False) - GITLAB_URL = ctx.obj.get('gitlab_url', None) - API_URL = f'{GITLAB_URL}/api/v4' -except Exception as e: - log.error(f"Error getting GitLab API URL: {e}") - -class GitLabAPIError(Exception): - pass - -def _gitlab_get(endpoint, test_only=False): - headers = {'Accept': 'application/json'} - using_token = True#'GITLAB_TOKEN' in os.environ - if using_token: - headers['Authorization'] = f'Bearer glpat-3z9T1F3zNa7WNAaireqi' - method = 'head' if test_only else 'get' - try: - if method == 'HEAD': - response = requests.head(endpoint, headers=headers) - else: - response = requests.get(endpoint, headers=headers) - print(response.request.url) - print("reponse URL",response.status_code) - - if response.status_code == 401: - raise GitLabAPIError(f"Unauthorized: Check your GitLab token or permissions. Endpoint: {endpoint}") - elif response.status_code == 429: - # Too Many Requests error - raise GitLabAPIError(f"Too Many Requests: Rate limit exceeded. Try again later. Endpoint: {endpoint}") - return response - - except requests.exceptions.RequestException as e: - # Handle connection errors or timeouts - raise GitLabAPIError(f"Request failed: {e}") from e - -@cache -def _object_for_gitlab(api_url, cls): - '''Return object of class cls made from the data obtained from the API url.''' - try: - response = _gitlab_get(api_url) - if not response: - return None - log(f'unpacking JSON into object structure from {api_url}') - - # Create the desired object & add the api url in case it's needed later. - obj = cls(response.json()) - print(response.json()) - obj.api_url = api_url - return obj - - except GitLabAPIError as e: - # Handle GitLab API specific errors - log(f'GitLab API Error: {e}') - raise InternalError('Encountered error trying to unpack GitLab data.') from e - - except Exception as ex: - # Handle other unexpected errors - log(f'Error: {ex}') - raise InternalError('Encountered unexpected error trying to unpack GitLab data.') from ex - - -class GitLabAccount(SimpleNamespace): - '''Simple data structure corresponding to a GitHub user or org account.''' - def __init__(self, user_dict): - super().__init__(**user_dict) - if os.environ.get('IGA_RUN_MODE') == 'debug': - log('GitHub user data: ' + json.dumps(user_dict, indent=2)) - # Save the original data for debugging purposes. - self._json_dict = user_dict - -class GitLabAsset(SimpleNamespace): - '''Simple data structure corresponding to a GitHub file asset JSON object.''' - def __init__(self, asset_dict): - super().__init__(**asset_dict) - -class GitLabRelease(SimpleNamespace): - '''Simple data structure corresponding to a GitHub release JSON object.''' - def __init__(self, release_dict): - super().__init__(**release_dict) - if os.environ.get('IGA_RUN_MODE') == 'debug': - log('GitHub release data: ' + json.dumps(release_dict, indent=2)) - #self.author = GitLabAccount(release_dict['author']) - - # ... then convert the dict of the asset (which contains uploader). - #self.assets = [GitLabAsset(asset) for asset in self.assets] - # Save the original data for debugging purposes. - self._json_dict = release_dict - -class GitLabRepo(SimpleNamespace): - '''Simple data structure corresponding to a GitHub repository JSON object. - This object is enhanced with a "files" property that contains a list of - the files in the default branch of the repository.''' - - def __init__(self, repo_dict): - super().__init__(**repo_dict) - if os.environ.get('IGA_RUN_MODE') == 'debug': - log('GitHub repo data: ' + json.dumps(repo_dict, indent=2)) - if repo_dict.get('owner',{}): - self.owner = GitLabAccount(repo_dict['owner']) - #if repo_dict.get('organization'): - # self.organization = GitLabAccount(repo_dict['organization']) - if repo_dict.get('license'): - self.license = GitLabLicense(repo_dict['license']) - # Save the original data for debugging purposes. - self._json_dict = repo_dict - -class GitLabLicense(SimpleNamespace): - '''Simple data structure corresponding to a license object.''' - def __init__(self, license_dict): - super().__init__(**license_dict) - -class GitLabFile(SimpleNamespace): - '''Simple data structure corresponding to a file in a repo.''' - def __init__(self, file_dict): - super().__init__(**file_dict) - -def gitlab_release(repo_name, tag, test_only=False): - '''Return a Release object corresponding to the tagged release in GitHub. - - If test_only is True, only check existence; don't create a Release object. - ''' - endpoint = f'{API_URL}/projects/{repo_name}/releases/{tag}' - if test_only: - log('testing for existence: ' + endpoint) - return _gitlab_get(endpoint, test_only) - - log('getting GitLab release data from ' + endpoint) - return _object_for_gitlab(endpoint, GitLabRelease) - -def gitlab_repo(repo_name, test_only=False): - '''Return a Repo object corresponding to the named repo in GitLab.''' - endpoint = f'{API_URL}/projects/{repo_name}?license=true' - print(endpoint) - if test_only: - log('testing for existence: ' + endpoint) - return _gitlab_get(endpoint, test_only) - - log('getting GitLab release data from ' + endpoint) - return _object_for_gitlab(endpoint, GitLabRelease) - -def gitlab_release_assets(repo_name, tag, all_assets): - '''Return a list of URLs for all the assets associated with the release.''' - - release = gitlab_release(repo_name, tag) - sources = release.assets.sources - assets = [] - for source in sources: - if not all_assets: - if source.format in ['zip']: - assets.append(source.url) - else: - log('option to get all assets is in effect') - assets.append(source.url) - return assets - -def gitlab_repo_filenames(repo, tag_name): - '''Return a list of filenames in the repo corresponding to the specified tag.''' - endpoint = f'{API_URL}/projects/{repo.id}/repository/tree' - response = _gitlab_get(endpoint) - if not response: - log(f'got no tree or it does not exist') - return '' - files = [res["path"] for res in response.json()] - return files - -def gitlab_repo_file(repo, tag_name, filename): - '''Return the text contents of the named file in the repo object. - - The tag_name must be a release tag, and is used to find the version of - the repository corresponding to that tag. - ''' - if filename in getattr(repo, '_files_contents', {}): - log(f'{filename} found in the files of {repo}') - return repo._files_contents[filename] - - endpoint = f'{API_URL}/projects/{repo.id}/repository/files/{filename}/raw?ref={tag_name}' - response = _gitlab_get(endpoint) - if not response: - log(f'got no content for file {filename} or it does not exist') - return '' - json_dict = response.json() - if json_dict['encoding'] != 'base64': - log(f'GitHub file encoding for {filename} is ' + json_dict['encoding']) - raise InternalError('Unimplemented file encoding ' + json_dict['encoding']) - import base64 - contents = base64.b64decode(json_dict['content']).decode() - if not getattr(repo, '_file_contents', {}): - repo._file_contents = {} - # Cache the file contents, so we don't have to get it from GitLab again. - repo._file_contents[filename] = contents - log(f'got contents for {filename} (length = {len(contents)} chars)') - return contents - -def gitlab_repo_languages(repo): - log(f'asking GitHub for list of languages for repo {repo.name}') - repolink = repo._links["self"] - endpoint = f'{repolink}/languages' - response = _gitlab_get(endpoint) - if not response: - log(f'got no content for list of languages for repo {repo}') - return '' - json_dict = response.json() - languages = json_dict.keys() if json_dict else [] - log(f'GitLab lists {len(languages)} languages for the repo') - return languages - -def gitlab_repo_contributers(repo): - repolink = repo._links.self - endpoint = f'{repolink}/repository/contributors' - response = _gitlab_get(endpoint) - if not response: - log(f'got no content for list of contributors for repo {repo}') - return [] - # The JSON data is a list containing a kind of minimal user info dict. - contributors = [] - for user_dict in response.json(): - contributors.append(gitlab_account(user_dict['login'])) - log(f'repo has {len(contributors)} contributors') - return contributors - -def gitlab_asset_contents(asset_url): - '''Return the raw contents of a release asset file.''' - try: - response = _gitlab_get(asset_url) - return response.content - except KeyboardInterrupt: - raise - except commonpy.exceptions.CommonPyException: - raise GitHubError(f'Failed to download GitHub asset at {asset_url}' - ' – either it does not exist or it is inaccessible.') - except Exception: - raise - -def valid_gitlab_release_url(url): - '''Check if the provided URL is a valid GitLab release endpoint.''' - print("HERE") - #return _gitlab_get(url, test_only=True) - return True - -def gitlab_account_repo_tag(release_url): - '''{gitlab_projectid}/releases/{tag}''' - from urllib.parse import urlparse - parsed = urlparse(release_url) - ctx = click.get_current_context() - ctx.ensure_object(dict) - ctx.obj['gitlab_url'] = parsed.hostname - path = parsed.path - path = path.rstrip('/') - tag = path.split('/')[-1] - y='/'.join(path.split('/')[:-1]) - project_id = y.rstrip("-/releases").lstrip('/') - from urllib.parse import quote - project_id = quote(project_id,safe='') - return ( None, project_id, tag) +from sidetrack import log +import os -def gitlab_account(account_name): - endpoint = f'{API_URL}/users/{account_name}' - result = _object_for_gitlab(endpoint, GitLabAccount) - if not result: - raise GitLabAPIError(f'Failed to get GitHub account data for {account_name}.' - ' This could be due to a number of causes. Please' - ' check that the account exists, and that the GitHub' - ' access token (if one is being used) is configured' - ' with appropriate permissions to grant access to' - ' user account data.') - return result +from iga.github import ( + github_account_repo_tag, + github_release, + github_repo, + github_release_assets, + valid_github_release_url, + github_repo_file, + github_repo_filenames, + github_repo_languages, + github_asset_contents, + github_account, + github_repo_contributors, + identity_from_github, + github_probable_bot +) +from iga.gitlab import ( + valid_gitlab_release_url, + gitlab_release_assets, + gitlab_repo, + gitlab_repo_file, + gitlab_release, + gitlab_account_repo_tag, + gitlab_repo_filenames, + gitlab_repo_languages, + gitlab_asset_contents, + gitlab_account, + gitlab_repo_contributors, + identity_from_gitlab, + gitlab_probable_bot +) +class LazyEnvBool: + def __init__(self, var_name): + self.var_name = var_name + + def __bool__(self): + return os.getenv(self.var_name, '').lower() == 'true' + + __nonzero__ = __bool__ # For Python 2 compatibility + +GITLAB = LazyEnvBool('GITLAB') + +def valid_release_url(release_url): + if not os.getenv('GITLAB'): + print("I am here") + return valid_github_release_url(release_url) + else: + return valid_gitlab_release_url(release_url) + +def git_account_repo_tag(release_url): + '''Return tuple (account, repo name, tag) based on the given web URL.''' + # Example URL: https://code.jlab.org/physdiv/jrdb/inveniordm_jlab/-/releases/0.1.0 + # Note this is not the same as the "release url" below. + if not GITLAB: + return github_account_repo_tag(release_url) + else: + return gitlab_account_repo_tag(release_url) + + +def git_release(repo_name, tag, account_name=None): + if not GITLAB: + return github_release(account_name, repo_name, tag) + else: + return gitlab_release(repo_name, tag) + +def git_repo(repo_name, account_name=None): + if not GITLAB: + return github_repo(account_name, repo_name) + else: + return gitlab_repo(repo_name) + +def git_repo_filenames(repo, tag): + if not GITLAB: + return github_repo_filenames(repo, tag) + else: + return gitlab_repo_filenames(repo, tag) + +def git_repo_file(repo, tag, filename): + if not GITLAB: + return github_repo_file(repo, tag, filename) + else: + print("here") + return gitlab_repo_file(repo, tag, filename) + +def git_release_assets(repo, tag, account_name=None, all_assets=False): + if not GITLAB: + return github_release_assets(account_name,repo, tag, all_assets) + else: + return gitlab_release_assets(repo, tag, all_assets) + +def git_account(repo): + if not GITLAB: + return github_account(repo) + else: + return gitlab_account(repo) + +def git_repo_languages(repo): + if not GITLAB: + return github_repo_languages(repo) + else: + return gitlab_repo_languages(repo) + +def git_asset_contents(asset): + if not GITLAB: + return github_asset_contents(asset) + else: + return gitlab_asset_contents(asset) + +def git_repo_contributors(repo): + if not GITLAB: + return github_repo_contributors(repo) + else: + return gitlab_repo_contributors(repo) + +def identity_from_git(account, role=None): + if GITLAB: + return identity_from_gitlab(account, role=None) + else: + return identity_from_github(account, role=role) + +def git_probable_bot(account): + if GITLAB: + return gitlab_probable_bot(account) + else: + return github_probable_bot(account) \ No newline at end of file diff --git a/iga/metadata.py b/iga/metadata.py index 00973c4..bacd353 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -53,35 +53,38 @@ git_repo_file, git_release, git_repo_filenames, - git_repo_languages + git_repo_languages, + git_account, + git_repo_contributors, + identity_from_git, + git_probable_bot ) from iga.github import ( - github_account, github_file_url, - github_release, - github_repo, - github_repo_contributors, - github_repo_file, - github_repo_languages, - GitHubError, - probable_bot, ) from iga.id_utils import detected_id, recognized_scheme from iga.name_utils import split_name, flattened_name from iga.reference import reference, RECOGNIZED_REFERENCE_SCHEMES from iga.text_utils import cleaned_text -import rich_click as click -ctx = click.get_current_context() -GITLAB = True +class LazyEnvBool: + def __init__(self, var_name): + self.var_name = var_name + + def __bool__(self): + return os.getenv(self.var_name, '').lower() == 'true' + + __nonzero__ = __bool__ # For Python 2 compatibility + +GITLAB = LazyEnvBool('GITLAB') FIELDS = [ -# "additional_descriptions", -# "additional_titles", -# "contributors", -# "creators", + "additional_descriptions", + "additional_titles", + "contributors", + "creators", "dates", -# "description", + "description", # "formats", # 2023-03-23 not clear we need this. Skip for now. "funding", "identifiers", @@ -163,7 +166,7 @@ def metadata_for_release(account_name, repo_name, tag, all_metadata): # https://github.com/citation-file-format/citation-file-format/blob/main/schema.json if name in filenames: try: - repo.cff = yaml.safe_load(github_repo_file(repo, tag, name)) + repo.cff = yaml.safe_load(git_repo_file(repo, tag, name)) except KeyboardInterrupt: raise except Exception as ex: # noqa PIE786 @@ -285,7 +288,7 @@ def add(item, role, summary): return deduplicated(descriptions) -def additional_titles(repo, include_all): +def additional_titles(repo, release, include_all): '''Return InvenioRDM "additional titles". https://inveniordm.docs.cern.ch/reference/metadata/#additional-titles-0-n ''' @@ -387,16 +390,25 @@ def contributors(repo, release, include_all): contributors.append(entity) else: log(f'skipping CodeMeta "contributor" {entity} who is in "authors"') - elif include_all and (repo_contributors := github_repo_contributors(repo)): - # If CodeMeta doesn't contain contributors, use the repo's, if any. - # Skip bot accounts. - for account in filterfalse(probable_bot, repo_contributors): - entity = _identity_from_github(account, 'other') - if not any(_entity_match(entity, author) for author in authors): - log(f'adding GitHub repo contributor {entity} as contributor(s)') - contributors.append(entity) - else: - log(f'skipping GitHub repo contributor {entity} who is in "authors"') + + elif include_all: + if not GITLAB: + if (repo_contributors := git_repo_contributors(repo)): + # If CodeMeta doesn't contain contributors, use the repo's, if any. + # Skip bot accounts. + for account in filterfalse(git_probable_bot, repo_contributors): + entity = identity_from_git(account, 'other') + if not any(_entity_match(entity, author) for author in authors): + log(f'adding GitHub repo contributor {entity} as contributor(s)') + contributors.append(entity) + else: + log(f'skipping GitHub repo contributor {entity} who is in "authors"') + else: + if (repo_contributors := git_repo_contributors(repo)): + if repo_contributors: + for author in authors: + print("author", author) + # We're getting data from multiple sources & we might have duplicates. # Deduplicate based on names & roles only. @@ -414,7 +426,7 @@ def contributors(repo, release, include_all): return result -def creators(repo, release, internal_call=False): +def creators(repo, release, include_all, internal_call=False): '''Return InvenioRDM "creators". https://inveniordm.docs.cern.ch/reference/metadata/#creators-1-n ''' @@ -427,7 +439,7 @@ def log_decision(text): # release data, so try them 1st. if authors := listified(repo.codemeta.get('author', [])): log_decision('CodeMeta "author" name(s)') - elif authors := repo.cff.get('author', []): + elif authors := repo.cff.get('authors', []): log_decision('CFF "author" name(s)') if authors: return deduplicated(_entity(x) for x in authors) @@ -436,8 +448,10 @@ def log_decision(text): # author first, followed by the repo owner. if identity := _release_author(release): log_decision('GitHub release author') - elif identity := _repo_owner(repo): + elif (GITLAB and hasattr(repo, 'owner')) and (identity := _repo_owner(repo)): log_decision('GitHub repo owner name') + elif not GITLAB and (identity := _repo_owner(repo)): + log_decision('GitHub repo owner name') if identity: return [identity] @@ -488,7 +502,7 @@ def dates(repo, release, include_all): return dates -def description(repo, release, internal_call=False): +def description(repo, release, include_all, internal_call=False): '''Return InvenioRDM "description". https://inveniordm.docs.cern.ch/reference/metadata/#description-0-1 ''' @@ -502,12 +516,13 @@ def description(repo, release, internal_call=False): # shown by GitHub in those cases, so we try other alternatives after this. # AP: gitlab release body no. - if release.body: - if internal_call: - return release.body.strip() - else: - log('adding GitHub release body text as "description"') - return html_from_md(release.body.strip()) + if not GITLAB: + if release.body: + if internal_call: + return release.body.strip() + else: + log('adding GitHub release body text as "description"') + return html_from_md(release.body.strip()) # CodeMeta releaseNotes can be either text or a URL. If it's a URL, it # often points to a NEWS or ChangeLog or similar file in their repo. @@ -1009,31 +1024,40 @@ def rights(repo, release, include_all): # We didn't recognize license info in the CodeMeta or cff files. # Look into the GitHub repo data to see if GitHub identified a license. - """ - license_url": "https://code.jlab.org/panta/hcana_container_doc/-/blob/main/LICENSE", - "license": { - "key": "apache-2.0", - "name": "Apache License 2.0", - "nickname": null, - "html_url": "https://www.apache.org/licenses/LICENSE-2.0", - "source_url": null - } - """ - if repo.license and repo.license.name != 'Other': - from iga.licenses import LICENSES - log('GitHub has provided license info for the repo – using those values') - spdx_id = repo.license.spdx_id - if spdx_id in INVENIO_LICENSES: - rights = {'id': spdx_id.lower()} + if not GITLAB: + if repo.license and repo.license.name != 'Other': + from iga.licenses import LICENSES + log('GitHub has provided license info for the repo – using those values') + spdx_id = repo.license.spdx_id + if spdx_id in INVENIO_LICENSES: + rights = {'id': spdx_id.lower()} + else: + rights = {'link': repo.license.url, + 'title': {'en': repo.license.name}} + if spdx_id in LICENSES and LICENSES[spdx_id].description: + log(f'adding our own description for license type {spdx_id}') + rights['description'] = {'en': LICENSES[spdx_id].description} + return [rights] else: - rights = {'link': repo.license.url, - 'title': {'en': repo.license.name}} - if spdx_id in LICENSES and LICENSES[spdx_id].description: - log(f'adding our own description for license type {spdx_id}') - rights['description'] = {'en': LICENSES[spdx_id].description} - return [rights] + log('GitHub did not provide license info for this repo') else: - log('GitHub did not provide license info for this repo') + if repo.license and repo.license["name"] != 'Other': + from iga.licenses import LICENSES + log('GitHub has provided license info for the repo – using those values') + key = repo.license['key'] + if key.upper() in INVENIO_LICENSES: + rights = {'id': key.lower()} + else: + rights = {'link': repo.license.url, + 'title': {'en': repo.license["name"]}} + if key in LICENSES and LICENSES[key].description: + log(f'adding our own description for license type {key}') + rights['description'] = {'en': LICENSES[key].description} + return [rights] + else: + log('GitHub did not provide license info for this repo') + + # GitHub didn't fill in the license info -- maybe it didn't recognize # the license or its format. Try to look for a license file ourselves. @@ -1245,7 +1269,7 @@ def _entity_from_string(data, role): 'type': 'organizational'}} elif account := _parsed_github_account(data): # It's the name of an account in GitHub. - result = _identity_from_github(account) + result = identity_from_git(account) else: # We have to parse a single string to guess whether it's the name of # a person or org, and if a person, to split the string into family @@ -1273,14 +1297,17 @@ def _entity_from_dict(data, role): # subset anyway because there's no place in Invenio records to put the rest. person = {} org = {} - type_ = data.get('@type', '') or data.get('type', '') + if not type_: + type_ = 'person' if type_.lower().strip() == 'person': # Deal with field name differences between CodeMeta & CFF. family = data.get('family-names', '') or data.get('familyName', '') given = data.get('given-names', '') or data.get('givenName', '') id = detected_id(data.get('@id', '')) # noqa A001 + if not id: + id = data.get("orcid",'') id_type = recognized_scheme(id) if not (family or given) and id_type == 'orcid': @@ -1378,70 +1405,29 @@ def _entity_match(first, second): and p1.get('given_name', '') == p2.get('given_name', '')) return False - def _release_author(release): # We can call GitHub's user data API, but it returns very little info # about a user (e.g.,, it gives a name but that name is not broken out # into family & given name), plus sometimes fields are empty. - account = github_account(release.author.login) #AP: release.author.username - return _identity_from_github(account) if account.name else None - + account_name = release.author["username"] if GITLAB else release.author.login + account = git_account(account_name) #AP: release.author.username + return identity_from_git(account) if account.name else None def _repo_owner(repo): - account = github_account(repo.owner.login) #AP: repo.owner.username or maybe deal with namespace.kind.group? - return _identity_from_github(account) - - -def _identity_from_github(account, role=None): - if account.type == 'User': - if account.name: - (given, family) = split_name(account.name) - person_or_org = {'given_name': given, - 'family_name': family, - 'type': 'personal'} - else: - # The GitHub account record has no name, and InvenioRDM won't pass - # a record without a family name. All we have is the login name. - person_or_org = {'given_name': '', - 'family_name': account.login, - 'type': 'personal'} - + if GITLAB: + account_name = repo.owner["username"] else: - name = account.name.strip() if account.name else '' - person_or_org = {'name': name, - 'type': 'organizational'} - result = {'person_or_org': person_or_org} - if account.company and account.type == 'User': - account.company = account.company.strip() - if account.company.startswith('@'): - # Some people write @foo to indicate org account "foo" in GitHub. - # Grab only the first token after the '@'. - log(f'company for {account.login} account starts with @') - try: - import re - candidate = re.search(r'\w+', account.company).group() - org_account = github_account(candidate) - except GitHubError: - # No luck. Take it as-is. - log(f'failed to find {account.company[1:]} as a GitHub account') - result['affiliations'] = [{'name': account.company}] - else: - log(f'using org {candidate} as affiliation for {account.name}') - result['affiliations'] = [{'name': org_account.name}] - else: - result['affiliations'] = [{'name': account.company}] - if role: - result['role'] = {'id': role} - return result - + account_name = repo.owner.login + account = git_account(account_name) #AP: repo.owner.username or maybe deal with namespace.kind.group? + return identity_from_git(account) def _parsed_github_account(data): if data.startswith('https://github.com'): # Might be the URL to an account page on GitHub. tail = data.replace('https://github.com/', '') - if '/' not in tail and (account := github_account(tail)): + if '/' not in tail and (account := git_account(tail)): return account - elif len(data.split()) == 1 and (account := github_account(data)): + elif len(data.split()) == 1 and (account := git_account(data)): return account return None From e8fa948a01095a45085ecb3b69079a37293ec00e Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Tue, 16 Jul 2024 14:57:12 -0400 Subject: [PATCH 06/17] finalize --- iga/githublab.py | 40 +++-- iga/gitlab.py | 427 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 341 insertions(+), 126 deletions(-) diff --git a/iga/githublab.py b/iga/githublab.py index e73c658..665beeb 100644 --- a/iga/githublab.py +++ b/iga/githublab.py @@ -13,7 +13,9 @@ github_repo_languages, github_asset_contents, github_account, - github_repo_contributors + github_repo_contributors, + identity_from_github, + github_probable_bot ) from iga.gitlab import ( valid_gitlab_release_url, @@ -26,18 +28,24 @@ gitlab_repo_languages, gitlab_asset_contents, gitlab_account, - gitlab_repo_contributors + gitlab_repo_contributors, + identity_from_gitlab, + gitlab_probable_bot ) -try: - if os.environ["GITLAB"]: - GITLAB = True -except Exception as e: - log(f"Error getting GitLab API URL: {e}") +class LazyEnvBool: + def __init__(self, var_name): + self.var_name = var_name -GITLAB = True + def __bool__(self): + return os.getenv(self.var_name, '').lower() == 'true' + + __nonzero__ = __bool__ # For Python 2 compatibility + +GITLAB = LazyEnvBool('GITLAB') def valid_release_url(release_url): - if not GITLAB: + if not os.getenv('GITLAB'): + print("I am here") return valid_github_release_url(release_url) else: return valid_gitlab_release_url(release_url) @@ -105,4 +113,16 @@ def git_repo_contributors(repo): if not GITLAB: return github_repo_contributors(repo) else: - return gitlab_repo_contributors(repo) \ No newline at end of file + return gitlab_repo_contributors(repo) + +def identity_from_git(account, role=None): + if GITLAB: + return identity_from_gitlab(account, role=None) + else: + return identity_from_github(account, role=role) + +def git_probable_bot(account): + if GITLAB: + return gitlab_probable_bot(account) + else: + return github_probable_bot(account) \ No newline at end of file diff --git a/iga/gitlab.py b/iga/gitlab.py index 665beeb..637a533 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -1,128 +1,323 @@ -import rich_click as click -from sidetrack import log +import commonpy.exceptions +from commonpy.network_utils import net +import contextlib +from functools import cache +import json import os +from sidetrack import log +from types import SimpleNamespace +import requests +import rich_click as click +from urllib.parse import quote -from iga.github import ( - github_account_repo_tag, - github_release, - github_repo, - github_release_assets, - valid_github_release_url, - github_repo_file, - github_repo_filenames, - github_repo_languages, - github_asset_contents, - github_account, - github_repo_contributors, - identity_from_github, - github_probable_bot -) -from iga.gitlab import ( - valid_gitlab_release_url, - gitlab_release_assets, - gitlab_repo, - gitlab_repo_file, - gitlab_release, - gitlab_account_repo_tag, - gitlab_repo_filenames, - gitlab_repo_languages, - gitlab_asset_contents, - gitlab_account, - gitlab_repo_contributors, - identity_from_gitlab, - gitlab_probable_bot -) -class LazyEnvBool: - def __init__(self, var_name): - self.var_name = var_name - - def __bool__(self): - return os.getenv(self.var_name, '').lower() == 'true' - - __nonzero__ = __bool__ # For Python 2 compatibility - -GITLAB = LazyEnvBool('GITLAB') - -def valid_release_url(release_url): - if not os.getenv('GITLAB'): - print("I am here") - return valid_github_release_url(release_url) - else: - return valid_gitlab_release_url(release_url) - -def git_account_repo_tag(release_url): - '''Return tuple (account, repo name, tag) based on the given web URL.''' - # Example URL: https://code.jlab.org/physdiv/jrdb/inveniordm_jlab/-/releases/0.1.0 - # Note this is not the same as the "release url" below. - if not GITLAB: - return github_account_repo_tag(release_url) - else: - return gitlab_account_repo_tag(release_url) - -def git_release(repo_name, tag, account_name=None): - if not GITLAB: - return github_release(account_name, repo_name, tag) - else: - return gitlab_release(repo_name, tag) +from iga.exceptions import GitHubError, InternalError +from iga.name_utils import split_name -def git_repo(repo_name, account_name=None): - if not GITLAB: - return github_repo(account_name, repo_name) - else: - return gitlab_repo(repo_name) -def git_repo_filenames(repo, tag): - if not GITLAB: - return github_repo_filenames(repo, tag) - else: - return gitlab_repo_filenames(repo, tag) +_BOT_NAME_WORDS = ['daemon', 'dependabot', 'dependabot[bot]'] +'''List of words such that, if one of the words is the last word in an account +name, mean the account will be assumed to be a software bot of some kind.''' -def git_repo_file(repo, tag, filename): - if not GITLAB: - return github_repo_file(repo, tag, filename) - else: - print("here") - return gitlab_repo_file(repo, tag, filename) - -def git_release_assets(repo, tag, account_name=None, all_assets=False): - if not GITLAB: - return github_release_assets(account_name,repo, tag, all_assets) - else: - return gitlab_release_assets(repo, tag, all_assets) +API_PATH = '/api/v4/' +GITLAB_URL = 'https://code.jlab.org' +API_URL = f'{GITLAB_URL}/api/v4' -def git_account(repo): - if not GITLAB: - return github_account(repo) - else: - return gitlab_account(repo) +class GitLabAPIError(Exception): + pass -def git_repo_languages(repo): - if not GITLAB: - return github_repo_languages(repo) - else: - return gitlab_repo_languages(repo) +def _gitlab_get(endpoint, test_only=False): + headers = {'Accept': 'application/json'} + using_token = True#'GITLAB_TOKEN' in os.environ + if using_token: + headers['Authorization'] = f'Bearer glpat-3z9T1F3zNa7WNAaireqi' + method = 'head' if test_only else 'get' + try: + if method == 'HEAD': + response = requests.head(endpoint, headers=headers) + else: + response = requests.get(endpoint, headers=headers) -def git_asset_contents(asset): - if not GITLAB: - return github_asset_contents(asset) - else: - return gitlab_asset_contents(asset) - -def git_repo_contributors(repo): - if not GITLAB: - return github_repo_contributors(repo) - else: - return gitlab_repo_contributors(repo) + if response.status_code == 401: + raise GitLabAPIError(f"Unauthorized: Check your GitLab token or permissions. Endpoint: {endpoint}") + elif response.status_code == 429: + # Too Many Requests error + raise GitLabAPIError(f"Too Many Requests: Rate limit exceeded. Try again later. Endpoint: {endpoint}") + return response -def identity_from_git(account, role=None): - if GITLAB: - return identity_from_gitlab(account, role=None) - else: - return identity_from_github(account, role=role) + except requests.exceptions.RequestException as e: + # Handle connection errors or timeouts + raise GitLabAPIError(f"Request failed: {e}") from e + +@cache +def _object_for_gitlab(api_url, cls): + '''Return object of class cls made from the data obtained from the API url.''' + try: + response = _gitlab_get(api_url) + if not response: + return None + log(f'unpacking JSON into object structure from {api_url}') + + # Create the desired object & add the api url in case it's needed later. + obj = cls(response.json()) + print(response.json()) + obj.api_url = api_url + return obj + + except GitLabAPIError as e: + # Handle GitLab API specific errors + log(f'GitLab API Error: {e}') + raise InternalError('Encountered error trying to unpack GitLab data.') from e + + except Exception as ex: + # Handle other unexpected errors + log(f'Error: {ex}') + raise InternalError('Encountered unexpected error trying to unpack GitLab data.') from ex + + +class GitLabAccount(SimpleNamespace): + '''Simple data structure corresponding to a GitHub user or org account.''' + def __init__(self, user_dict): + super().__init__(**user_dict) + if os.environ.get('IGA_RUN_MODE') == 'debug': + log('GitHub user data: ' + json.dumps(user_dict, indent=2)) + # Save the original data for debugging purposes. + self._json_dict = user_dict + +class GitLabAsset(SimpleNamespace): + '''Simple data structure corresponding to a GitHub file asset JSON object.''' + def __init__(self, asset_dict): + super().__init__(**asset_dict) + +class GitLabRelease(SimpleNamespace): + '''Simple data structure corresponding to a GitHub release JSON object.''' + def __init__(self, release_dict): + super().__init__(**release_dict) + if os.environ.get('IGA_RUN_MODE') == 'debug': + log('GitHub release data: ' + json.dumps(release_dict, indent=2)) + print(release_dict) + if release_dict.get('owner',{}): + self.author = GitLabAccount(release_dict['owner']) + + # ... then convert the dict of the asset (which contains uploader). + #self.assets = [GitLabAsset(asset) for asset in self.assets] + # Save the original data for debugging purposes. + self._json_dict = release_dict + +class GitLabRepo(SimpleNamespace): + '''Simple data structure corresponding to a GitHub repository JSON object. + This object is enhanced with a "files" property that contains a list of + the files in the default branch of the repository.''' + + def __init__(self, repo_dict): + super().__init__(**repo_dict) + if os.environ.get('IGA_RUN_MODE') == 'debug': + log('GitHub repo data: ' + json.dumps(repo_dict, indent=2)) + if repo_dict.get('owner',{}): + self.author = GitLabAccount(repo_dict['owner']) + #if repo_dict.get('organization'): + # self.organization = GitLabAccount(repo_dict['organization']) + print(repo_dict) + if repo_dict.get('license'): + self.license = GitLabLicense(repo_dict['license']) + # Save the original data for debugging purposes. + self._json_dict = repo_dict + +class GitLabLicense(SimpleNamespace): + '''Simple data structure corresponding to a license object.''' + def __init__(self, license_dict): + super().__init__(**license_dict) + +class GitLabFile(SimpleNamespace): + '''Simple data structure corresponding to a file in a repo.''' + def __init__(self, file_dict): + super().__init__(**file_dict) + +def gitlab_release(repo_name, tag, test_only=False): + '''Return a Release object corresponding to the tagged release in GitHub. + + If test_only is True, only check existence; don't create a Release object. + ''' + endpoint = f'{API_URL}/projects/{repo_name}/releases/{tag}' + if test_only: + log('testing for existence: ' + endpoint) + return _gitlab_get(endpoint, test_only) + + log('getting GitLab release data from ' + endpoint) + return _object_for_gitlab(endpoint, GitLabRelease) + +def gitlab_repo(repo_name, test_only=False): + '''Return a Repo object corresponding to the named repo in GitLab.''' + endpoint = f'{API_URL}/projects/{repo_name}?license=true' + if test_only: + log('testing for existence: ' + endpoint) + return _gitlab_get(endpoint, test_only) + + log('getting GitLab release data from ' + endpoint) + return _object_for_gitlab(endpoint, GitLabRelease) + +def gitlab_release_assets(repo_name, tag, all_assets): + '''Return a list of URLs for all the assets associated with the release.''' + + release = gitlab_release(repo_name, tag) + sources = release.assets["sources"] + assets = [] + for source in sources: + if not all_assets: + if source["format"] in ['zip']: + assets.append(source["url"]) + else: + log('option to get all assets is in effect') + assets.append(source["url"]) + return assets + +def gitlab_repo_filenames(repo, tag_name): + '''Return a list of filenames in the repo corresponding to the specified tag.''' + endpoint = f'{API_URL}/projects/{repo.id}/repository/tree' + response = _gitlab_get(endpoint) + if not response: + log(f'got no tree or it does not exist') + return '' + files = [res["path"] for res in response.json()] + return files + +def gitlab_repo_file(repo, tag_name, filename): + '''Return the text contents of the named file in the repo object. + + The tag_name must be a release tag, and is used to find the version of + the repository corresponding to that tag. + ''' + if filename in getattr(repo, '_files_contents', {}): + log(f'{filename} found in the files of {repo}') + return repo._files_contents[filename] + + endpoint = f'{API_URL}/projects/{repo.id}/repository/files/{filename}?ref={tag_name}' + print(endpoint) + response = _gitlab_get(endpoint) + if not response: + log(f'got no content for file {filename} or it does not exist') + return '' + json_dict = response.json() + if json_dict['encoding'] != 'base64': + log(f'GitHub file encoding for {filename} is ' + json_dict['encoding']) + raise InternalError('Unimplemented file encoding ' + json_dict['encoding']) + import base64 + contents = base64.b64decode(json_dict['content']).decode() + if not getattr(repo, '_file_contents', {}): + repo._file_contents = {} + # Cache the file contents, so we don't have to get it from GitLab again. + repo._file_contents[filename] = contents + log(f'got contents for {filename} (length = {len(contents)} chars)') + return contents + +def gitlab_repo_languages(repo): + log(f'asking GitHub for list of languages for repo {repo.name}') + repolink = repo._links["self"] + endpoint = f'{repolink}/languages' + response = _gitlab_get(endpoint) + if not response: + log(f'got no content for list of languages for repo {repo}') + return '' + json_dict = response.json() + languages = json_dict.keys() if json_dict else [] + log(f'GitLab lists {len(languages)} languages for the repo') + return languages + +def gitlab_asset_contents(asset_url): + '''Return the raw contents of a release asset file.''' + try: + response = _gitlab_get(asset_url) + return response.content + except KeyboardInterrupt: + raise + except commonpy.exceptions.CommonPyException: + raise GitHubError(f'Failed to download GitHub asset at {asset_url}' + ' – either it does not exist or it is inaccessible.') + except Exception: + raise + +def valid_gitlab_release_url(url): + '''Check if the provided URL is a valid GitLab release endpoint.''' + #return _gitlab_get(url, test_only=True) + return True + +def gitlab_account_repo_tag(release_url): + '''{gitlab_projectid}/releases/{tag}''' + from urllib.parse import urlparse + parsed = urlparse(release_url) + ctx = click.get_current_context() + ctx.ensure_object(dict) + ctx.obj['gitlab_url'] = parsed.hostname + path = parsed.path + path = path.rstrip('/') + tag = path.split('/')[-1] + y='/'.join(path.split('/')[:-1]) + project_id = y.rstrip("-/releases").lstrip('/') + from urllib.parse import quote + project_id = quote(project_id,safe='') + return ( None, project_id, tag) + +def gitlab_account(account_name): + endpoint = f'{API_URL}/users?username={account_name}' #without_project_bots=true + try: + response = _gitlab_get(endpoint) + if not response: + return None + log(f'unpacking JSON into object structure from {endpoint}') + + # Create the desired object & add the api url in case it's needed later. + jsn_response= response.json() + obj = GitLabAccount(jsn_response[0]) + + obj.api_url = endpoint + return obj + + except GitLabAPIError as e: + # Handle GitLab API specific errors + log(f'GitLab API Error: {e}') + raise InternalError('Encountered error trying to unpack GitLab data.') from e + + except Exception as ex: + # Handle other unexpected errors + log(f'Error: {ex}') + raise InternalError('Encountered unexpected error trying to unpack GitLab data.') from ex -def git_probable_bot(account): - if GITLAB: - return gitlab_probable_bot(account) +def gitlab_repo_contributors(repo): + repolink = repo._links["self"] + endpoint = f'{repolink}/repository/contributors' + response = _gitlab_get(endpoint) + if not response: + log(f'got no content for list of contributors for repo {repo}') + return [] + # The JSON data is a list containing a kind of minimal user info dict. + contributors = [] + for user_dict in response.json(): + new_contributor_name = user_dict["name"] + (given, family) = split_name(new_contributor_name) + person_or_org = {'given_name': given, + 'family_name': family, + 'type': 'personal'} + contributors.append(person_or_org) + log(f'repo has {len(contributors)} contributors') + return contributors + +def identity_from_gitlab(account, role=None): + if account.name: + (given, family) = split_name(account.name) + person_or_org = {'given_name': given, + 'family_name': family, + 'type': 'personal'} else: - return github_probable_bot(account) \ No newline at end of file + # The GitHub account record has no name, and InvenioRDM won't pass + # a record without a family name. All we have is the login name. + person_or_org = {'given_name': '', + 'family_name': account.username, + 'type': 'personal'} + result = {'person_or_org': person_or_org} + return result + +def gitlab_probable_bot(account): + #users?username=panta #res[0]["id"] + #users/res[0]["id"] + is_bot = False + return is_bot From 679c1bd55fb0435b0fff4f3baab499f140d9e74b Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Tue, 16 Jul 2024 15:51:02 -0400 Subject: [PATCH 07/17] finalize --- iga/githublab.py | 43 ++++++---- iga/gitlab.py | 204 +++++++++++++++++++++++++++-------------------- iga/metadata.py | 45 +++++++++++ 3 files changed, 189 insertions(+), 103 deletions(-) diff --git a/iga/githublab.py b/iga/githublab.py index 665beeb..8452825 100644 --- a/iga/githublab.py +++ b/iga/githublab.py @@ -1,5 +1,3 @@ -import rich_click as click -from sidetrack import log import os from iga.github import ( @@ -15,7 +13,7 @@ github_account, github_repo_contributors, identity_from_github, - github_probable_bot + github_probable_bot, ) from iga.gitlab import ( valid_gitlab_release_url, @@ -30,35 +28,39 @@ gitlab_account, gitlab_repo_contributors, identity_from_gitlab, - gitlab_probable_bot ) + + class LazyEnvBool: def __init__(self, var_name): self.var_name = var_name def __bool__(self): - return os.getenv(self.var_name, '').lower() == 'true' + return os.getenv(self.var_name, "").lower() == "true" __nonzero__ = __bool__ # For Python 2 compatibility -GITLAB = LazyEnvBool('GITLAB') + +GITLAB = LazyEnvBool("GITLAB") + def valid_release_url(release_url): - if not os.getenv('GITLAB'): + if not os.getenv("GITLAB"): print("I am here") return valid_github_release_url(release_url) else: return valid_gitlab_release_url(release_url) + def git_account_repo_tag(release_url): - '''Return tuple (account, repo name, tag) based on the given web URL.''' + """Return tuple (account, repo name, tag) based on the given web URL.""" # Example URL: https://code.jlab.org/physdiv/jrdb/inveniordm_jlab/-/releases/0.1.0 # Note this is not the same as the "release url" below. if not GITLAB: return github_account_repo_tag(release_url) else: return gitlab_account_repo_tag(release_url) - + def git_release(repo_name, tag, account_name=None): if not GITLAB: @@ -66,63 +68,70 @@ def git_release(repo_name, tag, account_name=None): else: return gitlab_release(repo_name, tag) + def git_repo(repo_name, account_name=None): if not GITLAB: return github_repo(account_name, repo_name) else: return gitlab_repo(repo_name) + def git_repo_filenames(repo, tag): if not GITLAB: return github_repo_filenames(repo, tag) else: return gitlab_repo_filenames(repo, tag) + def git_repo_file(repo, tag, filename): if not GITLAB: return github_repo_file(repo, tag, filename) else: print("here") return gitlab_repo_file(repo, tag, filename) - + + def git_release_assets(repo, tag, account_name=None, all_assets=False): if not GITLAB: - return github_release_assets(account_name,repo, tag, all_assets) + return github_release_assets(account_name, repo, tag, all_assets) else: return gitlab_release_assets(repo, tag, all_assets) + def git_account(repo): if not GITLAB: return github_account(repo) else: return gitlab_account(repo) + def git_repo_languages(repo): if not GITLAB: return github_repo_languages(repo) else: return gitlab_repo_languages(repo) + def git_asset_contents(asset): if not GITLAB: return github_asset_contents(asset) else: return gitlab_asset_contents(asset) - + + def git_repo_contributors(repo): if not GITLAB: return github_repo_contributors(repo) else: return gitlab_repo_contributors(repo) + def identity_from_git(account, role=None): if GITLAB: return identity_from_gitlab(account, role=None) else: return identity_from_github(account, role=role) - + + def git_probable_bot(account): - if GITLAB: - return gitlab_probable_bot(account) - else: - return github_probable_bot(account) \ No newline at end of file + return github_probable_bot(account) diff --git a/iga/gitlab.py b/iga/gitlab.py index 637a533..42d0e32 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -1,36 +1,44 @@ import commonpy.exceptions -from commonpy.network_utils import net -import contextlib -from functools import cache +from functools import cache import json import os -from sidetrack import log -from types import SimpleNamespace +from sidetrack import log +from types import SimpleNamespace import requests -import rich_click as click -from urllib.parse import quote - +from urllib.parse import quote, urlparse from iga.exceptions import GitHubError, InternalError from iga.name_utils import split_name -_BOT_NAME_WORDS = ['daemon', 'dependabot', 'dependabot[bot]'] -'''List of words such that, if one of the words is the last word in an account -name, mean the account will be assumed to be a software bot of some kind.''' +class LazyEnvStr: + def __init__(self, var_name): + self.var_name = var_name + self._value = None + + @property + def value(self): + if self._value is None: + val = os.getenv(self.var_name, '') + self._value = f'{val}/api/v4' + return self._value + + def __str__(self): + return self.value + + +API_URL = LazyEnvStr('GITLAB_URL') -API_PATH = '/api/v4/' -GITLAB_URL = 'https://code.jlab.org' -API_URL = f'{GITLAB_URL}/api/v4' class GitLabAPIError(Exception): pass + def _gitlab_get(endpoint, test_only=False): headers = {'Accept': 'application/json'} - using_token = True#'GITLAB_TOKEN' in os.environ + using_token = 'glpat-3z9T1F3zNa7WNAaireqi' #'GITLAB_TOKEN' in os.environ if using_token: - headers['Authorization'] = f'Bearer glpat-3z9T1F3zNa7WNAaireqi' + headers['Authorization'] = f'Bearer {using_token}' method = 'head' if test_only else 'get' try: if method == 'HEAD': @@ -39,19 +47,24 @@ def _gitlab_get(endpoint, test_only=False): response = requests.get(endpoint, headers=headers) if response.status_code == 401: - raise GitLabAPIError(f"Unauthorized: Check your GitLab token or permissions. Endpoint: {endpoint}") + raise GitLabAPIError( + f'Unauthorized: Check your GitLab token or permissions. Endpoint: {endpoint}' + ) elif response.status_code == 429: # Too Many Requests error - raise GitLabAPIError(f"Too Many Requests: Rate limit exceeded. Try again later. Endpoint: {endpoint}") + raise GitLabAPIError( + f'Too Many Requests: Rate limit exceeded. Try again later. Endpoint: {endpoint}' + ) return response except requests.exceptions.RequestException as e: # Handle connection errors or timeouts - raise GitLabAPIError(f"Request failed: {e}") from e + raise GitLabAPIError(f'Request failed: {e}') from e + @cache def _object_for_gitlab(api_url, cls): - '''Return object of class cls made from the data obtained from the API url.''' + """Return object of class cls made from the data obtained from the API url.""" try: response = _gitlab_get(api_url) if not response: @@ -72,11 +85,14 @@ def _object_for_gitlab(api_url, cls): except Exception as ex: # Handle other unexpected errors log(f'Error: {ex}') - raise InternalError('Encountered unexpected error trying to unpack GitLab data.') from ex + raise InternalError( + 'Encountered unexpected error trying to unpack GitLab data.' + ) from ex class GitLabAccount(SimpleNamespace): - '''Simple data structure corresponding to a GitHub user or org account.''' + """Simple data structure corresponding to a GitHub user or org account.""" + def __init__(self, user_dict): super().__init__(**user_dict) if os.environ.get('IGA_RUN_MODE') == 'debug': @@ -84,38 +100,43 @@ def __init__(self, user_dict): # Save the original data for debugging purposes. self._json_dict = user_dict + class GitLabAsset(SimpleNamespace): - '''Simple data structure corresponding to a GitHub file asset JSON object.''' + """Simple data structure corresponding to a GitHub file asset JSON object.""" + def __init__(self, asset_dict): super().__init__(**asset_dict) + class GitLabRelease(SimpleNamespace): - '''Simple data structure corresponding to a GitHub release JSON object.''' + """Simple data structure corresponding to a GitHub release JSON object.""" + def __init__(self, release_dict): super().__init__(**release_dict) if os.environ.get('IGA_RUN_MODE') == 'debug': log('GitHub release data: ' + json.dumps(release_dict, indent=2)) print(release_dict) - if release_dict.get('owner',{}): + if release_dict.get('owner', {}): self.author = GitLabAccount(release_dict['owner']) # ... then convert the dict of the asset (which contains uploader). - #self.assets = [GitLabAsset(asset) for asset in self.assets] + # self.assets = [GitLabAsset(asset) for asset in self.assets] # Save the original data for debugging purposes. self._json_dict = release_dict + class GitLabRepo(SimpleNamespace): - '''Simple data structure corresponding to a GitHub repository JSON object. + """Simple data structure corresponding to a GitHub repository JSON object. This object is enhanced with a "files" property that contains a list of - the files in the default branch of the repository.''' + the files in the default branch of the repository.""" def __init__(self, repo_dict): super().__init__(**repo_dict) if os.environ.get('IGA_RUN_MODE') == 'debug': log('GitHub repo data: ' + json.dumps(repo_dict, indent=2)) - if repo_dict.get('owner',{}): + if repo_dict.get('owner', {}): self.author = GitLabAccount(repo_dict['owner']) - #if repo_dict.get('organization'): + # if repo_dict.get('organization'): # self.organization = GitLabAccount(repo_dict['organization']) print(repo_dict) if repo_dict.get('license'): @@ -123,21 +144,26 @@ def __init__(self, repo_dict): # Save the original data for debugging purposes. self._json_dict = repo_dict + class GitLabLicense(SimpleNamespace): - '''Simple data structure corresponding to a license object.''' + """Simple data structure corresponding to a license object.""" + def __init__(self, license_dict): super().__init__(**license_dict) + class GitLabFile(SimpleNamespace): - '''Simple data structure corresponding to a file in a repo.''' + """Simple data structure corresponding to a file in a repo.""" + def __init__(self, file_dict): super().__init__(**file_dict) + def gitlab_release(repo_name, tag, test_only=False): - '''Return a Release object corresponding to the tagged release in GitHub. + """Return a Release object corresponding to the tagged release in GitHub. If test_only is True, only check existence; don't create a Release object. - ''' + """ endpoint = f'{API_URL}/projects/{repo_name}/releases/{tag}' if test_only: log('testing for existence: ' + endpoint) @@ -146,8 +172,9 @@ def gitlab_release(repo_name, tag, test_only=False): log('getting GitLab release data from ' + endpoint) return _object_for_gitlab(endpoint, GitLabRelease) + def gitlab_repo(repo_name, test_only=False): - '''Return a Repo object corresponding to the named repo in GitLab.''' + """Return a Repo object corresponding to the named repo in GitLab.""" endpoint = f'{API_URL}/projects/{repo_name}?license=true' if test_only: log('testing for existence: ' + endpoint) @@ -156,44 +183,49 @@ def gitlab_repo(repo_name, test_only=False): log('getting GitLab release data from ' + endpoint) return _object_for_gitlab(endpoint, GitLabRelease) + def gitlab_release_assets(repo_name, tag, all_assets): - '''Return a list of URLs for all the assets associated with the release.''' + """Return a list of URLs for all the assets associated with the release.""" release = gitlab_release(repo_name, tag) - sources = release.assets["sources"] + sources = release.assets['sources'] assets = [] for source in sources: if not all_assets: - if source["format"] in ['zip']: - assets.append(source["url"]) + if source['format'] in ['zip']: + assets.append(source['url']) else: log('option to get all assets is in effect') - assets.append(source["url"]) + assets.append(source['url']) return assets + def gitlab_repo_filenames(repo, tag_name): - '''Return a list of filenames in the repo corresponding to the specified tag.''' + """Return a list of filenames in the repo corresponding to the specified tag.""" endpoint = f'{API_URL}/projects/{repo.id}/repository/tree' - response = _gitlab_get(endpoint) + response = _gitlab_get(endpoint) if not response: - log(f'got no tree or it does not exist') + log('got no tree or it does not exist') return '' - files = [res["path"] for res in response.json()] + files = [res['path'] for res in response.json()] return files + def gitlab_repo_file(repo, tag_name, filename): - '''Return the text contents of the named file in the repo object. + """Return the text contents of the named file in the repo object. The tag_name must be a release tag, and is used to find the version of the repository corresponding to that tag. - ''' + """ if filename in getattr(repo, '_files_contents', {}): log(f'{filename} found in the files of {repo}') return repo._files_contents[filename] - endpoint = f'{API_URL}/projects/{repo.id}/repository/files/{filename}?ref={tag_name}' + endpoint = ( + f'{API_URL}/projects/{repo.id}/repository/files/{filename}?ref={tag_name}' + ) print(endpoint) - response = _gitlab_get(endpoint) + response = _gitlab_get(endpoint) if not response: log(f'got no content for file {filename} or it does not exist') return '' @@ -202,6 +234,7 @@ def gitlab_repo_file(repo, tag_name, filename): log(f'GitHub file encoding for {filename} is ' + json_dict['encoding']) raise InternalError('Unimplemented file encoding ' + json_dict['encoding']) import base64 + contents = base64.b64decode(json_dict['content']).decode() if not getattr(repo, '_file_contents', {}): repo._file_contents = {} @@ -210,9 +243,10 @@ def gitlab_repo_file(repo, tag_name, filename): log(f'got contents for {filename} (length = {len(contents)} chars)') return contents + def gitlab_repo_languages(repo): log(f'asking GitHub for list of languages for repo {repo.name}') - repolink = repo._links["self"] + repolink = repo._links['self'] endpoint = f'{repolink}/languages' response = _gitlab_get(endpoint) if not response: @@ -223,42 +257,44 @@ def gitlab_repo_languages(repo): log(f'GitLab lists {len(languages)} languages for the repo') return languages + def gitlab_asset_contents(asset_url): - '''Return the raw contents of a release asset file.''' + """Return the raw contents of a release asset file.""" try: response = _gitlab_get(asset_url) return response.content except KeyboardInterrupt: raise except commonpy.exceptions.CommonPyException: - raise GitHubError(f'Failed to download GitHub asset at {asset_url}' - ' – either it does not exist or it is inaccessible.') + raise GitHubError( + f'Failed to download GitHub asset at {asset_url}' + ' – either it does not exist or it is inaccessible.' + ) except Exception: raise + def valid_gitlab_release_url(url): - '''Check if the provided URL is a valid GitLab release endpoint.''' - #return _gitlab_get(url, test_only=True) + """Check if the provided URL is a valid GitLab release endpoint.""" + # return _gitlab_get(url, test_only=True) return True + def gitlab_account_repo_tag(release_url): - '''{gitlab_projectid}/releases/{tag}''' - from urllib.parse import urlparse + """{gitlab_projectid}/releases/{tag}""" parsed = urlparse(release_url) - ctx = click.get_current_context() - ctx.ensure_object(dict) - ctx.obj['gitlab_url'] = parsed.hostname + os.environ['GITLAB_URL'] = f'{parsed.scheme}://{parsed.netloc}' path = parsed.path path = path.rstrip('/') tag = path.split('/')[-1] - y='/'.join(path.split('/')[:-1]) - project_id = y.rstrip("-/releases").lstrip('/') - from urllib.parse import quote - project_id = quote(project_id,safe='') - return ( None, project_id, tag) + y = '/'.join(path.split('/')[:-1]) + project_id = y.rstrip('-/releases').lstrip('/') + project_id = quote(project_id, safe='') + return (None, project_id, tag) + def gitlab_account(account_name): - endpoint = f'{API_URL}/users?username={account_name}' #without_project_bots=true + endpoint = f'{API_URL}/users?username={account_name}' # without_project_bots=true try: response = _gitlab_get(endpoint) if not response: @@ -266,9 +302,9 @@ def gitlab_account(account_name): log(f'unpacking JSON into object structure from {endpoint}') # Create the desired object & add the api url in case it's needed later. - jsn_response= response.json() + jsn_response = response.json() obj = GitLabAccount(jsn_response[0]) - + obj.api_url = endpoint return obj @@ -280,10 +316,13 @@ def gitlab_account(account_name): except Exception as ex: # Handle other unexpected errors log(f'Error: {ex}') - raise InternalError('Encountered unexpected error trying to unpack GitLab data.') from ex - + raise InternalError( + 'Encountered unexpected error trying to unpack GitLab data.' + ) from ex + + def gitlab_repo_contributors(repo): - repolink = repo._links["self"] + repolink = repo._links['self'] endpoint = f'{repolink}/repository/contributors' response = _gitlab_get(endpoint) if not response: @@ -292,32 +331,25 @@ def gitlab_repo_contributors(repo): # The JSON data is a list containing a kind of minimal user info dict. contributors = [] for user_dict in response.json(): - new_contributor_name = user_dict["name"] + new_contributor_name = user_dict['name'] (given, family) = split_name(new_contributor_name) - person_or_org = {'given_name': given, - 'family_name': family, - 'type': 'personal'} + person_or_org = {'given_name': given, 'family_name': family, 'type': 'personal'} contributors.append(person_or_org) log(f'repo has {len(contributors)} contributors') return contributors + def identity_from_gitlab(account, role=None): if account.name: (given, family) = split_name(account.name) - person_or_org = {'given_name': given, - 'family_name': family, - 'type': 'personal'} + person_or_org = {'given_name': given, 'family_name': family, 'type': 'personal'} else: # The GitHub account record has no name, and InvenioRDM won't pass # a record without a family name. All we have is the login name. - person_or_org = {'given_name': '', - 'family_name': account.username, - 'type': 'personal'} + person_or_org = { + 'given_name': '', + 'family_name': account.username, + 'type': 'personal', + } result = {'person_or_org': person_or_org} return result - -def gitlab_probable_bot(account): - #users?username=panta #res[0]["id"] - #users/res[0]["id"] - is_bot = False - return is_bot diff --git a/iga/metadata.py b/iga/metadata.py index bacd353..8afd29a 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -78,6 +78,31 @@ def __bool__(self): GITLAB = LazyEnvBool('GITLAB') +# It's useful to understand the context of what's going on. The record stored +# in InvenioRDM may have these top-level fields (but might not contain all): +# +# { +# "$schema": "local://records/record-vX.Y.Z.json", +# "id": "q5jr8-hny72", +# "pid": { ... }, +# "pids" : { ... }, +# "parent": { ... }, +# "access" : { ... }, +# "metadata" : { ... }, +# "files" : { ... }, +# "tombstone" : { ... }, +# "created": "...", +# "updated": "...", +# } +# +# However, what is uploaded to an InvenioRDM server should only contain the +# 'metadata' field, because of the other fields above are added by the system. +# Consequently, IGA only needs to construct the 'metadata' field value. I.e., +# referring to https://inveniordm.docs.cern.ch/reference/metadata, we are only +# concerned with https://inveniordm.docs.cern.ch/reference/metadata/#metadata +# +# The following is the full set of possible subfields in "metadata". + FIELDS = [ "additional_descriptions", "additional_titles", @@ -102,6 +127,26 @@ def __bool__(self): "version", ] +# Not all of these need to be provided. Based on the test cases in +# https://github.com/inveniosoftware/invenio-rdm-records, the minimum set of +# fields that needs to be provided seems to be this: +# +# { +# "metadata": { +# "resource_type": { "id": "XYZ", ... }, # note below +# "title": "ABC", +# "creators": [ +# { +# "person_or_org": { +# "family_name": "A", +# "given_name": "B", +# "type": "C", +# } +# }, +# ], +# "publication_date": "...date...", +# } + REQUIRED_FIELDS = [ "creators", "publication_date", From 24a0c1194e88397c85ec4e788c97e625724aa5a7 Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Tue, 16 Jul 2024 16:30:55 -0400 Subject: [PATCH 08/17] finalize --- iga/githublab.py | 17 ++++++++++++----- iga/gitlab.py | 8 +++++++- iga/metadata.py | 9 ++++----- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/iga/githublab.py b/iga/githublab.py index 8452825..64f6087 100644 --- a/iga/githublab.py +++ b/iga/githublab.py @@ -14,6 +14,7 @@ github_repo_contributors, identity_from_github, github_probable_bot, + github_file_url, ) from iga.gitlab import ( valid_gitlab_release_url, @@ -28,6 +29,7 @@ gitlab_account, gitlab_repo_contributors, identity_from_gitlab, + gitlab_file_url, ) @@ -36,17 +38,16 @@ def __init__(self, var_name): self.var_name = var_name def __bool__(self): - return os.getenv(self.var_name, "").lower() == "true" + return os.getenv(self.var_name, '').lower() == 'true' __nonzero__ = __bool__ # For Python 2 compatibility -GITLAB = LazyEnvBool("GITLAB") +GITLAB = LazyEnvBool('GITLAB') def valid_release_url(release_url): - if not os.getenv("GITLAB"): - print("I am here") + if not GITLAB: return valid_github_release_url(release_url) else: return valid_gitlab_release_url(release_url) @@ -87,7 +88,6 @@ def git_repo_file(repo, tag, filename): if not GITLAB: return github_repo_file(repo, tag, filename) else: - print("here") return gitlab_repo_file(repo, tag, filename) @@ -135,3 +135,10 @@ def identity_from_git(account, role=None): def git_probable_bot(account): return github_probable_bot(account) + + +def git_file_url(repo, filename, tag): + if not GITLAB: + return github_file_url(repo, filename) + else: + return gitlab_file_url(repo, filename, tag) diff --git a/iga/gitlab.py b/iga/gitlab.py index 42d0e32..31aba21 100644 --- a/iga/gitlab.py +++ b/iga/gitlab.py @@ -36,7 +36,7 @@ class GitLabAPIError(Exception): def _gitlab_get(endpoint, test_only=False): headers = {'Accept': 'application/json'} - using_token = 'glpat-3z9T1F3zNa7WNAaireqi' #'GITLAB_TOKEN' in os.environ + using_token = 'GITLAB_TOKEN' in os.environ if using_token: headers['Authorization'] = f'Bearer {using_token}' method = 'head' if test_only else 'get' @@ -244,6 +244,12 @@ def gitlab_repo_file(repo, tag_name, filename): return contents +def gitlab_file_url(repo, filename, tag): + """Return the URL of the named file in the repo.""" + endpoint = f'{API_URL}/projects/{repo.id}/repository/blobs/{filename}?ref={tag}' + return endpoint + + def gitlab_repo_languages(repo): log(f'asking GitHub for list of languages for repo {repo.name}') repolink = repo._links['self'] diff --git a/iga/metadata.py b/iga/metadata.py index 8afd29a..7b58672 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -57,11 +57,10 @@ git_account, git_repo_contributors, identity_from_git, - git_probable_bot -) -from iga.github import ( - github_file_url, + git_probable_bot, + git_file_url ) + from iga.id_utils import detected_id, recognized_scheme from iga.name_utils import split_name, flattened_name from iga.reference import reference, RECOGNIZED_REFERENCE_SCHEMES @@ -1116,7 +1115,7 @@ def rights(repo, release, include_all): # There's no safe way to summarize arbitrary license text, # so we can't provide a 'description' field value. rights = [{'title': {'en': 'License'}, - 'link': github_file_url(repo, basename + ext)}] + 'link': git_file_url(repo, basename + ext, release.tag_name)}] break else: continue From 41cbafda0381fc0bcdc454b9819e4515fde54dbb Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Tue, 16 Jul 2024 16:44:07 -0400 Subject: [PATCH 09/17] finalize --- iga/invenio.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/iga/invenio.py b/iga/invenio.py index 14a6106..841f758 100644 --- a/iga/invenio.py +++ b/iga/invenio.py @@ -17,18 +17,18 @@ from sidetrack import log import socket import os -from os import path +from os import path import humanize - import iga -from iga.exceptions import ( +from iga.exceptions import ( InternalError, InvenioRDMError, RecordNotFound, ) -from iga.githublab import git_asset_contents -from iga.id_utils import normalize_invenio_rdm +from iga.githublab import git_asset_contents +from iga.id_utils import normalize_invenio_rdm + # Exported data structures. @@ -88,6 +88,7 @@ def __gt__(self, other): def __ge__(self, other): return not self.__lt__(self, other) + # Exported module functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -363,6 +364,7 @@ def invenio_communities(): log(f'we got {pluralized("community", communities, True)}') return communities + # Miscellaneous helper functions. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 0b7846f09e2cc5d246a3529e48cb3d40579420f6 Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Tue, 16 Jul 2024 16:45:50 -0400 Subject: [PATCH 10/17] finalize --- iga/metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iga/metadata.py b/iga/metadata.py index 7b58672..d2d9b50 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -1454,7 +1454,7 @@ def _release_author(release): # about a user (e.g.,, it gives a name but that name is not broken out # into family & given name), plus sometimes fields are empty. account_name = release.author["username"] if GITLAB else release.author.login - account = git_account(account_name) #AP: release.author.username + account = git_account(account_name) return identity_from_git(account) if account.name else None def _repo_owner(repo): @@ -1462,7 +1462,7 @@ def _repo_owner(repo): account_name = repo.owner["username"] else: account_name = repo.owner.login - account = git_account(account_name) #AP: repo.owner.username or maybe deal with namespace.kind.group? + account = git_account(account_name) return identity_from_git(account) def _parsed_github_account(data): From 2620167bcb27e4fa54c69ad9374478a7ea36c934 Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Wed, 17 Jul 2024 10:49:55 -0400 Subject: [PATCH 11/17] finalize --- iga/metadata.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/iga/metadata.py b/iga/metadata.py index d2d9b50..4f7d9e7 100644 --- a/iga/metadata.py +++ b/iga/metadata.py @@ -434,7 +434,6 @@ def contributors(repo, release, include_all): contributors.append(entity) else: log(f'skipping CodeMeta "contributor" {entity} who is in "authors"') - elif include_all: if not GITLAB: if (repo_contributors := git_repo_contributors(repo)): @@ -532,7 +531,7 @@ def dates(repo, release, include_all): # to the GitHub repo "updated_at" date. if mod_date := repo.codemeta.get('dateModified', ''): log('adding the CodeMeta "dateModified" as the "updated" date') - elif include_all and (mod_date := repo.updated_at): #AP: gitlab what + elif include_all and (mod_date := repo.updated_at): log('adding the GitHub repo "updated_at" date as the "updated" date') if mod_date: dates.append({'date': arrow.get(mod_date).format('YYYY-MM-DD'), @@ -862,7 +861,7 @@ def id_dict(url, rel_type, res_type): 'resource_type': {'id': res_type}, 'scheme': 'url'} - log('adding GitHub release "html_url" to "related_identifiers"') + log('adding GitHub/GitLab release "html_url" to "related_identifiers"') identifiers = [id_dict(release._links["self"] if GITLAB else release.html_url, 'isidenticalto', 'software')] # The GitHub repo is what this release is derived from. Note: you would @@ -875,7 +874,7 @@ def id_dict(url, rel_type, res_type): elif repo_url := repo.cff.get('repository-code', ''): log('adding CFF "repository-code" to "related_identifiers"') elif include_all and (repo_url := repo_html): - log('adding GitHub repo "html_url" to "related_identifiers"') + log('adding GitHub/GitLab repo "html_url" to "related_identifiers"') if repo_url: identifiers.append(id_dict(repo_url, 'isderivedfrom', 'software')) @@ -896,7 +895,7 @@ def id_dict(url, rel_type, res_type): elif homepage_url := repo.cff.get('url', ''): log('adding CFF "url" to "related_identifiers"') elif include_all and (homepage_url := repo.web_url if GITLAB else repo.homepage): - log('adding GitHub repo "homepage" to "related_identifiers"') + log('adding GitHub/GitLab repo "homepage" to "related_identifiers"') if homepage_url: identifiers.append(id_dict(homepage_url, 'isdescribedby', 'other')) @@ -939,22 +938,25 @@ def id_dict(url, rel_type, res_type): identifiers.append(id_dict(url, 'isdocumentedby', 'publication-softwaredocumentation')) - # The GitHub Pages URL for a repo usually points to documentation or info - # about the softare, though we can't tell if it's for THIS release. - #if include_all and repo.has_pages: - # url = f'https://{repo.owner.login}.github.io/{repo.name}' # AP: ? - # if not any(url == item['identifier'] for item in identifiers): - # log('adding the repo\'s GitHub Pages URL to "related_identifiers"') - # identifiers.append(id_dict(url, 'isdocumentedby', - # 'publication-softwaredocumentation')) + if not GITLAB: + # The GitHub Pages URL for a repo usually points to documentation or info + # about the softare, though we can't tell if it's for THIS release. + if include_all and repo.has_pages: + url = f'https://{repo.owner.login}.github.io/{repo.name}' + if not any(url == item['identifier'] for item in identifiers): + log('adding the repo\'s GitHub Pages URL to "related_identifiers"') + identifiers.append(id_dict(url, 'isdocumentedby', + 'publication-softwaredocumentation')) + + # todo: check gitlab pages # The issues URL is kind of a supplemental resource. repo_issues = repo._links["issues"] if GITLAB else repo.issues_url if issues_url := repo.codemeta.get('issueTracker', ''): log('adding CodeMeta "issueTracker" to "related_identifiers"') elif include_all and (repo_issues): - log('adding GitHub repo "issues_url" to "related_identifiers"') - issues_url = f'https://github.com/{repo.name if GITLAB else repo.full_name}/issues' + log('adding GitHub/GitLab repo "issues_url" to "related_identifiers"') + issues_url = repo_issues if issues_url: identifiers.append(id_dict(issues_url, 'issupplementedby', 'other')) @@ -1459,10 +1461,10 @@ def _release_author(release): def _repo_owner(repo): if GITLAB: - account_name = repo.owner["username"] + account_name = repo.owner["username"] else: account_name = repo.owner.login - account = git_account(account_name) + account = git_account(account_name) return identity_from_git(account) def _parsed_github_account(data): From ada9e484f7a5a7be8d6c773af70d1c07e422b60a Mon Sep 17 00:00:00 2001 From: Anil Panta Date: Wed, 17 Jul 2024 10:55:30 -0400 Subject: [PATCH 12/17] add optionhighlighter --- dev/scripts/filter-github-orgs | 10 +++++++++- dev/scripts/list-github-orgs | 10 +++++++++- iga/cli.py | 26 +++++++++++++++++--------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/dev/scripts/filter-github-orgs b/dev/scripts/filter-github-orgs index 456cdf0..65105a4 100755 --- a/dev/scripts/filter-github-orgs +++ b/dev/scripts/filter-github-orgs @@ -451,8 +451,16 @@ def _alert(ctx, msg, print_usage=True): STYLE_OPTION, STYLE_ARGUMENT, STYLE_SWITCH, - OptionHighlighter, ) + from rich.highlighter import RegexHighlighter + class OptionHighlighter(RegexHighlighter): + """Highlights our special options.""" + + highlights = [ + r"(^|[^\w\-])(?P-([^\W0-9][\w\-]*\w|[^\W0-9]))", + r"(^|[^\w\-])(?P