diff --git a/.gitignore b/.gitignore index 37984fc9..73929920 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,5 @@ docker/ingest-api/nginx/conf.d venv-hm-ingest-api/* .vscode/bookmarks.json .vscode/settings.json +src/venv-hm-ingest-api +.vscode/launch.json \ No newline at end of file diff --git a/src/TSV_helper.py b/src/TSV_helper.py new file mode 100644 index 00000000..fcc7ecee --- /dev/null +++ b/src/TSV_helper.py @@ -0,0 +1,25 @@ +import logging +from pathlib import Path +import csv +# @MAX Is this the right way to get this in here? with a _helper? OR should this go in Utils? +# OR an upcomming Contributors Helper? +from hubmap_commons.hubmap_const import HubmapConst + +# Set logging format and level (default is warning) +# All the API logging is forwarded to the uWSGI server and gets written into the log file `uwsgi-ingest-api.log` +# Log rotation is handled via logrotate on the host system with a configuration file +# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes +logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.DEBUG, + datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + +class TSVError(Exception): + def __init__(self, error): + self.errors = f"{list(error.keys())[0]}: {list(error.values())[0]}" + +def tsv_reader_wrapper(path, encoding: str) -> list: + with open(path) as f: + rows = list(csv.DictReader(f, dialect="excel-tab")) + # row = list(csv.DictReader(f, dialect="excel-tab")) + f.close() + return rows diff --git a/src/api/cedar_api.py b/src/api/cedar_api.py new file mode 100644 index 00000000..ec50a935 --- /dev/null +++ b/src/api/cedar_api.py @@ -0,0 +1,57 @@ +import requests +from requests.auth import HTTPBasicAuth +from flask import Flask, current_app, request, json, Response +import logging + +logger = logging.getLogger(__name__) + + +# DataCite REST API Guide: +# https://support.datacite.org/reference/dois-2 +class CEDARApi: # @MAX Best Practice when the Noun of the API (or other caps acro) is also caps? + + + def __init__(self): + # SWITCH TO ENV VARS + # hubmap_APIkey = "a92e621d1ced5925cbaa0c823a85f13492f86f57e78523035b51f3205eada386" + hubmap_APIkey = current_app.config['CEDAR_API_KEY'] + self.auth = HTTPBasicAuth('apiKey', hubmap_APIkey) + self.ssl_verification_enabed = False # @MAX Needed + + + # Schema Versions Retrieval + # curl -X GET --header 'Accept: application/json' --header 'Authorization: apiKey a92e621d1ced5925cbaa0c823a85f13492f86f57e78523035b51f3205eada386' 'https://resource.metadatacenter.org/templates/https%3A%2F%2Frepo.metadatacenter.org%2Ftemplates%2F [94dae6f8-0756-4ab0-a47b-138e446a9501 ]/versions' + + def get_schema_details(self, schema_id: str) -> object: + logger.debug(f"======get_schema_details: {schema_id}======") + cedar_api_url = "https://resource.metadatacenter.org/templates/" + cedar_repo_url = "https%3A%2F%2Frepo.metadatacenter.org%2Ftemplates%2F"+schema_id + cedar_versions_url = cedar_api_url+cedar_repo_url+"/versions" + response = requests.get( + url=f"{cedar_versions_url}", + headers={ + 'Accept': 'application/json', + 'Authorization': 'apiKey '+current_app.config['CEDAR_API_KEY'], + }, + verify=self.ssl_verification_enabed + ) + response_JSON = response.json() + return response.json() + + # return Response(response.text, response.status_code) + # return response + + # # @MAX: Best practice: Should I limit this to ONLY the Calls going in and out, and meddle with + # # the data in the main code? Or is it ok to have some pre-processing here? + # def get_schema_versions(self, schema_id: str) -> object: + # logger.debug(f"======get_schema_versions: {schema_id}======") + + # schema_details = get_schema_details(schema_id) + + # return response + +class CEDARApiException(Exception): + + def __init__(self, message, error_code=None): + super().__init__(message) + self.error_code = error_code \ No newline at end of file diff --git a/src/app.py b/src/app.py index 02267caf..6a3caa0d 100644 --- a/src/app.py +++ b/src/app.py @@ -2945,7 +2945,8 @@ def files_exist(uuid, data_access_level, group_name, metadata=False): parser = argparse.ArgumentParser() parser.add_argument("-p", "--port") args = parser.parse_args() - port = 8484 + # port = 8484 + port = 5000 if args.port: port = int(args.port) app.run(port=port, host='0.0.0.0') diff --git a/src/routes/validation/__init__.py b/src/routes/validation/__init__.py index c8d53d2c..953d08f9 100644 --- a/src/routes/validation/__init__.py +++ b/src/routes/validation/__init__.py @@ -6,6 +6,7 @@ import logging from typing import Union, Optional from flask import Blueprint, current_app, Response, request +from pathlib import Path import requests from importlib import import_module @@ -15,6 +16,8 @@ from hubmap_commons import file_helper as commons_file_helper from hubmap_commons.hm_auth import AuthHelper +from version_helper import VersionHelper + from utils.string import equals, to_title_case from utils.rest import ( is_json_request, rest_server_err, rest_bad_req, rest_ok, rest_response, full_response, @@ -339,9 +342,12 @@ def validate_records_uuids(records: list, entity_type: str, sub_type, pathname: 'There are invalid `uuids` and/or unmatched entity sub types', errors, dict_only=True) +# @REVIEW? @MAX + @validation_blueprint.route('/metadata/validate', methods=['POST']) def validate_metadata_upload(): + print("validate_metadata_upload") try: if is_json_request(): data = request.json @@ -353,6 +359,7 @@ def validate_metadata_upload(): sub_type = data.get('sub_type') validate_uuids = data.get('validate_uuids') tsv_row = data.get('tsv_row') + ensure_latest_cedar_version = data.get('ensure-latest-cedar-version') if pathname is None: upload = check_metadata_upload() @@ -366,6 +373,31 @@ def validate_metadata_upload(): response = error if error is None: + + # @MAX should this be here or in app.py? + if ensure_latest_cedar_version is not None: + # if ensure_latest_cedar_version is == None: #maybe check for true specifically? + path: str = upload.get('fullpath') + latestVersion = False + try: + schema_id = VersionHelper.get_schema_id(path, str) + latestVersion = VersionHelper.get_latest_published_schema( + schema_id) + isLatest = (schema_id == latestVersion) + print(isLatest) + if isLatest == True: + print("Schema ID Matches the latest CEDAR version.") + # return True + response = rest_response(StatusCodes.OK, "Is Latest",{"IsLatest":True}) + else: + print("Schema ID Does Not Match the latest CEDAR version.") + # return False + response = rest_response(StatusCodes.OK, "Is Not Latest",{"IsLatest":False}) + return response + except Exception as e: + return rest_server_err(e, True) + # END VERSION CHECK + if check_cedar(entity_type, sub_type, upload) is False: id_sub_type = get_cedar_schema_ids().get(sub_type) return rest_response(StatusCodes.UNACCEPTABLE, diff --git a/src/version_helper.py b/src/version_helper.py new file mode 100644 index 00000000..cce69fa3 --- /dev/null +++ b/src/version_helper.py @@ -0,0 +1,68 @@ +import logging +from pathlib import Path +from flask import Flask, request,jsonify, json, Response +import csv +from api.cedar_api import CEDARApi, CEDARApiException + + +# @MAX Is this the right way to get this in here? with a _helper? OR should this go in Utils? +# OR an upcomming Contributors Helper? +from hubmap_commons.hubmap_const import HubmapConst + +from TSV_helper import TSVError, tsv_reader_wrapper + +# Set logging format and level (default is warning) +# All the API logging is forwarded to the uWSGI server and gets written into the log file `uwsgi-ingest-api.log` +# Log rotation is handled via logrotate on the host system with a configuration file +# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes +logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.DEBUG, + datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + + +class VersionHelper: + @staticmethod + def get_schema_id(path: Path, encoding: str) -> object: + message = None + if not Path(path).exists(): + message = {"File does not exist": f"{path}"} + raise TSVError(message) + try: + rows = tsv_reader_wrapper(path, str) + if not rows: + message = {"File has no data rows": f"{path}"} + else: + first_row = rows[0] + if "metadata_schema_id" not in first_row: + message = {"metadata_schema_id not found in header": f"{path}"} + raise TSVError(message) + schema_id = first_row['metadata_schema_id'] + return schema_id + + except IsADirectoryError: + message = {"Expected a TSV, but found a directory": f"{path}"} + # raise TSVError(message) + except TSVError as e: + raise TSVError(e) + + + def get_latest_published_schema(schema_id: str, ) -> object: + latest_published_schema = "" + # API Time + CEDAR_API = CEDARApi() + try: + schema_details = CEDAR_API.get_schema_details(schema_id) + if schema_details["statusCode"] != 200: + return jsonify({"error": f"Error occurred while gathering schemas for schema id {schema_id}. {schema_details['errorMessage']}"}), 500 + for schema in schema_details['resources']: + if schema["isLatestVersion"]: + latest_published_schema = schema["@id"].strip("https://repo.metadatacenter.org/templates/") + break + return latest_published_schema + + except CEDARApiException as e: + logger.exception(f"Exception while gathering schemas for schema id {schema_id}. {e}") + return jsonify({"error": f"Error occurred while gathering schemas for schema id {schema_id}. {e}"}), 500 + + +