Skip to content

Commit

Permalink
Changed Monarch KG ingest to automatically pull from the latest versi…
Browse files Browse the repository at this point in the history
…on. Also made process to check the metadata yaml file for the publishing date of the latest version.
  • Loading branch information
Daniel Korn committed Sep 19, 2024
1 parent bc28a73 commit 5e9151b
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions parsers/monarchkg/src/loadMonarchKG.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import os
import tarfile
import orjson
import requests

from Common.loader_interface import SourceDataLoader
from Common.kgxmodel import kgxedge
from Common.biolink_constants import *
from Common.utils import GetData
from Common.utils import GetData, GetDataPullError


##############
Expand All @@ -29,7 +30,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):

# there is a /latest/ for this url, but without a valid get_latest_source_version function,
# it could create a mismatch, pin to this version for now
self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/2024-03-18/'
self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/latest/'
self.monarch_graph_archive = 'monarch-kg.jsonl.tar.gz'
self.monarch_edge_file_archive_path = 'monarch-kg_edges.jsonl'
self.data_files = [self.monarch_graph_archive]
Expand Down Expand Up @@ -63,9 +64,17 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
}

def get_latest_source_version(self) -> str:
# possible to retrieve from /latest/index.html with beautifulsoup or some html parser but not ideal,
# planning to try to set up a better method with owners
latest_version = '2024-03-18'
"""
Gets the name of latest monarch kg version from metadata.
"""
latest_version = None
try:
metadata_yaml : requests.Response = requests.get("https://data.monarchinitiative.org/monarch-kg-dev/latest/metadata.yaml")
for line in metadata_yaml.text.split('\n'):
if("kg-version:" in line): latest_version = line.replace("kg-version:","").strip()
if(latest_version==None):raise ValueError("Cannot find 'kg-version' in Monarch KG metadata yaml.")
except Exception as e:
raise GetDataPullError(error_message=f'Unable to determine latest version for Monarch KG: {e}')
return latest_version

def get_data(self) -> bool:
Expand Down

0 comments on commit 5e9151b

Please sign in to comment.