diff --git a/README.md b/README.md index df8e5fe..cd1f58f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ # ror-etl -Automates ETL of [ROR](https://ror.org) via `ror_dag.py`. +Automates ETL of [ROR](https://ror.org) via `ror_dag.py`. (CSET users) To update Airflow artifacts, run `bash push_to_airflow.sh`. diff --git a/ror_scripts/fetch.py b/ror_scripts/fetch.py index d7f4638..113d333 100644 --- a/ror_scripts/fetch.py +++ b/ror_scripts/fetch.py @@ -29,7 +29,9 @@ def fetch(output_bucket: str, output_loc: str) -> None: f.write(zip_resp.content) ZipFile(zip_f).extractall(td) print(f"Downloaded content: {os.listdir(td)}") - json_files = [js for js in os.listdir(td) if js.endswith(".json")] + json_files = [ + js for js in os.listdir(td) if js.endswith(".json") and ("schema" not in js) + ] assert len(json_files) == 1 output_file = os.path.join(td, output_loc.split("/")[-1]) with open(os.path.join(td, json_files[0])) as f: