From ad505cf832d913ad20712c571db9543c16248233 Mon Sep 17 00:00:00 2001 From: Keegan Smith Date: Mon, 18 Mar 2024 06:15:46 +0800 Subject: [PATCH] Astro Migration (#212) --- .astro/config.yaml | 2 + .github/workflows/deploy.yml | 19 + .github/workflows/unit-tests.yml | 17 +- .gitignore | 14 +- Dockerfile | 13 + .../__init__.py => dags/.airflowignore | 0 {oaebu_workflows/dags => dags}/__init__.py | 0 dags/load_dags.py | 3 + .../oaebu_workflows}/__init__.py | 0 .../oaebu_workflows}/airflow_pools.py | 0 .../oaebu_workflows}/config.py | 22 +- .../google_analytics3_telescope}/__init__.py | 0 .../schema}/__init__.py | 0 ...book_metrics_author_google_analytics3.json | 0 ...ook_metrics_country_google_analytics3.json | 0 .../book_metrics_google_analytics3.json | 0 ...ook_metrics_subject_google_analytics3.json | 0 ...ook_product_metrics_google_analytics3.json | 0 .../schema/google_analytics.json | 0 .../sql}/__init__.py | 0 ..._country_body_google_analytics3.sql.jinja2 | 0 ...metrics_country_join_google_analytics3.sql | 0 ...metrics_country_null_google_analytics3.sql | 0 ...trics_country_struct_google_analytics3.sql | 0 .../sql/book_metrics_google_analytics3.sql | 0 ..._product_body_google_analytics3.sql.jinja2 | 0 ...ok_product_functions_google_analytics3.sql | 0 .../month_metrics_sum_google_analytics3.sql | 0 .../sql/month_null_google_analytics3.sql | 0 .../google_books_telescope}/__init__.py | 0 .../google_books_telescope.py | 437 +++++ .../schema}/__init__.py | 0 ...ook_metrics_author_google_books_sales.json | 0 ...k_metrics_author_google_books_traffic.json | 0 ...ok_metrics_country_google_books_sales.json | 0 .../book_metrics_google_books_sales.json | 0 .../book_metrics_google_books_traffic.json | 0 ...ok_metrics_subject_google_books_sales.json | 0 ..._metrics_subject_google_books_traffic.json | 0 ...k_product_metadata_google_books_sales.json | 0 ...product_metadata_google_books_traffic.json | 0 ...ok_product_metrics_google_books_sales.json | 0 ..._product_metrics_google_books_traffic.json | 0 .../schema/google_books_sales.json | 0 .../schema/google_books_traffic.json | 0 .../google_books_telescope/sql}/__init__.py | 0 ...country_body_google_books_sales.sql.jinja2 | 0 ...etrics_country_join_google_books_sales.sql | 0 ...etrics_country_null_google_books_sales.sql | 0 ...rics_country_struct_google_books_sales.sql | 0 .../sql/book_metrics_google_books_sales.sql | 0 .../sql/book_metrics_google_books_traffic.sql | 0 ...product_body_google_books_sales.sql.jinja2 | 0 ...oduct_body_google_books_traffic.sql.jinja2 | 0 ...k_product_functions_google_books_sales.sql | 0 .../month_metrics_sum_google_books_sales.sql | 0 ...month_metrics_sum_google_books_traffic.sql | 0 .../sql/month_null_google_books_sales.sql | 0 .../sql/month_null_google_books_traffic.sql | 0 .../irus_fulcrum_telescope}/__init__.py | 0 .../irus_fulcrum_telescope.py | 419 +++++ .../schema}/__init__.py | 0 .../book_metrics_author_irus_fulcrum.json | 0 .../book_metrics_country_irus_fulcrum.json | 0 .../schema/book_metrics_irus_fulcrum.json | 0 .../book_metrics_subject_irus_fulcrum.json | 0 .../book_product_metadata_irus_fulcrum.json | 0 .../book_product_metrics_irus_fulcrum.json | 0 .../schema/irus_fulcrum.json | 0 .../irus_fulcrum_telescope/sql}/__init__.py | 0 ...trics_country_body_irus_fulcrum.sql.jinja2 | 0 ...book_metrics_country_join_irus_fulcrum.sql | 0 ...book_metrics_country_null_irus_fulcrum.sql | 0 ...ok_metrics_country_struct_irus_fulcrum.sql | 0 .../sql/book_metrics_irus_fulcrum.sql | 0 .../book_product_body_irus_fulcrum.sql.jinja2 | 0 .../book_product_functions_irus_fulcrum.sql | 0 .../sql/month_metrics_sum_irus_fulcrum.sql | 0 .../sql/month_null_irus_fulcrum.sql | 0 .../irus_oapen_telescope}/__init__.py | 0 .../irus_oapen_telescope.py | 626 ++++++++ .../irus_oapen_telescope}/schema/__init__.py | 0 .../book_metrics_author_irus_oapen.json | 0 .../book_metrics_country_irus_oapen.json | 0 .../schema/book_metrics_irus_oapen.json | 0 .../book_metrics_subject_irus_oapen.json | 0 .../book_product_metadata_irus_oapen.json | 0 .../book_product_metrics_irus_oapen.json | 0 .../export_metrics_city_irus_oapen.json | 0 .../schema/irus_oapen.json | 0 .../irus_oapen_telescope}/sql/__init__.py | 0 ...metrics_country_body_irus_oapen.sql.jinja2 | 0 .../book_metrics_country_join_irus_oapen.sql | 0 .../book_metrics_country_null_irus_oapen.sql | 0 ...book_metrics_country_struct_irus_oapen.sql | 0 .../sql/book_metrics_irus_oapen.sql | 0 .../book_product_body_irus_oapen.sql.jinja2 | 0 .../sql/book_product_functions_irus_oapen.sql | 0 .../sql/month_metrics_sum_irus_oapen.sql | 0 .../sql/month_null_irus_oapen.sql | 0 .../jstor_telescope}/__init__.py | 0 .../jstor_telescope/jstor_telescope.py | 645 ++++---- .../jstor_telescope/schema}/__init__.py | 0 .../book_metrics_author_jstor_country.json | 0 .../book_metrics_country_jstor_country.json | 0 .../schema/book_metrics_jstor_country.json | 0 .../book_metrics_subject_jstor_country.json | 0 .../book_product_metadata_jstor_country.json | 0 ...ok_product_metadata_jstor_institution.json | 0 .../book_product_metrics_jstor_country.json | 0 ...ook_product_metrics_jstor_institution.json | 0 .../jstor_telescope/schema/jstor_country.json | 0 .../schema/jstor_country_collection.json | 0 .../schema/jstor_institution.json | 0 .../schema/jstor_institution_collection.json | 0 .../jstor_telescope/sql}/__init__.py | 0 .../sql/book_metrics_author_jstor_country.sql | 0 ...rics_country_body_jstor_country.sql.jinja2 | 0 ...ook_metrics_country_join_jstor_country.sql | 0 ...ook_metrics_country_null_jstor_country.sql | 0 ...k_metrics_country_struct_jstor_country.sql | 0 .../sql/book_metrics_jstor_country.sql | 0 ...book_product_body_jstor_country.sql.jinja2 | 0 ..._product_body_jstor_institution.sql.jinja2 | 0 .../book_product_functions_jstor_country.sql | 0 ...ok_product_functions_jstor_institution.sql | 0 .../sql/month_metrics_sum_jstor_country.sql | 0 .../sql/month_null_jstor_country.sql | 0 .../oaebu_workflows}/oaebu_partners.py | 2 +- .../oapen_metadata_telescope}/__init__.py | 0 .../oapen_metadata_telescope.py | 339 ++++ .../schema}/__init__.py | 0 .../schema/oapen_metadata_filter.json | 0 .../oapen_metadata_telescope/sql}/__init__.py | 0 .../onix_telescope}/__init__.py | 0 .../onix_telescope/onix_telescope.py | 323 ++++ .../onix_telescope/schema}/__init__.py | 0 .../onix_telescope/schema/onix.json | 0 .../onix_telescope/sql}/__init__.py | 0 .../oaebu_workflows}/onix_utils.py | 8 +- .../onix_workflow}/__init__.py | 0 .../onix_workflow/onix_work_aggregation.py | 0 .../onix_workflow/onix_workflow.py | 1399 +++++++++++++++++ .../onix_workflow/schema}/__init__.py | 0 .../onix_workflow/schema/book.json | 0 .../schema/book_institution_list.json | 0 .../onix_workflow/schema/book_list.json | 0 .../onix_workflow/schema/book_metrics.json | 0 .../schema/book_metrics_author.json | 0 .../schema/book_metrics_city.json | 0 .../schema/book_metrics_country.json | 0 .../schema/book_metrics_events.json | 0 .../schema/book_metrics_institution.json | 0 .../schema/book_metrics_subject_bic.json | 0 .../schema/book_metrics_subject_bisac.json | 0 .../schema/book_metrics_subject_thema.json | 0 .../onix_workflow/schema/book_product.json | 0 .../onix_workflow/schema/crossref_events.json | 0 .../schema/crossref_metadata.json | 0 .../schema/onix_aggregate_metrics.json | 0 .../schema/onix_invalid_isbn.json | 0 .../schema/onix_workfamilyid_isbn.json | 0 .../schema/onix_workid_isbn.json | 0 .../schema/onix_workid_isbn_errors.json | 0 .../schema/platform_invalid_isbn.json | 0 .../schema/platform_unmatched_isbn.json | 0 .../onix_workflow/sql}/__init__.py | 0 .../sql/assign_workid_workfamilyid.sql.jinja2 | 0 .../onix_workflow/sql/book.sql.jinja2 | 0 .../sql/book_institution_list.sql.jinja2 | 0 .../onix_workflow/sql/book_list.sql.jinja2 | 0 .../onix_workflow/sql/book_metrics.sql.jinja2 | 0 .../sql/book_metrics_author.sql.jinja2 | 0 .../sql/book_metrics_city.sql.jinja2 | 0 .../sql/book_metrics_country.sql.jinja2 | 0 .../sql/book_metrics_events.sql.jinja2 | 0 .../sql/book_metrics_institution.sql.jinja2 | 0 .../sql/book_metrics_subject_bic.sql.jinja2 | 0 .../sql/book_metrics_subject_bisac.sql.jinja2 | 0 .../sql/book_metrics_subject_thema.sql.jinja2 | 0 .../onix_workflow/sql/book_product.sql.jinja2 | 0 .../sql/crossref_events_filter_doi.sql.jinja2 | 0 .../crossref_metadata_filter_isbn.sql.jinja2 | 0 .../onix_workflow/sql/isbn_utils.sql | 0 .../sql/oaebu_intermediate_metrics.sql.jinja2 | 0 .../sql/onix_aggregate_metrics.sql.jinja2 | 0 .../sql/validate_isbn.sql.jinja2 | 0 .../oaebu_workflows/schema}/__init__.py | 0 ...book_metrics_country_internet_archive.json | 0 .../book_metrics_internet_archive.json | 0 ...ook_product_metadata_internet_archive.json | 0 ...book_product_metrics_internet_archive.json | 0 .../internet_archive/internet_archive.json | 0 .../book_metrics_country_worldreader.json | 0 .../worldreader/book_metrics_worldreader.json | 0 .../book_product_metadata_worldreader.json | 0 .../book_product_metrics_worldreader.json | 0 .../schema/worldreader/worldreader.json | 0 .../oaebu_workflows/sql}/__init__.py | 0 .../sql/internet_archive}/__init__.py | 0 .../book_metrics_internet_archive.sql | 0 ...k_product_body_internet_archive.sql.jinja2 | 0 .../month_null_internet_archive.sql | 0 ...etrics_country_body_worldreader.sql.jinja2 | 0 .../book_metrics_country_join_worldreader.sql | 0 .../book_metrics_country_null_worldreader.sql | 0 ...ook_metrics_country_struct_worldreader.sql | 0 .../worldreader/book_metrics_worldreader.sql | 0 .../book_product_body_worldreader.sql.jinja2 | 0 .../book_product_functions_worldreader.sql | 0 .../worldreader/month_null_worldreader.sql | 0 .../thoth_telescope}/__init__.py | 0 .../thoth_telescope/schema}/__init__.py | 0 .../thoth_telescope/sql}/__init__.py | 0 .../thoth_telescope/thoth_telescope.py | 303 ++++ .../ucl_discovery_telescope}/__init__.py | 0 .../schema}/__init__.py | 0 .../book_metrics_country_ucl_discovery.json | 0 .../schema/book_metrics_ucl_discovery.json | 0 .../book_product_metadata_ucl_discovery.json | 0 .../book_product_metrics_ucl_discovery.json | 0 .../schema/ucl_discovery.json | 0 .../ucl_discovery_telescope/sql}/__init__.py | 0 ...rics_country_body_ucl_discovery.sql.jinja2 | 0 ...ook_metrics_country_join_ucl_discovery.sql | 0 ...ook_metrics_country_null_ucl_discovery.sql | 0 ...k_metrics_country_struct_ucl_discovery.sql | 0 .../sql/book_metrics_ucl_discovery.sql | 0 ...book_product_body_ucl_discovery.sql.jinja2 | 0 .../sql/month_null_ucl_discovery.sql | 0 .../ucl_discovery_telescope.py | 465 ++++++ .../google_analytics3_telescope.py | 583 ------- .../tests/fixtures/test_table.json | 3 - .../tests/fixtures/test_table_anu.json | 3 - .../tests/test_google_analytics3_telescope.py | 435 ----- .../google_books_telescope.py | 377 ----- .../tests/test_google_books_telescope.py | 360 ----- .../irus_fulcrum_telescope.py | 342 ---- .../irus_oapen_telescope.py | 583 ------- .../oapen_metadata_telescope.py | 294 ---- .../onix_telescope/onix_telescope.py | 280 ---- .../tests/test_onix_telescope.py | 238 --- .../onix_workflow/onix_workflow.py | 1287 --------------- .../fixtures/crossref_events_request.yaml | 3 - .../thoth_telescope/thoth_telescope.py | 251 --- .../tests/fixtures/__init__.py | 0 .../ucl_discovery_telescope.py | 399 ----- packages.txt | 2 + requirements.sh | 18 - requirements.txt | 9 + setup.cfg | 55 - setup.py | 3 - .../tests/fixtures => tests}/__init__.py | 0 .../fixtures}/__init__.py | 0 .../fixtures/onix_utils}/__init__.py | 0 .../fixtures/onix_utils/input_metadata.xml | 0 .../fixtures/onix_utils/output_metadata.jsonl | 0 .../onix_utils/test_subjects_expected.json | 0 .../onix_utils/test_subjects_input.json | 0 .../google_books_telescope}/__init__.py | 0 .../GoogleBooksTrafficReport_2020_02.csv | 0 .../GoogleBooksTrafficReport_bar2020_02.csv | 0 .../GoogleBooksTrafficReport_foo2020_02.csv | 0 .../GoogleSalesTransactionReport_2020_02.csv | 0 ...oogleSalesTransactionReport_bar2020_02.csv | 0 ...oogleSalesTransactionReport_foo2020_02.csv | 0 ...oogleSalesTransactionReport_foo2020_03.csv | 0 .../fixtures}/__init__.py | 0 .../test_google_books_telescope.py | 344 ++++ .../irus_fulcrum_telescope}/__init__.py | 0 .../fixtures}/__init__.py | 0 .../fixtures/fulcrum_download_cassette.yaml | 0 .../fixtures/test_country_download.jsonl | 0 .../fixtures/test_final_table.json | 0 .../fixtures/test_totals_download.jsonl | 0 .../fixtures/test_transform.jsonl | 0 .../test_irus_fulcrum_telescope.py | 157 +- .../irus_oapen_telescope}/__init__.py | 0 .../fixtures}/__init__.py | 0 .../fixtures/download.jsonl.gz | 0 .../test_irus_oapen_telescope.py | 465 +++--- .../jstor_telescope}/__init__.py | 0 .../jstor_telescope}/fixtures/__init__.py | 0 .../fixtures/collection_country.json | 0 .../fixtures/collection_country_table.json | 0 .../fixtures/collection_institution.json | 0 .../collection_institution_table.json | 0 .../fixtures/country_20220801.tsv | 0 .../fixtures/institution_20220801.tsv | 0 .../jstor_telescope}/test_jstor_telescope.py | 349 ++-- .../oapen_metadata_telescope}/__init__.py | 0 .../fixtures}/__init__.py | 0 .../fixtures/cassette_bad_response.yaml | 0 .../fixtures/cassette_empty.yaml | 0 .../fixtures/cassette_header_only.yaml | 0 .../fixtures/cassette_invalid.yaml | 0 .../fixtures/cassette_valid.yaml | 0 .../fixtures/empty_download.xml | 0 .../fixtures/invalid_products.xml | 0 .../fixtures/invalid_products_removed.xml | 0 .../fixtures/metadata_download_valid.xml | 0 .../fixtures/parsed_valid.xml | 0 .../fixtures/test_table.json | 0 .../test_oapen_metadata_telescope.py | 155 +- .../onix_telescope}/__init__.py | 0 .../fixtures/20210330_CURTINPRESS_ONIX.json | 0 .../fixtures/20210330_CURTINPRESS_ONIX.xml | 0 .../onix_telescope/fixtures}/__init__.py | 0 tests/onix_telescope/test_onix_telescope.py | 250 +++ .../tests => tests/onix_workflow}/__init__.py | 0 .../onix_workflow}/fixtures/__init__.py | 0 .../crossref_download_function_test.yaml | 0 .../fixtures/crossref_events_request.yaml | 3 + .../fixtures/doi_isbn_query_test.jsonl | 0 .../fixtures/e2e_inputs/bic_lookup.jsonl | 0 .../fixtures/e2e_inputs/bisac_lookup.jsonl | 0 .../fixtures/e2e_inputs/country.jsonl | 0 .../e2e_inputs/crossref_metadata_master.jsonl | 0 .../e2e_inputs/google_analytics3.jsonl | 0 .../e2e_inputs/google_books_sales.jsonl | 0 .../e2e_inputs/google_books_traffic.jsonl | 0 .../e2e_inputs/internet_archive.jsonl | 0 .../fixtures/e2e_inputs/irus_fulcrum.jsonl | 0 .../fixtures/e2e_inputs/irus_oapen.jsonl | 0 .../fixtures/e2e_inputs/jstor_country.jsonl | 0 .../e2e_inputs/jstor_institution.jsonl | 0 .../fixtures/e2e_inputs/onix.jsonl | 0 .../fixtures/e2e_inputs/thema_lookup.jsonl | 0 .../fixtures/e2e_inputs/ucl_discovery.jsonl | 0 .../fixtures/e2e_inputs/worldreader.jsonl | 0 .../fixtures/e2e_outputs/book.json | 0 .../fixtures/e2e_outputs/book_list.json | 0 .../fixtures/e2e_outputs/book_list_dry.json | 0 .../fixtures/e2e_outputs/book_product.json | 0 .../e2e_outputs/book_product_dry.json | 0 .../fixtures/e2e_outputs/crossref_events.json | 0 .../e2e_outputs/crossref_metadata.json | 0 .../e2e_outputs/onix_workfamilyid_isbn.json | 0 .../e2e_outputs/onix_workid_isbn.json | 0 .../e2e_outputs/onix_workid_isbn_errors.json | 0 .../fixtures/schema/bic_lookup.json | 0 .../fixtures/schema/bisac_lookup.json | 0 .../fixtures/schema/country.json | 0 .../fixtures/schema/country_2019-01-01.json | 0 .../schema/crossref_metadata_master.json | 0 .../fixtures/schema/thema_lookup.json | 0 .../test_onix_work_aggregation.py | 0 .../onix_workflow}/test_onix_workflow.py | 458 +++--- .../tests => tests}/test_oaebu_partners.py | 0 .../tests => tests}/test_onix_utils.py | 11 +- .../thoth_telescope}/__init__.py | 0 .../thoth_telescope/fixtures}/__init__.py | 0 .../thoth_telescope}/fixtures/test_table.json | 0 .../fixtures/thoth_download_cassette.yaml | 0 .../thoth_telescope}/test_thoth_telescope.py | 149 +- .../ucl_discovery_telescope}/__init__.py | 0 .../fixtures}/__init__.py | 0 .../fixtures/download_cassette.yaml | 0 .../fixtures/test_table.json | 0 .../test_ucl_discovery_telescope.py | 159 +- 360 files changed, 6392 insertions(+), 6689 deletions(-) create mode 100644 .astro/config.yaml create mode 100644 .github/workflows/deploy.yml create mode 100644 Dockerfile rename oaebu_workflows/__init__.py => dags/.airflowignore (100%) rename {oaebu_workflows/dags => dags}/__init__.py (100%) create mode 100644 dags/load_dags.py rename {oaebu_workflows/google_analytics3_telescope => dags/oaebu_workflows}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/airflow_pools.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/config.py (73%) rename {oaebu_workflows/google_analytics3_telescope/schema => dags/oaebu_workflows/google_analytics3_telescope}/__init__.py (100%) rename {oaebu_workflows/google_analytics3_telescope/sql => dags/oaebu_workflows/google_analytics3_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/schema/book_metrics_author_google_analytics3.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/schema/book_metrics_country_google_analytics3.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/schema/book_metrics_google_analytics3.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/schema/book_metrics_subject_google_analytics3.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/schema/book_product_metrics_google_analytics3.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/schema/google_analytics.json (100%) rename {oaebu_workflows/google_analytics3_telescope/tests => dags/oaebu_workflows/google_analytics3_telescope/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_metrics_country_body_google_analytics3.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_metrics_country_join_google_analytics3.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_metrics_country_null_google_analytics3.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_metrics_country_struct_google_analytics3.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_metrics_google_analytics3.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/book_product_functions_google_analytics3.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_analytics3_telescope/sql/month_null_google_analytics3.sql (100%) rename {oaebu_workflows/google_analytics3_telescope/tests/fixtures => dags/oaebu_workflows/google_books_telescope}/__init__.py (100%) create mode 100644 dags/oaebu_workflows/google_books_telescope/google_books_telescope.py rename {oaebu_workflows/google_books_telescope => dags/oaebu_workflows/google_books_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_author_google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_author_google_books_traffic.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_country_google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_google_books_traffic.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_subject_google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_metrics_subject_google_books_traffic.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_product_metadata_google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_product_metadata_google_books_traffic.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_product_metrics_google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/book_product_metrics_google_books_traffic.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/google_books_sales.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/schema/google_books_traffic.json (100%) rename {oaebu_workflows/google_books_telescope/schema => dags/oaebu_workflows/google_books_telescope/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_metrics_country_body_google_books_sales.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_metrics_country_join_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_metrics_country_null_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_metrics_country_struct_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_metrics_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_metrics_google_books_traffic.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/book_product_functions_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/month_metrics_sum_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/month_metrics_sum_google_books_traffic.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/month_null_google_books_sales.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/google_books_telescope/sql/month_null_google_books_traffic.sql (100%) rename {oaebu_workflows/google_books_telescope/sql => dags/oaebu_workflows/irus_fulcrum_telescope}/__init__.py (100%) create mode 100644 dags/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py rename {oaebu_workflows/google_books_telescope/tests => dags/oaebu_workflows/irus_fulcrum_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/book_metrics_author_irus_fulcrum.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/book_metrics_country_irus_fulcrum.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/book_metrics_irus_fulcrum.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/book_metrics_subject_irus_fulcrum.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/book_product_metadata_irus_fulcrum.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/book_product_metrics_irus_fulcrum.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/schema/irus_fulcrum.json (100%) rename {oaebu_workflows/google_books_telescope/tests/fixtures => dags/oaebu_workflows/irus_fulcrum_telescope/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_metrics_country_body_irus_fulcrum.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_metrics_country_join_irus_fulcrum.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_metrics_country_null_irus_fulcrum.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_metrics_country_struct_irus_fulcrum.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_metrics_irus_fulcrum.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/book_product_functions_irus_fulcrum.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/month_metrics_sum_irus_fulcrum.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql (100%) rename {oaebu_workflows/irus_fulcrum_telescope => dags/oaebu_workflows/irus_oapen_telescope}/__init__.py (100%) create mode 100644 dags/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py rename {oaebu_workflows/irus_fulcrum_telescope => dags/oaebu_workflows/irus_oapen_telescope}/schema/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/book_metrics_author_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/book_metrics_country_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/book_metrics_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/book_metrics_subject_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/book_product_metadata_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/book_product_metrics_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/export_metrics_city_irus_oapen.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/schema/irus_oapen.json (100%) rename {oaebu_workflows/irus_fulcrum_telescope => dags/oaebu_workflows/irus_oapen_telescope}/sql/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_metrics_country_body_irus_oapen.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_metrics_country_join_irus_oapen.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_metrics_country_null_irus_oapen.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_metrics_country_struct_irus_oapen.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_metrics_irus_oapen.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/book_product_functions_irus_oapen.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/month_metrics_sum_irus_oapen.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/irus_oapen_telescope/sql/month_null_irus_oapen.sql (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => dags/oaebu_workflows/jstor_telescope}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/jstor_telescope.py (58%) rename {oaebu_workflows/irus_fulcrum_telescope/tests/fixtures => dags/oaebu_workflows/jstor_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_metrics_author_jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_metrics_country_jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_metrics_jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_metrics_subject_jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_product_metadata_jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_product_metadata_jstor_institution.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_product_metrics_jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/book_product_metrics_jstor_institution.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/jstor_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/jstor_country_collection.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/jstor_institution.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/schema/jstor_institution_collection.json (100%) rename {oaebu_workflows/irus_oapen_telescope => dags/oaebu_workflows/jstor_telescope/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_metrics_author_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_metrics_country_body_jstor_country.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_metrics_country_join_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_metrics_country_null_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_metrics_country_struct_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_metrics_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_product_functions_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/book_product_functions_jstor_institution.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/month_metrics_sum_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/jstor_telescope/sql/month_null_jstor_country.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/oaebu_partners.py (99%) rename {oaebu_workflows/irus_oapen_telescope/schema => dags/oaebu_workflows/oapen_metadata_telescope}/__init__.py (100%) create mode 100644 dags/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py rename {oaebu_workflows/irus_oapen_telescope/sql => dags/oaebu_workflows/oapen_metadata_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/oapen_metadata_telescope/schema/oapen_metadata_filter.json (100%) rename {oaebu_workflows/irus_oapen_telescope/tests => dags/oaebu_workflows/oapen_metadata_telescope/sql}/__init__.py (100%) rename {oaebu_workflows/irus_oapen_telescope/tests/fixtures => dags/oaebu_workflows/onix_telescope}/__init__.py (100%) create mode 100644 dags/oaebu_workflows/onix_telescope/onix_telescope.py rename {oaebu_workflows/jstor_telescope => dags/oaebu_workflows/onix_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_telescope/schema/onix.json (100%) rename {oaebu_workflows/jstor_telescope/schema => dags/oaebu_workflows/onix_telescope/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_utils.py (99%) rename {oaebu_workflows/jstor_telescope/sql => dags/oaebu_workflows/onix_workflow}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/onix_work_aggregation.py (100%) create mode 100644 dags/oaebu_workflows/onix_workflow/onix_workflow.py rename {oaebu_workflows/jstor_telescope/tests => dags/oaebu_workflows/onix_workflow/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_institution_list.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_list.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_author.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_city.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_country.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_events.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_institution.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_subject_bic.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_subject_bisac.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_metrics_subject_thema.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/book_product.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/crossref_events.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/crossref_metadata.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/onix_aggregate_metrics.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/onix_invalid_isbn.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/onix_workfamilyid_isbn.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/onix_workid_isbn.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/onix_workid_isbn_errors.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/platform_invalid_isbn.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/schema/platform_unmatched_isbn.json (100%) rename {oaebu_workflows/jstor_telescope/tests/fixtures => dags/oaebu_workflows/onix_workflow/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/assign_workid_workfamilyid.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_institution_list.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_list.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_author.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_city.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_country.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_events.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_institution.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/book_product.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/crossref_events_filter_doi.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/crossref_metadata_filter_isbn.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/isbn_utils.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/oaebu_intermediate_metrics.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/onix_aggregate_metrics.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/onix_workflow/sql/validate_isbn.sql.jinja2 (100%) rename {oaebu_workflows/oapen_metadata_telescope => dags/oaebu_workflows/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/internet_archive/book_metrics_country_internet_archive.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/internet_archive/book_metrics_internet_archive.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/internet_archive/book_product_metadata_internet_archive.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/internet_archive/book_product_metrics_internet_archive.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/internet_archive/internet_archive.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/worldreader/book_metrics_country_worldreader.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/worldreader/book_metrics_worldreader.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/worldreader/book_product_metadata_worldreader.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/worldreader/book_product_metrics_worldreader.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/schema/worldreader/worldreader.json (100%) rename {oaebu_workflows/oapen_metadata_telescope/schema => dags/oaebu_workflows/sql}/__init__.py (100%) rename {oaebu_workflows/oapen_metadata_telescope/sql => dags/oaebu_workflows/sql/internet_archive}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/internet_archive/book_metrics_internet_archive.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/internet_archive/book_product_body_internet_archive.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/internet_archive/month_null_internet_archive.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_metrics_country_body_worldreader.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_metrics_country_join_worldreader.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_metrics_country_null_worldreader.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_metrics_country_struct_worldreader.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_metrics_worldreader.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_product_body_worldreader.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/book_product_functions_worldreader.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/sql/worldreader/month_null_worldreader.sql (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => dags/oaebu_workflows/thoth_telescope}/__init__.py (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests/fixtures => dags/oaebu_workflows/thoth_telescope/schema}/__init__.py (100%) rename {oaebu_workflows/onix_telescope => dags/oaebu_workflows/thoth_telescope/sql}/__init__.py (100%) create mode 100644 dags/oaebu_workflows/thoth_telescope/thoth_telescope.py rename {oaebu_workflows/onix_telescope/schema => dags/oaebu_workflows/ucl_discovery_telescope}/__init__.py (100%) rename {oaebu_workflows/onix_telescope/sql => dags/oaebu_workflows/ucl_discovery_telescope/schema}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/schema/book_metrics_country_ucl_discovery.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/schema/book_metrics_ucl_discovery.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/schema/book_product_metadata_ucl_discovery.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/schema/book_product_metrics_ucl_discovery.json (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/schema/ucl_discovery.json (100%) rename {oaebu_workflows/onix_telescope/tests => dags/oaebu_workflows/ucl_discovery_telescope/sql}/__init__.py (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/book_metrics_country_body_ucl_discovery.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/book_metrics_country_join_ucl_discovery.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/book_metrics_country_null_ucl_discovery.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/book_metrics_country_struct_ucl_discovery.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/book_metrics_ucl_discovery.sql (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/book_product_body_ucl_discovery.sql.jinja2 (100%) rename {oaebu_workflows => dags/oaebu_workflows}/ucl_discovery_telescope/sql/month_null_ucl_discovery.sql (100%) create mode 100644 dags/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py delete mode 100644 oaebu_workflows/google_analytics3_telescope/google_analytics3_telescope.py delete mode 100644 oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table.json delete mode 100644 oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table_anu.json delete mode 100644 oaebu_workflows/google_analytics3_telescope/tests/test_google_analytics3_telescope.py delete mode 100644 oaebu_workflows/google_books_telescope/google_books_telescope.py delete mode 100644 oaebu_workflows/google_books_telescope/tests/test_google_books_telescope.py delete mode 100644 oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py delete mode 100644 oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py delete mode 100644 oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py delete mode 100644 oaebu_workflows/onix_telescope/onix_telescope.py delete mode 100644 oaebu_workflows/onix_telescope/tests/test_onix_telescope.py delete mode 100644 oaebu_workflows/onix_workflow/onix_workflow.py delete mode 100644 oaebu_workflows/onix_workflow/tests/fixtures/crossref_events_request.yaml delete mode 100644 oaebu_workflows/thoth_telescope/thoth_telescope.py delete mode 100644 oaebu_workflows/ucl_discovery_telescope/tests/fixtures/__init__.py delete mode 100644 oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py create mode 100644 packages.txt delete mode 100644 requirements.sh delete mode 100644 setup.cfg delete mode 100644 setup.py rename {oaebu_workflows/onix_telescope/tests/fixtures => tests}/__init__.py (100%) rename {oaebu_workflows/onix_workflow => tests/fixtures}/__init__.py (100%) rename {oaebu_workflows/onix_workflow/schema => tests/fixtures/onix_utils}/__init__.py (100%) rename {oaebu_workflows/tests => tests}/fixtures/onix_utils/input_metadata.xml (100%) rename {oaebu_workflows/tests => tests}/fixtures/onix_utils/output_metadata.jsonl (100%) rename {oaebu_workflows/tests => tests}/fixtures/onix_utils/test_subjects_expected.json (100%) rename {oaebu_workflows/tests => tests}/fixtures/onix_utils/test_subjects_input.json (100%) rename {oaebu_workflows/onix_workflow/sql => tests/google_books_telescope}/__init__.py (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleBooksTrafficReport_2020_02.csv (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleBooksTrafficReport_bar2020_02.csv (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleBooksTrafficReport_foo2020_02.csv (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleSalesTransactionReport_2020_02.csv (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleSalesTransactionReport_bar2020_02.csv (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleSalesTransactionReport_foo2020_02.csv (100%) rename {oaebu_workflows/google_books_telescope/tests => tests/google_books_telescope}/fixtures/GoogleSalesTransactionReport_foo2020_03.csv (100%) rename {oaebu_workflows/onix_workflow/tests => tests/google_books_telescope/fixtures}/__init__.py (100%) create mode 100644 tests/google_books_telescope/test_google_books_telescope.py rename {oaebu_workflows/onix_workflow/tests/fixtures => tests/irus_fulcrum_telescope}/__init__.py (100%) rename {oaebu_workflows/schema => tests/irus_fulcrum_telescope/fixtures}/__init__.py (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => tests/irus_fulcrum_telescope}/fixtures/fulcrum_download_cassette.yaml (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => tests/irus_fulcrum_telescope}/fixtures/test_country_download.jsonl (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => tests/irus_fulcrum_telescope}/fixtures/test_final_table.json (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => tests/irus_fulcrum_telescope}/fixtures/test_totals_download.jsonl (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => tests/irus_fulcrum_telescope}/fixtures/test_transform.jsonl (100%) rename {oaebu_workflows/irus_fulcrum_telescope/tests => tests/irus_fulcrum_telescope}/test_irus_fulcrum_telescope.py (60%) rename {oaebu_workflows/sql => tests/irus_oapen_telescope}/__init__.py (100%) rename {oaebu_workflows/sql/internet_archive => tests/irus_oapen_telescope/fixtures}/__init__.py (100%) rename {oaebu_workflows/irus_oapen_telescope/tests => tests/irus_oapen_telescope}/fixtures/download.jsonl.gz (100%) rename {oaebu_workflows/irus_oapen_telescope/tests => tests/irus_oapen_telescope}/test_irus_oapen_telescope.py (59%) rename {oaebu_workflows/tests => tests/jstor_telescope}/__init__.py (100%) rename {oaebu_workflows/tests => tests/jstor_telescope}/fixtures/__init__.py (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/fixtures/collection_country.json (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/fixtures/collection_country_table.json (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/fixtures/collection_institution.json (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/fixtures/collection_institution_table.json (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/fixtures/country_20220801.tsv (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/fixtures/institution_20220801.tsv (100%) rename {oaebu_workflows/jstor_telescope/tests => tests/jstor_telescope}/test_jstor_telescope.py (75%) rename {oaebu_workflows/tests/fixtures/onix_utils => tests/oapen_metadata_telescope}/__init__.py (100%) rename {oaebu_workflows/thoth_telescope => tests/oapen_metadata_telescope/fixtures}/__init__.py (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/cassette_bad_response.yaml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/cassette_empty.yaml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/cassette_header_only.yaml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/cassette_invalid.yaml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/cassette_valid.yaml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/empty_download.xml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/invalid_products.xml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/invalid_products_removed.xml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/metadata_download_valid.xml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/parsed_valid.xml (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/fixtures/test_table.json (100%) rename {oaebu_workflows/oapen_metadata_telescope/tests => tests/oapen_metadata_telescope}/test_oapen_metadata_telescope.py (65%) rename {oaebu_workflows/thoth_telescope/schema => tests/onix_telescope}/__init__.py (100%) rename {oaebu_workflows/onix_telescope/tests => tests/onix_telescope}/fixtures/20210330_CURTINPRESS_ONIX.json (100%) rename {oaebu_workflows/onix_telescope/tests => tests/onix_telescope}/fixtures/20210330_CURTINPRESS_ONIX.xml (100%) rename {oaebu_workflows/thoth_telescope/sql => tests/onix_telescope/fixtures}/__init__.py (100%) create mode 100644 tests/onix_telescope/test_onix_telescope.py rename {oaebu_workflows/thoth_telescope/tests => tests/onix_workflow}/__init__.py (100%) rename {oaebu_workflows/thoth_telescope/tests => tests/onix_workflow}/fixtures/__init__.py (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/crossref_download_function_test.yaml (100%) create mode 100644 tests/onix_workflow/fixtures/crossref_events_request.yaml rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/doi_isbn_query_test.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/bic_lookup.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/bisac_lookup.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/country.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/crossref_metadata_master.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/google_analytics3.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/google_books_sales.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/google_books_traffic.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/internet_archive.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/irus_fulcrum.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/irus_oapen.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/jstor_country.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/jstor_institution.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/onix.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/thema_lookup.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/ucl_discovery.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_inputs/worldreader.jsonl (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/book.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/book_list.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/book_list_dry.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/book_product.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/book_product_dry.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/crossref_events.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/crossref_metadata.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/onix_workfamilyid_isbn.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/onix_workid_isbn.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/e2e_outputs/onix_workid_isbn_errors.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/schema/bic_lookup.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/schema/bisac_lookup.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/schema/country.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/schema/country_2019-01-01.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/schema/crossref_metadata_master.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/fixtures/schema/thema_lookup.json (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/test_onix_work_aggregation.py (100%) rename {oaebu_workflows/onix_workflow/tests => tests/onix_workflow}/test_onix_workflow.py (78%) rename {oaebu_workflows/tests => tests}/test_oaebu_partners.py (100%) rename {oaebu_workflows/tests => tests}/test_onix_utils.py (99%) rename {oaebu_workflows/ucl_discovery_telescope => tests/thoth_telescope}/__init__.py (100%) rename {oaebu_workflows/ucl_discovery_telescope/schema => tests/thoth_telescope/fixtures}/__init__.py (100%) rename {oaebu_workflows/thoth_telescope/tests => tests/thoth_telescope}/fixtures/test_table.json (100%) rename {oaebu_workflows/thoth_telescope/tests => tests/thoth_telescope}/fixtures/thoth_download_cassette.yaml (100%) rename {oaebu_workflows/thoth_telescope/tests => tests/thoth_telescope}/test_thoth_telescope.py (60%) rename {oaebu_workflows/ucl_discovery_telescope/sql => tests/ucl_discovery_telescope}/__init__.py (100%) rename {oaebu_workflows/ucl_discovery_telescope/tests => tests/ucl_discovery_telescope/fixtures}/__init__.py (100%) rename {oaebu_workflows/ucl_discovery_telescope/tests => tests/ucl_discovery_telescope}/fixtures/download_cassette.yaml (100%) rename {oaebu_workflows/ucl_discovery_telescope/tests => tests/ucl_discovery_telescope}/fixtures/test_table.json (100%) rename {oaebu_workflows/ucl_discovery_telescope/tests => tests/ucl_discovery_telescope}/test_ucl_discovery_telescope.py (81%) diff --git a/.astro/config.yaml b/.astro/config.yaml new file mode 100644 index 00000000..87830aca --- /dev/null +++ b/.astro/config.yaml @@ -0,0 +1,2 @@ +project: + name: oaebu-workflows diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 00000000..aa46ec79 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,19 @@ +name: Astronomer CI - Deploy code + +on: + push: + tags: + - 'v*' + +env: + ## Set API Token as an environment variable + ASTRO_API_TOKEN: ${{ secrets.ASTRO_API_TOKEN }} + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Deploy to Astro + uses: astronomer/deploy-action@v0.4 + with: + deployment-id: ${{ secrets.ASTRO_DEPLOYMENT_ID }} \ No newline at end of file diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8bd77a5d..b5cd8179 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,6 +1,6 @@ name: Unit Tests -on: [push, merge_group] +on: [push] jobs: build: @@ -13,12 +13,12 @@ jobs: steps: - name: Checkout ${{ matrix.python-version }} - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: lfs: true - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -27,14 +27,17 @@ jobs: python -m pip install --upgrade pip cd .. - git clone https://github.com/The-Academic-Observatory/observatory-platform.git + git clone -b feature/astro-refactor https://github.com/The-Academic-Observatory/observatory-platform.git cd observatory-platform - pip install -e observatory-api --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-no-providers-${{ matrix.python-version }}.txt - pip install -e observatory-platform --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-no-providers-${{ matrix.python-version }}.txt + pip install -e .[tests] --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-no-providers-${{ matrix.python-version }}.txt cd .. cd oaebu-workflows - pip install -e .[tests] --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-no-providers-${{ matrix.python-version }}.txt + pip install -r requirements.txt --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-no-providers-${{ matrix.python-version }}.txt + + - name: Add dag folder to PYTHONPATH + run: | + echo "PYTHONPATH=$HOME/work/oaebu-workflows/oaebu-workflows/dags" >> "$GITHUB_ENV" # Required for testing ONIX Telescope - name: Set up JDK 11 diff --git a/.gitignore b/.gitignore index 72e709fe..f977fc9d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,26 +1,20 @@ .idea venv -*.egg-info dist .vscode docs/_build -config.yaml *.pkrvars.hcl -*.tfvars -*.tfstate -*.tfstate.* -.terraform.tfstate.lock.info -.terraform/** .coverage coverage.xml -config_terraform.yaml *.pyc AUTHORS ChangeLog -.eggs/ .DS_Store -/oaebu_workflows/workflows/oapen_cloud_function.zip docs/schemas *.Rhistory .env __pycache__ +/oaebu-workflows/build + +# Astronomer +airflow_settings.yaml \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..9f8f9e97 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM quay.io/astronomer/astro-runtime:9.10.0 + +# Root user for installations +USER root + +# Install git +RUN apt-get update && apt-get install git -y + +USER astro + +# Install Observatory Platform +RUN git clone --branch feature/astro-refactor https://github.com/The-Academic-Observatory/observatory-platform.git +RUN pip install -e ./observatory-platform/ --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-no-providers-3.10.txt \ No newline at end of file diff --git a/oaebu_workflows/__init__.py b/dags/.airflowignore similarity index 100% rename from oaebu_workflows/__init__.py rename to dags/.airflowignore diff --git a/oaebu_workflows/dags/__init__.py b/dags/__init__.py similarity index 100% rename from oaebu_workflows/dags/__init__.py rename to dags/__init__.py diff --git a/dags/load_dags.py b/dags/load_dags.py new file mode 100644 index 00000000..c4611150 --- /dev/null +++ b/dags/load_dags.py @@ -0,0 +1,3 @@ +from observatory_platform.airflow.workflow import load_dags_from_config + +load_dags_from_config() diff --git a/oaebu_workflows/google_analytics3_telescope/__init__.py b/dags/oaebu_workflows/__init__.py similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/__init__.py rename to dags/oaebu_workflows/__init__.py diff --git a/oaebu_workflows/airflow_pools.py b/dags/oaebu_workflows/airflow_pools.py similarity index 100% rename from oaebu_workflows/airflow_pools.py rename to dags/oaebu_workflows/airflow_pools.py diff --git a/oaebu_workflows/config.py b/dags/oaebu_workflows/config.py similarity index 73% rename from oaebu_workflows/config.py rename to dags/oaebu_workflows/config.py index c4da216b..b8781a32 100644 --- a/oaebu_workflows/config.py +++ b/dags/oaebu_workflows/config.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ import os from typing import Optional -from observatory.platform.config import module_file_path +from observatory_platform.config import module_file_path def test_fixtures_folder(workflow_module: Optional[str] = None) -> str: @@ -29,9 +29,9 @@ def test_fixtures_folder(workflow_module: Optional[str] = None) -> str: """ if workflow_module is not None: - return construct_module_path("oaebu_workflows", workflow_module, "tests", "fixtures") + return construct_module_path("tests", workflow_module, "fixtures") - return construct_module_path("oaebu_workflows", "tests", "fixtures") + return construct_module_path("tests", "fixtures") def schema_folder(workflow_module: Optional[str] = None) -> str: @@ -43,9 +43,9 @@ def schema_folder(workflow_module: Optional[str] = None) -> str: """ if workflow_module is not None: - return construct_module_path("oaebu_workflows", workflow_module, "schema") + return construct_module_path("dags", "oaebu_workflows", workflow_module, "schema") - return construct_module_path("oaebu_workflows", "schema") + return construct_module_path("dags", "oaebu_workflows", "schema") def sql_folder(workflow_module: Optional[str] = None) -> str: @@ -57,9 +57,9 @@ def sql_folder(workflow_module: Optional[str] = None) -> str: """ if workflow_module is not None: - return construct_module_path("oaebu_workflows", workflow_module, "sql") + return construct_module_path("dags", "oaebu_workflows", workflow_module, "sql") - return construct_module_path("oaebu_workflows", "sql") + return construct_module_path("dags", "oaebu_workflows", "sql") def construct_module_path(*parts: str) -> str: @@ -71,3 +71,9 @@ def construct_module_path(*parts: str) -> str: raise FileNotFoundError(f"construct_module_path: directory {file_path} does not exist!") return file_path + + +def oaebu_user_agent_header() -> dict: + return { + "User-Agent": "oaebu-workflows v1.0.0 (+https://github.com/The-Academic-Observatory/oaebu-workflows; mailto:agent@observatory.academy) " + } diff --git a/oaebu_workflows/google_analytics3_telescope/schema/__init__.py b/dags/oaebu_workflows/google_analytics3_telescope/__init__.py similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/__init__.py rename to dags/oaebu_workflows/google_analytics3_telescope/__init__.py diff --git a/oaebu_workflows/google_analytics3_telescope/sql/__init__.py b/dags/oaebu_workflows/google_analytics3_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/__init__.py rename to dags/oaebu_workflows/google_analytics3_telescope/schema/__init__.py diff --git a/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_author_google_analytics3.json b/dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_author_google_analytics3.json similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/book_metrics_author_google_analytics3.json rename to dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_author_google_analytics3.json diff --git a/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_country_google_analytics3.json b/dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_country_google_analytics3.json similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/book_metrics_country_google_analytics3.json rename to dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_country_google_analytics3.json diff --git a/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_google_analytics3.json b/dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_google_analytics3.json similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/book_metrics_google_analytics3.json rename to dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_google_analytics3.json diff --git a/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_subject_google_analytics3.json b/dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_subject_google_analytics3.json similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/book_metrics_subject_google_analytics3.json rename to dags/oaebu_workflows/google_analytics3_telescope/schema/book_metrics_subject_google_analytics3.json diff --git a/oaebu_workflows/google_analytics3_telescope/schema/book_product_metrics_google_analytics3.json b/dags/oaebu_workflows/google_analytics3_telescope/schema/book_product_metrics_google_analytics3.json similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/book_product_metrics_google_analytics3.json rename to dags/oaebu_workflows/google_analytics3_telescope/schema/book_product_metrics_google_analytics3.json diff --git a/oaebu_workflows/google_analytics3_telescope/schema/google_analytics.json b/dags/oaebu_workflows/google_analytics3_telescope/schema/google_analytics.json similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/schema/google_analytics.json rename to dags/oaebu_workflows/google_analytics3_telescope/schema/google_analytics.json diff --git a/oaebu_workflows/google_analytics3_telescope/tests/__init__.py b/dags/oaebu_workflows/google_analytics3_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/tests/__init__.py rename to dags/oaebu_workflows/google_analytics3_telescope/sql/__init__.py diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_body_google_analytics3.sql.jinja2 b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_body_google_analytics3.sql.jinja2 similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_body_google_analytics3.sql.jinja2 rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_body_google_analytics3.sql.jinja2 diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_join_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_join_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_join_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_join_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_null_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_null_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_null_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_null_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_struct_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_struct_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_struct_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_country_struct_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_metrics_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_metrics_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2 b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2 similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2 rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_product_body_google_analytics3.sql.jinja2 diff --git a/oaebu_workflows/google_analytics3_telescope/sql/book_product_functions_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/book_product_functions_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/book_product_functions_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/book_product_functions_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/month_metrics_sum_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/sql/month_null_google_analytics3.sql b/dags/oaebu_workflows/google_analytics3_telescope/sql/month_null_google_analytics3.sql similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/sql/month_null_google_analytics3.sql rename to dags/oaebu_workflows/google_analytics3_telescope/sql/month_null_google_analytics3.sql diff --git a/oaebu_workflows/google_analytics3_telescope/tests/fixtures/__init__.py b/dags/oaebu_workflows/google_books_telescope/__init__.py similarity index 100% rename from oaebu_workflows/google_analytics3_telescope/tests/fixtures/__init__.py rename to dags/oaebu_workflows/google_books_telescope/__init__.py diff --git a/dags/oaebu_workflows/google_books_telescope/google_books_telescope.py b/dags/oaebu_workflows/google_books_telescope/google_books_telescope.py new file mode 100644 index 00000000..09e7a525 --- /dev/null +++ b/dags/oaebu_workflows/google_books_telescope/google_books_telescope.py @@ -0,0 +1,437 @@ +# Copyright 2020-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Aniek Roelofs, Keegan Smith + +import csv +import os +import re +from collections import OrderedDict, defaultdict +from typing import List, Tuple, Union + +import pendulum +from airflow.exceptions import AirflowException, AirflowSkipException +from airflow.decorators import dag, task, task_group +from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition, Client + +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.files import convert, add_partition_date, save_jsonl_gz +from observatory_platform.google.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path, gcs_download_blob +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.google.bigquery import bq_load_table, bq_table_id, bq_create_dataset +from observatory_platform.sftp import SftpFolders, make_sftp_connection +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.release import PartitionRelease, set_task_state +from observatory_platform.airflow.airflow import on_failure_callback + + +class GoogleBooksRelease(PartitionRelease): + def __init__( + self, + dag_id: str, + run_id: str, + partition_date: pendulum.DateTime, + sftp_files: List[str], + ): + """Construct a GoogleBooksRelease. + + :param dag_id: The ID of the DAG + :param run_id: The Airflow run ID + :param partition_date: the partition date, corresponds to the last day of the month being processed. + :param sftp_files: List of full filepaths to download from sftp service (incl. in_progress folder) + """ + super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) + self.download_sales_file_name = "google_books_sales.csv" + self.download_traffic_file_name = "google_books_traffic.csv" + self.transform_sales_file_name = "google_books_sales.jsonl.gz" + self.transform_traffic_file_name = "google_books_traffic.jsonl.gz" + self.sftp_files = sftp_files + + @property + def download_sales_path(self): + return os.path.join(self.download_folder, self.download_sales_file_name) + + @property + def download_traffic_path(self): + return os.path.join(self.download_folder, self.download_traffic_file_name) + + @property + def transform_sales_path(self): + return os.path.join(self.transform_folder, self.transform_sales_file_name) + + @property + def transform_traffic_path(self): + return os.path.join(self.transform_folder, self.transform_traffic_file_name) + + @property + def download_sales_blob_name(self): + return gcs_blob_name_from_path(self.download_sales_path) + + @property + def download_traffic_blob_name(self): + return gcs_blob_name_from_path(self.download_traffic_path) + + @property + def transform_sales_blob_name(self): + return gcs_blob_name_from_path(self.transform_sales_path) + + @property + def transform_traffic_blob_name(self): + return gcs_blob_name_from_path(self.transform_traffic_path) + + @staticmethod + def from_dict(dict_: dict): + return GoogleBooksRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + partition_date=pendulum.parse(dict_["partition_date"]), + sftp_files=dict_["sftp_files"], + ) + + def to_dict(self): + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "partition_date": self.partition_date.to_date_string(), + "sftp_files": self.sftp_files, + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + sftp_root: str = "/", + sales_partner: Union[str, OaebuPartner] = "google_books_sales", + traffic_partner: Union[str, OaebuPartner] = "google_books_traffic", + bq_dataset_description: str = "Data from Google sources", + bq_sales_table_description: str = None, + bq_traffic_table_description: str = None, + api_dataset_id: str = "google_books", + sftp_service_conn_id: str = "sftp_service", + catchup: bool = False, + schedule: str = "0 12 * * Sun", # Midday every sunday + start_date: pendulum.DateTime = pendulum.datetime(2018, 1, 1), + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """Construct a GoogleBooks DAG. + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param sftp_root: The root of the SFTP filesystem to work with + :param sales_partner: The name of the sales partner + :param traffic_partner: The name of the traffic partner + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_sales_table_description: Description for the BigQuery Google Books Sales table + :param bq_traffic_table_description: Description for the BigQuery Google Books Traffic table + :param api_dataset_id: The ID to store the dataset release in the API + :param sftp_service_conn_id: Airflow connection ID for the SFTP service + :param catchup: Whether to catchup the DAG or not + :param schedule: The schedule interval of the DAG + :param start_date: The start date of the DAG + :param max_active_runs: The maximum number of active DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ + sales_partner = partner_from_str(sales_partner) + traffic_partner = partner_from_str(traffic_partner) + + # Extra SFTP parameters + sftp_folders = SftpFolders(dag_id, sftp_conn_id=sftp_service_conn_id, sftp_root=sftp_root) + sftp_regex = r"^Google(SalesTransaction|BooksTraffic)Report_\d{4}_\d{2}.csv$" + + @dag( + dag_id=dag_id, + start_date=start_date, + schedule=schedule, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def google_books(): + @task + def fetch_releases(**context) -> List[dict]: + """Lists all Google Books releases available on the SFTP server + + :returns: List of release dictionaries + """ + + reports = defaultdict(list) + # List all reports in the 'upload' folder of the organisation + with make_sftp_connection(sftp_service_conn_id) as sftp: + files = sftp.listdir(sftp_folders.upload) + for file_name in files: + match = re.match(sftp_regex, file_name) + if match: + # Get the release date from file name + date_str = file_name[-11:].strip(".csv") + release_date = pendulum.from_format(date_str, "YYYY_MM").end_of("month") + release_date = release_date.format("YYYYMMDD") + report_type = match.group(1) # Get the report type from file name + # Create the full path of the file for the 'in progress' folder + sftp_file = os.path.join(sftp_folders.in_progress, file_name) + reports[report_type + release_date].append(sftp_file) + + # Check that for each report type + date combination there is a report available + release_info = defaultdict(list) + for report, sftp_files in reports.items(): + release_date = report[-8:] + release_info[release_date] += sftp_files + + if not bool(release_info): + raise AirflowSkipException("No new releases available. Skipping downstream DAG tasks.") + + releases = [] + run_id = context["run_id"] + for partition_date, sftp_files in release_info.items(): + releases.append( + GoogleBooksRelease( + dag_id, run_id=run_id, partition_date=pendulum.parse(partition_date), sftp_files=sftp_files + ) + ) + return [r.to_dict() for r in releases] + + @task_group(group_id="process_release") + def process_release(data: dict, **context): + """Process the Google Books release.""" + + @task + def move_files_to_in_progress(release: dict, **context) -> None: + """Move Google Books files to SFTP in-progress folder.""" + + release = GoogleBooksRelease.from_dict(release) + sftp_folders.move_files_to_in_progress(release.sftp_files) + + @task + def download(release: dict, **context) -> None: + """Downloads the Google Books release and uploads them to GCS""" + + release = GoogleBooksRelease.from_dict(release) + with make_sftp_connection(sftp_service_conn_id) as sftp: + for file in release.sftp_files: + if "Traffic" in file: + sftp.get(file, localpath=release.download_traffic_path) + elif "Transaction" in file: + sftp.get(file, localpath=release.download_sales_path) + if not os.path.exists(release.download_traffic_path) or not os.path.exists(release.download_sales_path): + raise FileNotFoundError( + f"Release files not found. {release.download_traffic_path} | {release.download_sales_path}" + ) + + success = gcs_upload_files( + bucket_name=cloud_workspace.download_bucket, + file_paths=[release.download_sales_path, release.download_traffic_path], + ) + if not success: + raise AirflowException( + f"Files could not be uploaded to cloud storage bucket: {cloud_workspace.download_bucket}" + ) + + @task + def transform(release: dict, **context) -> None: + """Transforms the Google Books release and uploads them to GCS""" + + release = GoogleBooksRelease.from_dict(release) + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_sales_blob_name, + file_path=release.download_sales_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_sales_blob_name}") + + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_traffic_blob_name, + file_path=release.download_traffic_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_traffic_blob_name}") + + gb_transform( + download_files=(release.download_sales_path, release.download_traffic_path), + sales_path=release.transform_sales_path, + traffic_path=release.transform_traffic_path, + release_date=release.partition_date, + ) + """Uploads the transformed files to GCS for each release""" + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, + file_paths=[release.transform_sales_path, release.transform_traffic_path], + ) + if not success: + raise AirflowException( + f"Files could not be uploaded to cloud storage bucket: {cloud_workspace.transform_bucket}" + ) + + @task + def move_files_to_finished(release: dict, **context) -> None: + """Move Google Books files to SFTP finished folder.""" + + release = GoogleBooksRelease.from_dict(release) + sftp_folders.move_files_to_finished(release.sftp_files) + + @task + def bq_load(release: dict, **context) -> None: + """Loads the sales and traffic data into BigQuery""" + + release = GoogleBooksRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + for partner, table_description, file_path in [ + [sales_partner, bq_sales_table_description, release.transform_sales_path], + [traffic_partner, bq_traffic_table_description, release.transform_traffic_path], + ]: + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + uri = gcs_blob_uri(cloud_workspace.transform_bucket, gcs_blob_name_from_path(file_path)) + table_id = bq_table_id(cloud_workspace.project_id, partner.bq_dataset_id, partner.bq_table_name) + success = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + partition_type=TimePartitioningType.MONTH, + partition=True, + partition_field="release_date", + write_disposition=WriteDisposition.WRITE_APPEND, + table_description=table_description, + ignore_unknown_values=True, + client=client, + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task + def add_new_dataset_release(release: dict, **context) -> None: + """Adds release information to API.""" + + release = GoogleBooksRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + data_interval_start=context["data_interval_start"], + data_interval_end=context["data_interval_end"], + partition_date=release.partition_date, + ) + api.add_dataset_release(dataset_release) + + @task + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files, folders and XComs associated with this release.""" + + release = GoogleBooksRelease.from_dict(release) + cleanup( + dag_id=dag_id, execution_date=context["execution_date"], workflow_folder=release.workflow_folder + ) + + ( + move_files_to_in_progress(data) + >> download(data) + >> transform(data) + >> move_files_to_finished(data) + >> bq_load(data) + >> add_new_dataset_release(data) + >> cleanup_workflow(data) + ) + + # Define dag tasks + task_check_dependencies = check_dependencies(airflow_conns=[sftp_service_conn_id]) + xcom_releases = fetch_releases() + process_release_task_group = process_release.expand(data=xcom_releases) + + task_check_dependencies >> xcom_releases >> process_release_task_group + + return google_books() + + +def gb_transform( + download_files: Tuple[str, str], sales_path: str, traffic_path: str, release_date: pendulum.DateTime +) -> None: + """Transforms sales and traffic reports. For both reports it transforms the csv into a jsonl file and + replaces spaces in the keys with underscores. + + :param download_files: The Google Books Sales and Traffic files + :param sales_path: The file path to save the transformed sales data to + :param traffic_path: The file path to save the transformed traffic data to + :param release_date: The release date to use as a partitioning date + """ + # Sort files to get same hash for unit tests + + results = defaultdict(list) + results["sales"] = [] + results["traffic"] = [] + for file in download_files: + report_type = "sales" if "sales" in os.path.basename(file).lower() else "traffic" + with open(file, encoding="utf-16") as csv_file: + csv_reader = csv.DictReader(csv_file, delimiter="\t") + for row in csv_reader: + transformed_row = OrderedDict((convert(k.replace("%", "Perc")), v) for k, v in row.items()) + # Sales transaction report + if report_type == "sales": + transaction_date = pendulum.from_format(transformed_row["Transaction_Date"], "MM/DD/YY") + + # Sanity check that transaction date is in month of release date + if release_date.start_of("month") <= transaction_date <= release_date.end_of("month"): + pass + else: + raise AirflowException( + "Transaction date does not fall within release month. " + f"Transaction date: {transaction_date.strftime('%Y-%m-%d')}, " + f"release month: {release_date.strftime('%Y-%m')}" + ) + + # Transform to valid date format + transformed_row["Transaction_Date"] = transaction_date.strftime("%Y-%m-%d") + + # Remove percentage sign + transformed_row["Publisher_Revenue_Perc"] = transformed_row["Publisher_Revenue_Perc"].strip("%") + # This field is not present for some publishers (UCL Press), for ANU Press the field value is + # “E-Book” + try: + transformed_row["Line_of_Business"] + except KeyError: + transformed_row["Line_of_Business"] = None + # Traffic report + else: + # Remove percentage sign + transformed_row["Buy_Link_CTR"] = transformed_row["Buy_Link_CTR"].strip("%") + + # Append results + results[report_type].append(transformed_row) + + for report_type, report_results in results.items(): + report_results = add_partition_date( + report_results, + partition_date=release_date, + partition_type=TimePartitioningType.MONTH, + partition_field="release_date", + ) + save_path = sales_path if report_type == "sales" else traffic_path + print(f"SAVING REPORT '{report_type}' to {save_path}") + save_jsonl_gz(save_path, report_results) diff --git a/oaebu_workflows/google_books_telescope/__init__.py b/dags/oaebu_workflows/google_books_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/google_books_telescope/__init__.py rename to dags/oaebu_workflows/google_books_telescope/schema/__init__.py diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_traffic.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_traffic.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_traffic.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_author_google_books_traffic.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_country_google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_country_google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_country_google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_country_google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_traffic.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_traffic.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_traffic.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_google_books_traffic.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_traffic.json b/dags/oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_traffic.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_traffic.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_metrics_subject_google_books_traffic.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_traffic.json b/dags/oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_traffic.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_traffic.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_product_metadata_google_books_traffic.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_traffic.json b/dags/oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_traffic.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_traffic.json rename to dags/oaebu_workflows/google_books_telescope/schema/book_product_metrics_google_books_traffic.json diff --git a/oaebu_workflows/google_books_telescope/schema/google_books_sales.json b/dags/oaebu_workflows/google_books_telescope/schema/google_books_sales.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/google_books_sales.json rename to dags/oaebu_workflows/google_books_telescope/schema/google_books_sales.json diff --git a/oaebu_workflows/google_books_telescope/schema/google_books_traffic.json b/dags/oaebu_workflows/google_books_telescope/schema/google_books_traffic.json similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/google_books_traffic.json rename to dags/oaebu_workflows/google_books_telescope/schema/google_books_traffic.json diff --git a/oaebu_workflows/google_books_telescope/schema/__init__.py b/dags/oaebu_workflows/google_books_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/google_books_telescope/schema/__init__.py rename to dags/oaebu_workflows/google_books_telescope/sql/__init__.py diff --git a/oaebu_workflows/google_books_telescope/sql/book_metrics_country_body_google_books_sales.sql.jinja2 b/dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_body_google_books_sales.sql.jinja2 similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_metrics_country_body_google_books_sales.sql.jinja2 rename to dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_body_google_books_sales.sql.jinja2 diff --git a/oaebu_workflows/google_books_telescope/sql/book_metrics_country_join_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_join_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_metrics_country_join_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_join_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/book_metrics_country_null_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_null_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_metrics_country_null_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_null_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/book_metrics_country_struct_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_struct_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_metrics_country_struct_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/book_metrics_country_struct_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_traffic.sql b/dags/oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_traffic.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_traffic.sql rename to dags/oaebu_workflows/google_books_telescope/sql/book_metrics_google_books_traffic.sql diff --git a/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2 b/dags/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2 similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2 rename to dags/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_sales.sql.jinja2 diff --git a/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2 b/dags/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2 similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2 rename to dags/oaebu_workflows/google_books_telescope/sql/book_product_body_google_books_traffic.sql.jinja2 diff --git a/oaebu_workflows/google_books_telescope/sql/book_product_functions_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/book_product_functions_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/book_product_functions_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/book_product_functions_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_traffic.sql b/dags/oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_traffic.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_traffic.sql rename to dags/oaebu_workflows/google_books_telescope/sql/month_metrics_sum_google_books_traffic.sql diff --git a/oaebu_workflows/google_books_telescope/sql/month_null_google_books_sales.sql b/dags/oaebu_workflows/google_books_telescope/sql/month_null_google_books_sales.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/month_null_google_books_sales.sql rename to dags/oaebu_workflows/google_books_telescope/sql/month_null_google_books_sales.sql diff --git a/oaebu_workflows/google_books_telescope/sql/month_null_google_books_traffic.sql b/dags/oaebu_workflows/google_books_telescope/sql/month_null_google_books_traffic.sql similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/month_null_google_books_traffic.sql rename to dags/oaebu_workflows/google_books_telescope/sql/month_null_google_books_traffic.sql diff --git a/oaebu_workflows/google_books_telescope/sql/__init__.py b/dags/oaebu_workflows/irus_fulcrum_telescope/__init__.py similarity index 100% rename from oaebu_workflows/google_books_telescope/sql/__init__.py rename to dags/oaebu_workflows/irus_fulcrum_telescope/__init__.py diff --git a/dags/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py b/dags/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py new file mode 100644 index 00000000..37983e36 --- /dev/null +++ b/dags/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py @@ -0,0 +1,419 @@ +# Copyright 2022-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Keegan Smith + +import logging +import os +from typing import List, Tuple, Union + +import pendulum +from airflow.hooks.base import BaseHook +from airflow.decorators import dag, task +from google.cloud.bigquery import SourceFormat, WriteDisposition, Client +from google.cloud.bigquery.table import TimePartitioningType + +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.files import save_jsonl_gz, load_jsonl, add_partition_date +from observatory_platform.google.gcs import gcs_blob_name_from_path, gcs_upload_files, gcs_blob_uri, gcs_download_blob +from observatory_platform.google.bigquery import bq_load_table, bq_create_dataset, bq_table_id +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.release import PartitionRelease, set_task_state +from observatory_platform.airflow.airflow import on_failure_callback +from observatory_platform.url_utils import retry_get_url + +IRUS_FULCRUM_ENDPOINT_TEMPLATE = ( + "https://irus.jisc.ac.uk/api/v3/irus/reports/irus_ir/?platform=235" + "&requestor_id={requestor_id}&begin_date={start_date}&end_date={end_date}" +) + + +class IrusFulcrumRelease(PartitionRelease): + def __init__( + self, + dag_id: str, + run_id: str, + data_interval_start: pendulum.DateTime, + data_interval_end: pendulum.DateTime, + partition_date: pendulum.DateTime, + ): + """Create a IrusFulcrumRelease instance. + + :param dag_id: The ID of the DAG + :param run_id: The airflow run ID + :param data_interval_start: The beginning of the data interval + :param data_interval_end: The end of the data interval + :param partition_date: The release/partition date + """ + super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) + self.data_interval_start = data_interval_start + self.data_interval_end = data_interval_end + self.download_totals_file_name = "fulcrum_totals.jsonl.gz" + self.download_country_file_name = "fulcrum_country.json.gz" + self.transfrom_file_name = "fulcrum.jsonl.gz" + + @property + def download_totals_path(self): + return os.path.join(self.download_folder, self.download_totals_file_name) + + @property + def download_country_path(self): + return os.path.join(self.download_folder, self.download_country_file_name) + + @property + def transform_path(self): + return os.path.join(self.transform_folder, self.transfrom_file_name) + + @property + def download_totals_blob_name(self): + return gcs_blob_name_from_path(self.download_totals_path) + + @property + def download_country_blob_name(self): + return gcs_blob_name_from_path(self.download_country_path) + + @property + def transform_blob_name(self): + return gcs_blob_name_from_path(self.transform_path) + + @staticmethod + def from_dict(dict_: dict): + return IrusFulcrumRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + data_interval_start=pendulum.parse(dict_["data_interval_start"]), + data_interval_end=pendulum.parse(dict_["data_interval_end"]), + partition_date=pendulum.parse(dict_["partition_date"]), + ) + + def to_dict(self): + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "data_interval_start": self.data_interval_start.to_date_string(), + "data_interval_end": self.data_interval_end.to_date_string(), + "partition_date": self.partition_date.to_date_string(), + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + publishers: List[str], + data_partner: Union[str, OaebuPartner] = "irus_fulcrum", + bq_dataset_description: str = "IRUS dataset", + bq_table_description: str = "Fulcrum metrics as recorded by the IRUS platform", + api_dataset_id: str = "fulcrum", + irus_oapen_api_conn_id: str = "irus_api", + catchup: bool = True, + schedule: str = "0 0 4 * *", # Run on the 4th of every month + start_date: pendulum.DateTime = pendulum.datetime(2022, 4, 1), # Earliest available data + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """The Fulcrum Telescope + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param publishers: The publishers pertaining to this DAG instance (as listed in Fulcrum) + :param data_partner: The name of the data partner + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_table_description: Description for the biguery table + :param api_dataset_id: The ID to store the dataset release in the API + :param irus_oapen_api_conn_id: Airflow connection ID OAPEN IRUS UK (counter 5) + :param catchup: Whether to catchup the DAG or not + :param schedule: The schedule interval of the DAG + :param start_date: The start date of the DAG + :param max_active_runs: The maximum number of active DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ + + data_partner = partner_from_str(data_partner) + + @dag( + dag_id=dag_id, + schedule=schedule, + start_date=start_date, + catchup=catchup, + tags=["oaebu"], + on_failure_callback=on_failure_callback, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def irus_fulcrum(): + @task + def make_release(**context) -> dict: + """Create a IrusFulcrumRelease instance + Dates are best explained with an example + Say the dag is scheduled to run on 2022-04-07 + Interval_start will be 2022-03-01 + Interval_end will be 2022-04-01 + partition_date will be 2022-03-31 + """ + data_interval_start = context["data_interval_start"].start_of("month") + data_interval_end = context["data_interval_end"].start_of("month") + partition_date = data_interval_start.end_of("month") + return IrusFulcrumRelease( + dag_id, + context["run_id"], + data_interval_start=data_interval_start, + data_interval_end=data_interval_end, + partition_date=partition_date, + ).to_dict() + + @task + def download(release: dict, **context) -> None: + """Task to download the Fulcrum data from IRUS and upload to cloud storage + + :param release: the IrusFulcrumRelease instance. + """ + release = IrusFulcrumRelease.from_dict(release) + requestor_id = BaseHook.get_connection(irus_oapen_api_conn_id).login + totals_data, country_data = download_fulcrum_month_data(release.partition_date, requestor_id) + if not totals_data or not country_data: + raise RuntimeError(f"Data not available for supplied release month: {release.partition_date}") + save_jsonl_gz(release.download_totals_path, totals_data) + save_jsonl_gz(release.download_country_path, country_data) + + # Upload to GCS + success = gcs_upload_files( + bucket_name=cloud_workspace.download_bucket, + file_paths=[release.download_totals_path, release.download_country_path], + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task + def transform(release: dict, **context) -> None: + """Task to transform the fulcrum data and upload to cloud storage""" + + release = IrusFulcrumRelease.from_dict(release) + # Download files + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_totals_blob_name, + file_path=release.download_totals_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_totals_blob_name}") + + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_country_blob_name, + file_path=release.download_country_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_country_blob_name}") + + logging.info(f"Transforming the Fulcrum dataset with the following publisher filter: {publishers}") + totals_data = load_jsonl(release.download_totals_path) + country_data = load_jsonl(release.download_country_path) + transformed_data = transform_fulcrum_data( + totals_data=totals_data, + country_data=country_data, + publishers=publishers, + ) + transformed_data = add_partition_date( + transformed_data, + partition_date=release.partition_date.end_of("month"), + partition_type=TimePartitioningType.MONTH, + partition_field="release_date", + ) + save_jsonl_gz(release.transform_path, transformed_data) + + # Upload to GCS + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, file_paths=[release.transform_path] + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task + def bq_load(release: dict, **context) -> None: + """Load the transfromed data into bigquery""" + release = IrusFulcrumRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=data_partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + + # Load each transformed release + uri = gcs_blob_uri(cloud_workspace.transform_bucket, release.transform_blob_name) + table_id = bq_table_id(cloud_workspace.project_id, data_partner.bq_dataset_id, data_partner.bq_table_name) + client = Client(project=cloud_workspace.project_id) + success = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=data_partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + table_description=bq_table_description, + partition=True, + partition_type=TimePartitioningType.MONTH, + write_disposition=WriteDisposition.WRITE_APPEND, + partition_field="release_date", + ignore_unknown_values=True, + client=client, + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = IrusFulcrumRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + data_interval_start=release.data_interval_start, + data_interval_end=release.data_interval_end, + partition_date=release.partition_date, + ) + api.add_dataset_release(dataset_release) + + @task + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files and folders associated with this release.""" + release = IrusFulcrumRelease.from_dict(release) + cleanup(dag_id, execution_date=context["execution_date"], workflow_folder=release.workflow_folder) + + # Define DAG tasks + task_check = check_dependencies(airflow_conns=[irus_oapen_api_conn_id]) + xcom_release = make_release() + task_download = download(xcom_release) + task_transform = transform(xcom_release) + task_bq_load = bq_load(xcom_release) + task_add_release = add_new_dataset_releases(xcom_release) + task_cleanup_workflow = cleanup_workflow(xcom_release) + + ( + task_check + >> xcom_release + >> task_download + >> task_transform + >> task_bq_load + >> task_add_release + >> task_cleanup_workflow + ) + + return irus_fulcrum() + + +def download_fulcrum_month_data( + download_month: pendulum.DateTime, + requestor_id: str, + num_retries: str = 3, +) -> Tuple[List[dict], List[dict]]: + """Download Fulcrum data for the release month + + :param download_month: The month to download usage data from + :param requestor_id: The requestor ID - used to access irus platform + :param num_retries: Number of attempts to make for the URL + """ + download_month = download_month.format("YYYY-MM") + base_url = IRUS_FULCRUM_ENDPOINT_TEMPLATE.format( + requestor_id=requestor_id, + start_date=download_month, + end_date=download_month, + ) + country_url = base_url + "&attributes_to_show=Country" + logging.info(f"Downloading Fulcrum metrics for month: {download_month}") + totals_data = retry_get_url(base_url, num_retries=num_retries).json() + country_data = retry_get_url(country_url, num_retries=num_retries).json() + totals_data = totals_data.get("Report_Items") + country_data = country_data.get("Report_Items") + + return totals_data, country_data + + +def transform_fulcrum_data( + totals_data: List[dict], + country_data: List[dict], + publishers: List[str] = None, +) -> List[dict]: + """ + Transforms Fulcrum downloaded "totals" and "country" data. + + :param totals_data: Fulcrum usage data aggregated over all countries + :param country_data: Fulcrum usage data split by country + :param publishers: Fulcrum publishers to retain. If None, use all publishers + """ + # Extract only the publishers related to this organisation name + if publishers: + totals_data = [i for i in totals_data if i["Publisher"] in publishers] + country_data = [i for i in country_data if i["Publisher"] in publishers] + + # Total and Country-granulated results should all have the same item entries and be ordered the same, but we should check anyway + c_ids = [i["IRUS_Item_ID"] for i in country_data] + t_ids = [i["IRUS_Item_ID"] for i in totals_data] + assert len(c_ids) == len(t_ids), "Country entry data is not the same length as total entry data" + + # Mapping the IDs to list elements + c_id_mapping = {entry["IRUS_Item_ID"]: i for (i, entry) in enumerate(country_data)} + t_id_mapping = {entry["IRUS_Item_ID"]: i for (i, entry) in enumerate(totals_data)} + + transformed_data = [] + for t_id, c_id in zip(t_ids, c_ids): + transformed_row = {} + t_entry = totals_data[t_id_mapping[t_id]] + c_entry = country_data[c_id_mapping[c_id]] + + # Metrics with country granulation + country_metrics = [] + for c_metric in c_entry["Performance_Instances"]: # For each country + country_metrics.append( + { + "name": c_metric["Country"]["Country"], + "code": c_metric["Country"]["Country_Code"], + "Total_Item_Investigations": c_metric["Metric_Type_Counts"].get("Total_Item_Investigations"), + "Total_Item_Requests": c_metric["Metric_Type_Counts"].get("Total_Item_Requests"), + "Unique_Item_Investigations": c_metric["Metric_Type_Counts"].get("Unique_Item_Investigations"), + "Unique_Item_Requests": c_metric["Metric_Type_Counts"].get("Unique_Item_Requests"), + } + ) + + # Total Metrics + t_metric = t_entry["Performance_Instances"][0] + total_item_investigations = t_metric["Metric_Type_Counts"].get("Total_Item_Investigations") + total_item_requests = t_metric["Metric_Type_Counts"].get("Total_Item_Requests") + unique_item_investigations = t_metric["Metric_Type_Counts"].get("Unique_Item_Investigations") + unique_item_requests = t_metric["Metric_Type_Counts"].get("Unique_Item_Requests") + + # Row structure + transformed_row = { + "proprietary_id": t_id, # t_id == c_id + "ISBN": t_entry.get("ISBN"), + "book_title": t_entry.get("Item"), + "publisher": t_entry.get("Publisher"), + "authors": t_entry.get("Authors"), + "event_month": pendulum.parse(t_entry["Performance_Instances"][0]["Event_Month"]).format("YYYY-MM"), + "total_item_investigations": total_item_investigations, + "total_item_requests": total_item_requests, + "unique_item_investigations": unique_item_investigations, + "unique_item_requests": unique_item_requests, + "country": country_metrics, + } + transformed_data.append(transformed_row) + + return transformed_data diff --git a/oaebu_workflows/google_books_telescope/tests/__init__.py b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/__init__.py rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/__init__.py diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_author_irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_author_irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_author_irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_author_irus_fulcrum.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_country_irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_country_irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_country_irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_country_irus_fulcrum.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_irus_fulcrum.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_subject_irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_subject_irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_subject_irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_metrics_subject_irus_fulcrum.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metadata_irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metadata_irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metadata_irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metadata_irus_fulcrum.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metrics_irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metrics_irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metrics_irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/book_product_metrics_irus_fulcrum.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/irus_fulcrum.json b/dags/oaebu_workflows/irus_fulcrum_telescope/schema/irus_fulcrum.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/irus_fulcrum.json rename to dags/oaebu_workflows/irus_fulcrum_telescope/schema/irus_fulcrum.json diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/__init__.py b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/__init__.py rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/__init__.py diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_body_irus_fulcrum.sql.jinja2 b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_body_irus_fulcrum.sql.jinja2 similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_body_irus_fulcrum.sql.jinja2 rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_body_irus_fulcrum.sql.jinja2 diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_join_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_join_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_join_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_join_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_null_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_null_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_null_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_null_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_struct_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_struct_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_struct_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_country_struct_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_metrics_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2 b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2 similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2 rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_body_irus_fulcrum.sql.jinja2 diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_functions_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_functions_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/book_product_functions_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/book_product_functions_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/month_metrics_sum_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/month_metrics_sum_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/month_metrics_sum_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/month_metrics_sum_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql b/dags/oaebu_workflows/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql rename to dags/oaebu_workflows/irus_fulcrum_telescope/sql/month_null_irus_fulcrum.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/__init__.py b/dags/oaebu_workflows/irus_oapen_telescope/__init__.py similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/__init__.py rename to dags/oaebu_workflows/irus_oapen_telescope/__init__.py diff --git a/dags/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py b/dags/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py new file mode 100644 index 00000000..872ee2a0 --- /dev/null +++ b/dags/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py @@ -0,0 +1,626 @@ +# Copyright 2020-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Aniek Roelofs, Keegan Smith + +import gzip +import json +import logging +import os +import time +from typing import List, Optional, Tuple, Union + +import pendulum +import requests +from airflow.decorators import dag, task, task_group +from airflow.exceptions import AirflowException, AirflowSkipException +from airflow.hooks.base import BaseHook +from google.auth import environment_vars +from google.auth.transport.requests import AuthorizedSession +from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition, Client +from google.oauth2.service_account import IDTokenCredentials +from googleapiclient.discovery import Resource, build +from googleapiclient.errors import HttpError +from oauth2client.service_account import ServiceAccountCredentials + +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.files import get_file_hash, save_jsonl_gz, add_partition_date +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.google.bigquery import bq_load_table, bq_table_id, bq_create_dataset +from observatory_platform.airflow.release import PartitionRelease, set_task_state +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback +from observatory_platform.google.gcs import ( + gcs_copy_blob, + gcs_create_bucket, + gcs_download_blob, + gcs_upload_file, + gcs_upload_files, + gcs_blob_uri, + gcs_blob_name_from_path, +) + +IRUS_FUNCTION_NAME = "oapen-access-stats" # Name of the google cloud function +IRUS_FUNCTION_REGION = "europe-west1" # Region of the google cloud function +IRUS_FUNCTION_SOURCE_URL = ( + "https://github.com/The-Academic-Observatory/oapen-irus-uk-cloud-function/releases/" + "download/v1.1.9/oapen-irus-uk-cloud-function.zip" +) # URL to the zipped source code of the cloud function +IRUS_FUNCTION_MD5_HASH = "946bb4d7ca229b15aba36ad7b5ed56d0" # MD5 hash of the zipped source code +IRUS_FUNCTION_BLOB_NAME = "cloud_function_source_code.zip" # blob name of zipped source code +IRUS_FUNCTION_TIMEOUT = 1500 # Timeout of cloud function in seconds. Maximum of 60 minutes, +# see https://cloud.google.com/functions/docs/2nd-gen/overview#enhanced_infrastructure + + +class IrusOapenRelease(PartitionRelease): + def __init__( + self, + dag_id: str, + run_id: str, + data_interval_start: pendulum.DateTime, + data_interval_end: pendulum.DateTime, + partition_date: pendulum.DateTime, + ): + """Create a IrusOapenRelease instance. + + :param dag_id: The ID of the DAG + :param run_id: The Airflow run ID + :param partition_date: The date of the partition/release + """ + super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) + self.data_interval_start = data_interval_start + self.data_interval_end = data_interval_end + self.download_file_name = "irus_oapen.jsonl.gz" + self.transform_file_name = "irus_oapen.jsonl.gz" + + @property + def download_path(self): + return os.path.join(self.download_folder, self.download_file_name) + + @property + def transform_path(self): + return os.path.join(self.transform_folder, self.transform_file_name) + + @property + def cloud_function_path(self): + return os.path.join(self.download_folder, "oapen_cloud_function.zip") + + @property + def download_blob_name(self): + return gcs_blob_name_from_path(self.download_path) + + @property + def transform_blob_name(self): + return gcs_blob_name_from_path(self.transform_path) + + @staticmethod + def from_dict(dict_: dict): + return IrusOapenRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + data_interval_start=pendulum.parse(dict_["data_interval_start"]), + data_interval_end=pendulum.parse(dict_["data_interval_end"]), + partition_date=pendulum.parse(dict_["partition_date"]), + ) + + def to_dict(self) -> dict: + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "data_interval_start": self.data_interval_start.isoformat(), + "data_interval_end": self.data_interval_end.isoformat(), + "partition_date": self.partition_date.isoformat(), + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + publisher_name_v4: str, + publisher_uuid_v5: str, + data_partner: Union[str, OaebuPartner] = "irus_oapen", + bq_dataset_description: str = "IRUS dataset", + bq_table_description: str = "OAPEN metrics as recorded by the IRUS platform", + gdpr_oapen_project_id: str = "oapen-usage-data-gdpr-proof", + gdpr_oapen_bucket_id: str = "oapen-usage-data-gdpr-proof_cloud-function", + api_dataset_id: str = "oapen", + max_cloud_function_instances: int = 0, + geoip_license_conn_id: str = "geoip_license_key", + irus_oapen_api_conn_id: str = "irus_api", + irus_oapen_login_conn_id: str = "irus_login", + catchup: bool = True, + start_date: pendulum.DateTime = pendulum.datetime(2015, 6, 1), + schedule: str = "0 0 4 * *", # Run on the 4th of every month + max_active_runs: int = 5, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """The OAPEN irus uk telescope. + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param publisher_name_v4: The publisher's name for version 4 + :param publisher_uuid_v5: The publisher's uuid for version 5 + :param data_partner: The data partner + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_table_description: Description for the biguery table + :param gdpr_oapen_project_id: The gdpr-proof oapen project id. + :param gdpr_oapen_bucket_id: The gdpr-proof oapen bucket + :param api_dataset_id: The ID to store the dataset release in the API + :param max_cloud_function_instances: + :param geoip_license_conn_id: The Airflow connection ID for the GEOIP license + :param irus_oapen_api_conn_id: The Airflow connection ID for IRUS API - for counter 5 + :param irus_oapen_login_conn_id: The Airflow connection ID for IRUS API (login) - for counter 4 + :param catchup: Whether to catchup the DAG or not + :param start_date: The start date of the DAG + :param schedule: The schedule interval of the DAG + :param max_active_runs: The maximum number of concurrent DAG instances + :param max_active_runs: The maximum number of active DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ + + data_partner = partner_from_str(data_partner) + + @dag( + dag_id=dag_id, + schedule=schedule, + start_date=start_date, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def irus_oapen(): + @task() + def fetch_releases(**context) -> List[dict]: + """Create a list of IrusOapenRelease instances for a given month. + Say the dag is scheduled to run on 2022-04-07 + Interval_start will be 2022-03-01 + Interval_end will be 2022-04-01 + partition_date will be 2022-03-31 + + :param context: the context passed from the PythonOperator. + See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed + :return: list of IrusOapenRelease instances + """ + + # Get release_date + data_interval_start = context["data_interval_start"].start_of("month") + data_interval_end = context["data_interval_end"].start_of("month") + partition_date = data_interval_start.end_of("month") + + logging.info(f"Release/partition date: {partition_date}") + releases = [ + IrusOapenRelease( + dag_id=dag_id, + run_id=context["run_id"], + data_interval_start=data_interval_start, + data_interval_end=data_interval_end, + partition_date=partition_date, + ) + ] + return [r.to_dict() for r in releases] + + @task() + def create_cloud_function_(releases: List[dict], **context): + """Task to create the cloud function for each release.""" + + release = IrusOapenRelease.from_dict(releases[0]) + # set up cloud function variables + location = f"projects/{gdpr_oapen_project_id}/locations/{IRUS_FUNCTION_REGION}" + full_name = f"{location}/functions/{IRUS_FUNCTION_NAME}" + + # zip source code and upload to bucket + success, upload = upload_source_code_to_bucket( + source_url=IRUS_FUNCTION_SOURCE_URL, + project_id=gdpr_oapen_project_id, + bucket_name=gdpr_oapen_bucket_id, + blob_name=IRUS_FUNCTION_BLOB_NAME, + cloud_function_path=release.cloud_function_path, + ) + set_task_state(success, context["ti"].task_id, release=release) + + # initialise cloud functions api + creds = ServiceAccountCredentials.from_json_keyfile_name(os.environ.get(environment_vars.CREDENTIALS)) + service = build( + "cloudfunctions", "v2beta", credentials=creds, cache_discovery=False, static_discovery=False + ) + + # update or create cloud function + exists = cloud_function_exists(service, full_name) + if not exists or upload is True: + update = True if exists else False + success, msg = create_cloud_function( + service=service, + location=location, + full_name=full_name, + source_bucket=gdpr_oapen_bucket_id, + blob_name=IRUS_FUNCTION_BLOB_NAME, + max_active_runs=max_cloud_function_instances, + update=update, + ) + set_task_state(success, context["ti"].task_id, release=release) + logging.info(f"Creating or patching cloud function successful, response: {msg}") + else: + logging.info(f"Using existing cloud function, source code has not changed.") + + @task_group(group_id="process_release") + def process_release(data, **context): + @task() + def call_cloud_function_(release: dict, **context): + """Task to call the cloud function for each release.""" + + release = IrusOapenRelease.from_dict(release) + # set up cloud function variables + location = f"projects/{gdpr_oapen_project_id}/locations/{IRUS_FUNCTION_REGION}" + full_name = f"{location}/functions/{IRUS_FUNCTION_NAME}" + geoip_license_key = BaseHook.get_connection(geoip_license_conn_id).password + + # get the publisher_uuid or publisher_id, both are set to empty strings when publisher id is 'oapen' + if release.partition_date >= pendulum.datetime(2020, 4, 1): + airflow_conn = irus_oapen_api_conn_id + else: + airflow_conn = irus_oapen_login_conn_id + username = BaseHook.get_connection(airflow_conn).login + password = BaseHook.get_connection(airflow_conn).password + + # initialise cloud functions api + creds = ServiceAccountCredentials.from_json_keyfile_name(os.environ.get(environment_vars.CREDENTIALS)) + service = build( + "cloudfunctions", "v2beta", credentials=creds, cache_discovery=False, static_discovery=False + ) + + # Get cloud function uri + function_uri = cloud_function_exists(service, full_name) + + call_cloud_function( + function_uri=function_uri, + release_date=release.partition_date.format("YYYY-MM"), + username=username, + password=password, + geoip_license_key=geoip_license_key, + publisher_name_v4=publisher_name_v4, + publisher_uuid_v5=publisher_uuid_v5, + bucket_name=gdpr_oapen_bucket_id, + blob_name=release.download_blob_name, + ) + + @task() + def transfer(release: dict, **context): + """Task to transfer the file for each release. + + :param releases: the list of IrusOapenRelease instances. + """ + + release = IrusOapenRelease.from_dict(release) + success = gcs_copy_blob( + blob_name=release.download_blob_name, + src_bucket=gdpr_oapen_bucket_id, + dst_bucket=cloud_workspace.download_bucket, + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def transform(release: dict, **context): + """Task to download the access stats to a local file for each release.""" + + release = IrusOapenRelease.from_dict(release) + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_blob_name, + file_path=release.download_path, + ) + if not success: + raise FileNotFoundError(f"Could not find file: {release.download_blob_name}") + + # Read gzipped data and create list of dicts + with gzip.open(release.download_path, "r") as f: + results = [json.loads(line) for line in f] + + # Add partition date + results = add_partition_date( + results, release.partition_date, TimePartitioningType.MONTH, partition_field="release_date" + ) + + # Write list into gzipped JSON Lines file + save_jsonl_gz(release.transform_path, results) + + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, + file_paths=[release.transform_path], + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def bq_load(release: dict, **context) -> None: + """Loads the sales and traffic data into BigQuery""" + + release = IrusOapenRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=data_partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + client = Client(project=cloud_workspace.project_id) + uri = gcs_blob_uri(cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) + table_id = bq_table_id( + cloud_workspace.project_id, data_partner.bq_dataset_id, data_partner.bq_table_name + ) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=data_partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + partition_type=TimePartitioningType.MONTH, + partition=True, + partition_field="release_date", + write_disposition=WriteDisposition.WRITE_APPEND, + table_description=bq_table_description, + ignore_unknown_values=True, + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = IrusOapenRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + data_interval_start=release.data_interval_start, + data_interval_end=release.data_interval_end, + partition_date=release.partition_date, + ) + api.add_dataset_release(dataset_release) + + @task() + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files, folders and XComs associated with this release.""" + + release = IrusOapenRelease.from_dict(release) + cleanup( + dag_id=dag_id, + execution_date=context["execution_date"], + workflow_folder=release.workflow_folder, + ) + + ( + call_cloud_function_(data) + >> transfer(data) + >> transform(data) + >> bq_load(data) + >> add_new_dataset_releases(data) + >> cleanup_workflow(data) + ) + + # Define DAG tasks + task_check_dependencies = check_dependencies( + airflow_conns=[geoip_license_conn_id, irus_oapen_api_conn_id, irus_oapen_login_conn_id] + ) + xcom_release = fetch_releases() + cloud_function_task = create_cloud_function_(xcom_release, task_concurrency=1) + process_release_task_group = process_release.expand(data=xcom_release) + + (task_check_dependencies >> xcom_release >> cloud_function_task >> process_release_task_group) + + return irus_oapen() + + +def upload_source_code_to_bucket( + source_url: str, + project_id: str, + bucket_name: str, + blob_name: str, + cloud_function_path: str, + expected_md5_hash: str = IRUS_FUNCTION_MD5_HASH, +) -> Tuple[bool, bool]: + """Upload source code of cloud function to storage bucket + + :param source_url: The url to the zip file with source code + :param project_id: The project id with the bucket + :param bucket_name: The bucket name + :param blob_name: The blob name + :param cloud_function_path: The local path to the cloud function + :param expected_md5_hash: The expected md5 hash of the source code + :return: Whether task was successful and whether file was uploaded + """ + + # Get zip file with source code from github release + response = requests.get(source_url) + with open(cloud_function_path, "wb") as f: + f.write(response.content) + + # Check if current md5 hash matches expected md5 hash + actual_md5_hash = get_file_hash(file_path=cloud_function_path, algorithm="md5") + if actual_md5_hash != expected_md5_hash: + raise AirflowException(f"md5 hashes do not match, expected: {expected_md5_hash}, actual: {actual_md5_hash}") + + # Create storage bucket + gcs_create_bucket(bucket_name=bucket_name, location="EU", project_id=project_id, lifecycle_delete_age=1) + + # upload zip to cloud storage + success, upload = gcs_upload_file(bucket_name=bucket_name, blob_name=blob_name, file_path=cloud_function_path) + return success, upload + + +def cloud_function_exists(service: Resource, full_name: str) -> Optional[str]: + """Check if cloud function with a given name already exists + + :param service: Cloud function service + :param full_name: Name of the cloud function + :return: URI if cloud function exists, else None + """ + try: + response = service.projects().locations().functions().get(name=full_name).execute() + uri = response["serviceConfig"]["uri"] + except HttpError: + return None + return uri + + +def create_cloud_function( + service: Resource, + location: str, + full_name: str, + source_bucket: str, + blob_name: str, + max_active_runs: int, + update: bool, +) -> Tuple[bool, dict]: + """Create cloud function. + + :param service: Cloud function service + :param location: Location of the cloud function + :param full_name: Name of the cloud function + :param source_bucket: Name of bucket where the source code is stored + :param blob_name: Blob name of source code inside bucket + :param max_active_runs: The limit on the maximum number of function instances that may coexist at a given time + :param update: Whether a new function is created or an existing one is updated + :return: Status of the cloud function and error/success message + """ + body = { + "name": full_name, + "environment": "GEN_2", + "description": "Pulls oapen irus uk data and replaces ip addresses with city and country info.", + "buildConfig": { + "runtime": "python39", + "entryPoint": "download", + "source": {"storageSource": {"bucket": source_bucket, "object": blob_name}}, + }, + "serviceConfig": { + "timeoutSeconds": IRUS_FUNCTION_TIMEOUT, + "availableMemory": "4096M", + "maxInstanceCount": max_active_runs, + "allTrafficOnLatestRevision": True, + }, + } + if update: + update_mask = ",".join(body.keys()) + response = ( + service.projects() + .locations() + .functions() + .patch(name=full_name, updateMask=update_mask, body=body) + .execute() + ) + logging.info(f"Patching cloud function, response: {response}") + else: + response = ( + service.projects() + .locations() + .functions() + .create(parent=location, functionId=IRUS_FUNCTION_NAME, body=body) + .execute() + ) + logging.info(f"Creating cloud function, response: {response}") + + operation_name = response.get("name") + done = response.get("done") + while not done: + time.sleep(10) + response = service.projects().locations().operations().get(name=operation_name).execute() + done = response.get("done") + + error = response.get("error") + response = response.get("response") + if response: + msg = response + success = True + else: + msg = error + success = False + + return success, msg + + +def call_cloud_function( + function_uri: str, + release_date: str, + username: str, + password: str, + geoip_license_key: str, + publisher_name_v4: str, + publisher_uuid_v5: str, + bucket_name: str, + blob_name: str, +) -> None: + """Iteratively call cloud function, until it has finished processing all publishers. + When a publisher name/uuid is given, there is only 1 publisher, if it is empty the cloud function will process + all available publishers. In that case, when the data is downloaded from the new platform it can be done in 1 + iteration, however for the old platform two files have to be downloaded separately for each publisher, + this might take longer than the timeout time of the cloud function, so the process is split up in multiple calls. + + :param function_uri: URI of the cloud function + :param release_date: The release date in YYYY-MM + :param username: Oapen username (email or requestor_id) + :param password: Oapen password (password or api_key) + :param geoip_license_key: License key of geoip database + :param publisher_name_v4: URL encoded name of the publisher (used for counter version 4) + :param publisher_uuid_v5: UUID of the publisher (used for counter version 5) + :param bucket_name: Name of the bucket to store oapen access stats data + :param blob_name: Blob name to store oapen access stats data + """ + creds = IDTokenCredentials.from_service_account_file( + os.environ.get(environment_vars.CREDENTIALS), target_audience=function_uri + ) + authed_session = AuthorizedSession(creds) + data = { + "release_date": release_date, + "username": username, + "password": password, + "geoip_license_key": geoip_license_key, + "publisher_name_v4": publisher_name_v4, + "publisher_uuid_v5": publisher_uuid_v5, + "bucket_name": bucket_name, + "blob_name": blob_name, + } + finished = False + while not finished: + response = authed_session.post( + function_uri, + data=json.dumps(data), + headers={"Content-Type": "application/json"}, + timeout=IRUS_FUNCTION_TIMEOUT, + ) + logging.info(f"Call cloud function response status code: {response.status_code}, reason: {response.reason}") + if response.status_code != 200: + raise AirflowException("Cloud function unsuccessful") + + response_json = response.json() + if response_json["unprocessed_publishers"]: + data["unprocessed_publishers"] = response_json["unprocessed_publishers"] + remaining_publishers = len(response_json["unprocessed_publishers"]) + else: + finished = True + remaining_publishers = 0 + + entries = response_json["entries"] + if entries == 0 and remaining_publishers == 0: + raise AirflowSkipException("No access stats entries for publisher(s) in month.") + + logging.info(f"Processed {entries} entries in total. {remaining_publishers} publishers " f"left to process") diff --git a/oaebu_workflows/irus_fulcrum_telescope/schema/__init__.py b/dags/oaebu_workflows/irus_oapen_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/schema/__init__.py rename to dags/oaebu_workflows/irus_oapen_telescope/schema/__init__.py diff --git a/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_author_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_author_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/book_metrics_author_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_author_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_country_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_country_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/book_metrics_country_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_country_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/book_metrics_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_subject_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_subject_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/book_metrics_subject_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/book_metrics_subject_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/book_product_metadata_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/book_product_metadata_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/book_product_metadata_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/book_product_metadata_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/book_product_metrics_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/book_product_metrics_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/book_product_metrics_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/book_product_metrics_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/export_metrics_city_irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/export_metrics_city_irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/export_metrics_city_irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/export_metrics_city_irus_oapen.json diff --git a/oaebu_workflows/irus_oapen_telescope/schema/irus_oapen.json b/dags/oaebu_workflows/irus_oapen_telescope/schema/irus_oapen.json similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/irus_oapen.json rename to dags/oaebu_workflows/irus_oapen_telescope/schema/irus_oapen.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/sql/__init__.py b/dags/oaebu_workflows/irus_oapen_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/sql/__init__.py rename to dags/oaebu_workflows/irus_oapen_telescope/sql/__init__.py diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_body_irus_oapen.sql.jinja2 b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_body_irus_oapen.sql.jinja2 similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_body_irus_oapen.sql.jinja2 rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_body_irus_oapen.sql.jinja2 diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_join_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_join_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_join_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_join_irus_oapen.sql diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_null_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_null_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_null_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_null_irus_oapen.sql diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_struct_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_struct_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_struct_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_country_struct_irus_oapen.sql diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_metrics_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_metrics_irus_oapen.sql diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2 b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2 similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2 rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_product_body_irus_oapen.sql.jinja2 diff --git a/oaebu_workflows/irus_oapen_telescope/sql/book_product_functions_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/book_product_functions_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/book_product_functions_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/book_product_functions_irus_oapen.sql diff --git a/oaebu_workflows/irus_oapen_telescope/sql/month_metrics_sum_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/month_metrics_sum_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/month_metrics_sum_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/month_metrics_sum_irus_oapen.sql diff --git a/oaebu_workflows/irus_oapen_telescope/sql/month_null_irus_oapen.sql b/dags/oaebu_workflows/irus_oapen_telescope/sql/month_null_irus_oapen.sql similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/month_null_irus_oapen.sql rename to dags/oaebu_workflows/irus_oapen_telescope/sql/month_null_irus_oapen.sql diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/__init__.py b/dags/oaebu_workflows/jstor_telescope/__init__.py similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/__init__.py rename to dags/oaebu_workflows/jstor_telescope/__init__.py diff --git a/oaebu_workflows/jstor_telescope/jstor_telescope.py b/dags/oaebu_workflows/jstor_telescope/jstor_telescope.py similarity index 58% rename from oaebu_workflows/jstor_telescope/jstor_telescope.py rename to dags/oaebu_workflows/jstor_telescope/jstor_telescope.py index 49bb8d64..878f0353 100644 --- a/oaebu_workflows/jstor_telescope/jstor_telescope.py +++ b/dags/oaebu_workflows/jstor_telescope/jstor_telescope.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,31 +29,39 @@ import pendulum import requests -from airflow.exceptions import AirflowException +from airflow.exceptions import AirflowException, AirflowSkipException from airflow.hooks.base import BaseHook -from airflow.models.taskinstance import TaskInstance +from airflow.decorators import dag, task, task_group from bs4 import BeautifulSoup, SoupStrainer -from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition +from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition, Client from google.oauth2.credentials import Credentials from googleapiclient.discovery import Resource, build from tenacity import retry, stop_after_attempt, wait_exponential, wait_fixed +from oaebu_workflows.config import oaebu_user_agent_header from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import save_jsonl_gz -from observatory.platform.utils.url_utils import get_user_agent, retry_get_url -from observatory.platform.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path -from observatory.platform.bigquery import bq_load_table, bq_table_id, bq_create_dataset -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.files import add_partition_date, convert -from observatory.platform.workflows.workflow import ( - Workflow, - PartitionRelease, - cleanup, - set_task_state, - check_workflow_inputs, +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.files import save_jsonl_gz +from observatory_platform.url_utils import retry_get_url +from observatory_platform.google.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path, gcs_download_blob +from observatory_platform.google.bigquery import bq_load_table, bq_table_id, bq_create_dataset +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.files import add_partition_date, convert +from observatory_platform.airflow.release import PartitionRelease, set_task_state +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback + + +JSTOR_PROCESSED_LABEL_NAME = "processed_report" + +# download settings +JSTOR_MAX_ATTEMPTS = 3 +JSTOR_FIXED_WAIT = 20 # seconds +JSTOR_MAX_WAIT_TIME = 60 * 10 # seconds +JSTOR_EXP_BASE = 3 +JSTOR_MULTIPLIER = 10 +JSTOR_WAIT_FN = wait_fixed(JSTOR_FIXED_WAIT) + wait_exponential( + multiplier=JSTOR_MULTIPLIER, exp_base=JSTOR_EXP_BASE, max=JSTOR_MAX_WAIT_TIME ) @@ -84,277 +92,323 @@ def __init__( self.reports = reports self.data_interval_start = data_interval_start self.data_interval_end = data_interval_end - self.download_country_path = os.path.join(self.download_folder, "country.tsv") - self.download_institution_path = os.path.join(self.download_folder, "institution.tsv") - self.transform_country_path = os.path.join(self.transform_folder, "country.jsonl.gz") - self.transform_institution_path = os.path.join(self.transform_folder, "institution.jsonl.gz") - - -class JstorTelescope(Workflow): - """The JSTOR telescope.""" - - REPORTS_INFO = "reports" - PROCESSED_LABEL_NAME = "processed_report" - - # download settings - MAX_ATTEMPTS = 3 - FIXED_WAIT = 20 # seconds - MAX_WAIT_TIME = 60 * 10 # seconds - EXP_BASE = 3 - MULTIPLIER = 10 - WAIT_FN = wait_fixed(FIXED_WAIT) + wait_exponential(multiplier=MULTIPLIER, exp_base=EXP_BASE, max=MAX_WAIT_TIME) - - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - entity_id: str, - entity_type: Literal["publisher", "collection"] = "publisher", - country_partner: Union[str, OaebuPartner] = "jstor_country", - institution_partner: Union[str, OaebuPartner] = "jstor_institution", - bq_dataset_description: str = "Data from JSTOR sources", - bq_country_table_description: Optional[str] = None, - bq_institution_table_description: Optional[str] = None, - api_dataset_id: str = "jstor", - gmail_api_conn_id: str = "gmail_api", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - catchup: bool = False, - max_active_runs: int = 1, - schedule: str = "0 0 4 * *", # 4th day of every month - start_date: pendulum.DateTime = pendulum.datetime(2016, 10, 1), - ): - """Construct a JstorTelescope instance. - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param entity_id: The ID of the publisher for this DAG - :param entity_type: Whether this entity should be treated as a publisher or a collection - :param country_partner: The name of the country partner - :param institution_partner: The name of the institution partner - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_country_table_description: Description for the BigQuery JSTOR country table - :param bq_institution_table_description: Description for the BigQuery JSTOR institution table - :param api_dataset_id: The ID to store the dataset release in the API - :param gmail_api_conn_id: Airflow connection ID for the Gmail API - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param catchup: Whether to catchup the DAG or not - :param max_active_runs: The maximum number of DAG runs that can be run concurrently - :param schedule: The schedule interval of the DAG - :param start_date: The start date of the DAG - """ - super().__init__( - dag_id, - start_date, - schedule, - catchup=catchup, - airflow_conns=[gmail_api_conn_id, observatory_api_conn_id], - max_active_runs=max_active_runs, - tags=["oaebu"], - ) - - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.entity_id = entity_id - self.entity_type = entity_type - self.country_partner = partner_from_str(country_partner) - self.institution_partner = partner_from_str(institution_partner) - self.bq_dataset_description = bq_dataset_description - self.bq_country_table_description = bq_country_table_description - self.bq_institution_table_description = bq_institution_table_description - self.api_dataset_id = api_dataset_id - self.gmail_api_conn_id = gmail_api_conn_id - self.observatory_api_conn_id = observatory_api_conn_id - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_setup_task(self.list_reports) - self.add_setup_task(self.download_reports) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> List[JstorRelease]: - """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is - called in 'task_callable'. - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: A list of grid release instances - """ - - ti: TaskInstance = kwargs["ti"] - available_releases = ti.xcom_pull( - key=JstorTelescope.RELEASE_INFO, task_ids=self.download_reports.__name__, include_prior_dates=False + self.download_country_file_name = "country.tsv" + self.download_institution_file_name = "institution.tsv" + self.transform_country_file_name = "country.jsonl.gz" + self.transform_institution_file_name = "institution.jsonl.gz" + + @property + def download_country_path(self): + return os.path.join(self.download_folder, self.download_country_file_name) + + @property + def download_institution_path(self): + return os.path.join(self.download_folder, self.download_institution_file_name) + + @property + def transform_country_path(self): + return os.path.join(self.transform_folder, self.transform_country_file_name) + + @property + def transform_institution_path(self): + return os.path.join(self.transform_folder, self.transform_institution_file_name) + + @property + def download_country_blob_name(self): + return gcs_blob_name_from_path(self.download_country_path) + + @property + def download_institution_blob_name(self): + return gcs_blob_name_from_path(self.download_institution_path) + + @property + def transform_country_blob_name(self): + return gcs_blob_name_from_path(self.transform_country_path) + + @property + def transform_institution_blob_name(self): + return gcs_blob_name_from_path(self.transform_institution_path) + + @staticmethod + def from_dict(dict_: dict): + return JstorRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + data_interval_start=pendulum.parse(dict_["data_interval_start"]), + data_interval_end=pendulum.parse(dict_["data_interval_end"]), + partition_date=pendulum.parse(dict_["partition_date"]), + reports=dict_["reports"], ) - releases = [] - for release_date in available_releases: - reports = available_releases[release_date] - partition_date = pendulum.parse(release_date) - data_interval_start = partition_date.start_of("month") - data_interval_end = partition_date.add(days=1).start_of("month") - releases.append( - JstorRelease( - dag_id=self.dag_id, - run_id=kwargs["run_id"], - partition_date=partition_date, - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - reports=reports, - ) - ) - return releases - def list_reports(self, **kwargs) -> bool: - """Lists all Jstor releases for a given month and publishes their report_type, download_url and - release_date's as an XCom. + def to_dict(self): + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "data_interval_start": self.data_interval_start.to_date_string(), + "data_interval_end": self.data_interval_end.to_date_string(), + "partition_date": self.partition_date.to_date_string(), + "reports": self.reports, + } - :return: Whether to continue the DAG - """ - api = make_jstor_api(self.entity_type, self.entity_id) - available_reports = api.list_reports() - continue_dag = len(available_reports) > 0 - if continue_dag: - # Push messages - ti: TaskInstance = kwargs["ti"] - ti.xcom_push(JstorTelescope.REPORTS_INFO, available_reports) - - return continue_dag - - def download_reports(self, **kwargs) -> bool: - """Download the JSTOR reports based on the list with available reports. - The release date for each report is only known after downloading the report. Therefore they are first - downloaded to a temporary location, afterwards the release info can be pushed as an xcom and the report is - moved to the correct location. - - :return: Whether to continue the DAG (always True) - """ - ti: TaskInstance = kwargs["ti"] - available_reports = ti.xcom_pull( - key=JstorTelescope.REPORTS_INFO, task_ids=self.list_reports.__name__, include_prior_dates=False - ) - available_releases = {} - api = make_jstor_api(self.entity_type, self.entity_id) - for report in available_reports: - # Download report to temporary file - tmp_download_path = NamedTemporaryFile().name - api.download_report(report, download_path=tmp_download_path) - start_date, end_date = api.get_release_date(tmp_download_path) - - # Create temporary release and move report to correct path - release = JstorRelease( - dag_id=self.dag_id, - run_id=kwargs["run_id"], - data_interval_start=start_date, - data_interval_end=end_date.add(days=1).start_of("month"), - partition_date=end_date, - reports=[report], - ) - download_path = ( - release.download_country_path if report["type"] == "country" else release.download_institution_path - ) - shutil.move(tmp_download_path, download_path) - # Add reports to list with available releases - release_date = release.partition_date.format("YYYYMMDD") - try: - available_releases[release_date].append(report) - except KeyError: - available_releases[release_date] = [report] +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + entity_id: str, + entity_type: Literal["publisher", "collection"] = "publisher", + country_partner: Union[str, OaebuPartner] = "jstor_country", + institution_partner: Union[str, OaebuPartner] = "jstor_institution", + bq_dataset_description: str = "Data from JSTOR sources", + bq_country_table_description: Optional[str] = None, + bq_institution_table_description: Optional[str] = None, + api_dataset_id: str = "jstor", + gmail_api_conn_id: str = "gmail_api", + catchup: bool = False, + schedule: str = "0 0 4 * *", # 4th day of every month + start_date: pendulum.DateTime = pendulum.datetime(2016, 10, 1), + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """Construct a Jstor DAG. + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param entity_id: The ID of the publisher for this DAG + :param entity_type: Whether this entity should be treated as a publisher or a collection + :param country_partner: The name of the country partner + :param institution_partner: The name of the institution partner + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_country_table_description: Description for the BigQuery JSTOR country table + :param bq_institution_table_description: Description for the BigQuery JSTOR institution table + :param api_dataset_id: The ID to store the dataset release in the API + :param gmail_api_conn_id: Airflow connection ID for the Gmail API + :param catchup: Whether to catchup the DAG or not + :param max_active_runs: The maximum number of DAG runs that can be run concurrently + :param schedule: The schedule interval of the DAG + :param start_date: The start date of the DAG + :param max_active_runs: The maximum number of active DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ - ti.xcom_push(JstorTelescope.RELEASE_INFO, available_releases) - return True + country_partner = partner_from_str(country_partner) + institution_partner = partner_from_str(institution_partner) + + @dag( + dag_id=dag_id, + schedule=schedule, + start_date=start_date, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def jstor(): + @task + def list_reports(**context) -> List[dict]: + """Lists all Jstor releases for a given month and publishes their report_type, download_url and + release_date's as an XCom. + + :raises AirflowSkipException: Raised if there are no available reports + :return: A list of available reports + """ + + # Get the reports from GMAIL API + api = make_jstor_api(entity_type, entity_id) + available_reports = api.list_reports() + if not len(available_reports) > 0: + raise AirflowSkipException("No reports available. Skipping downstream DAG taks.") + return available_reports + + @task + def download(available_reports: List[dict], **context) -> List[dict]: + """Downloads the reports from the GMAIL API. A release is created for each unique release date. + Upoads the reports to GCS + + :returns: List of unique releases + """ + + # Download reports and determine dates + available_releases = {} + api = make_jstor_api(entity_type, entity_id) + for report in available_reports: + # Download report to temporary file + tmp_download_path = NamedTemporaryFile().name + api.download_report(report, download_path=tmp_download_path) + start_date, end_date = api.get_release_date(tmp_download_path) + + # Create temporary release and move report to correct path + release = JstorRelease( + dag_id=dag_id, + run_id=context["run_id"], + data_interval_start=start_date, + data_interval_end=end_date.add(days=1).start_of("month"), + partition_date=end_date, + reports=[report], + ) + download_path = ( + release.download_country_path if report["type"] == "country" else release.download_institution_path + ) + shutil.move(tmp_download_path, download_path) - def upload_downloaded(self, releases: List[JstorRelease], **kwargs) -> None: - """Uploads the downloaded files to GCS for each release + # Add reports to list with available releases + release_date = release.partition_date.format("YYYYMMDD") + try: + available_releases[release_date].append(report) + except KeyError: + available_releases[release_date] = [report] + + # Generate the release for each release date + releases = [] + for release_date, reports in available_releases.items(): + partition_date = pendulum.parse(release_date) + data_interval_start = partition_date.start_of("month") + data_interval_end = partition_date.add(days=1).start_of("month") + releases.append( + JstorRelease( + dag_id=dag_id, + run_id=context["run_id"], + partition_date=partition_date, + data_interval_start=data_interval_start, + data_interval_end=data_interval_end, + reports=reports, + ) + ) - :param releases: List of JstorRelease instances: - """ - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.download_bucket, - file_paths=[release.download_country_path, release.download_institution_path], - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def transform(self, releases: List[JstorRelease], **kwargs): - """Task to transform the Jstor releases for a given month.""" - api = make_jstor_api(self.entity_type, self.entity_id) - for release in releases: - api.transform_reports( - download_country=release.download_country_path, - download_institution=release.download_institution_path, - transform_country=release.transform_country_path, - transform_institution=release.transform_institution_path, - partition_date=release.partition_date, - ) + # Upload to GCS + for release in releases: + success = gcs_upload_files( + bucket_name=cloud_workspace.download_bucket, + file_paths=[release.download_institution_path, release.download_country_path], + ) + set_task_state(success, context["ti"].task_id, release=release) + + return [r.to_dict() for r in releases] + + @task_group(group_id="process_release") + def process_release(data): + @task + def transform(release: dict, **context) -> None: + """Task to transform the Jstor releases for a given month.""" + + release = JstorRelease.from_dict(release) + api = make_jstor_api(entity_type, entity_id) + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_country_blob_name, + file_path=release.download_country_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_country_blob_name}") + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_institution_blob_name, + file_path=release.download_institution_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_institution_blob_name}") + + api.transform_reports( + download_country=release.download_country_path, + download_institution=release.download_institution_path, + transform_country=release.transform_country_path, + transform_institution=release.transform_institution_path, + partition_date=release.partition_date, + ) - def upload_transformed(self, releases: List[JstorRelease], **kwargs) -> None: - """Uploads the transformed files to GCS for each release""" - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, - file_paths=[release.transform_country_path, release.transform_institution_path], - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, releases: List[JstorRelease], **kwargs) -> None: - """Loads the sales and traffic data into BigQuery""" - - for release in releases: - for partner, table_description, file_path in [ - (self.country_partner, self.bq_country_table_description, release.transform_country_path), - (self.institution_partner, self.bq_institution_table_description, release.transform_institution_path), - ]: - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, + file_paths=[release.transform_country_path, release.transform_institution_path], ) - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(file_path)) - table_id = bq_table_id(self.cloud_workspace.project_id, partner.bq_dataset_id, partner.bq_table_name) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - partition_type=TimePartitioningType.MONTH, - partition=True, - partition_field="release_date", - write_disposition=WriteDisposition.WRITE_APPEND, - table_description=table_description, - ignore_unknown_values=True, + set_task_state(success, context["ti"].task_id, release=release) + + @task + def bq_load(release: dict, **context) -> None: + """Loads the sales and traffic data into BigQuery""" + + release = JstorRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + for partner, table_description, file_path in [ + (country_partner, bq_country_table_description, release.transform_country_path), + ( + institution_partner, + bq_institution_table_description, + release.transform_institution_path, + ), + ]: + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + uri = gcs_blob_uri(cloud_workspace.transform_bucket, gcs_blob_name_from_path(file_path)) + table_id = bq_table_id(cloud_workspace.project_id, partner.bq_dataset_id, partner.bq_table_name) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + partition_type=TimePartitioningType.MONTH, + partition=True, + partition_field="release_date", + write_disposition=WriteDisposition.WRITE_APPEND, + table_description=table_description, + ignore_unknown_values=True, + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = JstorRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + data_interval_start=release.data_interval_start, + data_interval_end=release.data_interval_end, + partition_date=release.partition_date, ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, releases: List[JstorRelease], **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - for release in releases: - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - data_interval_start=release.data_interval_start, - data_interval_end=release.data_interval_end, - partition_date=release.partition_date, - ) - api.post_dataset_release(dataset_release) + api.add_dataset_release(dataset_release) + + @task + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files, folders and XComs associated with this release. + Assign a label to the gmail messages that have been processed.""" + + release = JstorRelease.from_dict(release) + api = make_jstor_api(entity_type, entity_id) + cleanup( + dag_id=dag_id, + execution_date=context["execution_date"], + workflow_folder=release.workflow_folder, + ) + success = api.add_labels(release.reports) + set_task_state(success, context["ti"].task_id, release=release) - def cleanup(self, releases: List[JstorRelease], **kwargs) -> None: - """Delete all files, folders and XComs associated with this release. - Assign a label to the gmail messages that have been processed.""" + transform(data) >> bq_load(data) >> add_new_dataset_releases(data) >> cleanup_workflow(data) - api = make_jstor_api(self.entity_type, self.entity_id) - for release in releases: - cleanup( - dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder - ) - success = api.add_labels(release.reports) - set_task_state(success, kwargs["ti"].task_id, release=release) + # Define DAG tasks + task_check_dependencies = check_dependencies(airflow_conns=[gmail_api_conn_id], start_date=start_date) + xcom_reports = list_reports() + xcom_releases = download(xcom_reports) + process_release_task_group = process_release.expand(data=xcom_releases) + + (task_check_dependencies >> xcom_reports >> xcom_releases >> process_release_task_group) + + return jstor() def create_gmail_service() -> Resource: @@ -480,7 +534,7 @@ def list_reports(self) -> List[dict]: # List messages with specific query list_params = { "userId": "me", - "q": f'-label:{JstorTelescope.PROCESSED_LABEL_NAME} subject:"JSTOR Publisher Report Available" from:no-reply@ithaka.org', + "q": f'-label:{JSTOR_PROCESSED_LABEL_NAME} subject:"JSTOR Publisher Report Available" from:no-reply@ithaka.org', "labelIds": ["INBOX"], "maxResults": 500, } @@ -501,7 +555,7 @@ def list_reports(self) -> List[dict]: f"Can't find download link for report in e-mail, message snippet: {message.snippet}" ) - # Get filename and extension from head + # Get filename and extension from headXX filename, extension = self.get_header_info(download_url) # Get publisher @@ -547,15 +601,14 @@ def download_report(self, report: dict, download_path: str) -> None: raise KeyError(f"'url' not found in report: {report}") logging.info(f"Downloading report: {url} to: {download_path}") - headers = {"User-Agent": get_user_agent(package_name="oaebu_workflows")} response = retry_get_url( - url, headers=headers, wait=JstorTelescope.WAIT_FN, num_retries=JstorTelescope.MAX_ATTEMPTS + url, headers=oaebu_user_agent_header(), wait=JSTOR_WAIT_FN, num_retries=JSTOR_MAX_ATTEMPTS ) content = response.content.decode("utf-8") with open(download_path, "w") as f: f.write(content) - @retry(stop=stop_after_attempt(JstorTelescope.MAX_ATTEMPTS), reraise=True, wait=JstorTelescope.WAIT_FN) + @retry(stop=stop_after_attempt(JSTOR_MAX_ATTEMPTS), reraise=True, wait=JSTOR_WAIT_FN) def get_header_info(self, url: str) -> Tuple[str, str]: """Get header info from url and parse for filename and extension of file. @@ -567,8 +620,7 @@ def get_header_info(self, url: str) -> Tuple[str, str]: f'attempt: {self.get_header_info.retry.statistics["attempt_number"]}, ' f'idle for: {self.get_header_info.retry.statistics["idle_for"]}' ) - headers = {"User-Agent": get_user_agent(package_name="oaebu_workflows")} - response = requests.head(url, allow_redirects=True, headers=headers) + response = requests.head(url, allow_redirects=True, headers=oaebu_user_agent_header()) if response.status_code != 200: raise AirflowException( f"Could not get HEAD of report download url, reason: {response.reason}, " @@ -697,15 +749,13 @@ def add_labels(self, reports: List[dict]) -> bool: :param reports: List of report info :return: True if successful, False otherwise """ - label_id = self.get_label_id(JstorTelescope.PROCESSED_LABEL_NAME) + label_id = self.get_label_id(JSTOR_PROCESSED_LABEL_NAME) body = {"addLabelIds": [label_id]} for message in [report["id"] for report in reports]: response = self.service.users().messages().modify(userId="me", id=message, body=body).execute() try: message_id = response["id"] - logging.info( - f"Added label '{JstorTelescope.PROCESSED_LABEL_NAME}' to GMAIL message, message_id: {message_id}" - ) + logging.info(f"Added label '{JSTOR_PROCESSED_LABEL_NAME}' to GMAIL message, message_id: {message_id}") except KeyError: return False return True @@ -724,13 +774,13 @@ def list_reports(self) -> List[dict]: # List messages with specific query list_params = { "userId": "me", - "q": f"-label:{JstorTelescope.PROCESSED_LABEL_NAME} from:grp_ithaka_data_intelligence@ithaka.org", + "q": f"-label:{JSTOR_PROCESSED_LABEL_NAME} from:grp_ithaka_data_intelligence@ithaka.org", "labelIds": ["INBOX"], "maxResults": 500, } available_reports = [] - country_regex = rf"^{self.entity_id}_(Open_Country|Country_Open)_Usage\.csv$" - institution_regex = rf"^{self.entity_id}_(Open_Institution|Institution_Open)_Usage\.csv$" + country_regex = rf"^{self.entity_id}_Open_Country_Usage\.csv$" + institution_regex = rf"^{self.entity_id}_Open_Institution_Usage\.csv$" for message_info in self.get_messages(list_params): message_id = message_info["id"] message = self.service.users().messages().get(userId="me", id=message_id).execute() @@ -826,10 +876,7 @@ def transform_reports( row["Publisher"] = row.pop("publisher") row["Book_ID"] = row.pop("item_doi") row["Usage_Month"] = partition_date.strftime("%Y-%m") - try: - row[entity] = row.pop("\ufeffentity_name") - except KeyError: - row[entity] = row.pop("entity_name") + row[entity] = row.pop("\ufeffentity_name") row["Book_Title"] = row.pop("book_title") row["Authors"] = row.pop("authors") row["ISBN"] = row.pop("eisbn") @@ -849,15 +896,13 @@ def add_labels(self, reports: List[dict]) -> bool: :param reports: List of report info :return: True if successful, False otherwise """ - label_id = self.get_label_id(JstorTelescope.PROCESSED_LABEL_NAME) + label_id = self.get_label_id(JSTOR_PROCESSED_LABEL_NAME) body = {"addLabelIds": [label_id]} message = reports[0]["id"] # Only one message for collections response = self.service.users().messages().modify(userId="me", id=message, body=body).execute() try: message_id = response["id"] - logging.info( - f"Added label '{JstorTelescope.PROCESSED_LABEL_NAME}' to GMAIL message, message_id: {message_id}" - ) + logging.info(f"Added label '{JSTOR_PROCESSED_LABEL_NAME}' to GMAIL message, message_id: {message_id}") except KeyError: return False return True diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/__init__.py b/dags/oaebu_workflows/jstor_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/__init__.py rename to dags/oaebu_workflows/jstor_telescope/schema/__init__.py diff --git a/oaebu_workflows/jstor_telescope/schema/book_metrics_author_jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/book_metrics_author_jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_metrics_author_jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_metrics_author_jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_metrics_country_jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/book_metrics_country_jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_metrics_country_jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_metrics_country_jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_metrics_jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/book_metrics_jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_metrics_jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_metrics_jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_metrics_subject_jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/book_metrics_subject_jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_metrics_subject_jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_metrics_subject_jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_institution.json b/dags/oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_institution.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_institution.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_product_metadata_jstor_institution.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_institution.json b/dags/oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_institution.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_institution.json rename to dags/oaebu_workflows/jstor_telescope/schema/book_product_metrics_jstor_institution.json diff --git a/oaebu_workflows/jstor_telescope/schema/jstor_country.json b/dags/oaebu_workflows/jstor_telescope/schema/jstor_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/jstor_country.json rename to dags/oaebu_workflows/jstor_telescope/schema/jstor_country.json diff --git a/oaebu_workflows/jstor_telescope/schema/jstor_country_collection.json b/dags/oaebu_workflows/jstor_telescope/schema/jstor_country_collection.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/jstor_country_collection.json rename to dags/oaebu_workflows/jstor_telescope/schema/jstor_country_collection.json diff --git a/oaebu_workflows/jstor_telescope/schema/jstor_institution.json b/dags/oaebu_workflows/jstor_telescope/schema/jstor_institution.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/jstor_institution.json rename to dags/oaebu_workflows/jstor_telescope/schema/jstor_institution.json diff --git a/oaebu_workflows/jstor_telescope/schema/jstor_institution_collection.json b/dags/oaebu_workflows/jstor_telescope/schema/jstor_institution_collection.json similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/jstor_institution_collection.json rename to dags/oaebu_workflows/jstor_telescope/schema/jstor_institution_collection.json diff --git a/oaebu_workflows/irus_oapen_telescope/__init__.py b/dags/oaebu_workflows/jstor_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/__init__.py rename to dags/oaebu_workflows/jstor_telescope/sql/__init__.py diff --git a/oaebu_workflows/jstor_telescope/sql/book_metrics_author_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_metrics_author_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_metrics_author_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_metrics_author_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/book_metrics_country_body_jstor_country.sql.jinja2 b/dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_body_jstor_country.sql.jinja2 similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_metrics_country_body_jstor_country.sql.jinja2 rename to dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_body_jstor_country.sql.jinja2 diff --git a/oaebu_workflows/jstor_telescope/sql/book_metrics_country_join_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_join_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_metrics_country_join_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_join_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/book_metrics_country_null_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_null_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_metrics_country_null_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_null_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/book_metrics_country_struct_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_struct_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_metrics_country_struct_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_metrics_country_struct_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/book_metrics_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_metrics_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_metrics_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_metrics_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2 b/dags/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2 similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2 rename to dags/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_country.sql.jinja2 diff --git a/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2 b/dags/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2 similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2 rename to dags/oaebu_workflows/jstor_telescope/sql/book_product_body_jstor_institution.sql.jinja2 diff --git a/oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_institution.sql b/dags/oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_institution.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_institution.sql rename to dags/oaebu_workflows/jstor_telescope/sql/book_product_functions_jstor_institution.sql diff --git a/oaebu_workflows/jstor_telescope/sql/month_metrics_sum_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/month_metrics_sum_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/month_metrics_sum_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/month_metrics_sum_jstor_country.sql diff --git a/oaebu_workflows/jstor_telescope/sql/month_null_jstor_country.sql b/dags/oaebu_workflows/jstor_telescope/sql/month_null_jstor_country.sql similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/month_null_jstor_country.sql rename to dags/oaebu_workflows/jstor_telescope/sql/month_null_jstor_country.sql diff --git a/oaebu_workflows/oaebu_partners.py b/dags/oaebu_workflows/oaebu_partners.py similarity index 99% rename from oaebu_workflows/oaebu_partners.py rename to dags/oaebu_workflows/oaebu_partners.py index cac87ecf..a876d27d 100644 --- a/oaebu_workflows/oaebu_partners.py +++ b/dags/oaebu_workflows/oaebu_partners.py @@ -387,7 +387,7 @@ def partner_from_str(partner: Union[str, OaebuPartner], metadata_partner: bool = :return: The OaebuPartner """ - if isinstance(partner, OaebuPartner): + if not isinstance(partner, str): return partner partners_dict = OAEBU_METADATA_PARTNERS if metadata_partner else OAEBU_DATA_PARTNERS diff --git a/oaebu_workflows/irus_oapen_telescope/schema/__init__.py b/dags/oaebu_workflows/oapen_metadata_telescope/__init__.py similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/schema/__init__.py rename to dags/oaebu_workflows/oapen_metadata_telescope/__init__.py diff --git a/dags/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py b/dags/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py new file mode 100644 index 00000000..73090333 --- /dev/null +++ b/dags/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py @@ -0,0 +1,339 @@ +# Copyright 2020-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Aniek Roelofs, Keegan Smith + +from __future__ import annotations + +import logging +import os +import requests +import xmltodict +from xml.parsers.expat import ExpatError +from typing import Union + +import pendulum +from airflow.decorators import dag, task +from airflow.exceptions import AirflowException +from google.cloud.bigquery import SourceFormat, Client +from tenacity import ( + retry, + stop_after_attempt, + wait_chain, + wait_fixed, + retry_if_exception_type, +) + +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from oaebu_workflows.config import schema_folder, oaebu_user_agent_header +from oaebu_workflows.onix_utils import OnixTransformer +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.google.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path, gcs_download_blob +from observatory_platform.google.bigquery import bq_load_table, bq_sharded_table_id, bq_create_dataset +from observatory_platform.airflow.release import SnapshotRelease, set_task_state, make_snapshot_date +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback + + +# Download job will wait 120 seconds between first 2 attempts, then 30 minutes for the following 3 +DOWNLOAD_RETRY_CHAIN = wait_chain(*[wait_fixed(120) for _ in range(2)] + [wait_fixed(1800) for _ in range(3)]) + + +class OapenMetadataRelease(SnapshotRelease): + def __init__(self, dag_id: str, run_id: str, snapshot_date: pendulum.DateTime): + """Construct a OapenMetadataRelease instance + + :param dag_id: The ID of the DAG + :param run_id: The Airflow run ID + :param snapshot_date: The date of the snapshot_date/release + """ + super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) + self.download_file_name = f"metadata_{snapshot_date.format('YYYYMMDD')}.xml" + self.transform_file_name = "transformed.jsonl.gz" # Final onix file + + @property + def download_path(self): + return os.path.join(self.download_folder, self.download_file_name) + + @property + def transform_path(self): + return os.path.join(self.transform_folder, self.transform_file_name) + + @property + def transform_files(self): + files = os.listdir(self.transform_folder) + return [os.path.join(self.transform_folder, f) for f in files] + + @property + def download_blob_name(self): + return gcs_blob_name_from_path(self.download_path) + + @property + def transform_blob_name(self): + return gcs_blob_name_from_path(self.transform_path) + + @staticmethod + def from_dict(dict_: dict): + return OapenMetadataRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + snapshot_date=pendulum.from_format(dict_["snapshot_date"], "YYYY-MM-DD"), + ) + + def to_dict(self) -> dict: + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "snapshot_date": self.snapshot_date.to_date_string(), + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + metadata_uri: str, + metadata_partner: Union[str, OaebuPartner] = "oapen_metadata", + elevate_related_products: bool = False, + bq_dataset_description: str = "OAPEN Metadata converted to ONIX", + bq_table_description: str = None, + api_dataset_id: str = "oapen", + catchup: bool = False, + start_date: pendulum.DateTime = pendulum.datetime(2018, 5, 14), + schedule: str = "0 12 * * Sun", # Midday every sunday + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """Construct a OapenMetadata DAG. + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param metadata_uri: The URI of the metadata XML file + :param metadata_partner: The metadata partner name + :param elevate_related_products: Whether to pull out the related products to the product level. + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_table_description: Description for the biguery table + :param api_dataset_id: The ID to store the dataset release in the API + :param catchup: Whether to catchup the DAG or not + :param start_date: The start date of the DAG + :param schedule: The schedule interval of the DAG + :param max_active_runs: The maximum number of active DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ + + metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) + + # Fixture file paths + oapen_schema = os.path.join(schema_folder(workflow_module="oapen_metadata_telescope"), "oapen_metadata_filter.json") + + @dag( + dag_id=dag_id, + schedule=schedule, + start_date=start_date, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def oapen_metadata(): + @task() + def make_release(**context) -> dict: + """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is + called in 'task_callable'. + + :param context: the context passed from the PythonOperator. + See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed + :return: The Oapen metadata release instance""" + snapshot_date = make_snapshot_date(**context) + return OapenMetadataRelease(dag_id, context["run_id"], snapshot_date).to_dict() + + @task() + def download(release: dict, **context) -> None: + """Task to download the OapenMetadataRelease release. + + :param context: the context passed from the PythonOperator. + :param release: an OapenMetadataRelease instance. + """ + + release = OapenMetadataRelease.from_dict(release) + logging.info(f"Downloading metadata XML from url: {metadata_uri}") + download_metadata(metadata_uri, release.download_path) + success = gcs_upload_files( + bucket_name=cloud_workspace.download_bucket, + file_paths=[release.download_path], + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def transform(release: dict, **context) -> None: + """Transform the oapen metadata XML file into a valid ONIX file""" + + release = OapenMetadataRelease.from_dict(release) + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_blob_name, + file_path=release.download_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_blob_name}") + + # Parse the downloaded metadata through the schema to extract relevant fields only + transformer = OnixTransformer( + input_path=release.download_path, + output_dir=release.transform_folder, + filter_products=True, + error_removal=True, + normalise_related_products=True, + deduplicate_related_products=True, + elevate_related_products=elevate_related_products, + add_name_fields=True, + collapse_subjects=True, + filter_schema=oapen_schema, + keep_intermediate=True, + ) + out_file = transformer.transform() + if release.transform_path != out_file: + raise FileNotFoundError( + f"Expected file {release.transform_path} not equal to transformed file: {out_file}" + ) + + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, + file_paths=release.transform_files, + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def bq_load(release: dict, **context) -> None: + """Load the transformed ONIX file into bigquery""" + + release = OapenMetadataRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=metadata_partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + uri = gcs_blob_uri( + cloud_workspace.transform_bucket, + gcs_blob_name_from_path(release.transform_path), + ) + table_id = bq_sharded_table_id( + cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.snapshot_date, + ) + client = Client(project=cloud_workspace.project_id) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=metadata_partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + table_description=bq_table_description, + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = OapenMetadataRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + snapshot_date=release.snapshot_date, + data_interval_start=context["data_interval_start"], + data_interval_end=context["data_interval_end"], + ) + api.add_dataset_release(dataset_release) + + @task() + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files, folders and XComs associated with this release.""" + + release = OapenMetadataRelease.from_dict(release) + cleanup( + dag_id=dag_id, + execution_date=context["execution_date"], + workflow_folder=release.workflow_folder, + ) + + task_check_dependencies = check_dependencies() + xcom_release = make_release() + task_download = download(xcom_release) + task_transform = transform(xcom_release) + task_bq_load = bq_load(xcom_release) + task_add_release = add_new_dataset_releases(xcom_release) + task_cleanup_workflow = cleanup_workflow(xcom_release) + + ( + task_check_dependencies + >> xcom_release + >> task_download + >> task_transform + >> task_bq_load + >> task_add_release + >> task_cleanup_workflow + ) + + return oapen_metadata() + + +@retry( + stop=stop_after_attempt(5), + wait=DOWNLOAD_RETRY_CHAIN, + retry=retry_if_exception_type((ExpatError, ConnectionError, AirflowException)), + reraise=True, +) +def download_metadata(uri: str, download_path: str) -> None: + """Downloads the OAPEN metadata XML file + OAPEN's downloader can give an incomplete file if the metadata is partially generated. + In this scenario, we should wait until the metadata generator has finished. + Otherwise, an attempt to parse the data will result in an XML ParseError. + Another scenario is that OAPEN returns only a header in the XML. We want this to also raise an error. + OAPEN metadata generation can take over an hour + + :param uri: the url to query for the metadata + :param download_path: filepath to store te downloaded file + :raises ConnectionError: raised if the response from the metadata server does not have code 200 + :raises AirflowException: raised if the response does not contain any Product fields + """ + response = requests.get(uri, headers=oaebu_user_agent_header()) + if response.status_code != 200: + raise ConnectionError(f"Expected status code 200 from url {uri}, instead got response: {response.text}") + with open(download_path, "w") as f: + f.write(response.content.decode("utf-8")) + logging.info(f"Successfully downloadeded XML to {download_path}") + + # Attempt to parse the XML, will raise an ExpatError if it's invalid + with open(download_path, "rb") as f: + xmltodict.parse(f) + logging.info("XML file is valid") + + # Check that more than just the header is returned + if "" not in response.content.decode("utf-8"): + raise AirflowException("No products found in metadata") diff --git a/oaebu_workflows/irus_oapen_telescope/sql/__init__.py b/dags/oaebu_workflows/oapen_metadata_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/sql/__init__.py rename to dags/oaebu_workflows/oapen_metadata_telescope/schema/__init__.py diff --git a/oaebu_workflows/oapen_metadata_telescope/schema/oapen_metadata_filter.json b/dags/oaebu_workflows/oapen_metadata_telescope/schema/oapen_metadata_filter.json similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/schema/oapen_metadata_filter.json rename to dags/oaebu_workflows/oapen_metadata_telescope/schema/oapen_metadata_filter.json diff --git a/oaebu_workflows/irus_oapen_telescope/tests/__init__.py b/dags/oaebu_workflows/oapen_metadata_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/tests/__init__.py rename to dags/oaebu_workflows/oapen_metadata_telescope/sql/__init__.py diff --git a/oaebu_workflows/irus_oapen_telescope/tests/fixtures/__init__.py b/dags/oaebu_workflows/onix_telescope/__init__.py similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/tests/fixtures/__init__.py rename to dags/oaebu_workflows/onix_telescope/__init__.py diff --git a/dags/oaebu_workflows/onix_telescope/onix_telescope.py b/dags/oaebu_workflows/onix_telescope/onix_telescope.py new file mode 100644 index 00000000..830643e4 --- /dev/null +++ b/dags/oaebu_workflows/onix_telescope/onix_telescope.py @@ -0,0 +1,323 @@ +# Copyright 2021-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: James Diprose, Keegan Smith + +import logging +import os +import re +from typing import List, Union + +import pendulum +from airflow.decorators import dag, task, task_group +from airflow.exceptions import AirflowException, AirflowSkipException +from google.cloud.bigquery import SourceFormat, Client + +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from oaebu_workflows.onix_utils import collapse_subjects, onix_parser_download, onix_parser_execute +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.files import load_jsonl, save_jsonl_gz +from observatory_platform.google.gcs import gcs_blob_uri, gcs_upload_files, gcs_blob_name_from_path, gcs_download_blob +from observatory_platform.google.bigquery import bq_load_table, bq_sharded_table_id, bq_create_dataset +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.sftp import SftpFolders, make_sftp_connection +from observatory_platform.airflow.release import SnapshotRelease, set_task_state +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback + + +class OnixRelease(SnapshotRelease): + def __init__( + self, + *, + dag_id: str, + run_id: str, + snapshot_date: pendulum.DateTime, + onix_file_name: str, + ): + """Construct an OnixRelease. + + :param dag_id: The ID of the DAG + :param run_id: The Airflow run ID + :param snapshot_date: The date of the snapshot/release + :param onix_file_name: The ONIX file name. + """ + super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) + self.onix_file_name = onix_file_name + self.download_file_name = self.onix_file_name + self.parsed_file_name = "full.jsonl" + self.transform_file_name = "onix.jsonl.gz" + + @property + def download_path(self): + return os.path.join(self.download_folder, self.download_file_name) + + @property + def parsed_path(self): + return os.path.join(self.transform_folder, self.parsed_file_name) + + @property + def transform_path(self): + return os.path.join(self.transform_folder, self.transform_file_name) + + @property + def download_blob_name(self): + return gcs_blob_name_from_path(self.download_path) + + @property + def transform_blob_name(self): + return gcs_blob_name_from_path(self.transform_path) + + @staticmethod + def from_dict(dict_: dict): + return OnixRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + snapshot_date=pendulum.from_format(dict_["snapshot_date"], "YYYY-MM-DD"), + onix_file_name=dict_["onix_file_name"], + ) + + def to_dict(self) -> dict: + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "snapshot_date": self.snapshot_date.to_date_string(), + "onix_file_name": self.onix_file_name, + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + date_regex: str, + sftp_root: str = "/", + metadata_partner: Union[str, OaebuPartner] = "onix", + bq_dataset_description: str = "ONIX data provided by Org", + bq_table_description: str = None, + api_dataset_id: str = "onix", + sftp_service_conn_id: str = "sftp_service", + catchup: bool = False, + schedule: str = "0 12 * * Sun", # Midday every sunday + start_date: pendulum.DateTime = pendulum.datetime(2021, 3, 28), + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """Construct an OINX DAG. + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param sftp_root: The working root of the SFTP server, passed to the SftoFolders class + :param metadata_partner: The metadata partner name + :param date_regex: Regular expression for extracting a date string from an ONIX file name + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_table_description: Description for the biguery table + :param api_dataset_id: The ID to store the dataset release in the API + :param sftp_service_conn_id: Airflow connection ID for the SFTP service + :param catchup: Whether to catchup the DAG or not + :param schedule: The schedule interval of the DAG + :param start_date: The start date of the DAG + :param max_active_runs: The maximum number of active DAG runs. + :param retries: The number of times to retry failed tasks. + :param retry_delay: The delay between retries in minutes. + """ + metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) + sftp_folders = SftpFolders(dag_id, sftp_conn_id=sftp_service_conn_id, sftp_root=sftp_root) + + @dag( + dag_id=dag_id, + start_date=start_date, + schedule=schedule, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def onix_telescope(): + @task() + def fetch_releases(**context) -> List[dict]: + """Lists all ONIX releases and publishes their file names as an XCom. + + :param context: the context passed from the BranchPythonOperator. + See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed + :return: the identifier of the task to execute next. + """ + + # List release dates + release_info = [] + with make_sftp_connection(sftp_service_conn_id) as sftp: + files = sftp.listdir(sftp_folders.upload) + for file_name in files: + if re.match(r"^.*\.(onx|xml)$", file_name): + try: + date_str = re.search(date_regex, file_name).group(0) + except AttributeError: + msg = f"Could not find date with pattern `{date_regex}` in file name {file_name}" + logging.error(msg) + raise AirflowException(msg) + release_info.append({"release_date": date_str, "file_name": file_name}) + + if not bool(release_info): + raise AirflowSkipException("No new releases available. Skipping downstream DAG tasks.") + + releases = [] + for record in release_info: + onix_file_name = record["file_name"] + releases.append( + OnixRelease( + dag_id=dag_id, + run_id=context["run_id"], + snapshot_date=pendulum.parse(record["release_date"]), + onix_file_name=onix_file_name, + ) + ) + return [r.to_dict() for r in releases] + + @task_group(group_id="process_release") + def process_release(data: dict): + @task() + def move_files_to_in_progress(release: dict, **context): + """Move ONIX files to SFTP in-progress folder. + :param releases: a list of Onix release instances""" + + release = OnixRelease.from_dict(release) + sftp_folders.move_files_to_in_progress(release.onix_file_name) + + @task() + def download(release: dict, **context): + """Task to download the ONIX releases.""" + + release = OnixRelease.from_dict(release) + with make_sftp_connection(sftp_service_conn_id) as sftp: + in_progress_file = os.path.join(sftp_folders.in_progress, release.onix_file_name) + sftp.get(in_progress_file, localpath=release.download_path) + success = gcs_upload_files( + bucket_name=cloud_workspace.download_bucket, file_paths=[release.download_path] + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def transform(release: dict, **context) -> None: + """Task to transform the ONIX releases.""" + + release = OnixRelease.from_dict(release) + + # Download the parser + success, parser_path = onix_parser_download() + set_task_state(success, context["ti"].task_id) + + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_blob_name, + file_path=release.download_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file {release.download_blob_name}") + + onix_parser_execute( + parser_path=parser_path, input_dir=release.download_folder, output_dir=release.transform_folder + ) + onix = collapse_subjects(load_jsonl(release.parsed_path)) + save_jsonl_gz(release.transform_path, onix) + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, file_paths=[release.transform_path] + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def bq_load(release: dict, **context) -> None: + """Task to load each transformed release to BigQuery.""" + + release = OnixRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=metadata_partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + client = Client(project=cloud_workspace.project_id) + # Load each transformed release + table_id = bq_sharded_table_id( + cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.snapshot_date, + ) + uri = gcs_blob_uri(cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=metadata_partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + table_description=bq_table_description, + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def move_files_to_finished(release: dict, **context) -> None: + """Move ONIX files to SFTP finished folder.""" + + release = OnixRelease.from_dict(release) + sftp_folders.move_files_to_finished(release.onix_file_name) + + @task() + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = OnixRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + snapshot_date=release.snapshot_date, + data_interval_start=context["data_interval_start"], + data_interval_end=context["data_interval_end"], + ) + api.add_dataset_release(dataset_release) + + @task() + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files, folders and XComs associated with this release.""" + + release = OnixRelease.from_dict(release) + cleanup( + dag_id=dag_id, execution_date=context["execution_date"], workflow_folder=release.workflow_folder + ) + + ( + move_files_to_in_progress(data) + >> download(data) + >> transform(data) + >> bq_load(data) + >> move_files_to_finished(data) + >> add_new_dataset_releases(data) + >> cleanup_workflow(data) + ) + + task_check_dependencies = check_dependencies(airflow_conns=[sftp_service_conn_id]) + xcom_releases = fetch_releases() + process_release_task_group = process_release.expand(data=xcom_releases) + + task_check_dependencies >> xcom_releases >> process_release_task_group + + return onix_telescope() diff --git a/oaebu_workflows/jstor_telescope/__init__.py b/dags/oaebu_workflows/onix_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/jstor_telescope/__init__.py rename to dags/oaebu_workflows/onix_telescope/schema/__init__.py diff --git a/oaebu_workflows/onix_telescope/schema/onix.json b/dags/oaebu_workflows/onix_telescope/schema/onix.json similarity index 100% rename from oaebu_workflows/onix_telescope/schema/onix.json rename to dags/oaebu_workflows/onix_telescope/schema/onix.json diff --git a/oaebu_workflows/jstor_telescope/schema/__init__.py b/dags/oaebu_workflows/onix_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/jstor_telescope/schema/__init__.py rename to dags/oaebu_workflows/onix_telescope/sql/__init__.py diff --git a/oaebu_workflows/onix_utils.py b/dags/oaebu_workflows/onix_utils.py similarity index 99% rename from oaebu_workflows/onix_utils.py rename to dags/oaebu_workflows/onix_utils.py index 12761c1c..23d2dc6d 100644 --- a/oaebu_workflows/onix_utils.py +++ b/dags/oaebu_workflows/onix_utils.py @@ -31,10 +31,10 @@ from onixcheck import validate as validate_onix from oaebu_workflows.config import schema_folder -from observatory.platform.config import observatory_home -from observatory.platform.utils.http_download import download_file -from observatory.platform.utils.proc_utils import wait_for_process -from observatory.platform.files import save_jsonl_gz, load_jsonl +from observatory_platform.config import observatory_home +from observatory_platform.http_download import download_file +from observatory_platform.proc_utils import wait_for_process +from observatory_platform.files import save_jsonl_gz, load_jsonl @dataclass diff --git a/oaebu_workflows/jstor_telescope/sql/__init__.py b/dags/oaebu_workflows/onix_workflow/__init__.py similarity index 100% rename from oaebu_workflows/jstor_telescope/sql/__init__.py rename to dags/oaebu_workflows/onix_workflow/__init__.py diff --git a/oaebu_workflows/onix_workflow/onix_work_aggregation.py b/dags/oaebu_workflows/onix_workflow/onix_work_aggregation.py similarity index 100% rename from oaebu_workflows/onix_workflow/onix_work_aggregation.py rename to dags/oaebu_workflows/onix_workflow/onix_work_aggregation.py diff --git a/dags/oaebu_workflows/onix_workflow/onix_workflow.py b/dags/oaebu_workflows/onix_workflow/onix_workflow.py new file mode 100644 index 00000000..9b1d4470 --- /dev/null +++ b/dags/oaebu_workflows/onix_workflow/onix_workflow.py @@ -0,0 +1,1399 @@ +# Copyright 2020-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Author: Tuan Chien, Richard Hosking, Keegan Smith + +import os +from datetime import timedelta, datetime +from typing import List, Optional, Tuple, Union, Iterable +from concurrent.futures import ThreadPoolExecutor, as_completed +import re +import logging +import json + +import pendulum +from google.cloud.bigquery import SourceFormat, Client +from ratelimit import limits, sleep_and_retry +from tenacity import wait_exponential_jitter +from jinja2 import Environment, FileSystemLoader +from airflow.decorators import dag, task, task_group +from airflow.models.baseoperator import chain +from airflow.models import DagRun +from airflow.utils.session import provide_session +from sqlalchemy.orm.scoping import scoped_session + +from oaebu_workflows.airflow_pools import CrossrefEventsPool +from oaebu_workflows.config import schema_folder as default_schema_folder +from oaebu_workflows.config import sql_folder, oaebu_user_agent_header +from oaebu_workflows.oaebu_partners import OaebuPartner, DataPartner, partner_from_str +from oaebu_workflows.onix_workflow.onix_work_aggregation import BookWorkAggregator, BookWorkFamilyAggregator +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.airflow.sensors import DagCompleteSensor +from observatory_platform.files import save_jsonl_gz +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.url_utils import retry_get_url +from observatory_platform.google.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path +from observatory_platform.jinja2_utils import render_template +from observatory_platform.airflow.release import SnapshotRelease, make_snapshot_date, set_task_state +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback +from observatory_platform.google.bigquery import ( + bq_load_table, + bq_table_id, + bq_sharded_table_id, + bq_create_dataset, + bq_create_table_from_query, + bq_run_query, + bq_select_table_shard_dates, + bq_copy_table, + bq_find_schema, +) + + +CROSSREF_EVENT_URL_TEMPLATE = ( + "https://api.eventdata.crossref.org/v1/events?mailto={mailto}" + "&from-collected-date={start_date}&until-collected-date={end_date}&rows=1000" + "&obj-id={doi}" +) + + +class OnixWorkflowRelease(SnapshotRelease): + """Release information for OnixWorkflow""" + + def __init__( + self, + *, + dag_id: str, + run_id: str, + snapshot_date: pendulum.DateTime, + onix_snapshot_date: pendulum.DateTime, + crossref_master_snapshot_date: pendulum.DateTime, + ): + """ + Construct the OnixWorkflow Release + :param dag_id: DAG ID. + :param release_date: The date of the partition/release + :param onix_snapshot_date: The ONIX snapshot/release date. + :param crossref_master_snapshot_date: The release date/suffix of the crossref master table + """ + + super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) + + # Dates + self.onix_snapshot_date = onix_snapshot_date + self.crossref_master_snapshot_date = crossref_master_snapshot_date + + # Files + self.workslookup_file_name = "worksid.jsonl.gz" + self.workslookup_errors_file_name = "worksid_errors.jsonl.gz" + self.worksfamilylookup_file_name = "workfamilyid.jsonl.gz" + self.crossref_metadata_file_name = "crossref_metadata.jsonl.gz" + self.crossref_events_file_name = "crossref_events.jsonl.gz" + + # Generated Schemas + self.book_product_schema_file_name = "book_product_schema.json" + self.author_metrics_schema_file_name = "author_metrics_schema.json" + self.book_metrics_schema_file_name = "metrics_books_metrics_schema.json" + self.country_metrics_schema_file_name = "country_metrics_schema.json" + self.subject_metrics_bic_schema_file_name = "subject_metrics_bic_schema.json" + self.subject_metrics_bisac_schema_file_name = "subject_metrics_bisac_schema.json" + self.subject_metrics_thema_schema_file_name = "subject_metrics_thema_schema.json" + + ## File Paths ## + @property + def workslookup_path(self): + return os.path.join(self.transform_folder, self.workslookup_file_name) + + @property + def workslookup_errors_path(self): + return os.path.join(self.transform_folder, self.workslookup_errors_file_name) + + @property + def worksfamilylookup_path(self): + return os.path.join(self.transform_folder, self.worksfamilylookup_file_name) + + @property + def crossref_metadata_path(self): + return os.path.join(self.transform_folder, self.crossref_metadata_file_name) + + @property + def crossref_events_path(self): + return os.path.join(self.transform_folder, self.crossref_events_file_name) + + @property + def book_product_schema_path(self): + return os.path.join(self.transform_folder, self.book_product_schema_file_name) + + @property + def author_metrics_schema(self): + return os.path.join(self.transform_folder, self.author_metrics_schema_file_name) + + @property + def book_metrics_schema(self): + return os.path.join(self.transform_folder, self.book_metrics_schema_file_name) + + @property + def country_metrics_schema(self): + return os.path.join(self.transform_folder, self.country_metrics_schema_file_name) + + @property + def subject_metrics_bic_schema(self): + return os.path.join(self.transform_folder, self.subject_metrics_bic_schema_file_name) + + @property + def subject_metrics_bisac_schema(self): + return os.path.join(self.transform_folder, self.subject_metrics_bisac_schema_file_name) + + @property + def subject_metrics_thema_schema(self): + return os.path.join(self.transform_folder, self.subject_metrics_thema_schema_file_name) + + ## Blob Names ## + @property + def workslookup_blob_name(self): + return gcs_blob_name_from_path(self.workslookup_path) + + @property + def workslookup_errors_blob_name(self): + return gcs_blob_name_from_path(self.workslookup_errors_path) + + @property + def worksfamilylookup_blob_name(self): + return gcs_blob_name_from_path(self.worksfamilylookup_path) + + @property + def crossref_metadata_blob_name(self): + return gcs_blob_name_from_path(self.crossref_metadata_path) + + @property + def crossref_events_blob_name(self): + return gcs_blob_name_from_path(self.crossref_events_path) + + @staticmethod + def from_dict(dict_: dict): + return OnixWorkflowRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + snapshot_date=pendulum.from_format(dict_["snapshot_date"], "YYYY-MM-DD"), + onix_snapshot_date=pendulum.from_format(dict_["onix_snapshot_date"], "YYYY-MM-DD"), + crossref_master_snapshot_date=pendulum.from_format(dict_["crossref_master_snapshot_date"], "YYYY-MM-DD"), + ) + + def to_dict(self): + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "snapshot_date": self.snapshot_date.to_date_string(), + "onix_snapshot_date": self.onix_snapshot_date.to_date_string(), + "crossref_master_snapshot_date": self.crossref_master_snapshot_date.to_date_string(), + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + metadata_partner: Union[str, OaebuPartner], + # Bigquery parameters + bq_master_crossref_project_id: str = "academic-observatory", + bq_master_crossref_dataset_id: str = "crossref_metadata", + bq_oaebu_crossref_dataset_id: str = "crossref", + bq_master_crossref_metadata_table_name: str = "crossref_metadata", + bq_oaebu_crossref_metadata_table_name: str = "crossref_metadata", + bq_crossref_events_table_name: str = "crossref_events", + bq_country_project_id: str = "oaebu-public-data", + bq_country_dataset_id: str = "oaebu_reference", + bq_subject_project_id: str = "oaebu-public-data", + bq_subject_dataset_id: str = "oaebu_reference", + bq_book_table_name: str = "book", + bq_book_product_table_name: str = "book_product", + bq_onix_workflow_dataset: str = "onix_workflow", + bq_oaebu_intermediate_dataset: str = "oaebu_intermediate", + bq_oaebu_dataset: str = "oaebu", + bq_oaebu_export_dataset: str = "data_export", + bq_oaebu_latest_export_dataset: str = "data_export_latest", + bq_worksid_table_name: str = "onix_workid_isbn", + bq_worksid_error_table_name: str = "onix_workid_isbn_errors", + bq_workfamilyid_table_name: str = "onix_workfamilyid_isbn", + oaebu_intermediate_match_suffix: str = "_matched", + # Run parameters + data_partners: List[Union[str, OaebuPartner]] = None, + ga3_views_field="page_views", + schema_folder: str = default_schema_folder(workflow_module="onix_workflow"), + mailto: str = "agent@observatory.academy", + crossref_start_date: pendulum.DateTime = pendulum.datetime(2018, 5, 14), + api_dataset_id: str = "onix_workflow", + max_threads: int = 2 * os.cpu_count() - 1, + # Ariflow parameters + sensor_dag_ids: List[str] = None, + catchup: Optional[bool] = False, + start_date: Optional[pendulum.DateTime] = pendulum.datetime(2022, 8, 1), + schedule: Optional[str] = "0 0 * * Mon", # Mondays at midnight + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, + +): + """ + Initialises the workflow object. + + :param dag_id: DAG ID. + :param cloud_workspace: The CloudWorkspace object for this DAG + + :param bq_master_crossref_project_id: GCP project ID of crossref master data + :param bq_master_crossref_dataset_id: GCP dataset ID of crossref master data + :param bq_oaebu_crossref_dataset_id: GCP dataset ID of crossref OAeBU data + :param bq_master_crossref_metadata_table_name: The name of the master crossref metadata table + :param bq_oaebu_crossref_metadata_table_name: The name of the OAeBU crossref metadata table + :param bq_crossref_events_table_name: The name of the crossref events table + :param bq_country_project_id: GCP project ID of the country table + :param bq_country_dataset_id: GCP dataset containing the country table + :param bq_subject_project_id: GCP project ID of the subject tables + :param bq_subject_dataset_id: GCP dataset ID of the subject tables + :param bq_book_table_name: The name of the book table + :param bq_book_product_table_name: The name of the book product table + :param bq_onix_workflow_dataset: Onix workflow dataset. + :param bq_oaebu_intermediate_dataset: OAEBU intermediate dataset. + :param bq_oaebu_dataset: OAEBU dataset. + :param bq_oaebu_export_dataset: OAEBU data export dataset. + :param bq_oaebu_latest_export_dataset: OAEBU data export dataset with the latest export tables + :param bq_worksid_table_name: table ID of the worksid table + :param bq_worksid_error_table_name: table ID of the worksid error table + :param bq_workfamilyid_table_name: table ID of the workfamilyid table + :param oaebu_intermediate_match_suffix: Suffix to append to intermediate tables + + :param data_partners: OAEBU data sources. + :param ga3_views_field: The name of the GA3 views field - should be either 'page_views' or 'unique_views' + :param schema_folder: the SQL schema path. + :param mailto: email address used to identify the user when sending requests to an API. + :param crossref_start_date: The starting date of crossref's API calls + :param api_dataset_id: The ID to store the dataset release in the API + :param max_threads: The maximum number of threads to use for parallel tasks. + + :param sensor_dag_ids: Dag IDs for dependent tasks + :param catchup: Whether to catch up missed DAG runs. + :param start_date: Start date of the DAG. + :param schedule: Scheduled interval for running the DAG. + :param max_active_runs: The maximum number of active DAG runs. + :param retries: The number of times to retry failed tasks. + :param retry_delay: The delay between retries in minutes. + """ + + if not sensor_dag_ids: + sensor_dag_ids = [] + + if data_partners is None: + data_partners = list() + + metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) + data_partners = [partner_from_str(p) for p in data_partners] + + # Create pool for crossref API calls (if they don't exist) + # Pools are necessary to throttle the maxiumum number of requests we can make per second and avoid 429 errors + crossref_events_pool = CrossrefEventsPool(pool_slots=15) + crossref_events_pool.create_pool() + + @dag( + dag_id=dag_id, + schedule=schedule, + start_date=start_date, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def onix_workflow(): + """Construct the DAG""" + + @task_group(group_id="sensors") + def make_sensors(): + """Create the sensor tasks for the DAG. These check that the data partner dag runs are complete""" + + tasks = [] + for ext_dag_id in sensor_dag_ids: + sensor = DagCompleteSensor( + task_id=f"{ext_dag_id}_sensor", + external_dag_id=ext_dag_id, + mode="reschedule", + poke_interval=int(1200), # Check if dag run is ready every 20 minutes + timeout=int(timedelta(days=1).total_seconds()), # Sensor will fail after 1 day of waiting + check_existence=True, + # Custom date retrieval fn. Airflow expects a callable with the execution_date as an argument only. + execution_date_fn=lambda dt: latest_execution_timedelta(dt, ext_dag_id), + ) + + tasks.append(sensor) + chain(tasks) + + @task() + def make_release(**context) -> dict: + """Creates a release object. + + :param context: From Airflow. Contains the execution_date. + :return: a dictionary representation of the OnixWorkflowRelease object. + """ + + # Get ONIX release date + onix_table_id = bq_table_id( + project_id=cloud_workspace.project_id, + dataset_id=metadata_partner.bq_dataset_id, + table_id=metadata_partner.bq_table_name, + ) + snapshot_date = make_snapshot_date(**context) + client = Client(project=cloud_workspace.project_id) + onix_snapshot_dates = bq_select_table_shard_dates( + table_id=onix_table_id, end_date=snapshot_date, client=client + ) + if not len(onix_snapshot_dates): + raise RuntimeError("OnixWorkflow.make_release: no ONIX releases found") + + onix_snapshot_date = onix_snapshot_dates[0] # Get most recent snapshot + + # Get Crossref Metadata release date + crossref_table_id = bq_table_id( + project_id=bq_master_crossref_project_id, + dataset_id=bq_master_crossref_dataset_id, + table_id=bq_master_crossref_metadata_table_name, + ) + crossref_metadata_snapshot_dates = bq_select_table_shard_dates( + table_id=crossref_table_id, end_date=snapshot_date, client=client + ) + if not len(crossref_metadata_snapshot_dates): + raise RuntimeError("OnixWorkflow.make_release: no Crossref Metadata releases found") + crossref_master_snapshot_date = crossref_metadata_snapshot_dates[0] # Get most recent snapshot + + # Make the release object + return OnixWorkflowRelease( + dag_id=dag_id, + run_id=context["run_id"], + snapshot_date=snapshot_date, + onix_snapshot_date=onix_snapshot_date, + crossref_master_snapshot_date=crossref_master_snapshot_date, + ).to_dict() + + @task() + def aggregate_works(release: dict, **context) -> None: + """Fetches the ONIX product records from our ONIX database, aggregates them into works, workfamilies, + and outputs it into jsonl files. + + :param release: The onix workflow release object + """ + + release = OnixWorkflowRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=bq_onix_workflow_dataset, + location=cloud_workspace.data_location, + description="Onix Workflow Aggregations", + ) + + # Fetch ONIX data + sharded_onix_table = bq_sharded_table_id( + cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.onix_snapshot_date, + ) + client = Client(project=cloud_workspace.project_id) + products = get_onix_records(sharded_onix_table, client=client) + + # Aggregate into works + agg = BookWorkAggregator(products) + works = agg.aggregate() + lookup_table = agg.get_works_lookup_table() + save_jsonl_gz(release.workslookup_path, lookup_table) + + # Save errors from aggregation + error_table = [{"Error": error} for error in agg.errors] + save_jsonl_gz(release.workslookup_errors_path, error_table) + + # Aggregate work families + agg = BookWorkFamilyAggregator(works) + agg.aggregate() + lookup_table = agg.get_works_family_lookup_table() + save_jsonl_gz(release.worksfamilylookup_path, lookup_table) + + # Upload the aggregation tables and error tables to a GCP bucket in preparation for BQ loading + files = [release.workslookup_path, release.workslookup_errors_path, release.worksfamilylookup_path] + gcs_upload_files(bucket_name=cloud_workspace.transform_bucket, file_paths=files) + + # Load the 'WorkID lookup', 'WorkID lookup table errors' and 'WorkFamilyID lookup' tables into BigQuery + aggregation_blobs = [ + release.workslookup_blob_name, + release.workslookup_errors_blob_name, + release.worksfamilylookup_blob_name, + ] + aggregation_tables = [ + bq_worksid_table_name, + bq_worksid_error_table_name, + bq_workfamilyid_table_name, + ] + for blob, table_name in zip(aggregation_blobs, aggregation_tables): + uri = gcs_blob_uri(cloud_workspace.transform_bucket, blob) + table_id = bq_sharded_table_id( + cloud_workspace.project_id, bq_onix_workflow_dataset, table_name, release.snapshot_date + ) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=bq_find_schema(path=schema_folder, table_name=table_name), + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + write_disposition="WRITE_TRUNCATE", + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def create_crossref_metadata_table(release: dict, **context) -> None: + """Creates the crossref metadata table by querying the AO master table and matching on this publisher's ISBNs""" + + release = OnixWorkflowRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=bq_oaebu_crossref_dataset_id, + location=cloud_workspace.data_location, + description="Data from Crossref sources", + ) + + onix_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.onix_snapshot_date, + ) + master_crossref_metadata_table_id = bq_sharded_table_id( + bq_master_crossref_project_id, + bq_master_crossref_dataset_id, + bq_master_crossref_metadata_table_name, + release.crossref_master_snapshot_date, + ) + sql = render_template( + os.path.join(sql_folder(workflow_module="onix_workflow"), "crossref_metadata_filter_isbn.sql.jinja2"), + onix_table_id=onix_table_id, + crossref_metadata_table_id=master_crossref_metadata_table_id, + ) + logging.info("Creating crossref metadata table from master table") + schema_file_path = bq_find_schema(path=schema_folder, table_name=bq_oaebu_crossref_metadata_table_name) + oaebu_crossref_metadata_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_crossref_dataset_id, + bq_oaebu_crossref_metadata_table_name, + release.snapshot_date, + ) + client = Client(project=cloud_workspace.project_id) + state = bq_create_table_from_query( + sql=sql, table_id=oaebu_crossref_metadata_table_id, schema_file_path=schema_file_path, client=client + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def create_crossref_events_table(release: dict, **context) -> None: + """Download, transform, upload and create a table for crossref events""" + + release = OnixWorkflowRelease.from_dict(release) + + # Get the unique dois from the metadata table + metadata_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_crossref_dataset_id, + bq_oaebu_crossref_metadata_table_name, + release.snapshot_date, + ) + client = Client(project=cloud_workspace.project_id) + dois = dois_from_table(metadata_table_id, doi_column_name="DOI", distinct=True, client=client) + + # Download and transform all events + start_date = crossref_start_date + end_date = release.snapshot_date + events = download_crossref_events(dois, start_date, end_date, mailto, max_threads=max_threads) + events = transform_crossref_events(events, max_threads=max_threads) + + # Zip and upload to google cloud + save_jsonl_gz(release.crossref_events_path, events) + gcs_upload_files(bucket_name=cloud_workspace.transform_bucket, file_paths=[release.crossref_events_path]) + table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_crossref_dataset_id, + bq_crossref_events_table_name, + release.snapshot_date, + ) + + state = bq_load_table( + uri=gcs_blob_uri(cloud_workspace.transform_bucket, release.crossref_events_blob_name), + table_id=table_id, + schema_file_path=bq_find_schema(path=schema_folder, table_name=bq_crossref_events_table_name), + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + write_disposition="WRITE_TRUNCATE", + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def create_book_table(release: dict, **context) -> None: + """Create the oaebu book table using the crossref event and metadata tables""" + + release = OnixWorkflowRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=bq_oaebu_dataset, + location=cloud_workspace.data_location, + description="OAEBU Tables", + ) + book_table_id = bq_sharded_table_id( + cloud_workspace.project_id, bq_oaebu_dataset, bq_book_table_name, release.snapshot_date + ) + crossref_metadata_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_crossref_dataset_id, + bq_oaebu_crossref_metadata_table_name, + release.snapshot_date, + ) + crossref_events_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_crossref_dataset_id, + bq_crossref_events_table_name, + release.snapshot_date, + ) + sql = render_template( + os.path.join(sql_folder(workflow_module="onix_workflow"), "book.sql.jinja2"), + crossref_events_table_id=crossref_events_table_id, + crossref_metadata_table_id=crossref_metadata_table_id, + ) + logging.info(sql) + + client = Client(project=cloud_workspace.project_id) + status = bq_create_table_from_query( + sql=sql, + table_id=book_table_id, + schema_file_path=os.path.join(schema_folder, "book.json"), + client=client, + ) + set_task_state(status, context["ti"].task_id, release=release) + + @task_group(group_id="intermediate_tables") + def create_tasks_intermediate_tables(release: dict): + tasks = [] + for data_partner in data_partners: + task = create_intermediate_table.override(task_id=f"intermediate_{data_partner.bq_table_name}")( + release, + orig_project_id=cloud_workspace.project_id, + orig_dataset=data_partner.bq_dataset_id, + orig_table=data_partner.bq_table_name, + orig_isbn=data_partner.isbn_field_name, + sharded=data_partner.sharded, + ) + tasks.append(task) + chain(tasks) + + @task() + def create_intermediate_table( + release: dict, + *, + orig_project_id: str, + orig_dataset: str, + orig_table: str, + orig_isbn: str, + sharded: bool, + **context, + ) -> None: + """Create an intermediate oaebu table. They are of the form datasource_matched + + :param release: Onix workflow release information. + :param orig_project_id: Project ID for the partner data. + :param orig_dataset: Dataset ID for the partner data. + :param orig_table: Table ID for the partner data. + :param orig_isbn: Name of the ISBN field in the partner data table. + :param sharded: Whether the data partner table is sharded + """ + + release = OnixWorkflowRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=bq_oaebu_intermediate_dataset, + location=cloud_workspace.data_location, + description="Intermediate OAEBU Tables", + ) + orig_table_id = ( + bq_sharded_table_id(orig_project_id, orig_dataset, orig_table, release.snapshot_date) + if sharded + else bq_table_id(orig_project_id, orig_dataset, orig_table) + ) + output_table_name = f"{orig_table}{oaebu_intermediate_match_suffix}" + template_path = os.path.join( + sql_folder(workflow_module="onix_workflow"), "assign_workid_workfamilyid.sql.jinja2" + ) + output_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_intermediate_dataset, + output_table_name, + release.snapshot_date, + ) + wid_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_worksid_table_name, + release.snapshot_date, + ) + wfam_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_workfamilyid_table_name, + release.snapshot_date, + ) + + # Make the table from SQL query + client = Client(project=cloud_workspace.project_id) + sql = render_template( + template_path, + orig_table_id=orig_table_id, + orig_isbn=orig_isbn, + wid_table_id=wid_table_id, + wfam_table_id=wfam_table_id, + ) + status = bq_create_table_from_query(sql=sql, table_id=output_table_id, client=client) + set_task_state(status, context["ti"].task_id, release=release) + + @task() + def create_book_product_table(release: dict, **context) -> None: + """Create the Book Product Table""" + + release = OnixWorkflowRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=bq_oaebu_dataset, + location=cloud_workspace.data_location, + description="OAEBU Tables", + ) + + # Data partner table names + dp_tables = { + f"{dp.type_id}_table_id": bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_intermediate_dataset, + f"{dp.type_id}_matched", + release.snapshot_date, + ) + for dp in data_partners + } + + # Metadata table name + onix_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.onix_snapshot_date, + ) + + # ONIX WF table names + workid_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_worksid_table_name, + release.snapshot_date, + ) + workfamilyid_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_workfamilyid_table_name, + release.snapshot_date, + ) + country_table_id = bq_table_id(bq_country_project_id, bq_country_dataset_id, "country") + book_table_id = bq_sharded_table_id( + cloud_workspace.project_id, bq_oaebu_dataset, bq_book_table_name, release.snapshot_date + ) + + # Render the SQL + env = create_data_partner_env( + main_template=os.path.join(sql_folder(workflow_module="onix_workflow"), "book_product.sql.jinja2"), + data_partners=data_partners, + ) + sql = env.render( + onix_table_id=onix_table_id, + data_partners=data_partners, + book_table_id=book_table_id, + country_table_id=country_table_id, + workid_table_id=workid_table_id, + workfamilyid_table_id=workfamilyid_table_id, + ga3_views_field=ga3_views_field, + **dp_tables, + ) + logging.info(f"Book Product SQL:\n{sql}") + + # Create the table + with open(os.path.join(default_schema_folder("onix_workflow"), "book_product.json"), "r") as f: + schema = json.load(f) + + # Create the schema + for dp in data_partners: + months_schema_file = os.path.join(dp.schema_directory, dp.files.book_product_metrics_schema) + with open(months_schema_file, "r") as f: + months_schema = json.load(f) + schema = insert_into_schema(schema, insert_field=months_schema, schema_field_name="months") + + metadata_schema_file = os.path.join(dp.schema_directory, dp.files.book_product_metadata_schema) + if dp.has_metadata: + with open(metadata_schema_file, "r") as f: + metadata_schema = json.load(f) + schema = insert_into_schema(schema, insert_field=metadata_schema, schema_field_name="metadata") + + table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_dataset, + bq_book_product_table_name, + release.snapshot_date, + ) + + # Run the query + with open(release.book_product_schema_path, mode="w+") as f: + json.dump(schema, f) + client = Client(project=cloud_workspace.project_id) + status = bq_create_table_from_query( + sql=sql, table_id=table_id, schema_file_path=release.book_product_schema_path, client=client + ) + set_task_state(status, context["ti"].task_id, release=release) + + @task_group(group_id="export_tables") + def create_tasks_export_tables(release): + """Create tasks for exporting final metrics from our OAEBU data. + These are split into two categories: generic and custom. + The custom exports change their schema depending on the data partners. Generic tables do not.""" + + generic_export_tables = [ + { + "output_table": "book_list", + "query_template": os.path.join(sql_folder("onix_workflow"), "book_list.sql.jinja2"), + "schema": os.path.join(default_schema_folder("onix_workflow"), "book_list.json"), + }, + { + "output_table": "book_metrics_events", + "query_template": os.path.join(sql_folder("onix_workflow"), "book_metrics_events.sql.jinja2"), + "schema": os.path.join(default_schema_folder("onix_workflow"), "book_metrics_events.json"), + }, + ] + if "jstor_institution" in [dp.type_id for dp in data_partners]: + generic_export_tables.append( + { + "output_table": "book_institution_list", + "query_template": os.path.join(sql_folder("onix_workflow"), "book_institution_list.sql.jinja2"), + "schema": os.path.join(default_schema_folder("onix_workflow"), "book_institution_list.json"), + } + ) + generic_export_tables.append( + { + "output_table": "book_metrics_institution", + "query_template": os.path.join( + sql_folder("onix_workflow"), "book_metrics_institution.sql.jinja2" + ), + "schema": os.path.join(default_schema_folder("onix_workflow"), "book_metrics_institution.json"), + }, + ) + if "irus_oapen" in [dp.type_id for dp in data_partners]: + generic_export_tables.append( + { + "output_table": "book_metrics_city", + "query_template": os.path.join(sql_folder("onix_workflow"), "book_metrics_city.sql.jinja2"), + "schema": os.path.join(default_schema_folder("onix_workflow"), "book_metrics_city.json"), + } + ) + + # Create each export table in BiqQuery + tasks = [] + + for export_table in generic_export_tables: + task = export_oaebu_table.override(task_id=f"export_{export_table['output_table']}")( + release, + output_table=export_table["output_table"], + query_template_path=export_table["query_template"], + schema_file_path=export_table["schema"], + ) + tasks.append(task) + + tasks.append(export_book_metrics(release)) + tasks.append(export_book_metrics_country(release)) + tasks.append(export_book_metrics_author(release)) + tasks.append(export_book_metrics_subjects(release)) + chain(tasks) + + @task() + def export_oaebu_table( + release: dict, *, output_table, query_template_path, schema_file_path, **context + ) -> bool: + """Create an export table. + + Takes several kwargs: + :param output_table: The name of the table to create + :param query_template: The name of the template SQL file + :param schema_file_path: The path to the schema + """ + release = OnixWorkflowRelease.from_dict(release) + _export_oaebu_table(release, output_table, query_template_path, schema_file_path) + + def _export_oaebu_table( + release: OnixWorkflowRelease, output_table, query_template_path, schema_file_path + ) -> bool: + """Non-task version of export_oaebu_table() + + :return: Whether the table creation was a success""" + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=bq_oaebu_export_dataset, + location=cloud_workspace.data_location, + description="OAEBU Tables for Dashboarding", + ) + + output_table_name = f"{cloud_workspace.project_id.replace('-', '_')}_{output_table}" + output_table_id = bq_sharded_table_id( + cloud_workspace.project_id, bq_oaebu_export_dataset, output_table_name, release.snapshot_date + ) + book_product_table_id = bq_sharded_table_id( + cloud_workspace.project_id, + bq_oaebu_dataset, + bq_book_product_table_name, + release.snapshot_date, + ) + country_table_id = bq_table_id(bq_country_project_id, bq_country_dataset_id, "country") + bic_table_id = bq_table_id(bq_subject_project_id, bq_subject_dataset_id, "bic_lookup") + bisac_table_id = bq_table_id(bq_subject_project_id, bq_subject_dataset_id, "bisac_lookup") + thema_table_id = bq_table_id(bq_subject_project_id, bq_subject_dataset_id, "thema_lookup") + + env = create_data_partner_env(main_template=query_template_path, data_partners=data_partners) + sql = env.render( + project_id=cloud_workspace.project_id, + dataset_id=bq_oaebu_dataset, + release=release.snapshot_date, + data_partners=data_partners, + book_product_table_id=book_product_table_id, + country_table_id=country_table_id, + bic_table_id=bic_table_id, + bisac_table_id=bisac_table_id, + thema_table_id=thema_table_id, + ) + logging.info(f"{output_table} SQL:\n{sql}") + + client = Client(project=cloud_workspace.project_id) + status = bq_create_table_from_query( + sql=sql, table_id=output_table_id, schema_file_path=schema_file_path, client=client + ) + return status + + @task() + def export_book_metrics_country(release: dict, **context) -> None: + """Create table for country metrics""" + + release = OnixWorkflowRelease.from_dict(release) + country_schema_base = os.path.join(default_schema_folder("onix_workflow"), "book_metrics_country.json") + with open(country_schema_base, "r") as f: + country_schema = json.load(f) + + for dp in [dp for dp in data_partners if dp.export_country]: + _file = dp.files.book_metrics_country_schema + with open(os.path.join(dp.schema_directory, _file), "r") as f: + dp_schema = json.load(f) + country_schema = insert_into_schema(country_schema, dp_schema) + + with open(release.country_metrics_schema, "w") as f: + json.dump(country_schema, f) + + query_template_path = os.path.join( + sql_folder(workflow_module="onix_workflow"), "book_metrics_country.sql.jinja2" + ) + status = _export_oaebu_table( + release=release, + output_table="book_metrics_country", + query_template_path=query_template_path, + schema_file_path=release.country_metrics_schema, + ) + set_task_state(status, context["ti"].task_id, release=release) + + @task() + def export_book_metrics_author(release: dict, **context) -> None: + """Create table for author metrics""" + + release = OnixWorkflowRelease.from_dict(release) + author_schema_base = os.path.join(default_schema_folder("onix_workflow"), "book_metrics_author.json") + with open(author_schema_base, "r") as f: + author_schema = json.load(f) + + for dp in [dp for dp in data_partners if dp.export_author]: + _file = dp.files.book_metrics_author_schema + with open(os.path.join(dp.schema_directory, _file), "r") as f: + dp_schema = json.load(f) + author_schema = insert_into_schema(author_schema, dp_schema) + + with open(release.author_metrics_schema, "w") as f: + json.dump(author_schema, f) + + query_template_path = os.path.join( + sql_folder(workflow_module="onix_workflow"), "book_metrics_author.sql.jinja2" + ) + status = _export_oaebu_table( + release=release, + output_table="book_metrics_author", + query_template_path=query_template_path, + schema_file_path=release.author_metrics_schema, + ) + set_task_state(status, context["ti"].task_id, release=release) + + @task() + def export_book_metrics(release: dict, **context) -> None: + """Create table for book metrics""" + + release = OnixWorkflowRelease.from_dict(release) + book_schema_base = os.path.join(default_schema_folder("onix_workflow"), "book_metrics.json") + with open(book_schema_base, "r") as f: + book_schema = json.load(f) + + for dp in [dp for dp in data_partners if dp.export_book_metrics]: + _file = dp.files.book_metrics_schema + with open(os.path.join(dp.schema_directory, _file), "r") as f: + dp_schema = json.load(f) + book_schema = insert_into_schema(book_schema, dp_schema) + + with open(release.book_metrics_schema, "w") as f: + json.dump(book_schema, f) + + query_template_path = os.path.join(sql_folder(workflow_module="onix_workflow"), "book_metrics.sql.jinja2") + status = _export_oaebu_table( + release=release, + output_table="book_metrics", + query_template_path=query_template_path, + schema_file_path=release.book_metrics_schema, + ) + set_task_state(status, context["ti"].task_id, release=release) + + @task() + def export_book_metrics_subjects(release: dict, **context) -> None: + """Create tables for subject metrics""" + + release = OnixWorkflowRelease.from_dict(release) + for sub, schema_dump in [ + ("bic", release.subject_metrics_bic_schema), + ("bisac", release.subject_metrics_bisac_schema), + ("thema", release.subject_metrics_thema_schema), + ]: + subject_schema_base = os.path.join( + default_schema_folder("onix_workflow"), f"book_metrics_subject_{sub}.json" + ) + with open(subject_schema_base, "r") as f: + subject_schema = json.load(f) + + for dp in [dp for dp in data_partners if dp.export_subject]: + _file = dp.files.book_metrics_subject_schema + with open(os.path.join(dp.schema_directory, _file), "r") as f: + dp_schema = json.load(f) + subject_schema = insert_into_schema(subject_schema, dp_schema) + + with open(schema_dump, "w") as f: + json.dump(subject_schema, f) + + query_template_path = os.path.join( + sql_folder(workflow_module="onix_workflow"), f"book_metrics_subject_{sub}.sql.jinja2" + ) + status = _export_oaebu_table( + release=release, + output_table=f"book_metrics_subject_{sub}", + query_template_path=query_template_path, + schema_file_path=schema_dump, + ) + set_task_state(status, context["ti"].task_id, release=release) + + @task() + def update_latest_export_tables(release: dict, **context) -> None: + """Create copies of the latest data export tables in bigquery""" + + release = OnixWorkflowRelease.from_dict(release) + copy_latest_export_tables( + project_id=cloud_workspace.project_id, + from_dataset=bq_oaebu_export_dataset, + to_dataset=bq_oaebu_latest_export_dataset, + date_match=release.snapshot_date.strftime("%Y%m%d"), + data_location=cloud_workspace.data_location, + ) + + @task() + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = OnixWorkflowRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + snapshot_date=release.snapshot_date, + data_interval_start=context["data_interval_start"], + data_interval_end=context["data_interval_end"], + ) + api.add_dataset_release(dataset_release) + + @task() + def cleanup_workflow(release: dict, **context): + """Cleanup temporary files.""" + + release = OnixWorkflowRelease.from_dict(release) + cleanup(dag_id=dag_id, execution_date=context["execution_date"], workflow_folder=release.workflow_folder) + + # Define DAG tasks + task_check_dependencies = check_dependencies() + task_group_sensors = make_sensors() + xcom_release = make_release() + task_aggregate_works = aggregate_works(xcom_release) + task_create_crossref_metadata_table = create_crossref_metadata_table(xcom_release) + task_create_crossref_events_table = create_crossref_events_table( + xcom_release, + pool=crossref_events_pool.pool_name, + pool_slots=min(max_threads, crossref_events_pool.pool_slots), + ) + task_create_book_table = create_book_table(xcom_release) + task_group_create_intermediate_tables = create_tasks_intermediate_tables(xcom_release) + task_create_book_product_table = create_book_product_table(xcom_release) + task_group_create_export_tables = create_tasks_export_tables(xcom_release) + task_update_latest_export_tables = update_latest_export_tables(xcom_release) + task_add_release = add_new_dataset_releases(xcom_release) + task_cleanup_workflow = cleanup_workflow(xcom_release) + + ( + task_check_dependencies + >> task_group_sensors + >> xcom_release + >> task_aggregate_works + >> task_create_crossref_metadata_table + >> task_create_crossref_events_table + >> task_create_book_table + >> task_group_create_intermediate_tables + >> task_create_book_product_table + >> task_group_create_export_tables + >> task_update_latest_export_tables + >> task_add_release + >> task_cleanup_workflow + ) + + return onix_workflow() + + +def dois_from_table( + table_id: str, doi_column_name: str = "DOI", distinct: str = True, client: Client = None +) -> List[str]: + """ + Queries a metadata table to retrieve the unique DOIs. Provided the DOIs are not in a nested structure. + + :param metadata_table_id: The fully qualified ID of the metadata table on GCP + :param doi_field_name: The name of the DOI column + :param distinct: Whether to retrieve only unique DOIs + :return: All DOIs present in the metadata table + """ + + select_field = f"DISTINCT({doi_column_name})" if distinct else doi_column_name + sql = f"SELECT {select_field} FROM `{table_id}`" + query_results = bq_run_query(sql, client=client) + dois = [r["DOI"] for r in query_results] + return dois + + +def download_crossref_events( + dois: List[str], + start_date: pendulum.DateTime, + end_date: pendulum.DateTime, + mailto: str, + max_threads: int = 1, +) -> List[dict]: + """ + Spawns multiple threads to download event data (DOI and publisher only) for each doi supplied. + The url template was made with reference to the crossref event api: + https://www.eventdata.crossref.org/guide/service/query-api/ + Note that the max_threads will cap at 15 because the events API will return a 429 if more than 15 requests are made + per second. Each API request happens to take roughly 1 second. Having more threadsthan necessary slows down the + download process as the retry script will wait a minimum of two seconds between each attempt. + + :param dois: The list of DOIs to download the events for + :param start_date: The start date for events we're interested in + :param end_date: The end date for events we're interested in + :param mailto: The email to use as a reference for who is requesting the data + :param max_threads: The maximum threads to spawn for the downloads. + :return: All events for the input DOIs + """ + + url_start_date = start_date.strftime("%Y-%m-%d") + url_end_date = end_date.strftime("%Y-%m-%d") + max_threads = min(max_threads, 15) + + event_urls = [ + CROSSREF_EVENT_URL_TEMPLATE.format(doi=doi, mailto=mailto, start_date=url_start_date, end_date=url_end_date) + for doi in dois + ] + + logging.info(f"Beginning crossref event data download from {len(event_urls)} URLs with {max_threads} workers") + logging.info( + f"Downloading DOI data using URL: {CROSSREF_EVENT_URL_TEMPLATE.format(doi='***', mailto=mailto, start_date=url_start_date, end_date=url_end_date)}" + ) + all_events = [] + with ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + for i, url in enumerate(event_urls): + futures.append(executor.submit(download_crossref_event_url, url, i=i)) + for future in as_completed(futures): + all_events.extend(future.result()) + + return all_events + + +def download_crossref_event_url(url: str, i: int = 0) -> List[dict]: + """ + Downloads all crossref events from a url, iterating through pages if there is more than one + + :param url: The url send the request to + :param i: Worker number + :return: The events from this URL + """ + + events = [] + headers = oaebu_user_agent_header() + next_cursor, page_counts, total_events, page_events = download_crossref_page_events(url, headers) + events.extend(page_events) + total_counts = page_counts + while next_cursor: + tmp_url = url + f"&cursor={next_cursor}" + next_cursor, page_counts, _, page_events = download_crossref_page_events(tmp_url, headers) + total_counts += page_counts + events.extend(page_events) + logging.info(f"{i + 1}: {url} successful") + logging.info(f"{i + 1}: Total no. events: {total_events}, downloaded " f"events: {total_counts}") + return events + + +def download_crossref_page_events(url: str, headers: dict) -> Tuple[str, int, int, List[dict]]: + """ + Download crossref events from a single page + + :param url: The url to send the request to + :param headers: Headers to send with the request + :return: The cursor, event counter, total number of events and the events for the URL + """ + + crossref_events_limiter() + response = retry_get_url(url, num_retries=5, wait=wait_exponential_jitter(initial=0.5, max=60), headers=headers) + response_json = response.json() + total_events = response_json["message"]["total-results"] + events = response_json["message"]["events"] + next_cursor = response_json["message"]["next-cursor"] + counter = len(events) + + return next_cursor, counter, total_events, events + + +@sleep_and_retry +@limits(calls=15, period=1) +def crossref_events_limiter(): + """ "Task to throttle the calls to the crossref events API""" + return + + +def transform_crossref_events(events: List[dict], max_threads: int = 1) -> List[dict]: + """ + Spawns workers to transforms crossref events + + :param all_events: A list of the events to transform + :param max_threads: The maximum number of threads to utilise for the transforming process + :return: transformed events, the order of the events in the input list is not preserved + """ + + logging.info(f"Beginning crossref event transform with {max_threads} workers") + transformed_events = [] + with ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + for event in events: + futures.append(executor.submit(transform_event, event)) + for future in as_completed(futures): + transformed_events.append(future.result()) + logging.info("Crossref event transformation complete") + return transformed_events + + +def transform_event(event: dict) -> dict: + """Transform the dictionary with event data by replacing '-' with '_' in key names, converting all int values to + string except for the 'total' field and parsing datetime columns for a valid datetime. + + :param event: The event dictionary + :return: The transformed event dictionary + """ + + if isinstance(event, (str, int, float)): + return event + if isinstance(event, dict): + new = event.__class__() + for k, v in event.items(): + if isinstance(v, int) and k != "total": + v = str(v) + if k in ["timestamp", "occurred_at", "issued", "dateModified", "updated_date"]: + try: + v = str(pendulum.parse(v)) + except ValueError: + v = "0001-01-01T00:00:00Z" + + # Replace hyphens with underscores for BigQuery compatibility + k = k.replace("-", "_") + + # Replace @ symbol in keys left by DataCite between the 15 and 22 March 2019 + k = k.replace("@", "") + + new[k] = transform_event(v) + return new + + +def copy_latest_export_tables( + project_id: str, from_dataset: str, to_dataset: str, date_match: str, data_location: str, description: str = None +) -> None: + """Creates copies of all sharded tables from a dataset with a matching a date string. + + :param project_id: The project id + :param from_dataset: The dataset containing the sharded tables + :param to_dataset: The dataset to contain the copied tables - will create if does not exist + :param date_match: The date string to match. e.g. for a table named 'this_table20220101', this would be '20220101' + :param data_location: The regional location of the data in google cloud + :param description: The description for dataset housing the copied tables + """ + + if description is None: + description = "OAEBU Export tables for Dashboarding" + + # Make to_dataset if it doesn't exist + bq_create_dataset( + project_id=project_id, + dataset_id=to_dataset, + location=data_location, + description=description, + ) + + # Get the tables from the from_dataset + client = Client(project_id) + tables = [t.table_id for t in client.list_tables(from_dataset)] + + # Find the tables with specified date string + regex_string = rf"^\w+{date_match}\b" + matched_tables = [re.findall(regex_string, t) for t in tables] + matched_tables = [t[0] for t in matched_tables if t] + assert len(matched_tables), f"No tables matching date {date_match} in dataset {project_id}.{from_dataset}" + + # Copy all of the tables + for table in matched_tables: + table_id = bq_table_id(project_id, from_dataset, table) + unsharded_name = re.match(r"(.*?)\d{8}$", table).group(1) # Drop the date from the table for copied table + copy_table_id = bq_table_id(project_id, to_dataset, unsharded_name) + bq_copy_table( + src_table_id=table_id, dst_table_id=copy_table_id, write_disposition="WRITE_TRUNCATE", client=client + ) + + +def get_onix_records(table_id: str, client: Client = None) -> List[dict]: + """Fetch the latest onix snapshot from BigQuery. + :param table_id: Fully qualified table ID. + :return: List of onix product records. + """ + + sql = f"SELECT * FROM {table_id}" + records = bq_run_query(sql, client=client) + products = [{key: records[i][key] for key in records[i].keys()} for i in range(len(records))] + return products + + +def get_isbn_utils_sql_string() -> str: + """Load the ISBN utils sql functions. + :return BQ SQL string. + """ + + isbn_utils_file = "isbn_utils.sql" + isbn_utils_path = os.path.join(sql_folder(workflow_module="onix_workflow"), isbn_utils_file) + with open(isbn_utils_path, "r") as f: + isbn_utils_sql = f.read() + + return isbn_utils_sql + + +def create_data_partner_env(main_template: str, data_partners: Iterable[DataPartner]) -> Environment: + """Creates a jinja2 environment for any number of data partners + + :param main_template: The name of the main jinja2 template + :param data_partners: The data partners + :return: Jinja2 environment with data partners sql folders loaded + """ + + directories = [dp.sql_directory for dp in data_partners] + with open(main_template) as f: + contents = f.read() + loader = FileSystemLoader(directories) + env = Environment(loader=loader).from_string(contents) + return env + + +def insert_into_schema(schema_base: List[dict], insert_field: dict, schema_field_name: Optional[str] = None): + """ + Inserts a given field into a schema. + + :param schema_base: (List[dict]): The base schema to insert the field into. + :param insert_field: (dict): The field to be inserted into the schema. + :param schema_field_name: (Optional[str], optional): The name of the field in the schema. + If provided, the field will be inserted into the matching field. + If not provided, the field will be appended to the end of the schema. + :return: The updated schema with the field inserted. + + Raises ValueError If the provided schema_field_name is not found in the schema. + """ + + if schema_field_name: + field_found = False + for row in schema_base: + if row["name"] == schema_field_name: + field_found = True + row["fields"].append(insert_field) + break + if not field_found: + raise ValueError(f"Field {schema_field_name} not found in schema") + else: + schema_base.append(insert_field) + + return schema_base + + +@provide_session +def latest_execution_timedelta( + data_interval_start: datetime, ext_dag_id: str, session: scoped_session = None, **context +) -> int: + """ + Get the latest execution for a given external dag and returns its data_interval_start (logical date) + + :param ext_dag_id: The dag_id to get the latest execution date for. + :return: The latest execution date in the window. + """ + dagruns = ( + session.query(DagRun) + .filter( + DagRun.dag_id == ext_dag_id, + ) + .all() + ) + dates = [d.data_interval_start for d in dagruns] # data_interval start is what ExternalTaskSensor checks + dates.sort(reverse=True) + + if not len(dates): # If no execution is found return the logical date for the Workflow + logging.warn(f"No Executions found for dag id: {ext_dag_id}") + return data_interval_start + + return dates[0] diff --git a/oaebu_workflows/jstor_telescope/tests/__init__.py b/dags/oaebu_workflows/onix_workflow/schema/__init__.py similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/__init__.py rename to dags/oaebu_workflows/onix_workflow/schema/__init__.py diff --git a/oaebu_workflows/onix_workflow/schema/book.json b/dags/oaebu_workflows/onix_workflow/schema/book.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book.json rename to dags/oaebu_workflows/onix_workflow/schema/book.json diff --git a/oaebu_workflows/onix_workflow/schema/book_institution_list.json b/dags/oaebu_workflows/onix_workflow/schema/book_institution_list.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_institution_list.json rename to dags/oaebu_workflows/onix_workflow/schema/book_institution_list.json diff --git a/oaebu_workflows/onix_workflow/schema/book_list.json b/dags/oaebu_workflows/onix_workflow/schema/book_list.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_list.json rename to dags/oaebu_workflows/onix_workflow/schema/book_list.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_author.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_author.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_author.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_author.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_city.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_city.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_city.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_city.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_country.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_country.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_country.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_country.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_events.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_events.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_events.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_events.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_institution.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_institution.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_subject_bic.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_subject_bic.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_subject_bic.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_subject_bic.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_subject_bisac.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_subject_bisac.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_subject_bisac.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_subject_bisac.json diff --git a/oaebu_workflows/onix_workflow/schema/book_metrics_subject_thema.json b/dags/oaebu_workflows/onix_workflow/schema/book_metrics_subject_thema.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_metrics_subject_thema.json rename to dags/oaebu_workflows/onix_workflow/schema/book_metrics_subject_thema.json diff --git a/oaebu_workflows/onix_workflow/schema/book_product.json b/dags/oaebu_workflows/onix_workflow/schema/book_product.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/book_product.json rename to dags/oaebu_workflows/onix_workflow/schema/book_product.json diff --git a/oaebu_workflows/onix_workflow/schema/crossref_events.json b/dags/oaebu_workflows/onix_workflow/schema/crossref_events.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/crossref_events.json rename to dags/oaebu_workflows/onix_workflow/schema/crossref_events.json diff --git a/oaebu_workflows/onix_workflow/schema/crossref_metadata.json b/dags/oaebu_workflows/onix_workflow/schema/crossref_metadata.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/crossref_metadata.json rename to dags/oaebu_workflows/onix_workflow/schema/crossref_metadata.json diff --git a/oaebu_workflows/onix_workflow/schema/onix_aggregate_metrics.json b/dags/oaebu_workflows/onix_workflow/schema/onix_aggregate_metrics.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/onix_aggregate_metrics.json rename to dags/oaebu_workflows/onix_workflow/schema/onix_aggregate_metrics.json diff --git a/oaebu_workflows/onix_workflow/schema/onix_invalid_isbn.json b/dags/oaebu_workflows/onix_workflow/schema/onix_invalid_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/onix_invalid_isbn.json rename to dags/oaebu_workflows/onix_workflow/schema/onix_invalid_isbn.json diff --git a/oaebu_workflows/onix_workflow/schema/onix_workfamilyid_isbn.json b/dags/oaebu_workflows/onix_workflow/schema/onix_workfamilyid_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/onix_workfamilyid_isbn.json rename to dags/oaebu_workflows/onix_workflow/schema/onix_workfamilyid_isbn.json diff --git a/oaebu_workflows/onix_workflow/schema/onix_workid_isbn.json b/dags/oaebu_workflows/onix_workflow/schema/onix_workid_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/onix_workid_isbn.json rename to dags/oaebu_workflows/onix_workflow/schema/onix_workid_isbn.json diff --git a/oaebu_workflows/onix_workflow/schema/onix_workid_isbn_errors.json b/dags/oaebu_workflows/onix_workflow/schema/onix_workid_isbn_errors.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/onix_workid_isbn_errors.json rename to dags/oaebu_workflows/onix_workflow/schema/onix_workid_isbn_errors.json diff --git a/oaebu_workflows/onix_workflow/schema/platform_invalid_isbn.json b/dags/oaebu_workflows/onix_workflow/schema/platform_invalid_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/platform_invalid_isbn.json rename to dags/oaebu_workflows/onix_workflow/schema/platform_invalid_isbn.json diff --git a/oaebu_workflows/onix_workflow/schema/platform_unmatched_isbn.json b/dags/oaebu_workflows/onix_workflow/schema/platform_unmatched_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/schema/platform_unmatched_isbn.json rename to dags/oaebu_workflows/onix_workflow/schema/platform_unmatched_isbn.json diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/__init__.py b/dags/oaebu_workflows/onix_workflow/sql/__init__.py similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/__init__.py rename to dags/oaebu_workflows/onix_workflow/sql/__init__.py diff --git a/oaebu_workflows/onix_workflow/sql/assign_workid_workfamilyid.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/assign_workid_workfamilyid.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/assign_workid_workfamilyid.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/assign_workid_workfamilyid.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_institution_list.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_list.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_author.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_city.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_country.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_events.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_institution.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bic.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_bisac.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_metrics_subject_thema.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/book_product.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/crossref_events_filter_doi.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/crossref_events_filter_doi.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/crossref_events_filter_doi.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/crossref_events_filter_doi.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/crossref_metadata_filter_isbn.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/crossref_metadata_filter_isbn.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/crossref_metadata_filter_isbn.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/crossref_metadata_filter_isbn.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/isbn_utils.sql b/dags/oaebu_workflows/onix_workflow/sql/isbn_utils.sql similarity index 100% rename from oaebu_workflows/onix_workflow/sql/isbn_utils.sql rename to dags/oaebu_workflows/onix_workflow/sql/isbn_utils.sql diff --git a/oaebu_workflows/onix_workflow/sql/oaebu_intermediate_metrics.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/oaebu_intermediate_metrics.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/oaebu_intermediate_metrics.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/oaebu_intermediate_metrics.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/onix_aggregate_metrics.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/onix_aggregate_metrics.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/onix_aggregate_metrics.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/onix_aggregate_metrics.sql.jinja2 diff --git a/oaebu_workflows/onix_workflow/sql/validate_isbn.sql.jinja2 b/dags/oaebu_workflows/onix_workflow/sql/validate_isbn.sql.jinja2 similarity index 100% rename from oaebu_workflows/onix_workflow/sql/validate_isbn.sql.jinja2 rename to dags/oaebu_workflows/onix_workflow/sql/validate_isbn.sql.jinja2 diff --git a/oaebu_workflows/oapen_metadata_telescope/__init__.py b/dags/oaebu_workflows/schema/__init__.py similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/__init__.py rename to dags/oaebu_workflows/schema/__init__.py diff --git a/oaebu_workflows/schema/internet_archive/book_metrics_country_internet_archive.json b/dags/oaebu_workflows/schema/internet_archive/book_metrics_country_internet_archive.json similarity index 100% rename from oaebu_workflows/schema/internet_archive/book_metrics_country_internet_archive.json rename to dags/oaebu_workflows/schema/internet_archive/book_metrics_country_internet_archive.json diff --git a/oaebu_workflows/schema/internet_archive/book_metrics_internet_archive.json b/dags/oaebu_workflows/schema/internet_archive/book_metrics_internet_archive.json similarity index 100% rename from oaebu_workflows/schema/internet_archive/book_metrics_internet_archive.json rename to dags/oaebu_workflows/schema/internet_archive/book_metrics_internet_archive.json diff --git a/oaebu_workflows/schema/internet_archive/book_product_metadata_internet_archive.json b/dags/oaebu_workflows/schema/internet_archive/book_product_metadata_internet_archive.json similarity index 100% rename from oaebu_workflows/schema/internet_archive/book_product_metadata_internet_archive.json rename to dags/oaebu_workflows/schema/internet_archive/book_product_metadata_internet_archive.json diff --git a/oaebu_workflows/schema/internet_archive/book_product_metrics_internet_archive.json b/dags/oaebu_workflows/schema/internet_archive/book_product_metrics_internet_archive.json similarity index 100% rename from oaebu_workflows/schema/internet_archive/book_product_metrics_internet_archive.json rename to dags/oaebu_workflows/schema/internet_archive/book_product_metrics_internet_archive.json diff --git a/oaebu_workflows/schema/internet_archive/internet_archive.json b/dags/oaebu_workflows/schema/internet_archive/internet_archive.json similarity index 100% rename from oaebu_workflows/schema/internet_archive/internet_archive.json rename to dags/oaebu_workflows/schema/internet_archive/internet_archive.json diff --git a/oaebu_workflows/schema/worldreader/book_metrics_country_worldreader.json b/dags/oaebu_workflows/schema/worldreader/book_metrics_country_worldreader.json similarity index 100% rename from oaebu_workflows/schema/worldreader/book_metrics_country_worldreader.json rename to dags/oaebu_workflows/schema/worldreader/book_metrics_country_worldreader.json diff --git a/oaebu_workflows/schema/worldreader/book_metrics_worldreader.json b/dags/oaebu_workflows/schema/worldreader/book_metrics_worldreader.json similarity index 100% rename from oaebu_workflows/schema/worldreader/book_metrics_worldreader.json rename to dags/oaebu_workflows/schema/worldreader/book_metrics_worldreader.json diff --git a/oaebu_workflows/schema/worldreader/book_product_metadata_worldreader.json b/dags/oaebu_workflows/schema/worldreader/book_product_metadata_worldreader.json similarity index 100% rename from oaebu_workflows/schema/worldreader/book_product_metadata_worldreader.json rename to dags/oaebu_workflows/schema/worldreader/book_product_metadata_worldreader.json diff --git a/oaebu_workflows/schema/worldreader/book_product_metrics_worldreader.json b/dags/oaebu_workflows/schema/worldreader/book_product_metrics_worldreader.json similarity index 100% rename from oaebu_workflows/schema/worldreader/book_product_metrics_worldreader.json rename to dags/oaebu_workflows/schema/worldreader/book_product_metrics_worldreader.json diff --git a/oaebu_workflows/schema/worldreader/worldreader.json b/dags/oaebu_workflows/schema/worldreader/worldreader.json similarity index 100% rename from oaebu_workflows/schema/worldreader/worldreader.json rename to dags/oaebu_workflows/schema/worldreader/worldreader.json diff --git a/oaebu_workflows/oapen_metadata_telescope/schema/__init__.py b/dags/oaebu_workflows/sql/__init__.py similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/schema/__init__.py rename to dags/oaebu_workflows/sql/__init__.py diff --git a/oaebu_workflows/oapen_metadata_telescope/sql/__init__.py b/dags/oaebu_workflows/sql/internet_archive/__init__.py similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/sql/__init__.py rename to dags/oaebu_workflows/sql/internet_archive/__init__.py diff --git a/oaebu_workflows/sql/internet_archive/book_metrics_internet_archive.sql b/dags/oaebu_workflows/sql/internet_archive/book_metrics_internet_archive.sql similarity index 100% rename from oaebu_workflows/sql/internet_archive/book_metrics_internet_archive.sql rename to dags/oaebu_workflows/sql/internet_archive/book_metrics_internet_archive.sql diff --git a/oaebu_workflows/sql/internet_archive/book_product_body_internet_archive.sql.jinja2 b/dags/oaebu_workflows/sql/internet_archive/book_product_body_internet_archive.sql.jinja2 similarity index 100% rename from oaebu_workflows/sql/internet_archive/book_product_body_internet_archive.sql.jinja2 rename to dags/oaebu_workflows/sql/internet_archive/book_product_body_internet_archive.sql.jinja2 diff --git a/oaebu_workflows/sql/internet_archive/month_null_internet_archive.sql b/dags/oaebu_workflows/sql/internet_archive/month_null_internet_archive.sql similarity index 100% rename from oaebu_workflows/sql/internet_archive/month_null_internet_archive.sql rename to dags/oaebu_workflows/sql/internet_archive/month_null_internet_archive.sql diff --git a/oaebu_workflows/sql/worldreader/book_metrics_country_body_worldreader.sql.jinja2 b/dags/oaebu_workflows/sql/worldreader/book_metrics_country_body_worldreader.sql.jinja2 similarity index 100% rename from oaebu_workflows/sql/worldreader/book_metrics_country_body_worldreader.sql.jinja2 rename to dags/oaebu_workflows/sql/worldreader/book_metrics_country_body_worldreader.sql.jinja2 diff --git a/oaebu_workflows/sql/worldreader/book_metrics_country_join_worldreader.sql b/dags/oaebu_workflows/sql/worldreader/book_metrics_country_join_worldreader.sql similarity index 100% rename from oaebu_workflows/sql/worldreader/book_metrics_country_join_worldreader.sql rename to dags/oaebu_workflows/sql/worldreader/book_metrics_country_join_worldreader.sql diff --git a/oaebu_workflows/sql/worldreader/book_metrics_country_null_worldreader.sql b/dags/oaebu_workflows/sql/worldreader/book_metrics_country_null_worldreader.sql similarity index 100% rename from oaebu_workflows/sql/worldreader/book_metrics_country_null_worldreader.sql rename to dags/oaebu_workflows/sql/worldreader/book_metrics_country_null_worldreader.sql diff --git a/oaebu_workflows/sql/worldreader/book_metrics_country_struct_worldreader.sql b/dags/oaebu_workflows/sql/worldreader/book_metrics_country_struct_worldreader.sql similarity index 100% rename from oaebu_workflows/sql/worldreader/book_metrics_country_struct_worldreader.sql rename to dags/oaebu_workflows/sql/worldreader/book_metrics_country_struct_worldreader.sql diff --git a/oaebu_workflows/sql/worldreader/book_metrics_worldreader.sql b/dags/oaebu_workflows/sql/worldreader/book_metrics_worldreader.sql similarity index 100% rename from oaebu_workflows/sql/worldreader/book_metrics_worldreader.sql rename to dags/oaebu_workflows/sql/worldreader/book_metrics_worldreader.sql diff --git a/oaebu_workflows/sql/worldreader/book_product_body_worldreader.sql.jinja2 b/dags/oaebu_workflows/sql/worldreader/book_product_body_worldreader.sql.jinja2 similarity index 100% rename from oaebu_workflows/sql/worldreader/book_product_body_worldreader.sql.jinja2 rename to dags/oaebu_workflows/sql/worldreader/book_product_body_worldreader.sql.jinja2 diff --git a/oaebu_workflows/sql/worldreader/book_product_functions_worldreader.sql b/dags/oaebu_workflows/sql/worldreader/book_product_functions_worldreader.sql similarity index 100% rename from oaebu_workflows/sql/worldreader/book_product_functions_worldreader.sql rename to dags/oaebu_workflows/sql/worldreader/book_product_functions_worldreader.sql diff --git a/oaebu_workflows/sql/worldreader/month_null_worldreader.sql b/dags/oaebu_workflows/sql/worldreader/month_null_worldreader.sql similarity index 100% rename from oaebu_workflows/sql/worldreader/month_null_worldreader.sql rename to dags/oaebu_workflows/sql/worldreader/month_null_worldreader.sql diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/__init__.py b/dags/oaebu_workflows/thoth_telescope/__init__.py similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/__init__.py rename to dags/oaebu_workflows/thoth_telescope/__init__.py diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/__init__.py b/dags/oaebu_workflows/thoth_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/__init__.py rename to dags/oaebu_workflows/thoth_telescope/schema/__init__.py diff --git a/oaebu_workflows/onix_telescope/__init__.py b/dags/oaebu_workflows/thoth_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/onix_telescope/__init__.py rename to dags/oaebu_workflows/thoth_telescope/sql/__init__.py diff --git a/dags/oaebu_workflows/thoth_telescope/thoth_telescope.py b/dags/oaebu_workflows/thoth_telescope/thoth_telescope.py new file mode 100644 index 00000000..bbb43831 --- /dev/null +++ b/dags/oaebu_workflows/thoth_telescope/thoth_telescope.py @@ -0,0 +1,303 @@ +# Copyright 2023-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Keegan Smith + +import os +import logging +from typing import Union + +import pendulum +from pendulum.datetime import DateTime +from airflow.decorators import dag, task +from airflow.exceptions import AirflowException +from google.cloud.bigquery import SourceFormat, Client + +from oaebu_workflows.onix_utils import OnixTransformer +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.google.bigquery import bq_load_table, bq_sharded_table_id, bq_create_dataset +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.url_utils import retry_get_url +from observatory_platform.google.gcs import gcs_upload_files, gcs_blob_name_from_path, gcs_blob_uri, gcs_download_blob +from observatory_platform.airflow.release import SnapshotRelease, set_task_state, make_snapshot_date +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback + + +THOTH_URL = "{host_name}/specifications/{format_specification}/publisher/{publisher_id}" +DEFAULT_HOST_NAME = "https://export.thoth.pub" + + +class ThothRelease(SnapshotRelease): + def __init__( + self, + *, + dag_id: str, + run_id: str, + snapshot_date: DateTime, + ): + """Construct a ThothRelease. + :param dag_id: The ID of the DAG + :param run_id: The Airflow run ID + :param snapshot_date: The date of the snapshot_date/release + """ + super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) + self.download_file_name = f"thoth_{snapshot_date.format('YYYY_MM_DD')}.xml" + self.transform_file_name = "transformed.jsonl.gz" + + @property + def download_path(self) -> str: + return os.path.join(self.download_folder, self.download_file_name) + + @property + def transform_path(self) -> str: + return os.path.join(self.transform_folder, self.transform_file_name) + + @property + def download_blob_name(self): + return gcs_blob_name_from_path(self.download_path) + + @property + def tranform_blob_name(self): + return gcs_blob_name_from_path(self.transform_path) + + @staticmethod + def from_dict(dict_: dict): + return ThothRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + snapshot_date=pendulum.from_format(dict_["snapshot_date"], "YYYY-MM-DD"), + ) + + def to_dict(self) -> dict: + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "snapshot_date": self.snapshot_date.to_date_string(), + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + publisher_id: str, + format_specification: str, + elevate_related_products: bool = False, + metadata_partner: Union[str, OaebuPartner] = "thoth", + bq_dataset_description: str = "Thoth ONIX Feed", + bq_table_description: str = "Thoth ONIX Feed", + api_dataset_id: str = "onix", + catchup: bool = False, + start_date: DateTime = pendulum.datetime(2022, 12, 1), + schedule: str = "0 12 * * Sun", # Midday every sunday + max_active_runs: int = 1, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """Construct an Thoth DAG. + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param publisher_id: The Thoth ID for this publisher + :param format_specification: The Thoth ONIX/metadata format specification. e.g. "onix_3.0::oapen" + :param elevate_related_products: Whether to pull out the related products to the product level. + :param metadata_partner: The metadata partner name + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_table_description: Description for the biguery table + :param api_dataset_id: The ID to store the dataset release in the API + :param catchup: Whether to catchup the DAG or not + :param start_date: The start date of the DAG + :param schedule: The schedule interval of the DAG + :param max_active_runs: The maximum number of active DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ + + metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) + + @dag( + dag_id=dag_id, + start_date=start_date, + schedule=schedule, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def thoth_telescope(): + @task() + def make_release(**content) -> dict: + """Creates a new Thoth release instance + + :param content: the context passed from the PythonOperator. + See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed + :return: The Thoth release instance + """ + + snapshot_date = make_snapshot_date(**content) + return ThothRelease(dag_id=dag_id, run_id=content["run_id"], snapshot_date=snapshot_date).to_dict() + + @task() + def download(release: dict, **content) -> None: + """Task to download the ONIX release from Thoth. + + :param release: The Thoth release instance + """ + + release = ThothRelease.from_dict(release) + thoth_download_onix( + publisher_id=publisher_id, + format_spec=format_specification, + download_path=release.download_path, + ) + success = gcs_upload_files(bucket_name=cloud_workspace.download_bucket, file_paths=[release.download_path]) + set_task_state(success, content["ti"].task_id, release=release) + + @task() + def transform(release: dict, **content) -> None: + """Task to transform the Thoth ONIX data""" + + release = ThothRelease.from_dict(release) + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_blob_name, + file_path=release.download_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_blob_name}") + + transformer = OnixTransformer( + input_path=release.download_path, + output_dir=release.transform_folder, + deduplicate_related_products=elevate_related_products, + elevate_related_products=elevate_related_products, + add_name_fields=True, + collapse_subjects=True, + ) + out_file = transformer.transform() + if release.transform_path != out_file: + raise FileNotFoundError( + f"Expected file {release.transform_path} not equal to transformed file: {out_file}" + ) + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, file_paths=[release.transform_path] + ) + set_task_state(success, content["ti"].task_id, release=release) + + @task() + def bq_load(release: dict, **content) -> None: + """Task to load the transformed ONIX jsonl file to BigQuery.""" + + release = ThothRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=metadata_partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + uri = gcs_blob_uri(cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) + table_id = bq_sharded_table_id( + cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.snapshot_date, + ) + client = Client(project=cloud_workspace.project_id) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=metadata_partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + table_description=bq_table_description, + client=client, + ) + set_task_state(state, content["ti"].task_id, release=release) + + @task() + def add_new_dataset_releases(release: dict, **content) -> None: + """Adds release information to API.""" + + release = ThothRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + snapshot_date=release.snapshot_date, + data_interval_start=content["data_interval_start"], + data_interval_end=content["data_interval_end"], + ) + api.add_dataset_release(dataset_release) + + @task() + def cleanup_workflow(release: dict, **content) -> None: + """Delete all files, folders and XComs associated with this release.""" + + release = ThothRelease.from_dict(release) + cleanup(dag_id=dag_id, execution_date=content["execution_date"], workflow_folder=release.workflow_folder) + + task_check_dependencies = check_dependencies() + xcom_release = make_release() + task_download = download(xcom_release) + task_transform = transform(xcom_release) + task_bq_load = bq_load(xcom_release) + task_add_new_dataset_releases = add_new_dataset_releases(xcom_release) + task_cleanup_workflow = cleanup_workflow(xcom_release) + + ( + task_check_dependencies + >> xcom_release + >> task_download + >> task_transform + >> task_bq_load + >> task_add_new_dataset_releases + >> task_cleanup_workflow + ) + + return thoth_telescope() + + +def thoth_download_onix( + publisher_id: str, + download_path: str, + format_spec: str, + host_name: str = DEFAULT_HOST_NAME, + num_retries: int = 3, +) -> None: + """Hits the Thoth API and requests the ONIX feed for a particular publisher. + Creates a file called onix.xml at the specified location + + :param publisher_id: The ID of the publisher. Can be found using Thoth GraphiQL API + :param download_path: The path to download ONIX the file to + :param format_spec: The ONIX format specification to use. Options can be found with the /formats endpoint of the API + :param host_name: The Thoth host URL + :param num_retries: The number of times to retry the download, given an unsuccessful return code + """ + url = THOTH_URL.format(host_name=host_name, format_specification=format_spec, publisher_id=publisher_id) + logging.info(f"Downloading ONIX XML from {url}") + response = retry_get_url(url, num_retries=num_retries) + if response.status_code != 200: + raise AirflowException( + f"Request for URL {url} was unsuccessful with code: {response.status_code}\nContent response: {response.content.decode('utf-8')}" + ) + with open(download_path, "wb") as f: + f.write(response.content) diff --git a/oaebu_workflows/onix_telescope/schema/__init__.py b/dags/oaebu_workflows/ucl_discovery_telescope/__init__.py similarity index 100% rename from oaebu_workflows/onix_telescope/schema/__init__.py rename to dags/oaebu_workflows/ucl_discovery_telescope/__init__.py diff --git a/oaebu_workflows/onix_telescope/sql/__init__.py b/dags/oaebu_workflows/ucl_discovery_telescope/schema/__init__.py similarity index 100% rename from oaebu_workflows/onix_telescope/sql/__init__.py rename to dags/oaebu_workflows/ucl_discovery_telescope/schema/__init__.py diff --git a/oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_country_ucl_discovery.json b/dags/oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_country_ucl_discovery.json similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_country_ucl_discovery.json rename to dags/oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_country_ucl_discovery.json diff --git a/oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_ucl_discovery.json b/dags/oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_ucl_discovery.json similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_ucl_discovery.json rename to dags/oaebu_workflows/ucl_discovery_telescope/schema/book_metrics_ucl_discovery.json diff --git a/oaebu_workflows/ucl_discovery_telescope/schema/book_product_metadata_ucl_discovery.json b/dags/oaebu_workflows/ucl_discovery_telescope/schema/book_product_metadata_ucl_discovery.json similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/schema/book_product_metadata_ucl_discovery.json rename to dags/oaebu_workflows/ucl_discovery_telescope/schema/book_product_metadata_ucl_discovery.json diff --git a/oaebu_workflows/ucl_discovery_telescope/schema/book_product_metrics_ucl_discovery.json b/dags/oaebu_workflows/ucl_discovery_telescope/schema/book_product_metrics_ucl_discovery.json similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/schema/book_product_metrics_ucl_discovery.json rename to dags/oaebu_workflows/ucl_discovery_telescope/schema/book_product_metrics_ucl_discovery.json diff --git a/oaebu_workflows/ucl_discovery_telescope/schema/ucl_discovery.json b/dags/oaebu_workflows/ucl_discovery_telescope/schema/ucl_discovery.json similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/schema/ucl_discovery.json rename to dags/oaebu_workflows/ucl_discovery_telescope/schema/ucl_discovery.json diff --git a/oaebu_workflows/onix_telescope/tests/__init__.py b/dags/oaebu_workflows/ucl_discovery_telescope/sql/__init__.py similarity index 100% rename from oaebu_workflows/onix_telescope/tests/__init__.py rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/__init__.py diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_body_ucl_discovery.sql.jinja2 b/dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_body_ucl_discovery.sql.jinja2 similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_body_ucl_discovery.sql.jinja2 rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_body_ucl_discovery.sql.jinja2 diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_join_ucl_discovery.sql b/dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_join_ucl_discovery.sql similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_join_ucl_discovery.sql rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_join_ucl_discovery.sql diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_null_ucl_discovery.sql b/dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_null_ucl_discovery.sql similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_null_ucl_discovery.sql rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_null_ucl_discovery.sql diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_struct_ucl_discovery.sql b/dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_struct_ucl_discovery.sql similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_struct_ucl_discovery.sql rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_country_struct_ucl_discovery.sql diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_ucl_discovery.sql b/dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_ucl_discovery.sql similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_ucl_discovery.sql rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/book_metrics_ucl_discovery.sql diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/book_product_body_ucl_discovery.sql.jinja2 b/dags/oaebu_workflows/ucl_discovery_telescope/sql/book_product_body_ucl_discovery.sql.jinja2 similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/book_product_body_ucl_discovery.sql.jinja2 rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/book_product_body_ucl_discovery.sql.jinja2 diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/month_null_ucl_discovery.sql b/dags/oaebu_workflows/ucl_discovery_telescope/sql/month_null_ucl_discovery.sql similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/month_null_ucl_discovery.sql rename to dags/oaebu_workflows/ucl_discovery_telescope/sql/month_null_ucl_discovery.sql diff --git a/dags/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py b/dags/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py new file mode 100644 index 00000000..897fc846 --- /dev/null +++ b/dags/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py @@ -0,0 +1,465 @@ +# Copyright 2023-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Aniek Roelofs, Keegan Smith + + +import logging +import os +from typing import List, Union +from concurrent.futures import ThreadPoolExecutor, as_completed + +import pendulum +from airflow.decorators import dag, task +from airflow.hooks.base import BaseHook +from google.cloud.bigquery import SourceFormat, TimePartitioningType, WriteDisposition, Client +from google.oauth2 import service_account +from apiclient import discovery + +from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str +from observatory_platform.dataset_api import DatasetAPI, DatasetRelease +from observatory_platform.files import save_jsonl_gz, load_jsonl +from observatory_platform.google.gcs import gcs_blob_uri, gcs_upload_files, gcs_blob_name_from_path, gcs_download_blob +from observatory_platform.google.bigquery import bq_load_table, bq_table_id, bq_create_dataset +from observatory_platform.url_utils import retry_get_url +from observatory_platform.airflow.tasks import check_dependencies +from observatory_platform.files import add_partition_date +from observatory_platform.airflow.release import PartitionRelease, set_task_state +from observatory_platform.airflow.workflow import CloudWorkspace, cleanup +from observatory_platform.airflow.airflow import on_failure_callback + + +class UclDiscoveryRelease(PartitionRelease): + def __init__( + self, + dag_id: str, + run_id: str, + data_interval_start: pendulum.DateTime, + data_interval_end: pendulum.DateTime, + partition_date: pendulum.DateTime, + ): + """Construct a UclDiscoveryRelease instance. + + :param dag_id: The ID of the DAG + :param run_id: The Airflow run ID. + :param data_interval_start: The start of the data interval. + :param data_interval_end: The end of the data interval. + :param partition_date: The partition date for this release. + """ + super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) + self.data_interval_start = data_interval_start + self.data_interval_end = data_interval_end + self.download_country_file_name = "ucl_discovery_country.jsonl.gz" + self.download_totals_file_name = "ucl_discovery_totals.jsonl.gz" + self.transform_file_name = "ucl_discovery.jsonl.gz" + + @property + def download_country_path(self): + return os.path.join(self.download_folder, "ucl_discovery_country.jsonl.gz") + + @property + def download_totals_path(self): + return os.path.join(self.download_folder, "ucl_discovery_totals.jsonl.gz") + + @property + def transform_path(self): + return os.path.join(self.transform_folder, "ucl_discovery.jsonl.gz") + + @property + def download_country_blob_name(self): + return gcs_blob_name_from_path(self.download_country_path) + + @property + def download_totals_blob_name(self): + return gcs_blob_name_from_path(self.download_totals_path) + + @property + def transform_blob_name(self): + return gcs_blob_name_from_path(self.transform_path) + + @staticmethod + def from_dict(dict_: dict): + return UclDiscoveryRelease( + dag_id=dict_["dag_id"], + run_id=dict_["run_id"], + data_interval_start=pendulum.from_format(dict_["data_interval_start"], "YYYY-MM-DD"), + data_interval_end=pendulum.from_format(dict_["data_interval_end"], "YYYY-MM-DD"), + partition_date=pendulum.from_format(dict_["partition_date"], "YYYY-MM-DD"), + ) + + def to_dict(self) -> dict: + return { + "dag_id": self.dag_id, + "run_id": self.run_id, + "data_interval_start": self.data_interval_start.to_date_string(), + "data_interval_end": self.data_interval_end.to_date_string(), + "partition_date": self.partition_date.to_date_string(), + } + + +def create_dag( + *, + dag_id: str, + cloud_workspace: CloudWorkspace, + sheet_id: str, + data_partner: Union[str, OaebuPartner] = "ucl_discovery", + bq_dataset_description: str = "UCL Discovery dataset", + bq_table_description: str = "UCL Discovery table", + api_dataset_id: str = "ucl", + oaebu_service_account_conn_id: str = "oaebu_service_account", + max_threads: int = os.cpu_count() * 2, + schedule: str = "0 0 4 * *", # run on the 4th of every month + start_date: pendulum.DateTime = pendulum.datetime(2015, 6, 1), + catchup: bool = True, + max_active_runs: int = 10, + retries: int = 3, + retry_delay: Union[int, float] = 5, +): + """Construct a UclDiscovery DAG. + + :param dag_id: The ID of the DAG + :param cloud_workspace: The CloudWorkspace object for this DAG + :param sheet_id: The ID of the google sheet match eprint ID to ISBN13 + :param data_partner: The name of the data partner + :param bq_dataset_description: Description for the BigQuery dataset + :param bq_table_description: Description for the biguery table + :param api_dataset_id: The ID to store the dataset release in the API + :param oaebu_service_account_conn_id: Airflow connection ID for the oaebu service account + :param max_threads: The maximum number threads to utilise for parallel processes + :param schedule: The schedule interval of the DAG + :param start_date: The start date of the DAG + :param catchup: Whether to catchup the DAG or not + :param max_active_runs: The maximum number of concurrent DAG runs + :param retries: The number of times to retry failed tasks + :param retry_delay: The delay between retries in minutes + """ + data_partner = partner_from_str(data_partner) + + @dag( + dag_id=dag_id, + start_date=start_date, + schedule=schedule, + catchup=catchup, + tags=["oaebu"], + max_active_runs=max_active_runs, + default_args=dict( + retries=retries, retry_delay=pendulum.duration(minutes=retry_delay), on_failure_callback=on_failure_callback + ), + ) + def ucl_discovery(): + @task() + def make_release(**context) -> dict: + """Creates a new ucl discovery release instance + + :param context: the context passed from the PythonOperator. + See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed + :return: A list with one ucldiscovery release instance. + """ + + data_interval_start = context["data_interval_start"].start_of("month") + data_interval_end = context["data_interval_end"].start_of("month") + partition_date = data_interval_start.end_of("month") + run_id = context["run_id"] + + logging.info( + f"Interval Start: {data_interval_start}, Interval End:{data_interval_end}, Partition date: {partition_date}, Run ID: {run_id}" + ) + return UclDiscoveryRelease( + dag_id, + context["run_id"], + data_interval_start=data_interval_start, + data_interval_end=data_interval_end, + partition_date=partition_date, + ).to_dict() + + @task() + def download(release: dict, **context) -> None: + """Fownload the ucl discovery data for a given release. + :param releases: The UCL discovery release. + """ + + release = UclDiscoveryRelease.from_dict(release) + mappings = get_isbn_eprint_mappings(sheet_id, oaebu_service_account_conn_id, release.partition_date) + with ThreadPoolExecutor(max_threads) as executor: + futures = [] + for eprint_id in mappings.keys(): + future = executor.submit( + download_discovery_stats, eprint_id, release.data_interval_start, release.partition_date + ) + futures.append(future) + totals = [] + country = [] + for future in as_completed(futures): + c, t = future.result() + country.append(c) + totals.append(t) + + logging.info(f"Saving totals data to file: {release.download_totals_path}") + save_jsonl_gz(release.download_totals_path, totals) + logging.info(f"Saving country data to file: {release.download_country_path}") + save_jsonl_gz(release.download_country_path, country) + + success = gcs_upload_files( + bucket_name=cloud_workspace.download_bucket, + file_paths=[release.download_country_path, release.download_totals_path], + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def transform(release: dict, **context) -> None: + """Transform the ucl discovery data for a given release.""" + + release = UclDiscoveryRelease.from_dict(release) + # Download files from GCS + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_country_blob_name, + file_path=release.download_country_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_country_blob_name}") + success = gcs_download_blob( + bucket_name=cloud_workspace.download_bucket, + blob_name=release.download_totals_blob_name, + file_path=release.download_totals_path, + ) + if not success: + raise FileNotFoundError(f"Error downloading file: {release.download_totals_blob_name}") + + # Load the records and sort them by eprint id + mappings = get_isbn_eprint_mappings(sheet_id, oaebu_service_account_conn_id, release.partition_date) + country_records = load_jsonl(release.download_country_path) + totals_records = load_jsonl(release.download_totals_path) + country_records = sorted(country_records, key=lambda x: x["set"]["value"]) # ["set"]["value"] = eprint_id + totals_records = sorted(totals_records, key=lambda x: x["set"]["value"]) + if not len(country_records) == len(totals_records): + raise RuntimeError(f"{len(country_records)} != {len(totals_records)}") + + with ThreadPoolExecutor(max_threads) as executor: + futures = [] + for country_record, totals_record in zip(country_records, totals_records): + isbn = mappings[country_record["set"]["value"]]["ISBN13"] + title = mappings[country_record["set"]["value"]]["title"] + future = executor.submit(transform_discovery_stats, country_record, totals_record, isbn, title) + futures.append(future) + results = [] + for future in as_completed(futures): + results.append(future.result()) + + # Add the release date to the data as a parition field + results = add_partition_date( + results, release.partition_date, TimePartitioningType.MONTH, partition_field="release_date" + ) + save_jsonl_gz(release.transform_path, results) + success = gcs_upload_files( + bucket_name=cloud_workspace.transform_bucket, file_paths=[release.transform_path] + ) + set_task_state(success, context["ti"].task_id, release=release) + + @task() + def bq_load(release: dict, **context) -> None: + """Loads the transformed data into BigQuery""" + + release = UclDiscoveryRelease.from_dict(release) + bq_create_dataset( + project_id=cloud_workspace.project_id, + dataset_id=data_partner.bq_dataset_id, + location=cloud_workspace.data_location, + description=bq_dataset_description, + ) + + uri = gcs_blob_uri(cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) + table_id = bq_table_id(cloud_workspace.project_id, data_partner.bq_dataset_id, data_partner.bq_table_name) + client = Client(project=cloud_workspace.project_id) + state = bq_load_table( + uri=uri, + table_id=table_id, + schema_file_path=data_partner.schema_path, + source_format=SourceFormat.NEWLINE_DELIMITED_JSON, + partition_type=TimePartitioningType.MONTH, + partition=True, + partition_field="release_date", + write_disposition=WriteDisposition.WRITE_APPEND, + table_description=bq_table_description, + ignore_unknown_values=True, + client=client, + ) + set_task_state(state, context["ti"].task_id, release=release) + + @task() + def add_new_dataset_releases(release: dict, **context) -> None: + """Adds release information to API.""" + + release = UclDiscoveryRelease.from_dict(release) + client = Client(project=cloud_workspace.project_id) + api = DatasetAPI(project_id=cloud_workspace.project_id, dataset_id=api_dataset_id, client=client) + api.seed_db() + dataset_release = DatasetRelease( + dag_id=dag_id, + dataset_id=api_dataset_id, + dag_run_id=release.run_id, + created=pendulum.now(), + modified=pendulum.now(), + data_interval_start=context["data_interval_start"], + data_interval_end=context["data_interval_end"], + partition_date=release.partition_date, + ) + api.add_dataset_release(dataset_release) + + @task() + def cleanup_workflow(release: dict, **context) -> None: + """Delete all files, folders and XComs associated with this release.""" + + release = UclDiscoveryRelease.from_dict(release) + cleanup(dag_id=dag_id, execution_date=context["execution_date"], workflow_folder=release.workflow_folder) + + task_check_dependencies = check_dependencies(airflow_conns=[oaebu_service_account_conn_id]) + xcom_release = make_release() + task_download = download(xcom_release) + task_transform = transform(xcom_release) + task_bq_load = bq_load(xcom_release) + task_add_new_dataset_releases = add_new_dataset_releases(xcom_release) + task_cleanup_workflow = cleanup_workflow(xcom_release) + + ( + task_check_dependencies + >> xcom_release + >> task_download + >> task_transform + >> task_bq_load + >> task_add_new_dataset_releases + >> task_cleanup_workflow + ) + + return ucl_discovery() + + +def get_isbn_eprint_mappings(sheet_id: str, service_account_conn_id: str, cutoff_date: pendulum.DateTime) -> dict: + """Get the eprint id to isbn mapping from the google sheet + + :param sheet_id: The ID of the google sheet. + :param credentials: The credentials object to authenticate with. + :param cutoff_date: The cutoff date. If an item is published after this date, it will be skipped. + """ + scopes = [ + "https://www.googleapis.com/auth/drive", + "https://www.googleapis.com/auth/drive.file", + "https://www.googleapis.com/auth/spreadsheets", + ] + service_account_conn = BaseHook.get_connection(service_account_conn_id) + credentials = service_account.Credentials.from_service_account_info( + service_account_conn.extra_dejson, scopes=scopes + ) + service = discovery.build("sheets", "v4", credentials=credentials) + result = service.spreadsheets().values().get(spreadsheetId=sheet_id, range="isbn_mapping").execute() + sheet_contents = result.get("values") + if not sheet_contents: + raise ValueError(f"No content found for sheet with ID {sheet_id}") + + items = [] + header = sheet_contents[0] + if not all(heading in header for heading in ["ISBN13", "discovery_eprintid", "date", "title_list_title"]): + raise ValueError(f"Invalid header found for sheet: {header}") + for row in sheet_contents[1:]: + items.append(dict(zip(header, row))) + + mappings = {} + for item in items: + eprint_id = item.get("discovery_eprintid") + isbn = item.get("ISBN13") + title = item.get("title_list_title") + if not eprint_id or not isbn: + logging.warn(f"Item with missing information will be skipped: {item}") + continue + if pendulum.parse(item["date"]) > cutoff_date: + logging.info(f"Item released after cutoff date will be skipped: {item}") + continue + mappings[eprint_id] = {"ISBN13": isbn, "title": title} + + return mappings + + +def download_discovery_stats(eprint_id: str, start_date: pendulum.DateTime, end_date: pendulum.DateTime): + """Downloads the discovery stats for a given eprint ID within a specified date range. + + :param eprint_id: The eprint ID of the item to get the stats for. + :param start_date: The start date of the date range. + :param end_date: The end date of the date range. + :return: A tuple containing the country statistics and the total downloads statistics. + """ + countries_url = ( + "https://discovery.ucl.ac.uk/cgi/stats/get" + f"?from={start_date.format('YYYYMMDD')}&to={end_date.format('YYYYMMDD')}" + f"&irs2report=eprint&set_name=eprint&set_value={eprint_id}&datatype=countries&top=countries" + "&view=Table&limit=all&export=JSON" + ) + totals_url = ( + "https://discovery.ucl.ac.uk/cgi/stats/get" + f"?from={start_date.format('YYYYMMDD')}&to={end_date.format('YYYYMMDD')}" + f"&irs2report=eprint&set_name=eprint&set_value={eprint_id}&datatype=downloads&graph_type=column" + "&view=Google%3A%3AGraph&date_resolution=month&title=Download+activity+-+last+12+months&export=JSON" + ) + response = retry_get_url(countries_url) + country = response.json() + response = retry_get_url(totals_url) + totals = response.json() + + # Perform some checks on the returned data + timescale = (start_date.format("YYYYMMDD"), end_date.format("YYYYMMDD")) + country_timescale = (country["timescale"]["from"], country["timescale"]["to"]) + totals_timescale = (totals["timescale"]["from"], totals["timescale"]["to"]) + if country_timescale != timescale or totals_timescale != timescale: + raise ValueError( + f"Invalid timescale value(s): country: {country['timescale']} | totals: {totals['timescale']} != {timescale}" + ) + if country["set"]["value"] != eprint_id or totals["set"]["value"] != eprint_id: + raise ValueError( + f"Invalid eprint ID values downloaded: {totals['set']['value']} | {country['set']['value']} != {eprint_id}" + ) + + return country, totals + + +def transform_discovery_stats(country_record: dict, totals_record: dict, isbn: str, title: str) -> dict: + """Transforms the discovery stats for a single set of records + + :param country_record: The country record + :param totals_record: The totals record + :param isbn: The isbn that matches the eprint id + :return: The transformed stats + """ + # Sanity check the records + country_eprint_id = country_record["set"]["value"] + totals_eprint_id = totals_record["set"]["value"] + if country_eprint_id != totals_eprint_id: + raise ValueError(f"Country and totals eprint ID do not match: {country_eprint_id} != {totals_eprint_id}") + + country_timescale = country_record["timescale"] + totals_timescale = totals_record["timescale"] + if country_timescale != totals_timescale: + raise ValueError(f"Timescales do not match: {country_timescale} != {totals_timescale}") + + # If there are no downloads for the time period, there is no "records" field in country stats + country_records = country_record.get("records", []) + + transformed = { + "ISBN": isbn, + "title": title, + "eprint_id": totals_record["set"]["value"], + "timescale": totals_record["timescale"], + "origin": totals_record["origin"], + "total_downloads": totals_record["records"][0]["count"], + "country": country_records, + } + return transformed diff --git a/oaebu_workflows/google_analytics3_telescope/google_analytics3_telescope.py b/oaebu_workflows/google_analytics3_telescope/google_analytics3_telescope.py deleted file mode 100644 index 8aca7e50..00000000 --- a/oaebu_workflows/google_analytics3_telescope/google_analytics3_telescope.py +++ /dev/null @@ -1,583 +0,0 @@ -# Copyright 2020-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs, Keegan Smith - -from __future__ import annotations - -import logging -import os -from typing import Dict, List, Tuple, Union - -import pendulum -from airflow.exceptions import AirflowException, AirflowSkipException -from airflow.hooks.base import BaseHook -from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition -from googleapiclient.discovery import Resource, build -from oauth2client.service_account import ServiceAccountCredentials - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.files import save_jsonl_gz -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import add_partition_date -from observatory.platform.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path -from observatory.platform.bigquery import bq_load_table, bq_table_id, bq_create_dataset -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.workflows.workflow import ( - Workflow, - PartitionRelease, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -class GoogleAnalytics3Release(PartitionRelease): - def __init__( - self, - dag_id: str, - run_id: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - partition_date: pendulum.DateTime, - ): - """Construct a GoogleAnalytics3Release. - - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID - :param data_interval_start: The start date of the DAG the start date of the download period. - :param data_interval_end: end date of the download period, also used as release date for BigQuery table and file paths - """ - super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) - self.data_interval_start = data_interval_start - self.data_interval_end = data_interval_end - self.transform_path = os.path.join(self.transform_folder, f"{partition_date.format('YYYY_MM_DD')}.json.gz") - - -class GoogleAnalytics3Telescope(Workflow): - """Google Analytics Telescope.""" - - ANU_ORG_NAME = "ANU Press" - - def __init__( - self, - dag_id: str, - organisation_name: str, - cloud_workspace: CloudWorkspace, - view_id: str, - pagepath_regex: str, - data_partner: Union[str, OaebuPartner] = "google_analytics3", - bq_dataset_description: str = "Data from Google sources", - bq_table_description: str = None, - api_dataset_id: str = "google_analytics", - oaebu_service_account_conn_id: str = "oaebu_service_account", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - catchup: bool = True, - start_date: pendulum.DateTime = pendulum.datetime(2018, 1, 1), - schedule: str = "@monthly", - ): - """Construct a GoogleAnalytics3Telescope instance. - :param dag_id: The ID of the DAG - :param organisation_name: The organisation name as per Google Analytics - :param cloud_workspace: The CloudWorkspace object for this DAG - :param view_id: The Google Analytics view ID - :param pagepath_regex: The pagepath regex - :param data_partner: The name of the data partner - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param oaebu_service_account_conn_id: Airflow connection ID for the OAEBU service account - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param catchup: Whether to catchup the DAG or not - :param start_date: The start date of the DAG - :param schedule: The schedule interval of the DAG - """ - super().__init__( - dag_id, - start_date, - schedule, - catchup=catchup, - airflow_conns=[oaebu_service_account_conn_id, observatory_api_conn_id], - tags=["oaebu"], - ) - - self.dag_id = dag_id - self.organisation_name = organisation_name - self.cloud_workspace = cloud_workspace - self.view_id = view_id - self.pagepath_regex = pagepath_regex - self.data_partner = partner_from_str(data_partner) - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.oaebu_service_account_conn_id = oaebu_service_account_conn_id - self.observatory_api_conn_id = observatory_api_conn_id - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_task(self.download_transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> List[GoogleAnalytics3Release]: - """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is - called in 'task_callable'. - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: A list of grid release instances - """ - # Get start and end date (data_interval_end = release_date) - data_interval_start = kwargs["data_interval_start"].start_of("month") - data_interval_end = kwargs["data_interval_end"].start_of("month") - partition_date = data_interval_start.end_of("month") - - logging.info( - f"Start date: {data_interval_start}, end date:{data_interval_end}, parition_date: {partition_date}" - ) - releases = [ - GoogleAnalytics3Release( - dag_id=self.dag_id, - run_id=kwargs["run_id"], - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - partition_date=partition_date, - ) - ] - return releases - - def check_dependencies(self, **kwargs) -> bool: - """Check dependencies of DAG. Add to parent method to additionally check for a view id and pagepath regex - - :param kwargs: the context passed from the Airflow Operator. - :return: True if dependencies are valid. - """ - super().check_dependencies() - - if self.view_id is None or self.pagepath_regex is None: - expected_extra = {"view_id": "the_view_id", "pagepath_regex": r"pagepath_regex"} - raise AirflowException( - f"View ID and/or pagepath regex is not set in 'extra' of telescope, extra example: " f"{expected_extra}" - ) - return True - - def download_transform(self, releases: List[GoogleAnalytics3Release], **kwargs) -> None: - """Task to download and transform the google analytics release for a given month. - - :param releases: a list with one google analytics release. - """ - data_found = False - for release in releases: - service = initialize_analyticsreporting(self.oaebu_service_account_conn_id) - results = get_reports( - service, - self.organisation_name, - self.view_id, - self.pagepath_regex, - release.data_interval_start, - release.data_interval_end.subtract( - days=1 - ), # Subtract 1 day because GA uses inclusive dates, Airlfow data intervals are not - ) - results = add_partition_date( - results, release.partition_date, TimePartitioningType.MONTH, partition_field="release_date" - ) - if results: - save_jsonl_gz(release.transform_path, results) - data_found = True - else: - if (pendulum.today("UTC") - self.data_interval_end).in_months() >= 26: - logging.info( - "No data available. Google Analytics data is only available for 26 months, see " - "https://support.google.com/analytics/answer/7667196?hl=en for more info" - ) - - if not data_found: - raise AirflowSkipException("No Google Analytics data available to download.") - - def upload_transformed(self, releases: List[GoogleAnalytics3Release], **kwargs) -> None: - """Uploads the transformed file to GCS""" - for release in releases: - state = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.transform_path] - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def bq_load(self, releases: List[GoogleAnalytics3Release], **kwargs) -> None: - """Loads the data into BigQuery""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.data_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - for release in releases: - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) - table_id = bq_table_id( - self.cloud_workspace.project_id, self.data_partner.bq_dataset_id, self.data_partner.bq_table_name - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.data_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - partition_type=TimePartitioningType.MONTH, - partition=True, - partition_field="release_date", - write_disposition=WriteDisposition.WRITE_APPEND, - table_description=self.bq_table_description, - ignore_unknown_values=True, - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, releases: List[GoogleAnalytics3Release], **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - for release in releases: - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - data_interval_start=release.data_interval_start, - data_interval_end=release.data_interval_end, - partition_date=release.partition_date, - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, releases: List[GoogleAnalytics3Release], **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - for release in releases: - cleanup( - dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder - ) - - -def initialize_analyticsreporting(oaebu_service_account_conn_id: str) -> Resource: - """Initializes an Analytics Reporting API V4 service object. - - :return: An authorized Analytics Reporting API V4 service object. - """ - oaebu_account_conn = BaseHook.get_connection(oaebu_service_account_conn_id) - - scopes = ["https://www.googleapis.com/auth/analytics.readonly"] - creds = ServiceAccountCredentials.from_json_keyfile_dict(oaebu_account_conn.extra_dejson, scopes=scopes) - - # Build the service object. - service = build("analyticsreporting", "v4", credentials=creds, cache_discovery=False) - - return service - - -def list_all_books( - service: Resource, - view_id: str, - pagepath_regex: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - organisation_name: str, - metrics: list, -) -> Tuple[List[dict], list]: - """List all available books by getting all pagepaths of a view id in a given period. - Note: Google API will not return a result for any entry in which all supplied metrics are zero. - However, it will return 'some' results if you supply no metrics, contrary to the documentation. - Date ranges are inclusive. - - :param service: The Google Analytics Reporting service object. - :param view_id: The view id. - :param pagepath_regex: The regex expression for the pagepath of a book. - :param data_interval_start: The start date of the DAG Start date of analytics period - :param data_interval_end: End date of analytics period - :param organisation_name: The organisation name. - :param: metrics: The metrics to return return with the book results - :return: A list with dictionaries, one for each book entry (the dict contains the pagepath, title and average time - on page) and a list of all pagepaths. - """ - # Get pagepath, pagetitle and average time on page for each path - body = { - "reportRequests": [ - { - "viewId": view_id, - "pageSize": 10000, - "dateRanges": [ - { - "startDate": data_interval_start.strftime("%Y-%m-%d"), - "endDate": data_interval_end.strftime("%Y-%m-%d"), - } - ], - "metrics": metrics, - "dimensions": [{"name": "ga:pagepath"}, {"name": "ga:pageTitle"}], - "dimensionFilterClauses": [ - { - "operator": "AND", - "filters": [ - {"dimensionName": "ga:pagepath", "operator": "REGEXP", "expressions": [pagepath_regex]} - ], - } - ], - } - ] - } - - # add all 6 custom dimensions for anu press - if organisation_name == GoogleAnalytics3Telescope.ANU_ORG_NAME: - for i in range(1, 7): - body["reportRequests"][0]["dimensions"].append({"name": f"ga:dimension{str(i)}"}) - - reports = service.reports().batchGet(body=body).execute() - all_book_entries = reports["reports"][0]["data"].get("rows") - next_page_token = reports["reports"][0].get("nextPageToken") - - while next_page_token: - body["reportRequests"][0]["pageToken"] = next_page_token - reports = service.reports().batchGet(body=body).execute() - book_entries = reports["reports"][0]["data"].get("rows") - next_page_token = reports["reports"][0].get("nextPageToken") - all_book_entries += book_entries - - # create list with just pagepaths - if all_book_entries: - pagepaths = [path["dimensions"][0] for path in all_book_entries] - else: - pagepaths = [] - - return all_book_entries, pagepaths - - -def create_book_result_dicts( - book_entries: List[dict], - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - organisation_name: str, -) -> Dict[dict]: - """Create a dictionary to store results for a single book. Pagepath, title and avg time on page are already given. - The other metrics will be added to the dictionary later. - - :param book_entries: List with dictionaries of book entries. - :param data_interval_start: The start date of the DAG Start date of analytics period. - :param data_interval_end: End date of analytics period. - :param organisation_name: The organisation name. - :return: Dict to store results - """ - book_results = {} - for entry in book_entries: - pagepath = entry["dimensions"][0] - pagetitle = entry["dimensions"][1] - average_time = float(entry["metrics"][0]["values"][-1]) - book_result = { - "url": pagepath, - "title": pagetitle, - "start_date": data_interval_start.strftime("%Y-%m-%d"), - "end_date": data_interval_end.strftime("%Y-%m-%d"), - "average_time": average_time, - "unique_views": {"country": {}, "referrer": {}, "social_network": {}}, - "page_views": {"country": {}, "referrer": {}, "social_network": {}}, - "sessions": {"country": {}, "source": {}}, - } - # add custom dimension data for ANU Press - if organisation_name == GoogleAnalytics3Telescope.ANU_ORG_NAME: - # matches dimension order in 'list_all_books' - custom_dimensions = { - "publication_id": entry["dimensions"][2], - "publication_type": entry["dimensions"][3], - "publication_imprint": entry["dimensions"][4], - "publication_group": entry["dimensions"][5], - "publication_whole_or_part": entry["dimensions"][6], - "publication_format": entry["dimensions"][7], - } - book_result = dict(book_result, **custom_dimensions) - book_results[pagepath] = book_result - - return book_results - - -def get_dimension_data( - service: Resource, - view_id: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - metrics: list, - dimension: dict, - pagepaths: list, -) -> list: - """Get reports data from the Google Analytics Reporting service for a single dimension and multiple metrics. - The results are filtered by pagepaths of interest and ordered by pagepath as well. - - :param service: The Google Analytics Reporting service. - :param view_id: The view id. - :param data_interval_start: The start date of the DAG The start date of the analytics period. - :param data_interval_end: The end date of the analytics period. - :param metrics: List with dictionaries of metric. - :param dimension: The dimension. - :param pagepaths: List with pagepaths to filter and sort on. - :return: List with reports data for dimension and metrics. - """ - body = { - "reportRequests": [ - { - "viewId": view_id, - "pageSize": 10000, - "dateRanges": [ - { - "startDate": data_interval_start.strftime("%Y-%m-%d"), - "endDate": data_interval_end.strftime("%Y-%m-%d"), - } - ], - "metrics": metrics, - "dimensions": [{"name": "ga:pagePath"}, dimension], - "dimensionFilterClauses": [ - {"filters": [{"dimensionName": "ga:pagePath", "operator": "IN_LIST", "expressions": pagepaths}]} - ], - "orderBys": [{"fieldName": "ga:pagepath"}], - } - ] - } - reports = service.reports().batchGet(body=body).execute() - all_dimension_data = reports["reports"][0]["data"].get("rows") - next_page_token = reports["reports"][0].get("nextPageToken") - - while next_page_token: - body["reportRequests"][0]["pageToken"] = next_page_token - reports = service.reports().batchGet(body=body).execute() - dimension_data = reports["reports"][0]["data"].get("rows") - next_page_token = reports["reports"][0].get("nextPageToken") - all_dimension_data += dimension_data - - return all_dimension_data - - -def add_to_book_result_dict( - book_results: dict, dimension: dict, pagepath: str, unique_views: dict, page_views: dict, sessions: dict -): - """Add the 'unique_views', 'page_views' and 'sessions' results to the book results dict if these metrics are of interest for the - current dimension. - - :param book_results: A dictionary with all book results. - :param dimension: Current dimension for which 'unique_views' and 'sessions' data is given. - :param pagepath: Pagepath of the book. - :param unique_views: Number of unique views for the pagepath&dimension - :param page_views: Number of page views for the pagepath&dimension - :param sessions: Number of sessions for the pagepath&dimension - :return: None - """ - # map the dimension name to the field name in BigQuery. The ga:dimensionX are obtained from custom ANU press - # dimensions - mapping = { - "ga:country": "country", - "ga:fullReferrer": "referrer", - "ga:socialNetwork": "social_network", - "ga:source": "source", - } - column_name = mapping[dimension["name"]] - if column_name in ["country", "referrer", "social_network"]: - book_results[pagepath]["unique_views"][column_name] = unique_views - book_results[pagepath]["page_views"][column_name] = page_views - if column_name in ["country", "source"]: - book_results[pagepath]["sessions"][column_name] = sessions - - -def get_reports( - service: Resource, - organisation_name: str, - view_id: str, - pagepath_regex: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, -) -> list: - """Get reports data from the Google Analytics Reporting API. - - :param service: The Google Analytics Reporting service. - :param organisation_name: Name of the organisation. - :param view_id: The view id. - :param pagepath_regex: The regex expression for the pagepath of a book. - :param data_interval_start: The start date of the DAG Start date of analytics period - :param data_interval_end: End date of analytics period - :return: List with google analytics data for each book - """ - - metric_names = ["uniquePageviews", "Pageviews", "sessions", "avgTimeOnPage"] - metrics = [{"expression": f"ga:{metric}"} for metric in metric_names] - - # list all books - book_entries, pagepaths = list_all_books( - service, view_id, pagepath_regex, data_interval_start, data_interval_end, organisation_name, metrics - ) - # if no books in period return empty list and raise airflow skip exception - if not book_entries: - return [] - # create dict with dict for each book to store results - book_results = create_book_result_dicts(book_entries, data_interval_start, data_interval_end, organisation_name) - - dimension_names = ["country", "fullReferrer", "socialNetwork", "source"] - dimensions = [{"name": f"ga:{dimension}"} for dimension in dimension_names] - - # get data per dimension - for dimension in dimensions: - dimension_data = get_dimension_data( - service, view_id, data_interval_start, data_interval_end, metrics, dimension, pagepaths - ) - - prev_pagepath = None - unique_views = {} - page_views = {} - sessions = {} - # entry is combination of book pagepath & dimension - for entry in dimension_data: - pagepath = entry["dimensions"][0] - dimension_value = entry["dimensions"][1] # e.g. 'Australia' for 'country' dimension - - if prev_pagepath and pagepath != prev_pagepath: - add_to_book_result_dict(book_results, dimension, prev_pagepath, unique_views, page_views, sessions) - - unique_views = {} - page_views = {} - sessions = {} - - # add values if they are not 0 - # ["values"][n] maps to the nth value of metric_names - unique_views_metric = int(entry["metrics"][0]["values"][0]) - page_views_metric = int(entry["metrics"][0]["values"][1]) - sessions_metric = int(entry["metrics"][0]["values"][2]) - if unique_views_metric > 0: - unique_views[dimension_value] = unique_views_metric - if page_views_metric > 0: - page_views[dimension_value] = page_views_metric - if sessions_metric > 0: - sessions[dimension_value] = sessions_metric - - prev_pagepath = pagepath - else: - add_to_book_result_dict(book_results, dimension, prev_pagepath, unique_views, page_views, sessions) - - # transform nested dict to list of dicts - for book, result in book_results.items(): - for field, value in result.items(): - # field is 'unique_views' or 'sessions' - if isinstance(value, dict): - # nested_field is 'country', 'referrer' or 'social_network' - for nested_field, nested_value in value.items(): - values = [] - # k is e.g. 'Australia', v is e.g. 1 - for k, v in nested_value.items(): - values.append({"name": k, "value": v}) - book_results[book][field][nested_field] = values - - # convert dict to list of results - book_results = [book_results[k] for k in book_results] - - return book_results diff --git a/oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table.json b/oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table.json deleted file mode 100644 index 926fa418..00000000 --- a/oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e710bd4f4fabb23925ba5dee504aeb7be6a5b7e9fc04fa252c41c1f767aca5c5 -size 4333 diff --git a/oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table_anu.json b/oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table_anu.json deleted file mode 100644 index b2d01ce9..00000000 --- a/oaebu_workflows/google_analytics3_telescope/tests/fixtures/test_table_anu.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b527970796c8a63c9f54ae1ad4153c4be723cc952f1875925af26b88bdf51697 -size 5061 diff --git a/oaebu_workflows/google_analytics3_telescope/tests/test_google_analytics3_telescope.py b/oaebu_workflows/google_analytics3_telescope/tests/test_google_analytics3_telescope.py deleted file mode 100644 index 61cb2afa..00000000 --- a/oaebu_workflows/google_analytics3_telescope/tests/test_google_analytics3_telescope.py +++ /dev/null @@ -1,435 +0,0 @@ -# Copyright 2020-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs, Keegan Smith - -import gzip -import json -import os -from unittest.mock import patch - -import pendulum -from airflow.models import Connection -from airflow.utils.state import State -from googleapiclient.discovery import build -from googleapiclient.http import HttpMockSequence - -from oaebu_workflows.google_analytics3_telescope.google_analytics3_telescope import GoogleAnalytics3Telescope -from oaebu_workflows.oaebu_partners import partner_from_str -from oaebu_workflows.config import test_fixtures_folder -from observatory.platform.api import get_dataset_releases -from observatory.platform.observatory_config import Workflow -from observatory.platform.bigquery import bq_table_id -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - load_and_parse_json, -) - - -class TestGoogleAnalytics3Telescope(ObservatoryTestCase): - """Tests for the Google Analytics telescope""" - - def __init__(self, *args, **kwargs): - """Constructor which sets up variables used by tests. - :param args: arguments. - :param kwargs: keyword arguments. - """ - super(TestGoogleAnalytics3Telescope, self).__init__(*args, **kwargs) - self.project_id = os.getenv("TEST_GCP_PROJECT_ID") - self.data_location = os.getenv("TEST_GCP_DATA_LOCATION") - self.view_id = "11235141" - self.pagepath_regex = r".*regex$" - self.organisation_name = "UCL Press" - - fixtures_folder = test_fixtures_folder(workflow_module="google_analytics3_telescope") - self.test_table = os.path.join(fixtures_folder, "test_table.json") - self.test_table_anu = os.path.join(fixtures_folder, "test_table_anu.json") - - def test_dag_structure(self): - """Test that the Google Analytics DAG has the correct structure. - :return: None - """ - cloud_workspace = self.fake_cloud_workspace - dag = GoogleAnalytics3Telescope( - dag_id="google_analytics_test", - organisation_name="Organisation Name", - cloud_workspace=cloud_workspace, - view_id=self.view_id, - pagepath_regex=self.pagepath_regex, - ).make_dag() - self.assert_dag_structure( - { - "check_dependencies": ["download_transform"], - "download_transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], - "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], - }, - dag, - ) - - def test_dag_load(self): - """Test that the Google Analytics DAG can be loaded from a DAG bag.""" - - env = ObservatoryEnvironment( - workflows=[ - Workflow( - dag_id="google_analytics3", - name="My Google Analytics Workflow", - class_name="oaebu_workflows.google_analytics3_telescope.google_analytics3_telescope.GoogleAnalytics3Telescope", - cloud_workspace=self.fake_cloud_workspace, - kwargs=dict(organisation_name="My Organisation", pagepath_regex="", view_id="123456"), - ) - ] - ) - with env.create(): - self.assert_dag_load_from_config("google_analytics3") - - # Errors should be raised if kwargs dict not supplied - env.workflows[0].kwargs = {} - with env.create(): - with self.assertRaises(AssertionError) as cm: - self.assert_dag_load_from_config("google_analytics3") - msg = cm.exception.args[0] - self.assertTrue("missing 3 required positional arguments" in msg) - self.assertTrue("organisation_name" in msg) - self.assertTrue("pagepath_regex" in msg) - self.assertTrue("view_id" in msg) - - @patch("oaebu_workflows.google_analytics3_telescope.google_analytics3_telescope.build") - @patch( - "oaebu_workflows.google_analytics3_telescope.google_analytics3_telescope.ServiceAccountCredentials" - ) - def test_telescope(self, mock_account_credentials, mock_build): - """Test the Google Analytics telescope end to end specifically for ANU Press, to test custom dimensions. - :return: None. - """ - # Mock the Google Reporting Analytics API service - mock_account_credentials.from_json_keyfile_dict.return_value = "" - - http = HttpMockSequence(create_http_mock_sequence(GoogleAnalytics3Telescope.ANU_ORG_NAME)) - mock_build.return_value = build("analyticsreporting", "v4", http=http) - - # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) - # Setup Telescope - execution_date = pendulum.datetime(year=2022, month=6, day=1) - partner = partner_from_str("google_analytics3") - partner.bq_dataset_id = env.add_dataset() - telescope = GoogleAnalytics3Telescope( - dag_id="google_analytics_test", - organisation_name=GoogleAnalytics3Telescope.ANU_ORG_NAME, - cloud_workspace=env.cloud_workspace, - view_id=self.view_id, - pagepath_regex=self.pagepath_regex, - data_partner=partner, - ) - dag = telescope.make_dag() - - # Create the Observatory environment and run tests - with env.create(): - with env.create_dag_run(dag, execution_date): - # Add OAEBU service account connection connection - conn = Connection( - conn_id="oaebu_service_account", - uri=f"google-cloud-platform://?type=service_account&private_key_id=private_key_id" - f"&private_key=private_key" - f"&client_email=client_email" - f"&client_id=client_id", - ) - env.add_connection(conn) - - # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Test download_transform task - ti = env.run_task(telescope.download_transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Test that transformed file uploaded - ti = env.run_task(telescope.upload_transformed.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Test that data loaded into BigQuery - ti = env.run_task(telescope.bq_load.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Use release to check tasks - release = telescope.make_release( - run_id=env.dag_run.run_id, - data_interval_start=pendulum.parse(str(env.dag_run.data_interval_start)), - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - )[0] - - # Test download_transform task - self.assertTrue(os.path.exists(release.transform_path)) - self.assertTrue(os.path.isfile(release.transform_path)) - # Use frozenset to test results are as expected, many dict transformations re-order items in dict - actual_list = [] - with gzip.open(release.transform_path, "rb") as f: - for line in f: - actual_list.append(json.loads(line)) - expected_list = [ - { - "url": "/base/path/151420", - "title": "Anything public program drive north.", - "start_date": "2022-06-01", - "end_date": "2022-06-30", - "average_time": 59.5, - "unique_views": { - "country": [{"name": "country 1", "value": 3}, {"name": "country 2", "value": 3}], - "referrer": [{"name": "referrer 1", "value": 3}, {"name": "referrer 2", "value": 3}], - "social_network": [ - {"name": "social_network 1", "value": 3}, - {"name": "social_network 2", "value": 3}, - ], - }, - "page_views": { - "country": [{"name": "country 1", "value": 4}, {"name": "country 2", "value": 4}], - "referrer": [{"name": "referrer 1", "value": 4}, {"name": "referrer 2", "value": 4}], - "social_network": [ - {"name": "social_network 1", "value": 4}, - {"name": "social_network 2", "value": 4}, - ], - }, - "sessions": { - "country": [{"name": "country 1", "value": 1}, {"name": "country 2", "value": 1}], - "source": [{"name": "source 1", "value": 1}, {"name": "source 2", "value": 1}], - }, - "publication_id": "1234567890123", - "publication_type": "book", - "publication_imprint": "imprint", - "publication_group": "group", - "publication_whole_or_part": "whole", - "publication_format": "PDF", - "release_date": "2022-06-30", - }, - { - "url": "/base/path/833557", - "title": "Standard current never no.", - "start_date": "2022-06-01", - "end_date": "2022-06-30", - "average_time": 49.6, - "unique_views": {"country": [], "referrer": [], "social_network": []}, - "page_views": {"country": [], "referrer": [], "social_network": []}, - "sessions": {"country": [], "source": []}, - "publication_id": "1234567891234", - "publication_type": "book", - "publication_imprint": "imprint", - "publication_group": "(none)", - "publication_whole_or_part": "part", - "publication_format": "HTML", - "release_date": "2022-06-30", - }, - { - "url": "/base/path/833557?fbclid=123", - "title": "Standard current never no.", - "start_date": "2022-06-01", - "end_date": "2022-06-30", - "average_time": 38.8, - "unique_views": { - "country": [{"name": "country 2", "value": 2}], - "referrer": [{"name": "referrer 2", "value": 2}], - "social_network": [{"name": "social_network 2", "value": 2}], - }, - "page_views": { - "country": [{"name": "country 2", "value": 4}], - "referrer": [{"name": "referrer 2", "value": 4}], - "social_network": [{"name": "social_network 2", "value": 4}], - }, - "sessions": {"country": [], "source": []}, - "publication_id": "1234567891234", - "publication_type": "book", - "publication_imprint": "imprint", - "publication_group": "(none)", - "publication_whole_or_part": "part", - "publication_format": "HTML", - "release_date": "2022-06-30", - }, - ] - self.assertEqual(3, len(actual_list)) - self.assertEqual(frozenset(expected_list[0]), frozenset(actual_list[0])) - self.assertEqual(frozenset(expected_list[1]), frozenset(actual_list[1])) - self.assertEqual(frozenset(expected_list[2]), frozenset(actual_list[2])) - - # Test that data loaded into BigQuery - table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.data_partner.bq_dataset_id, - telescope.data_partner.bq_table_name, - ) - self.assert_table_integrity(table_id, expected_rows=3) - self.assert_table_content( - table_id, - load_and_parse_json(self.test_table_anu, date_fields=["release_date", "start_date", "end_date"]), - primary_key="url", - ) - - # add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) - self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) - self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) - self.assertEqual(len(dataset_releases), 1) - - # Test that all telescope data deleted - ti = env.run_task(telescope.cleanup.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) - - -def create_http_mock_sequence(organisation_name: str) -> list: - """Create a list of http mock sequences for listing books and getting dimension data - - :param organisation_name: The organisation name (add custom dimensions for ANU) - :return: A list with HttpMockSequence instances - """ - http_mock_sequence = [] - list_books = { - "reports": [ - { - "columnHeader": { - "dimensions": ["ga:pagepath", "ga:pageTitle"], - "metricHeader": {"metricHeaderEntries": [{"name": "ga:avgTimeOnPage", "type": "TIME"}]}, - }, - "data": { - "rows": [ - { - "dimensions": ["/base/path/151420", "Anything public program drive north."], - "metrics": [{"values": ["59.5"]}], - }, - { - "dimensions": ["/base/path/833557", "Standard current never no."], - "metrics": [{"values": ["49.6"]}], - }, - ], - "totals": [{"values": ["109.1"]}], - "rowCount": 2, - "minimums": [{"values": ["49.6"]}], - "maximums": [{"values": ["59.5"]}], - "isDataGolden": True, - }, - "nextPageToken": "200", - } - ] - } - # Add custom dimensions from ANU Press - if organisation_name == GoogleAnalytics3Telescope.ANU_ORG_NAME: - list_books["reports"][0]["columnHeader"]["dimensions"] += [f"ga:dimension{(str(i))}" for i in range(1, 7)] - list_books["reports"][0]["data"]["rows"][0]["dimensions"] += [ - "1234567890123", - "book", - "imprint", - "group", - "whole", - "PDF", - ] - list_books["reports"][0]["data"]["rows"][1]["dimensions"] += [ - "1234567891234", - "book", - "imprint", - "(none)", - "part", - "HTML", - ] - list_books_next_page = { - "reports": [ - { - "columnHeader": { - "dimensions": ["ga:pagepath", "ga:pageTitle"], - "metricHeader": {"metricHeaderEntries": [{"name": "ga:avgTimeOnPage", "type": "TIME"}]}, - }, - "data": { - "rows": [ - { - "dimensions": ["/base/path/833557?fbclid=123", "Standard current never no."], - "metrics": [{"values": ["38.8"]}], - } - ], - "totals": [{"values": ["38.8"]}], - "rowCount": 1, - "minimums": [{"values": ["38.8"]}], - "maximums": [{"values": ["38.8"]}], - "isDataGolden": True, - }, - } - ] - } - # Add custom dimensions from ANU Press - if organisation_name == GoogleAnalytics3Telescope.ANU_ORG_NAME: - list_books_next_page["reports"][0]["columnHeader"]["dimensions"] += [ - f"ga:dimension{(str(i))}" for i in range(1, 7) - ] - list_books_next_page["reports"][0]["data"]["rows"][0]["dimensions"] += [ - "1234567891234", - "book", - "imprint", - "(none)", - "part", - "HTML", - ] - http_mock_sequence.append(({"status": "200"}, json.dumps(list_books))) - http_mock_sequence.append(({"status": "200"}, json.dumps(list_books_next_page))) - for dimension in ["country", "referrer", "social_network", "source"]: - results = { - "reports": [ - { - "columnHeader": { - "dimensions": ["ga:pagePath", "ga:country"], - "metricHeader": { - "metricHeaderEntries": [ - {"name": "ga:uniquePageviews", "type": "INTEGER"}, - {"name": "ga:Pageviews", "type": "INTEGER"}, - {"name": "ga:sessions", "type": "INTEGER"}, - ] - }, - }, - "data": { - "rows": [ - { - "dimensions": ["/base/path/151420", dimension + " 1"], - "metrics": [{"values": ["3", "4", "1"]}], - }, - { - "dimensions": ["/base/path/151420", dimension + " 2"], - "metrics": [{"values": ["3", "4", "1"]}], - }, - { - "dimensions": ["/base/path/833557", dimension + " 1"], - "metrics": [{"values": ["0", "0", "0"]}], # Added a zero case for code coverage - }, - { - "dimensions": ["/base/path/833557?fbclid=123", dimension + " 2"], - "metrics": [{"values": ["2", "4", "0"]}], - }, - ], - "totals": [{"values": ["6", "9", "1"]}], - "rowCount": 3, - "minimums": [{"values": ["1", "3", "0"]}], - "maximums": [{"values": ["3", "4", "1"]}], - "isDataGolden": True, - }, - } - ] - } - http_mock_sequence.append(({"status": "200"}, json.dumps(results))) - - return http_mock_sequence diff --git a/oaebu_workflows/google_books_telescope/google_books_telescope.py b/oaebu_workflows/google_books_telescope/google_books_telescope.py deleted file mode 100644 index 10e8cc48..00000000 --- a/oaebu_workflows/google_books_telescope/google_books_telescope.py +++ /dev/null @@ -1,377 +0,0 @@ -# Copyright 2020-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs - -import csv -import os -import re -from collections import OrderedDict, defaultdict -from typing import List, Tuple, Union - -import pendulum -from airflow.exceptions import AirflowException -from airflow.models.taskinstance import TaskInstance -from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import save_jsonl_gz -from observatory.platform.files import convert, add_partition_date -from observatory.platform.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.bigquery import bq_load_table, bq_table_id, bq_create_dataset -from observatory.platform.sftp import SftpFolders, make_sftp_connection -from observatory.platform.workflows.workflow import ( - PartitionRelease, - Workflow, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -class GoogleBooksRelease(PartitionRelease): - def __init__( - self, - dag_id: str, - run_id: str, - partition_date: pendulum.DateTime, - sftp_files: List[str], - ): - """Construct a GoogleBooksRelease. - - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID - :param partition_date: the partition date, corresponds to the last day of the month being processed. - :param sftp_files: List of full filepaths to download from sftp service (incl. in_progress folder) - """ - super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) - self.download_sales_path = os.path.join(self.download_folder, "google_books_sales.csv") - self.download_traffic_path = os.path.join(self.download_folder, "google_books_traffic.csv") - self.transform_sales_path = os.path.join(self.transform_folder, "google_books_sales.jsonl.gz") - self.transform_traffic_path = os.path.join(self.transform_folder, "google_books_traffic.jsonl.gz") - self.sftp_files = sftp_files - - -class GoogleBooksTelescope(Workflow): - """The Google Books telescope.""" - - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - sftp_root: str = "/", - sales_partner: Union[str, OaebuPartner] = "google_books_sales", - traffic_partner: Union[str, OaebuPartner] = "google_books_traffic", - bq_dataset_description: str = "Data from Google sources", - bq_sales_table_description: str = None, - bq_traffic_table_description: str = None, - api_dataset_id: str = "google_books", - sftp_service_conn_id: str = "sftp_service", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - catchup: bool = False, - schedule: str = "@weekly", - start_date: pendulum.DateTime = pendulum.datetime(2018, 1, 1), - ): - """Construct a GoogleBooksTelescope instance. - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param sftp_root: The root of the SFTP filesystem to work with - :param sales_partner: The name of the sales partner - :param traffic_partner: The name of the traffic partner - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_sales_table_description: Description for the BigQuery Google Books Sales table - :param bq_traffic_table_description: Description for the BigQuery Google Books Traffic table - :param api_dataset_id: The ID to store the dataset release in the API - :param sftp_service_conn_id: Airflow connection ID for the SFTP service - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param catchup: Whether to catchup the DAG or not - :param schedule: The schedule interval of the DAG - :param start_date: The start date of the DAG - """ - super().__init__( - dag_id, - start_date, - schedule, - catchup=catchup, - airflow_conns=[sftp_service_conn_id, observatory_api_conn_id], - tags=["oaebu"], - ) - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.sftp_root = sftp_root - self.sales_partner = partner_from_str(sales_partner) - self.traffic_partner = partner_from_str(traffic_partner) - self.bq_dataset_description = bq_dataset_description - self.bq_sales_table_description = bq_sales_table_description - self.bq_traffic_table_description = bq_traffic_table_description - self.api_dataset_id = api_dataset_id - self.sftp_service_conn_id = sftp_service_conn_id - self.observatory_api_conn_id = observatory_api_conn_id - - # Extra SFTP parameters - self.sftp_folders = SftpFolders(dag_id, sftp_conn_id=sftp_service_conn_id, sftp_root=sftp_root) - self.sftp_regex = r"^Google(SalesTransaction|BooksTraffic)Report_\d{4}_\d{2}.csv$" - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_setup_task(self.list_release_info) - self.add_task(self.move_files_to_in_progress) - self.add_task(self.download) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.move_files_to_finished) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> List[GoogleBooksRelease]: - """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is - called in 'task_callable'. - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: A list of google books release instances - """ - ti: TaskInstance = kwargs["ti"] - reports_info = ti.xcom_pull( - key=GoogleBooksTelescope.RELEASE_INFO, task_ids=self.list_release_info.__name__, include_prior_dates=False - ) - releases = [] - run_id = kwargs["run_id"] - for partition_date, sftp_files in reports_info.items(): - releases.append( - GoogleBooksRelease( - self.dag_id, run_id=run_id, partition_date=pendulum.parse(partition_date), sftp_files=sftp_files - ) - ) - return releases - - def list_release_info(self, **kwargs) -> bool: - """Lists all Google Books releases available on the SFTP server and publishes sftp file paths and - release_date's as an XCom. - - :return: the identifier of the task to execute next. - """ - - reports = defaultdict(list) - # List all reports in the 'upload' folder of the organisation - with make_sftp_connection(self.sftp_service_conn_id) as sftp: - files = sftp.listdir(self.sftp_folders.upload) - for file_name in files: - match = re.match(self.sftp_regex, file_name) - if match: - # Get the release date from file name - date_str = file_name[-11:].strip(".csv") - release_date = pendulum.from_format(date_str, "YYYY_MM").end_of("month") - release_date = release_date.format("YYYYMMDD") - - # Get the report type from file name - report_type = match.group(1) - - # Create the full path of the file for the 'in progress' folder - sftp_file = os.path.join(self.sftp_folders.in_progress, file_name) - - # Append report - reports[report_type + release_date].append(sftp_file) - - # Check that for each report type + date combination there is a report available - release_info = defaultdict(list) - for report, sftp_files in reports.items(): - release_date = report[-8:] - release_info[release_date] += sftp_files - - continue_dag = bool(release_info) - if continue_dag: - # Push messages - ti: TaskInstance = kwargs["ti"] - ti.xcom_push(GoogleBooksTelescope.RELEASE_INFO, release_info) - - return continue_dag - - def move_files_to_in_progress(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Move Google Books files to SFTP in-progress folder.""" - - for release in releases: - self.sftp_folders.move_files_to_in_progress(release.sftp_files) - - def download(self, releases: List[GoogleBooksRelease], **kwargs): - """Task to download the Google Books releases for a given month.""" - for release in releases: - with make_sftp_connection(self.sftp_service_conn_id) as sftp: - for file in release.sftp_files: - if "Traffic" in file: - sftp.get(file, localpath=release.download_traffic_path) - elif "Transaction" in file: - sftp.get(file, localpath=release.download_sales_path) - assert os.path.exists(release.download_traffic_path) and os.path.exists(release.download_sales_path) - - def upload_downloaded(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Uploads the downloaded files to GCS for each release""" - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.download_bucket, - file_paths=[release.download_sales_path, release.download_traffic_path], - ) - if not success: - raise AirflowException(f"Files could not be uploaded to cloud storage bucket: {self.transform_bucket}") - - def transform(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Task to transform the Google Books releases for a given month.""" - for release in releases: - gb_transform( - download_files=(release.download_sales_path, release.download_traffic_path), - sales_path=release.transform_sales_path, - traffic_path=release.transform_traffic_path, - release_date=release.partition_date, - ) - - def upload_transformed(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Uploads the transformed files to GCS for each release""" - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, - file_paths=[release.transform_sales_path, release.transform_traffic_path], - ) - if not success: - raise AirflowException(f"Files could not be uploaded to cloud storage bucket: {self.transform_bucket}") - - def move_files_to_finished(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Move Google Books files to SFTP finished folder.""" - - for release in releases: - self.sftp_folders.move_files_to_finished(release.sftp_files) - - def bq_load(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Loads the sales and traffic data into BigQuery""" - for release in releases: - for partner, table_description, file_path in [ - [self.sales_partner, self.bq_sales_table_description, release.transform_sales_path], - [self.traffic_partner, self.bq_traffic_table_description, release.transform_traffic_path], - ]: - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(file_path)) - table_id = bq_table_id(self.cloud_workspace.project_id, partner.bq_dataset_id, partner.bq_table_name) - success = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - partition_type=TimePartitioningType.MONTH, - partition=True, - partition_field="release_date", - write_disposition=WriteDisposition.WRITE_APPEND, - table_description=table_description, - ignore_unknown_values=True, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Adds release information to API.""" - - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - for release in releases: - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - data_interval_start=kwargs["data_interval_start"], - data_interval_end=kwargs["data_interval_end"], - partition_date=release.partition_date, - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, releases: List[GoogleBooksRelease], **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - for release in releases: - cleanup( - dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder - ) - - -def gb_transform( - download_files: Tuple[str, str], sales_path: str, traffic_path: str, release_date: pendulum.DateTime -) -> None: - """Transforms sales and traffic reports. For both reports it transforms the csv into a jsonl file and - replaces spaces in the keys with underscores. - - :param download_files: The Google Books Sales and Traffic files - :param sales_path: The file path to save the transformed sales data to - :param traffic_path: The file path to save the transformed traffic data to - :param release_date: The release date to use as a partitioning date - """ - # Sort files to get same hash for unit tests - - results = defaultdict(list) - results["sales"] = [] - results["traffic"] = [] - for file in download_files: - report_type = "sales" if "sales" in os.path.basename(file).lower() else "traffic" - with open(file, encoding="utf-16") as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter="\t") - for row in csv_reader: - transformed_row = OrderedDict((convert(k.replace("%", "Perc")), v) for k, v in row.items()) - # Sales transaction report - if report_type == "sales": - transaction_date = pendulum.from_format(transformed_row["Transaction_Date"], "MM/DD/YY") - - # Sanity check that transaction date is in month of release date - if release_date.start_of("month") <= transaction_date <= release_date.end_of("month"): - pass - else: - raise AirflowException( - "Transaction date does not fall within release month. " - f"Transaction date: {transaction_date.strftime('%Y-%m-%d')}, " - f"release month: {release_date.strftime('%Y-%m')}" - ) - - # Transform to valid date format - transformed_row["Transaction_Date"] = transaction_date.strftime("%Y-%m-%d") - - # Remove percentage sign - transformed_row["Publisher_Revenue_Perc"] = transformed_row["Publisher_Revenue_Perc"].strip("%") - # This field is not present for some publishers (UCL Press), for ANU Press the field value is - # “E-Book” - try: - transformed_row["Line_of_Business"] - except KeyError: - transformed_row["Line_of_Business"] = None - # Traffic report - else: - # Remove percentage sign - transformed_row["Buy_Link_CTR"] = transformed_row["Buy_Link_CTR"].strip("%") - - # Append results - results[report_type].append(transformed_row) - - for report_type, report_results in results.items(): - report_results = add_partition_date( - report_results, - partition_date=release_date, - partition_type=TimePartitioningType.MONTH, - partition_field="release_date", - ) - save_path = sales_path if report_type == "sales" else traffic_path - print(f"SAVING REPORT '{report_type}' to {save_path}") - save_jsonl_gz(save_path, report_results) diff --git a/oaebu_workflows/google_books_telescope/tests/test_google_books_telescope.py b/oaebu_workflows/google_books_telescope/tests/test_google_books_telescope.py deleted file mode 100644 index 485dfb3a..00000000 --- a/oaebu_workflows/google_books_telescope/tests/test_google_books_telescope.py +++ /dev/null @@ -1,360 +0,0 @@ -# Copyright 2020-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs - -import os -import shutil -from collections import defaultdict -from unittest.mock import patch - -import pendulum -from airflow.exceptions import AirflowException -from airflow.models.connection import Connection -from airflow.utils.state import State -from click.testing import CliRunner - -from oaebu_workflows.config import test_fixtures_folder -from oaebu_workflows.oaebu_partners import partner_from_str -from oaebu_workflows.google_books_telescope.google_books_telescope import ( - GoogleBooksRelease, - GoogleBooksTelescope, -) -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - SftpServer, - find_free_port, - random_id, -) -from observatory.platform.bigquery import bq_table_id -from observatory.platform.observatory_config import Workflow -from observatory.platform.sftp import SftpFolders -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.api import get_dataset_releases - - -class TestGoogleBooksTelescope(ObservatoryTestCase): - """Tests for the GoogleBooks telescope""" - - def __init__(self, *args, **kwargs): - """Constructor which sets up variables used by tests. - :param args: arguments. - :param kwargs: keyword arguments. - """ - - super(TestGoogleBooksTelescope, self).__init__(*args, **kwargs) - self.project_id = os.getenv("TEST_GCP_PROJECT_ID") - self.data_location = os.getenv("TEST_GCP_DATA_LOCATION") - self.sftp_port = find_free_port() - - def test_dag_structure(self): - """Test that the Google Books DAG has the correct structure.""" - dag = GoogleBooksTelescope( - dag_id="test_dag", cloud_workspace=self.fake_cloud_workspace, sftp_root="/" - ).make_dag() - self.assert_dag_structure( - { - "check_dependencies": ["list_release_info"], - "list_release_info": ["move_files_to_in_progress"], - "move_files_to_in_progress": ["download"], - "download": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], - "bq_load": ["move_files_to_finished"], - "move_files_to_finished": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], - }, - dag, - ) - - def test_dag_load(self): - """Test that the Google Books DAG can be loaded from a DAG bag.""" - # Run tests both for telescope with file suffixes and without - env = ObservatoryEnvironment( - workflows=[ - Workflow( - dag_id="google_books", - name="My Google Books Telescope", - class_name="oaebu_workflows.google_books_telescope.google_books_telescope.GoogleBooksTelescope", - cloud_workspace=self.fake_cloud_workspace, - ) - ] - ) - with env.create(): - self.assert_dag_load_from_config("google_books") - - def test_telescope(self): - """Test the Google Books telescope end to end.""" - fixtures_folder = test_fixtures_folder(workflow_module="google_books_telescope") - params = { - "no_download_files": 2, - "bq_rows": 4, - "traffic_download_hash": "db4dca44d5231e0c4e2ad95db41b79b6", - "traffic_transform_hash": "b8073007", - "sales_download_hash": "6496518be1ea73694d0a8f89c0b42f20", - "sales_transform_hash": "ebe49987", - "test_files": { - "GoogleBooksTrafficReport_2020_02.csv": os.path.join( - fixtures_folder, "GoogleBooksTrafficReport_2020_02.csv" - ), - "GoogleSalesTransactionReport_2020_02.csv": os.path.join( - fixtures_folder, "GoogleSalesTransactionReport_2020_02.csv" - ), - }, - } - - # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) - sftp_server = SftpServer(host="localhost", port=self.sftp_port) - dataset_id = env.add_dataset() - - # Create the Observatory environment and run tests - with env.create(): - with sftp_server.create() as sftp_root: - # Setup Telescope - execution_date = pendulum.datetime(year=2021, month=3, day=31) - sales_partner = partner_from_str("google_books_sales") - sales_partner.bq_dataset_id = dataset_id - traffic_partner = partner_from_str("google_books_traffic") - traffic_partner.bq_dataset_id = dataset_id - telescope = GoogleBooksTelescope( - dag_id="google_books_test", - cloud_workspace=env.cloud_workspace, - sftp_root="/", - sales_partner=sales_partner, - traffic_partner=traffic_partner, - ) - dag = telescope.make_dag() - - # Add SFTP connection - conn = Connection( - conn_id=telescope.sftp_service_conn_id, uri=f"ssh://:password@localhost:{self.sftp_port}" - ) - env.add_connection(conn) - with env.create_dag_run(dag, execution_date): - # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Add file to SFTP server - local_sftp_folders = SftpFolders(telescope.dag_id, telescope.sftp_service_conn_id, sftp_root) - os.makedirs(local_sftp_folders.upload, exist_ok=True) - for file_name, file_path in params["test_files"].items(): - upload_file = os.path.join(local_sftp_folders.upload, file_name) - shutil.copy(file_path, upload_file) - - # Check that the correct release info is returned via Xcom - ti = env.run_task(telescope.list_release_info.__name__) - self.assertEqual(ti.state, State.SUCCESS) - release_info = ti.xcom_pull( - key=GoogleBooksTelescope.RELEASE_INFO, - task_ids=telescope.list_release_info.__name__, - include_prior_dates=False, - ) - - # Get release info from SFTP server and create expected release info - expected_release_info = defaultdict(list) - for file_name, file_path in params["test_files"].items(): - expected_release_date = pendulum.from_format(file_name[-11:].strip(".csv"), "YYYY_MM").end_of( - "month" - ) - release_date_str = expected_release_date.format("YYYYMMDD") - if release_date_str == "20200229": - expected_release_file = os.path.join(telescope.sftp_folders.in_progress, file_name) - expected_release_info[release_date_str].append(expected_release_file) - self.assertTrue(1, len(release_info)) - self.assertEqual(expected_release_info["20200229"].sort(), release_info["20200229"].sort()) - - # Use release info for other tasks - releases = [] - for release_date, sftp_files in release_info.items(): - releases.append( - GoogleBooksRelease( - dag_id=telescope.dag_id, - run_id=env.dag_run.run_id, - partition_date=pendulum.parse(release_date), - sftp_files=sftp_files, - ) - ) - self.assertTrue(1, len(releases)) - release = releases[0] - - # Test move file to in progress - ti = env.run_task(telescope.move_files_to_in_progress.__name__) - self.assertEqual(ti.state, State.SUCCESS) - for file in release.sftp_files: - file_name = os.path.basename(file) - upload_file = os.path.join(local_sftp_folders.upload, file_name) - self.assertFalse(os.path.isfile(upload_file)) - in_progress_file = os.path.join(local_sftp_folders.in_progress, file_name) - self.assertTrue(os.path.isfile(in_progress_file)) - - # Run main telescope tasks - ti = env.run_task(telescope.download.__name__) - self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.upload_transformed.__name__) - self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.bq_load.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Make assertions for the above tasks - # Test download - self.assertTrue(os.path.exists(release.download_traffic_path)) - self.assertTrue(os.path.exists(release.download_sales_path)) - self.assert_file_integrity(release.download_traffic_path, params["traffic_download_hash"], "md5") - self.assert_file_integrity(release.download_sales_path, params["sales_download_hash"], "md5") - - # Test upload downloaded - self.assert_blob_integrity( - env.download_bucket, - gcs_blob_name_from_path(release.download_traffic_path), - release.download_traffic_path, - ) - self.assert_blob_integrity( - env.download_bucket, - gcs_blob_name_from_path(release.download_sales_path), - release.download_sales_path, - ) - - # Test that file transformed - self.assertTrue(os.path.exists(release.transform_sales_path)) - self.assertTrue(os.path.exists(release.transform_traffic_path)) - self.assert_file_integrity(release.transform_sales_path, params["sales_transform_hash"], "gzip_crc") - self.assert_file_integrity( - release.transform_traffic_path, params["traffic_transform_hash"], "gzip_crc" - ) - - # Test that transformed file uploaded - self.assert_blob_integrity( - env.transform_bucket, - gcs_blob_name_from_path(release.transform_traffic_path), - release.transform_traffic_path, - ) - self.assert_blob_integrity( - env.transform_bucket, - gcs_blob_name_from_path(release.transform_sales_path), - release.transform_sales_path, - ) - - # Test that data loaded into BigQuery - table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.sales_partner.bq_dataset_id, - telescope.sales_partner.bq_table_name, - ) - self.assert_table_integrity(table_id, params["bq_rows"]) - table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.traffic_partner.bq_dataset_id, - telescope.traffic_partner.bq_table_name, - ) - self.assert_table_integrity(table_id, params["bq_rows"]) - - # Test move files to finished - ti = env.run_task(telescope.move_files_to_finished.__name__) - self.assertEqual(ti.state, State.SUCCESS) - for file in release.sftp_files: - file_name = os.path.basename(file) - in_progress_file = os.path.join(local_sftp_folders.in_progress, file_name) - self.assertFalse(os.path.isfile(in_progress_file)) - - finished_file = os.path.join(local_sftp_folders.finished, file_name) - self.assertTrue(os.path.isfile(finished_file)) - - # Add_dataset_release_task - dataset_releases = get_dataset_releases( - dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id - ) - self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) - self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases( - dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id - ) - self.assertEqual(len(dataset_releases), 1) - - # Test cleanup - ti = env.run_task(telescope.cleanup.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) - - @patch("observatory.platform.airflow.Variable.get") - def test_gb_transform(self, mock_variable_get): - """Test sanity check in transform method when transaction date falls outside release month - - :param mock_variable_get: Mock Airflow Variable 'data' - """ - with CliRunner().isolated_filesystem(): - mock_variable_get.return_value = os.path.join(os.getcwd(), "data") - - # Objects to create release instance - telescope = GoogleBooksTelescope( - dag_id="google_books_test", - cloud_workspace=self.fake_cloud_workspace, - sftp_root="/", - sales_partner=partner_from_str("google_books_sales"), - traffic_partner=partner_from_str("google_books_traffic"), - ) - fixtures_folder = test_fixtures_folder(workflow_module="google_books_telescope") - sales_file_path = os.path.join(fixtures_folder, "GoogleSalesTransactionReport_2020_02.csv") - traffic_file_path = os.path.join(fixtures_folder, "GoogleBooksTrafficReport_2020_02.csv") - sftp_files = [ - os.path.join(telescope.sftp_folders.in_progress, os.path.basename(sales_file_path)), - os.path.join(telescope.sftp_folders.in_progress, os.path.basename(traffic_file_path)), - ] - - # test transaction date inside of release month - release = GoogleBooksRelease( - dag_id=telescope.dag_id, - run_id=random_id(), - partition_date=pendulum.parse("2020-02-01"), - sftp_files=sftp_files, - ) - shutil.copy(sales_file_path, os.path.join(release.download_folder, "google_books_sales.csv")) - shutil.copy(traffic_file_path, os.path.join(release.download_folder, "google_books_traffic.csv")) - telescope.transform([release]) - self.assertTrue(os.path.exists(release.transform_sales_path)) - - # test transaction date before release month - release = GoogleBooksRelease( - dag_id=telescope.dag_id, - run_id=random_id(), - partition_date=pendulum.parse("2020-01-31"), - sftp_files=sftp_files, - ) - shutil.copy(sales_file_path, os.path.join(release.download_folder, "google_books_sales.csv")) - shutil.copy(traffic_file_path, os.path.join(release.download_folder, "google_books_traffic.csv")) - with self.assertRaises(AirflowException): - telescope.transform([release]) - - # test transaction date after release month - release = GoogleBooksRelease( - dag_id=telescope.dag_id, - run_id=random_id(), - partition_date=pendulum.parse("2020-03-01"), - sftp_files=sftp_files, - ) - shutil.copy(sales_file_path, os.path.join(release.download_folder, "google_books_sales.csv")) - shutil.copy(traffic_file_path, os.path.join(release.download_folder, "google_books_traffic.csv")) - with self.assertRaises(AirflowException): - telescope.transform([release]) diff --git a/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py b/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py deleted file mode 100644 index 8dcdf7d8..00000000 --- a/oaebu_workflows/irus_fulcrum_telescope/irus_fulcrum_telescope.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright 2022-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Keegan Smith - -import logging -import os -from typing import List, Tuple, Union - -import pendulum -from airflow.hooks.base import BaseHook -from google.cloud.bigquery import SourceFormat, WriteDisposition -from google.cloud.bigquery.table import TimePartitioningType - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.platform.files import add_partition_date -from observatory.platform.api import make_observatory_api, DatasetRelease -from observatory.platform.airflow import AirflowConns -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.files import save_jsonl_gz, load_jsonl -from observatory.platform.gcs import gcs_blob_name_from_path, gcs_upload_files, gcs_blob_uri -from observatory.platform.bigquery import bq_load_table, bq_create_dataset, bq_table_id -from observatory.platform.workflows.workflow import ( - Workflow, - PartitionRelease, - cleanup, - set_task_state, - check_workflow_inputs, -) -from observatory.platform.utils.url_utils import retry_get_url - -IRUS_FULCRUM_ENDPOINT_TEMPLATE = ( - "https://irus.jisc.ac.uk/api/v3/irus/reports/irus_ir/?platform=235" - "&requestor_id={requestor_id}&begin_date={start_date}&end_date={end_date}" -) - - -class IrusFulcrumRelease(PartitionRelease): - def __init__( - self, - dag_id: str, - run_id: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - partition_date: pendulum.DateTime, - ): - """Create a IrusFulcrumRelease instance. - - :param dag_id: The ID of the DAG - :param run_id: The airflow run ID - :param data_interval_start: The beginning of the data interval - :param data_interval_end: The end of the data interval - :param partition_date: The release/partition date - """ - super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) - self.data_interval_start = data_interval_start - self.data_interval_end = data_interval_end - self.download_totals_path = os.path.join(self.download_folder, "fulcrum_totals.jsonl.gz") - self.download_country_path = os.path.join(self.download_folder, "fulcrum_country.json.gz") - self.transform_path = os.path.join(self.transform_folder, "fulcrum.json.gz") - - -class IrusFulcrumTelescope(Workflow): - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - publishers: List[str], - data_partner: Union[str, OaebuPartner] = "irus_fulcrum", - bq_dataset_description: str = "IRUS dataset", - bq_table_description: str = None, - api_dataset_id: str = "fulcrum", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - irus_oapen_api_conn_id: str = "irus_api", - catchup: bool = True, - schedule: str = "0 0 4 * *", # Run on the 4th of every month - start_date: pendulum.DateTime = pendulum.datetime(2022, 4, 1), # Earliest available data - ): - """The Fulcrum Telescope - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param publishers: The publishers pertaining to this DAG instance (as listed in Fulcrum) - :param data_partner: The name of the data partner - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param irus_oapen_api_conn_id: Airflow connection ID OAPEN IRUS UK (counter 5) - :param catchup: Whether to catchup the DAG or not - :param schedule: The schedule interval of the DAG - :param start_date: The start date of the DAG - """ - if bq_table_description is None: - bq_table_description = "Fulcrum metrics as recorded by the IRUS platform" - - super().__init__( - dag_id, - start_date, - schedule, - airflow_conns=[observatory_api_conn_id, irus_oapen_api_conn_id], - catchup=catchup, - tags=["oaebu"], - ) - - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.publishers = publishers - self.data_partner = partner_from_str(data_partner) - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.observatory_api_conn_id = observatory_api_conn_id - self.irus_oapen_api_conn_id = irus_oapen_api_conn_id - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_task(self.download) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> IrusFulcrumRelease: - """Create a IrusFulcrumRelease instance - Dates are best explained with an example - Say the dag is scheduled to run on 2022-04-07 - Interval_start will be 2022-03-01 - Interval_end will be 2022-04-01 - partition_date will be 2022-03-31 - """ - data_interval_start = kwargs["data_interval_start"].start_of("month") - data_interval_end = kwargs["data_interval_end"].start_of("month") - partition_date = data_interval_start.end_of("month") - return IrusFulcrumRelease( - self.dag_id, - kwargs["run_id"], - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - partition_date=partition_date, - ) - - def download(self, release: IrusFulcrumRelease, **kwargs): - """Task to download the Fulcrum data for a release - - :param releases: the IrusFulcrumRelease instance. - """ - requestor_id = BaseHook.get_connection(self.irus_oapen_api_conn_id).login - totals_data, country_data = download_fulcrum_month_data(release.partition_date, requestor_id) - assert totals_data and country_data, f"Data not available for supplied release month: {release.partition_date}" - save_jsonl_gz(release.download_totals_path, totals_data) - save_jsonl_gz(release.download_country_path, country_data) - - def upload_downloaded(self, release: IrusFulcrumRelease, **kwargs): - """Upload the downloaded fulcrum data to the google cloud download bucket""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.download_bucket, - file_paths=[release.download_totals_path, release.download_country_path], - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def transform(self, release: IrusFulcrumRelease, **kwargs): - """Task to transform the fulcrum data""" - logging.info(f"Transforming the Fulcrum dataset with the following publisher filter: {self.publishers}") - totals_data = load_jsonl(release.download_totals_path) - country_data = load_jsonl(release.download_country_path) - transformed_data = transform_fulcrum_data( - totals_data=totals_data, - country_data=country_data, - publishers=self.publishers, - ) - transformed_data = add_partition_date( - transformed_data, - partition_date=release.partition_date.end_of("month"), - partition_type=TimePartitioningType.MONTH, - partition_field="release_date", - ) - save_jsonl_gz(release.transform_path, transformed_data) - - def upload_transformed(self, release: IrusFulcrumRelease, **kwargs): - """Upload the transformed fulcrum data to the google cloud download bucket""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.transform_path] - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, release: IrusFulcrumRelease, **kwargs) -> None: - """Load the transfromed data into bigquery""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.data_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - - # Load each transformed release - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) - table_id = bq_table_id( - self.cloud_workspace.project_id, self.data_partner.bq_dataset_id, self.data_partner.bq_table_name - ) - success = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.data_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - table_description=self.bq_table_description, - partition=True, - partition_type=TimePartitioningType.MONTH, - write_disposition=WriteDisposition.WRITE_APPEND, - partition_field="release_date", - ignore_unknown_values=True, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, release: IrusFulcrumRelease, **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - data_interval_start=release.data_interval_start, - data_interval_end=release.data_interval_end, - partition_date=release.partition_date, - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, release: IrusFulcrumRelease, **kwargs) -> None: - """Delete all files and folders associated with this release.""" - cleanup(self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder) - - -def download_fulcrum_month_data( - download_month: pendulum.DateTime, - requestor_id: str, - num_retries: str = 3, -) -> Tuple[List[dict], List[dict]]: - """Download Fulcrum data for the release month - - :param download_month: The month to download usage data from - :param requestor_id: The requestor ID - used to access irus platform - :param num_retries: Number of attempts to make for the URL - """ - download_month = download_month.format("YYYY-MM") - base_url = IRUS_FULCRUM_ENDPOINT_TEMPLATE.format( - requestor_id=requestor_id, - start_date=download_month, - end_date=download_month, - ) - country_url = base_url + "&attributes_to_show=Country" - logging.info(f"Downloading Fulcrum metrics for month: {download_month}") - totals_data = retry_get_url(base_url, num_retries=num_retries).json() - country_data = retry_get_url(country_url, num_retries=num_retries).json() - totals_data = totals_data.get("Report_Items") - country_data = country_data.get("Report_Items") - - return totals_data, country_data - - -def transform_fulcrum_data( - totals_data: List[dict], - country_data: List[dict], - publishers: List[str] = None, -) -> List[dict]: - """ - Transforms Fulcrum downloaded "totals" and "country" data. - - :param totals_data: Fulcrum usage data aggregated over all countries - :param country_data: Fulcrum usage data split by country - :param publishers: Fulcrum publishers to retain. If None, use all publishers - """ - # Extract only the publishers related to this organisation name - if publishers: - totals_data = [i for i in totals_data if i["Publisher"] in publishers] - country_data = [i for i in country_data if i["Publisher"] in publishers] - - # Total and Country-granulated results should all have the same item entries and be ordered the same, but we should check anyway - c_ids = [i["IRUS_Item_ID"] for i in country_data] - t_ids = [i["IRUS_Item_ID"] for i in totals_data] - assert len(c_ids) == len(t_ids), "Country entry data is not the same length as total entry data" - - # Mapping the IDs to list elements - c_id_mapping = {entry["IRUS_Item_ID"]: i for (i, entry) in enumerate(country_data)} - t_id_mapping = {entry["IRUS_Item_ID"]: i for (i, entry) in enumerate(totals_data)} - - transformed_data = [] - for t_id, c_id in zip(t_ids, c_ids): - transformed_row = {} - t_entry = totals_data[t_id_mapping[t_id]] - c_entry = country_data[c_id_mapping[c_id]] - - # Metrics with country granulation - country_metrics = [] - for c_metric in c_entry["Performance_Instances"]: # For each country - country_metrics.append( - { - "name": c_metric["Country"]["Country"], - "code": c_metric["Country"]["Country_Code"], - "Total_Item_Investigations": c_metric["Metric_Type_Counts"].get("Total_Item_Investigations"), - "Total_Item_Requests": c_metric["Metric_Type_Counts"].get("Total_Item_Requests"), - "Unique_Item_Investigations": c_metric["Metric_Type_Counts"].get("Unique_Item_Investigations"), - "Unique_Item_Requests": c_metric["Metric_Type_Counts"].get("Unique_Item_Requests"), - } - ) - - # Total Metrics - t_metric = t_entry["Performance_Instances"][0] - total_item_investigations = t_metric["Metric_Type_Counts"].get("Total_Item_Investigations") - total_item_requests = t_metric["Metric_Type_Counts"].get("Total_Item_Requests") - unique_item_investigations = t_metric["Metric_Type_Counts"].get("Unique_Item_Investigations") - unique_item_requests = t_metric["Metric_Type_Counts"].get("Unique_Item_Requests") - - # Row structure - transformed_row = { - "proprietary_id": t_id, # t_id == c_id - "ISBN": t_entry.get("ISBN"), - "book_title": t_entry.get("Item"), - "publisher": t_entry.get("Publisher"), - "authors": t_entry.get("Authors"), - "event_month": pendulum.parse(t_entry["Performance_Instances"][0]["Event_Month"]).format("YYYY-MM"), - "total_item_investigations": total_item_investigations, - "total_item_requests": total_item_requests, - "unique_item_investigations": unique_item_investigations, - "unique_item_requests": unique_item_requests, - "country": country_metrics, - } - transformed_data.append(transformed_row) - - return transformed_data diff --git a/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py b/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py deleted file mode 100644 index 6a710de6..00000000 --- a/oaebu_workflows/irus_oapen_telescope/irus_oapen_telescope.py +++ /dev/null @@ -1,583 +0,0 @@ -# Copyright 2020-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs - -import gzip -import json -import logging -import os -import time -from typing import Dict, List, Optional, Tuple, Union - -import pendulum -import requests -from airflow.exceptions import AirflowException, AirflowSkipException -from airflow.hooks.base import BaseHook -from google.auth import environment_vars -from google.auth.transport.requests import AuthorizedSession -from google.cloud.bigquery import TimePartitioningType, SourceFormat, WriteDisposition -from google.oauth2.service_account import IDTokenCredentials -from googleapiclient.discovery import Resource, build -from googleapiclient.errors import HttpError -from oauth2client.service_account import ServiceAccountCredentials - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import get_file_hash, save_jsonl_gz -from observatory.platform.files import add_partition_date -from observatory.platform.bigquery import bq_load_table, bq_table_id, bq_create_dataset -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.workflows.workflow import ( - PartitionRelease, - Workflow, - cleanup, - set_task_state, - check_workflow_inputs, -) -from observatory.platform.gcs import ( - gcs_copy_blob, - gcs_create_bucket, - gcs_download_blob, - gcs_upload_file, - gcs_upload_files, - gcs_blob_uri, - gcs_blob_name_from_path, -) - - -class IrusOapenRelease(PartitionRelease): - def __init__( - self, - dag_id: str, - run_id: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - partition_date: pendulum.DateTime, - ): - """Create a IrusOapenRelease instance. - - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID - :param partition_date: The date of the partition/release - """ - super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) - self.data_interval_start = data_interval_start - self.data_interval_end = data_interval_end - self.download_path = os.path.join(self.download_folder, "irus_oapen.jsonl.gz") - self.transform_path = os.path.join(self.transform_folder, "irus_oapen.jsonl.gz") - self.blob_name = gcs_blob_name_from_path( - os.path.join(self.download_folder, f'{self.partition_date.format("YYYY_MM")}.jsonl.gz') - ) - self.cloud_function_path = os.path.join(self.download_folder, "oapen_cloud_function.zip") - - -class IrusOapenTelescope(Workflow): - OAPEN_PROJECT_ID = "oapen-usage-data-gdpr-proof" # The oapen project id. - OAPEN_BUCKET = f"{OAPEN_PROJECT_ID}_cloud-function" # Storage bucket with the source code - FUNCTION_NAME = "oapen-access-stats" # Name of the google cloud function - FUNCTION_REGION = "europe-west1" # Region of the google cloud function - FUNCTION_SOURCE_URL = ( - "https://github.com/The-Academic-Observatory/oapen-irus-uk-cloud-function/releases/" - "download/v1.1.9/oapen-irus-uk-cloud-function.zip" - ) # URL to the zipped source code of the cloud function - FUNCTION_MD5_HASH = "946bb4d7ca229b15aba36ad7b5ed56d0" # MD5 hash of the zipped source code - FUNCTION_BLOB_NAME = "cloud_function_source_code.zip" # blob name of zipped source code - FUNCTION_TIMEOUT = 1500 # Timeout of cloud function in seconds. Maximum of 60 minutes, - # see https://cloud.google.com/functions/docs/2nd-gen/overview#enhanced_infrastructure - - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - publisher_name_v4: str, - publisher_uuid_v5: str, - data_partner: Union[str, OaebuPartner] = "irus_oapen", - bq_dataset_description: str = "IRUS dataset", - bq_table_description: str = None, - api_dataset_id: str = "oapen", - max_cloud_function_instances: int = 0, - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - geoip_license_conn_id: str = "geoip_license_key", - irus_oapen_api_conn_id: str = "irus_api", - irus_oapen_login_conn_id: str = "irus_login", - catchup: bool = True, - start_date: pendulum.DateTime = pendulum.datetime(2015, 6, 1), - schedule: str = "0 0 4 * *", # Run on the 4th of every month - max_active_runs: int = 5, - ): - """The OAPEN irus uk telescope. - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param publisher_name_v4: The publisher's name for version 4 - :param publisher_uuid_v5: The publisher's uuid for version 5 - :param data_partner: The data partner - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param max_cloud_function_instances: - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param geoip_license_conn_id: The Airflow connection ID for the GEOIP license - :param irus_oapen_api_conn_id: The Airflow connection ID for IRUS API - for counter 5 - :param irus_oapen_login_conn_id: The Airflow connection ID for IRUS API (login) - for counter 4 - :param catchup: Whether to catchup the DAG or not - :param start_date: The start date of the DAG - :param schedule: The schedule interval of the DAG - :param max_active_runs: The maximum number of concurrent DAG instances - """ - if bq_table_description is None: - bq_table_description = "OAPEN metrics as recorded by the IRUS platform" - - super().__init__( - dag_id, - start_date, - schedule, - catchup=catchup, - airflow_conns=[ - observatory_api_conn_id, - geoip_license_conn_id, - irus_oapen_api_conn_id, - irus_oapen_login_conn_id, - ], - max_active_runs=max_active_runs, - tags=["oaebu"], - ) - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.publisher_name_v4 = publisher_name_v4 - self.publisher_uuid_v5 = publisher_uuid_v5 - self.data_partner = partner_from_str(data_partner) - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.max_cloud_function_instances = max_cloud_function_instances - self.observatory_api_conn_id = observatory_api_conn_id - self.geoip_license_conn_id = geoip_license_conn_id - self.irus_oapen_api_conn_id = irus_oapen_api_conn_id - self.irus_oapen_login_conn_id = irus_oapen_login_conn_id - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - # create PythonOperator with task concurrency of 1, so tasks to create cloud function never run in parallel - self.add_task(self.create_cloud_function, task_concurrency=1) - self.add_task(self.call_cloud_function) - self.add_task(self.transfer) - self.add_task(self.download_transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> List[IrusOapenRelease]: - """Create a list of IrusOapenRelease instances for a given month. - Say the dag is scheduled to run on 2022-04-07 - Interval_start will be 2022-03-01 - Interval_end will be 2022-04-01 - partition_date will be 2022-03-31 - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: list of IrusOapenRelease instances - """ - # Get release_date - data_interval_start = kwargs["data_interval_start"].start_of("month") - data_interval_end = kwargs["data_interval_end"].start_of("month") - partition_date = data_interval_start.end_of("month") - - logging.info(f"Release/partition date: {partition_date}") - releases = [ - IrusOapenRelease( - dag_id=self.dag_id, - run_id=kwargs["run_id"], - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - partition_date=partition_date, - ) - ] - return releases - - def transfer(self, releases: List[IrusOapenRelease], **kwargs): - """Task to transfer the file for each release. - - :param releases: the list of IrusOapenRelease instances. - """ - for release in releases: - success = gcs_copy_blob( - blob_name=release.blob_name, - src_bucket=IrusOapenTelescope.OAPEN_BUCKET, - dst_bucket=self.cloud_workspace.download_bucket, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def download_transform(self, releases: List[IrusOapenRelease], **kwargs): - """Task to download the access stats to a local file for each release.""" - for release in releases: - success = gcs_download_blob( - bucket_name=self.cloud_workspace.download_bucket, - blob_name=release.blob_name, - file_path=release.download_path, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - # Read gzipped data and create list of dicts - with gzip.open(release.download_path, "r") as f: - results = [json.loads(line) for line in f] - - # Add partition date - results = add_partition_date( - results, release.partition_date, TimePartitioningType.MONTH, partition_field="release_date" - ) - - # Write list into gzipped JSON Lines file - save_jsonl_gz(release.transform_path, results) - - def create_cloud_function(self, releases: List[IrusOapenRelease], **kwargs): - """Task to create the cloud function for each release.""" - for release in releases: - # set up cloud function variables - oapen_project_id = IrusOapenTelescope.OAPEN_PROJECT_ID - source_bucket = IrusOapenTelescope.OAPEN_BUCKET - function_name = IrusOapenTelescope.FUNCTION_NAME - function_region = IrusOapenTelescope.FUNCTION_REGION - function_source_url = IrusOapenTelescope.FUNCTION_SOURCE_URL - function_blob_name = IrusOapenTelescope.FUNCTION_BLOB_NAME - location = f"projects/{oapen_project_id}/locations/{function_region}" - full_name = f"{location}/functions/{function_name}" - - # zip source code and upload to bucket - success, upload = upload_source_code_to_bucket( - source_url=function_source_url, - project_id=oapen_project_id, - bucket_name=source_bucket, - blob_name=function_blob_name, - cloud_function_path=release.cloud_function_path, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - # initialise cloud functions api - creds = ServiceAccountCredentials.from_json_keyfile_name(os.environ.get(environment_vars.CREDENTIALS)) - service = build( - "cloudfunctions", "v2beta", credentials=creds, cache_discovery=False, static_discovery=False - ) - - # update or create cloud function - exists = cloud_function_exists(service, full_name) - if not exists or upload is True: - update = True if exists else False - success, msg = create_cloud_function( - service, - location, - full_name, - source_bucket, - function_blob_name, - self.max_cloud_function_instances, - update, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - logging.info(f"Creating or patching cloud function successful, response: {msg}") - else: - logging.info(f"Using existing cloud function, source code has not changed.") - - def call_cloud_function(self, releases: List[IrusOapenRelease], **kwargs): - """Task to call the cloud function for each release.""" - for release in releases: - # set up cloud function variables - oapen_project_id = IrusOapenTelescope.OAPEN_PROJECT_ID - source_bucket = IrusOapenTelescope.OAPEN_BUCKET - function_name = IrusOapenTelescope.FUNCTION_NAME - function_region = IrusOapenTelescope.FUNCTION_REGION - location = f"projects/{oapen_project_id}/locations/{function_region}" - full_name = f"{location}/functions/{function_name}" - geoip_license_key = BaseHook.get_connection(self.geoip_license_conn_id).password - - # get the publisher_uuid or publisher_id, both are set to empty strings when publisher id is 'oapen' - if release.partition_date >= pendulum.datetime(2020, 4, 1): - airflow_conn = self.irus_oapen_api_conn_id - else: - airflow_conn = self.irus_oapen_login_conn_id - username = BaseHook.get_connection(airflow_conn).login - password = BaseHook.get_connection(airflow_conn).password - - # initialise cloud functions api - creds = ServiceAccountCredentials.from_json_keyfile_name(os.environ.get(environment_vars.CREDENTIALS)) - service = build( - "cloudfunctions", "v2beta", credentials=creds, cache_discovery=False, static_discovery=False - ) - - # Get cloud function uri - function_uri = cloud_function_exists(service, full_name) - - call_cloud_function( - function_uri, - release.partition_date.format("YYYY-MM"), - username, - password, - geoip_license_key, - self.publisher_name_v4, - self.publisher_uuid_v5, - source_bucket, - release.blob_name, - ) - - def upload_transformed(self, releases: List[IrusOapenRelease], **kwargs) -> None: - """Uploads the transformed files to GCS for each release""" - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, - file_paths=[release.transform_path], - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, releases: List[IrusOapenRelease], **kwargs) -> None: - """Loads the sales and traffic data into BigQuery""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.data_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - for release in releases: - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) - table_id = bq_table_id( - self.cloud_workspace.project_id, self.data_partner.bq_dataset_id, self.data_partner.bq_table_name - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.data_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - partition_type=TimePartitioningType.MONTH, - partition=True, - partition_field="release_date", - write_disposition=WriteDisposition.WRITE_APPEND, - table_description=self.bq_table_description, - ignore_unknown_values=True, - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, releases: List[IrusOapenRelease], **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - for release in releases: - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - data_interval_start=release.data_interval_start, - data_interval_end=release.data_interval_end, - partition_date=release.partition_date, - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, releases: List[IrusOapenRelease], **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - for release in releases: - cleanup( - dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder - ) - - -def upload_source_code_to_bucket( - source_url: str, project_id: str, bucket_name: str, blob_name: str, cloud_function_path: str -) -> Tuple[bool, bool]: - """Upload source code of cloud function to storage bucket - - :param source_url: The url to the zip file with source code - :param project_id: The project id with the bucket - :param bucket_name: The bucket name - :param blob_name: The blob name - :param cloud_function_path: The local path to the cloud function - :return: Whether task was successful and whether file was uploaded - """ - - # Get zip file with source code from github release - response = requests.get(source_url) - with open(cloud_function_path, "wb") as f: - f.write(response.content) - - # Check if current md5 hash matches expected md5 hash - expected_md5_hash = IrusOapenTelescope.FUNCTION_MD5_HASH - actual_md5_hash = get_file_hash(file_path=cloud_function_path, algorithm="md5") - if expected_md5_hash != actual_md5_hash: - raise AirflowException(f"md5 hashes do not match, expected: {expected_md5_hash}, actual: {actual_md5_hash}") - - # Create storage bucket - gcs_create_bucket(bucket_name=bucket_name, location="EU", project_id=project_id, lifecycle_delete_age=1) - - # upload zip to cloud storage - success, upload = gcs_upload_file( - bucket_name=bucket_name, blob_name=blob_name, file_path=cloud_function_path, project_id=project_id - ) - return success, upload - - -def cloud_function_exists(service: Resource, full_name: str) -> Optional[str]: - """Check if cloud function with a given name already exists - - :param service: Cloud function service - :param full_name: Name of the cloud function - :return: URI if cloud function exists, else None - """ - try: - response = service.projects().locations().functions().get(name=full_name).execute() - uri = response["serviceConfig"]["uri"] - except HttpError: - return None - return uri - - -def create_cloud_function( - service: Resource, - location: str, - full_name: str, - source_bucket: str, - blob_name: str, - max_active_runs: int, - update: bool, -) -> Tuple[bool, dict]: - """Create cloud function. - - :param service: Cloud function service - :param location: Location of the cloud function - :param full_name: Name of the cloud function - :param source_bucket: Name of bucket where the source code is stored - :param blob_name: Blob name of source code inside bucket - :param max_active_runs: The limit on the maximum number of function instances that may coexist at a given time - :param update: Whether a new function is created or an existing one is updated - :return: Status of the cloud function and error/success message - """ - body = { - "name": full_name, - "environment": "GEN_2", - "description": "Pulls oapen irus uk data and replaces ip addresses with city and country info.", - "buildConfig": { - "runtime": "python39", - "entryPoint": "download", - "source": {"storageSource": {"bucket": source_bucket, "object": blob_name}}, - }, - "serviceConfig": { - "timeoutSeconds": IrusOapenTelescope.FUNCTION_TIMEOUT, - "availableMemory": "4096M", - "maxInstanceCount": max_active_runs, - "allTrafficOnLatestRevision": True, - }, - } - if update: - update_mask = ",".join(body.keys()) - response = ( - service.projects() - .locations() - .functions() - .patch(name=full_name, updateMask=update_mask, body=body) - .execute() - ) - logging.info(f"Patching cloud function, response: {response}") - else: - response = ( - service.projects() - .locations() - .functions() - .create(parent=location, functionId=IrusOapenTelescope.FUNCTION_NAME, body=body) - .execute() - ) - logging.info(f"Creating cloud function, response: {response}") - - operation_name = response.get("name") - done = response.get("done") - while not done: - time.sleep(10) - response = service.projects().locations().operations().get(name=operation_name).execute() - done = response.get("done") - - error = response.get("error") - response = response.get("response") - if response: - msg = response - success = True - else: - msg = error - success = False - - return success, msg - - -def call_cloud_function( - function_uri: str, - release_date: str, - username: str, - password: str, - geoip_license_key: str, - publisher_name_v4: str, - publisher_uuid_v5: str, - bucket_name: str, - blob_name: str, -) -> None: - """Iteratively call cloud function, until it has finished processing all publishers. - When a publisher name/uuid is given, there is only 1 publisher, if it is empty the cloud function will process - all available publishers. In that case, when the data is downloaded from the new platform it can be done in 1 - iteration, however for the old platform two files have to be downloaded separately for each publisher, - this might take longer than the timeout time of the cloud function, so the process is split up in multiple calls. - - :param function_uri: URI of the cloud function - :param release_date: The release date in YYYY-MM - :param username: Oapen username (email or requestor_id) - :param password: Oapen password (password or api_key) - :param geoip_license_key: License key of geoip database - :param publisher_name_v4: URL encoded name of the publisher (used for counter version 4) - :param publisher_uuid_v5: UUID of the publisher (used for counter version 5) - :param bucket_name: Name of the bucket to store oapen access stats data - :param blob_name: Blob name to store oapen access stats data - """ - creds = IDTokenCredentials.from_service_account_file( - os.environ.get(environment_vars.CREDENTIALS), target_audience=function_uri - ) - authed_session = AuthorizedSession(creds) - data = { - "release_date": release_date, - "username": username, - "password": password, - "geoip_license_key": geoip_license_key, - "publisher_name_v4": publisher_name_v4, - "publisher_uuid_v5": publisher_uuid_v5, - "bucket_name": bucket_name, - "blob_name": blob_name, - } - finished = False - while not finished: - response = authed_session.post( - function_uri, - data=json.dumps(data), - headers={"Content-Type": "application/json"}, - timeout=IrusOapenTelescope.FUNCTION_TIMEOUT, - ) - logging.info(f"Call cloud function response status code: {response.status_code}, reason: {response.reason}") - if response.status_code != 200: - raise AirflowException("Cloud function unsuccessful") - - response_json = response.json() - if response_json["unprocessed_publishers"]: - data["unprocessed_publishers"] = response_json["unprocessed_publishers"] - remaining_publishers = len(response_json["unprocessed_publishers"]) - else: - finished = True - remaining_publishers = 0 - - entries = response_json["entries"] - if entries == 0 and remaining_publishers == 0: - raise AirflowSkipException("No access stats entries for publisher(s) in month.") - - logging.info(f"Processed {entries} entries in total. {remaining_publishers} publishers " f"left to process") diff --git a/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py b/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py deleted file mode 100644 index 1b6bbcb1..00000000 --- a/oaebu_workflows/oapen_metadata_telescope/oapen_metadata_telescope.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright 2020-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs, Keegan Smith - -from __future__ import annotations - -import logging -import os -import requests -import xmltodict -from xml.parsers.expat import ExpatError -from typing import Union - -import pendulum -from airflow.exceptions import AirflowException -from google.cloud.bigquery import SourceFormat -from tenacity import ( - retry, - stop_after_attempt, - wait_chain, - wait_fixed, - retry_if_exception_type, -) - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from oaebu_workflows.config import schema_folder -from oaebu_workflows.onix_utils import OnixTransformer -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.utils.url_utils import get_user_agent -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.gcs import ( - gcs_upload_files, - gcs_blob_uri, - gcs_blob_name_from_path, -) -from observatory.platform.bigquery import bq_load_table, bq_sharded_table_id, bq_create_dataset -from observatory.platform.workflows.workflow import ( - SnapshotRelease, - Workflow, - make_snapshot_date, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -# Download job will wait 120 seconds between first 2 attempts, then 30 minutes for the following 3 -DOWNLOAD_RETRY_CHAIN = wait_chain(*[wait_fixed(120) for _ in range(2)] + [wait_fixed(1800) for _ in range(3)]) - - -class OapenMetadataRelease(SnapshotRelease): - def __init__(self, dag_id: str, run_id: str, snapshot_date: pendulum.DateTime): - """Construct a OapenMetadataRelease instance - - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID - :param snapshot_date: The date of the snapshot_date/release - """ - super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) - self.download_path = os.path.join(self.download_folder, f"metadata_{snapshot_date.format('YYYYMMDD')}.xml") - self.transform_path = os.path.join(self.transform_folder, "transformed.jsonl.gz") # Final onix file - - @property - def transform_files(self): - files = os.listdir(self.transform_folder) - return [os.path.join(self.transform_folder, f) for f in files] - - -class OapenMetadataTelescope(Workflow): - """Oapen Metadata Telescope""" - - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - metadata_uri: str, - metadata_partner: Union[str, OaebuPartner] = "oapen_metadata", - elevate_related_products: bool = False, - bq_dataset_id: str = "onix", - bq_table_name: str = "onix", - bq_dataset_description: str = "OAPEN Metadata converted to ONIX", - bq_table_description: str = None, - api_dataset_id: str = "oapen", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - catchup: bool = False, - start_date: pendulum.DateTime = pendulum.datetime(2018, 5, 14), - schedule: str = "0 12 * * Sun", # Midday every sunday - ): - """Construct a OapenMetadataTelescope instance. - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param metadata_uri: The URI of the metadata XML file - :param metadata_partner: The metadata partner name - :param elevate_related_products: Whether to pull out the related products to the product level. - :param bq_dataset_id: The BigQuery dataset ID - :param bq_table_name: The BigQuery table name - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param catchup: Whether to catchup the DAG or not - :param start_date: The start date of the DAG - :param schedule: The schedule interval of the DAG - """ - super().__init__( - dag_id, - start_date, - schedule, - airflow_conns=[observatory_api_conn_id], - catchup=catchup, - tags=["oaebu"], - ) - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.metadata_uri = metadata_uri - self.metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) - self.elevate_related_products = elevate_related_products - self.bq_dataset_id = bq_dataset_id - self.bq_table_name = bq_table_name - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.observatory_api_conn_id = observatory_api_conn_id - - # Fixture file paths - self.oapen_schema = os.path.join( - schema_folder(workflow_module="oapen_metadata_telescope"), "oapen_metadata_filter.json" - ) - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_task(self.download) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> OapenMetadataRelease: - """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is - called in 'task_callable'. - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: The Oapen metadata release instance""" - snapshot_date = make_snapshot_date(**kwargs) - release = OapenMetadataRelease(self.dag_id, kwargs["run_id"], snapshot_date) - return release - - def download(self, release: OapenMetadataRelease, **kwargs) -> None: - """Task to download the OapenMetadataRelease release. - - :param kwargs: the context passed from the PythonOperator. - :param release: an OapenMetadataRelease instance. - """ - logging.info(f"Downloading metadata XML from url: {self.metadata_uri}") - download_metadata(self.metadata_uri, release.download_path) - - def upload_downloaded(self, release: OapenMetadataRelease, **kwargs) -> None: - """Task to upload the downloaded OAPEN metadata""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.download_bucket, - file_paths=[release.download_path], - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def transform(self, release: OapenMetadataRelease, **kwargs) -> None: - """Transform the oapen metadata XML file into a valid ONIX file""" - # Parse the downloaded metadata through the schema to extract relevant fields only - transformer = OnixTransformer( - input_path=release.download_path, - output_dir=release.transform_folder, - filter_products=True, - error_removal=True, - normalise_related_products=True, - deduplicate_related_products=True, - elevate_related_products=self.elevate_related_products, - add_name_fields=True, - collapse_subjects=True, - filter_schema=self.oapen_schema, - keep_intermediate=True, - ) - out_file = transformer.transform() - if release.transform_path != out_file: - raise FileNotFoundError(f"Expected file {release.transform_path} not equal to transformed file: {out_file}") - - def upload_transformed(self, release: OapenMetadataRelease, **kwargs) -> None: - """Task to upload the transformed OAPEN metadata""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, - file_paths=release.transform_files, - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, release: OapenMetadataRelease, **kwargs) -> None: - """Load the transformed ONIX file into bigquery""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.metadata_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - uri = gcs_blob_uri( - self.cloud_workspace.transform_bucket, - gcs_blob_name_from_path(release.transform_path), - ) - table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.metadata_partner.bq_dataset_id, - self.metadata_partner.bq_table_name, - release.snapshot_date, - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.metadata_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - table_description=self.bq_table_description, - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, release: OapenMetadataRelease, **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - snapshot_date=release.snapshot_date, - data_interval_start=kwargs["data_interval_start"], - data_interval_end=kwargs["data_interval_end"], - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, release: OapenMetadataRelease, **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - cleanup( - dag_id=self.dag_id, - execution_date=kwargs["execution_date"], - workflow_folder=release.workflow_folder, - ) - - -@retry( - stop=stop_after_attempt(5), - wait=DOWNLOAD_RETRY_CHAIN, - retry=retry_if_exception_type((ExpatError, ConnectionError, AirflowException)), - reraise=True, -) -def download_metadata(uri: str, download_path: str) -> None: - """Downloads the OAPEN metadata XML file - OAPEN's downloader can give an incomplete file if the metadata is partially generated. - In this scenario, we should wait until the metadata generator has finished. - Otherwise, an attempt to parse the data will result in an XML ParseError. - Another scenario is that OAPEN returns only a header in the XML. We want this to also raise an error. - OAPEN metadata generation can take over an hour - - :param uri: the url to query for the metadata - :param download_path: filepath to store te downloaded file - :raises ConnectionError: raised if the response from the metadata server does not have code 200 - :raises AirflowException: raised if the response does not contain any Product fields - """ - headers = {"User-Agent": f"{get_user_agent(package_name='oaebu_workflows')}"} - response = requests.get(uri, headers=headers) - if response.status_code != 200: - raise ConnectionError(f"Expected status code 200 from url {uri}, instead got response: {response.text}") - with open(download_path, "w") as f: - f.write(response.content.decode("utf-8")) - logging.info(f"Successfully downloadeded XML to {download_path}") - - # Attempt to parse the XML, will raise an ExpatError if it's invalid - with open(download_path, "rb") as f: - xmltodict.parse(f) - logging.info("XML file is valid") - - # Check that more than just the header is returned - if "" not in response.content.decode("utf-8"): - raise AirflowException("No products found in metadata") diff --git a/oaebu_workflows/onix_telescope/onix_telescope.py b/oaebu_workflows/onix_telescope/onix_telescope.py deleted file mode 100644 index 67ef27d4..00000000 --- a/oaebu_workflows/onix_telescope/onix_telescope.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright 2021-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: James Diprose, Keegan Smith - -import logging -import os -import re -from typing import List, Union - -import pendulum -from airflow.exceptions import AirflowException -from airflow.models.taskinstance import TaskInstance -from google.cloud.bigquery import SourceFormat - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from oaebu_workflows.onix_utils import collapse_subjects, onix_parser_download, onix_parser_execute -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import load_jsonl, save_jsonl_gz -from observatory.platform.gcs import gcs_blob_uri, gcs_upload_files, gcs_blob_name_from_path -from observatory.platform.bigquery import bq_load_table, bq_sharded_table_id, bq_create_dataset -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.sftp import SftpFolders, make_sftp_connection -from observatory.platform.workflows.workflow import ( - SnapshotRelease, - Workflow, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -class OnixRelease(SnapshotRelease): - def __init__( - self, - *, - dag_id: str, - run_id: str, - snapshot_date: pendulum.DateTime, - onix_file_name: str, - ): - """Construct an OnixRelease. - - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID - :param snapshot_date: The date of the snapshot/release - :param onix_file_name: The ONIX file name. - """ - super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) - self.onix_file_name = onix_file_name - self.download_path = os.path.join(self.download_folder, self.onix_file_name) - self.parsed_path = os.path.join(self.transform_folder, "full.jsonl") - self.transform_path = os.path.join(self.transform_folder, "onix.jsonl.gz") - - -class OnixTelescope(Workflow): - def __init__( - self, - *, - dag_id: str, - cloud_workspace: CloudWorkspace, - date_regex: str, - sftp_root: str = "/", - metadata_partner: Union[str, OaebuPartner] = "onix", - bq_dataset_description: str = "ONIX data provided by Org", - bq_table_description: str = None, - api_dataset_id: str = "onix", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - sftp_service_conn_id: str = "sftp_service", - catchup: bool = False, - schedule: str = "@weekly", - start_date: pendulum.DateTime = pendulum.datetime(2021, 3, 28), - ): - """Construct an OnixTelescope instance. - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param sftp_root: The working root of the SFTP server, passed to the SftoFolders class - :param metadata_partner: The metadata partner name - :param date_regex: Regular expression for extracting a date string from an ONIX file name - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param sftp_service_conn_id: Airflow connection ID for the SFTP service - :param catchup: Whether to catchup the DAG or not - :param schedule: The schedule interval of the DAG - :param start_date: The start date of the DAG - """ - super().__init__( - dag_id, - start_date, - schedule, - catchup=catchup, - airflow_conns=[observatory_api_conn_id, sftp_service_conn_id], - tags=["oaebu"], - ) - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.sftp_root = sftp_root - self.metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) - self.date_regex = date_regex - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.observatory_api_conn_id = observatory_api_conn_id - self.sftp_service_conn_id = sftp_service_conn_id - self.sftp_folders = SftpFolders(dag_id, sftp_conn_id=sftp_service_conn_id, sftp_root=sftp_root) - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_setup_task(self.list_release_info) - self.add_task(self.move_files_to_in_progress) - self.add_task(self.download) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.move_files_to_finished) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def list_release_info(self, **kwargs): - """Lists all ONIX releases and publishes their file names as an XCom. - - :param kwargs: the context passed from the BranchPythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: the identifier of the task to execute next. - """ - - # List release dates - release_info = [] - with make_sftp_connection(self.sftp_service_conn_id) as sftp: - files = sftp.listdir(self.sftp_folders.upload) - for file_name in files: - if re.match(r"^.*\.(onx|xml)$", file_name): - try: - date_str = re.search(self.date_regex, file_name).group(0) - except AttributeError: - msg = f"Could not find date with pattern `{self.date_regex}` in file name {file_name}" - logging.error(msg) - raise AirflowException(msg) - release_info.append({"release_date": date_str, "file_name": file_name}) - - # Publish XCom - continue_dag = len(release_info) - if continue_dag: - ti: TaskInstance = kwargs["ti"] - execution_date = kwargs["execution_date"] - ti.xcom_push(OnixTelescope.RELEASE_INFO, release_info, execution_date) - - return continue_dag - - def make_release(self, **kwargs) -> List[OnixRelease]: - """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is - called in 'task_callable'. - - :return: a list of Onix release instances. - """ - - ti: TaskInstance = kwargs["ti"] - records = ti.xcom_pull( - key=OnixTelescope.RELEASE_INFO, task_ids=self.list_release_info.__name__, include_prior_dates=False - ) - releases = [] - for record in records: - onix_file_name = record["file_name"] - releases.append( - OnixRelease( - dag_id=self.dag_id, - run_id=kwargs["run_id"], - snapshot_date=pendulum.parse(record["release_date"]), - onix_file_name=onix_file_name, - ) - ) - return releases - - def move_files_to_in_progress(self, releases: List[OnixRelease], **kwargs): - """Move ONIX files to SFTP in-progress folder. - :param releases: a list of Onix release instances""" - self.sftp_folders.move_files_to_in_progress([release.onix_file_name for release in releases]) - - def download(self, releases: List[OnixRelease], **kwargs): - """Task to download the ONIX releases.""" - with make_sftp_connection(self.sftp_service_conn_id) as sftp: - for release in releases: - in_progress_file = os.path.join(self.sftp_folders.in_progress, release.onix_file_name) - sftp.get(in_progress_file, localpath=release.download_path) - - def upload_downloaded(self, releases: List[OnixRelease], **kwargs): - """Uploads the downloaded onix file to GCS""" - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.download_bucket, file_paths=[release.download_path] - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def transform(self, releases: List[OnixRelease], **kwargs): - """Task to transform the ONIX releases.""" - - success, parser_path = onix_parser_download() - set_task_state(success, kwargs["ti"].task_id) - for release in releases: - onix_parser_execute( - parser_path=parser_path, input_dir=release.download_folder, output_dir=release.transform_folder - ) - onix = collapse_subjects(load_jsonl(release.parsed_path)) - save_jsonl_gz(release.transform_path, onix) - - def upload_transformed(self, releases: List[OnixRelease], **kwargs): - """Uploads the transformed file to GCS""" - for release in releases: - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.transform_path] - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, releases: List[OnixRelease], **kwargs): - """Task to load each transformed release to BigQuery.""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.metadata_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - # Load each transformed release - for release in releases: - table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.metadata_partner.bq_dataset_id, - self.metadata_partner.bq_table_name, - release.snapshot_date, - ) - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.metadata_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - table_description=self.bq_table_description, - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def move_files_to_finished(self, releases: List[OnixRelease], **kwargs): - """Move ONIX files to SFTP finished folder.""" - self.sftp_folders.move_files_to_finished([release.onix_file_name for release in releases]) - - def add_new_dataset_releases(self, releases: List[OnixRelease], **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - for release in releases: - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - snapshot_date=release.snapshot_date, - data_interval_start=kwargs["data_interval_start"], - data_interval_end=kwargs["data_interval_end"], - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, releases: List[OnixRelease], **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - for release in releases: - cleanup( - dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder - ) diff --git a/oaebu_workflows/onix_telescope/tests/test_onix_telescope.py b/oaebu_workflows/onix_telescope/tests/test_onix_telescope.py deleted file mode 100644 index f20a12ac..00000000 --- a/oaebu_workflows/onix_telescope/tests/test_onix_telescope.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright 2021-2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: James Diprose - -import os -import shutil - -import pendulum -from airflow.models import Connection -from airflow.utils.state import State - -from oaebu_workflows.onix_telescope.onix_telescope import OnixTelescope, OnixRelease -from oaebu_workflows.oaebu_partners import partner_from_str -from oaebu_workflows.config import test_fixtures_folder -from observatory.platform.api import get_dataset_releases -from observatory.platform.bigquery import bq_sharded_table_id -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.sftp import SftpFolders -from observatory.platform.observatory_config import Workflow -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - SftpServer, - find_free_port, - load_and_parse_json, -) - - -class TestOnixTelescope(ObservatoryTestCase): - """Tests for the ONIX telescope""" - - def __init__(self, *args, **kwargs): - """Constructor which sets up variables used by tests. - - :param args: arguments. - :param kwargs: keyword arguments. - """ - - super(TestOnixTelescope, self).__init__(*args, **kwargs) - self.project_id = os.getenv("TEST_GCP_PROJECT_ID") - self.data_location = os.getenv("TEST_GCP_DATA_LOCATION") - self.sftp_root = "/" - self.date_regex = "\\d{8}" - self.sftp_port = find_free_port() - - # Test file - fixtures_folder = test_fixtures_folder(workflow_module="onix_telescope") - self.onix_xml_path = os.path.join(fixtures_folder, "20210330_CURTINPRESS_ONIX.xml") - self.onix_json_path = os.path.join(fixtures_folder, "20210330_CURTINPRESS_ONIX.json") - - def test_dag_structure(self): - """Test that the ONIX DAG has the correct structure.""" - dag = OnixTelescope( - dag_id="onix_telescope", - cloud_workspace=self.fake_cloud_workspace, - sftp_root=self.sftp_root, - date_regex=self.date_regex, - ).make_dag() - self.assert_dag_structure( - { - "check_dependencies": ["list_release_info"], - "list_release_info": ["move_files_to_in_progress"], - "move_files_to_in_progress": ["download"], - "download": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], - "bq_load": ["move_files_to_finished"], - "move_files_to_finished": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], - }, - dag, - ) - - def test_dag_load(self): - """Test that the Geonames DAG can be loaded from a DAG bag.""" - env = ObservatoryEnvironment( - workflows=[ - Workflow( - dag_id="onix", - name="ONIX Telescope", - class_name="oaebu_workflows.onix_telescope.onix_telescope.OnixTelescope", - cloud_workspace=self.fake_cloud_workspace, - kwargs=dict(date_regex=self.date_regex), - ) - ], - ) - with env.create(): - self.assert_dag_load_from_config("onix") - - # Errors should be raised if kwargs dict not supplied - env.workflows[0].kwargs = {} - with env.create(): - with self.assertRaises(AssertionError) as cm: - self.assert_dag_load_from_config("onix") - msg = cm.exception.args[0] - self.assertTrue("missing 1 required keyword-only argument" in msg) - self.assertTrue("date_regex" in msg) - - def test_telescope(self): - """Test the ONIX telescope end to end.""" - # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) - sftp_server = SftpServer(host="localhost", port=self.sftp_port) - dataset_id = env.add_dataset() - - # Create the Observatory environment and run tests - - with env.create(), sftp_server.create() as sftp_root: - # Setup Telescope - execution_date = pendulum.datetime(year=2021, month=3, day=31) - partner = partner_from_str("onix", metadata_partner=True) - partner.bq_dataset_id = dataset_id - telescope = OnixTelescope( - dag_id="onix_telescope_test", - cloud_workspace=env.cloud_workspace, - sftp_root="/", - date_regex=self.date_regex, - metadata_partner=partner, - ) - dag = telescope.make_dag() - - # Release settings - release_date = pendulum.datetime(year=2021, month=3, day=30) - - # Add SFTP connection - conn = Connection(conn_id=telescope.sftp_service_conn_id, uri=f"ssh://:password@localhost:{self.sftp_port}") - env.add_connection(conn) - with env.create_dag_run(dag, execution_date): - # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Add ONIX file to SFTP server - local_sftp_folders = SftpFolders(telescope.dag_id, telescope.sftp_service_conn_id, sftp_root) - os.makedirs(local_sftp_folders.upload, exist_ok=True) - onix_file_name = os.path.basename(self.onix_xml_path) - onix_file_dst = os.path.join(local_sftp_folders.upload, onix_file_name) - shutil.copy(self.onix_xml_path, onix_file_dst) - - # Get release info from SFTP server and check that the correct release info is returned via Xcom - ti = env.run_task(telescope.list_release_info.__name__) - self.assertEqual(ti.state, State.SUCCESS) - expected_release_info = [{"release_date": release_date, "file_name": onix_file_name}] - release_info = ti.xcom_pull( - key=OnixTelescope.RELEASE_INFO, - task_ids=telescope.list_release_info.__name__, - include_prior_dates=False, - ) - for release in release_info: - rdate = release["release_date"] - release["release_date"] = pendulum.parse(rdate) - self.assertEqual(expected_release_info, release_info) - - release = OnixRelease( - dag_id=telescope.dag_id, - run_id=env.dag_run.run_id, - snapshot_date=release_date, - onix_file_name=onix_file_name, - ) - - # Test move file to in progress - ti = env.run_task(telescope.move_files_to_in_progress.__name__) - self.assertEqual(ti.state, State.SUCCESS) - in_progress_path = os.path.join(local_sftp_folders.in_progress, release.onix_file_name) - self.assertFalse(os.path.isfile(onix_file_dst)) - self.assertTrue(os.path.isfile(in_progress_path)) - - # Test download - ti = env.run_task(telescope.download.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_file_integrity(release.download_path, "28f85c488ab01b0cff769d9da6b4be24", "md5") - - # Test upload downloaded - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_blob_integrity( - env.download_bucket, gcs_blob_name_from_path(release.download_path), release.download_path - ) - - # Test transform - ti = env.run_task(telescope.transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_file_integrity(release.transform_path, "2164a300", "gzip_crc") - - # Test upload to cloud storage - ti = env.run_task(telescope.upload_transformed.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_blob_integrity( - env.transform_bucket, gcs_blob_name_from_path(release.transform_path), release.transform_path - ) - - # Test load into BigQuery - ti = env.run_task(telescope.bq_load.__name__) - self.assertEqual(ti.state, State.SUCCESS) - table_id = bq_sharded_table_id( - telescope.cloud_workspace.project_id, - telescope.metadata_partner.bq_dataset_id, - telescope.metadata_partner.bq_table_name, - release.snapshot_date, - ) - self.assert_table_integrity(table_id, expected_rows=1) - self.assert_table_content(table_id, load_and_parse_json(self.onix_json_path), primary_key="ISBN13") - - # Test move files to finished - ti = env.run_task(telescope.move_files_to_finished.__name__) - self.assertEqual(ti.state, State.SUCCESS) - finished_path = os.path.join(local_sftp_folders.finished, onix_file_name) - self.assertFalse(os.path.isfile(local_sftp_folders.in_progress)) - self.assertTrue(os.path.isfile(finished_path)) - - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) - self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) - self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) - self.assertEqual(len(dataset_releases), 1) - - # Test cleanup - ti = env.run_task(telescope.cleanup.__name__) - self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) diff --git a/oaebu_workflows/onix_workflow/onix_workflow.py b/oaebu_workflows/onix_workflow/onix_workflow.py deleted file mode 100644 index a039a994..00000000 --- a/oaebu_workflows/onix_workflow/onix_workflow.py +++ /dev/null @@ -1,1287 +0,0 @@ -# Copyright 2020-2024 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# Author: Tuan Chien, Richard Hosking, Keegan Smith - -import os -from datetime import timedelta -from typing import List, Optional, Tuple, Union, Iterable -import re -import logging -import json -from concurrent.futures import ThreadPoolExecutor, as_completed - -import pendulum -from google.cloud.bigquery import SourceFormat, Client -from ratelimit import limits, sleep_and_retry -from tenacity import wait_exponential_jitter -from jinja2 import Environment, FileSystemLoader -from airflow import DAG -from airflow.models.baseoperator import chain -from airflow.utils.task_group import TaskGroup - -from oaebu_workflows.airflow_pools import CrossrefEventsPool -from oaebu_workflows.config import schema_folder as default_schema_folder -from oaebu_workflows.config import sql_folder -from oaebu_workflows.oaebu_partners import OaebuPartner, DataPartner, partner_from_str -from oaebu_workflows.onix_workflow.onix_work_aggregation import BookWorkAggregator, BookWorkFamilyAggregator -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.utils.dag_run_sensor import DagRunSensor -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import save_jsonl_gz -from observatory.platform.utils.url_utils import get_user_agent, retry_get_url -from observatory.platform.gcs import gcs_upload_files, gcs_blob_uri, gcs_blob_name_from_path -from observatory.platform.utils.jinja2_utils import render_template -from observatory.platform.api import make_observatory_api -from observatory.platform.bigquery import ( - bq_load_table, - bq_table_id, - bq_sharded_table_id, - bq_table_id_parts, - bq_create_dataset, - bq_create_table_from_query, - bq_run_query, - bq_select_table_shard_dates, - bq_copy_table, - bq_find_schema, -) -from observatory.platform.workflows.workflow import ( - SnapshotRelease, - Workflow, - make_snapshot_date, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -CROSSREF_EVENT_URL_TEMPLATE = ( - "https://api.eventdata.crossref.org/v1/events?mailto={mailto}" - "&from-collected-date={start_date}&until-collected-date={end_date}&rows=1000" - "&obj-id={doi}" -) - - -class OnixWorkflowRelease(SnapshotRelease): - """Release information for OnixWorkflow""" - - def __init__( - self, - *, - dag_id: str, - run_id: str, - snapshot_date: pendulum.DateTime, - onix_snapshot_date: pendulum.DateTime, - crossref_master_snapshot_date: pendulum.DateTime, - ): - """ - Construct the OnixWorkflow Release - :param dag_id: DAG ID. - :param release_date: The date of the partition/release - :param onix_snapshot_date: The ONIX snapshot/release date. - :param crossref_master_snapshot_date: The release date/suffix of the crossref master table - """ - - super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) - self.onix_snapshot_date = onix_snapshot_date - self.crossref_master_snapshot_date = crossref_master_snapshot_date - - # Files - self.workslookup_path = os.path.join(self.transform_folder, "worksid.jsonl.gz") - self.workslookup_errors_path = os.path.join(self.transform_folder, "worksid_errors.jsonl.gz") - self.worksfamilylookup_path = os.path.join(self.transform_folder, "workfamilyid.jsonl.gz") - self.crossref_metadata_path = os.path.join(self.transform_folder, "crossref_metadata.jsonl.gz") - self.crossref_events_path = os.path.join(self.transform_folder, "crossref_events.jsonl.gz") - - # Generated Schemas - self.book_product_schema_path = os.path.join(self.transform_folder, "book_product_schema.json") - self.author_metrics_schema = os.path.join(self.transform_folder, "author_metrics_schema.json") - self.book_metrics_schema = os.path.join(self.transform_folder, "metrics_books_metrics_schema.json") - self.country_metrics_schema = os.path.join(self.transform_folder, "country_metrics_schema.json") - self.subject_metrics_bic_schema = os.path.join(self.transform_folder, "subject_metrics_bic_schema.json") - self.subject_metrics_bisac_schema = os.path.join(self.transform_folder, "subject_metrics_bisac_schema.json") - self.subject_metrics_thema_schema = os.path.join(self.transform_folder, "subject_metrics_thema_schema.json") - - -class OnixWorkflow(Workflow): - """Onix Workflow Instance""" - - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - metadata_partner: Union[str, OaebuPartner], - # Bigquery parameters - bq_master_crossref_project_id: str = "academic-observatory", - bq_master_crossref_dataset_id: str = "crossref_metadata", - bq_oaebu_crossref_dataset_id: str = "crossref", - bq_master_crossref_metadata_table_name: str = "crossref_metadata", - bq_oaebu_crossref_metadata_table_name: str = "crossref_metadata", - bq_crossref_events_table_name: str = "crossref_events", - bq_country_project_id: str = "oaebu-public-data", - bq_country_dataset_id: str = "oaebu_reference", - bq_subject_project_id: str = "oaebu-public-data", - bq_subject_dataset_id: str = "oaebu_reference", - bq_book_table_name: str = "book", - bq_book_product_table_name: str = "book_product", - bq_onix_workflow_dataset: str = "onix_workflow", - bq_oaebu_intermediate_dataset: str = "oaebu_intermediate", - bq_oaebu_dataset: str = "oaebu", - bq_oaebu_export_dataset: str = "data_export", - bq_oaebu_latest_export_dataset: str = "data_export_latest", - bq_worksid_table_name: str = "onix_workid_isbn", - bq_worksid_error_table_name: str = "onix_workid_isbn_errors", - bq_workfamilyid_table_name: str = "onix_workfamilyid_isbn", - bq_dataset_description: str = "ONIX workflow tables", - oaebu_intermediate_match_suffix: str = "_matched", - # Run parameters - data_partners: List[Union[str, OaebuPartner]] = None, - ga3_views_field="page_views", - schema_folder: str = default_schema_folder(workflow_module="onix_workflow"), - mailto: str = "agent@observatory.academy", - crossref_start_date: pendulum.DateTime = pendulum.datetime(2018, 5, 14), - api_dataset_id: str = "onix_workflow", - max_threads: int = 2 * os.cpu_count() - 1, - # Ariflow parameters - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - sensor_dag_ids: List[str] = None, - catchup: Optional[bool] = False, - start_date: Optional[pendulum.DateTime] = pendulum.datetime(2022, 8, 1), - schedule: Optional[str] = "@weekly", - ): - """ - Initialises the workflow object. - - :param dag_id: DAG ID. - :param cloud_workspace: The CloudWorkspace object for this DAG - - :param bq_master_crossref_project_id: GCP project ID of crossref master data - :param bq_master_crossref_dataset_id: GCP dataset ID of crossref master data - :param bq_oaebu_crossref_dataset_id: GCP dataset ID of crossref OAeBU data - :param bq_master_crossref_metadata_table_name: The name of the master crossref metadata table - :param bq_oaebu_crossref_metadata_table_name: The name of the OAeBU crossref metadata table - :param bq_crossref_events_table_name: The name of the crossref events table - :param bq_country_project_id: GCP project ID of the country table - :param bq_country_dataset_id: GCP dataset containing the country table - :param bq_subject_project_id: GCP project ID of the subject tables - :param bq_subject_dataset_id: GCP dataset ID of the subject tables - :param bq_book_table_name: The name of the book table - :param bq_book_product_table_name: The name of the book product table - :param bq_onix_workflow_dataset: Onix workflow dataset. - :param bq_oaebu_intermediate_dataset: OAEBU intermediate dataset. - :param bq_oaebu_dataset: OAEBU dataset. - :param bq_oaebu_export_dataset: OAEBU data export dataset. - :param bq_oaebu_latest_export_dataset: OAEBU data export dataset with the latest export tables - :param bq_worksid_table_name: table ID of the worksid table - :param bq_worksid_error_table_name: table ID of the worksid error table - :param bq_workfamilyid_table_name: table ID of the workfamilyid table - :param bq_dataset_description: Description to give to the workflow tables - :param oaebu_intermediate_match_suffix: Suffix to append to intermediate tables - - :param data_partners: OAEBU data sources. - :param ga3_views_field: The name of the GA3 views field - should be either 'page_views' or 'unique_views' - :param schema_folder: the SQL schema path. - :param mailto: email address used to identify the user when sending requests to an API. - :param crossref_start_date: The starting date of crossref's API calls - :param api_dataset_id: The ID to store the dataset release in the API - :param max_threads: The maximum number of threads to use for parallel tasks. - - :param observatory_api_conn_id: The connection ID for the observatory API - :param sensor_dag_ids: Dag IDs for dependent tasks - :param catchup: Whether to catch up missed DAG runs. - :param start_date: Start date of the DAG. - :param schedule: Scheduled interval for running the DAG. - """ - - if not sensor_dag_ids: - sensor_dag_ids = [] - - if data_partners is None: - data_partners = list() - - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) - # Bigquery projects, datasets and tables - self.bq_master_crossref_project_id = bq_master_crossref_project_id - self.bq_master_crossref_dataset_id = bq_master_crossref_dataset_id - self.bq_oaebu_crossref_dataset_id = bq_oaebu_crossref_dataset_id - self.bq_master_crossref_metadata_table_name = bq_master_crossref_metadata_table_name - self.bq_oaebu_crossref_metadata_table_name = bq_oaebu_crossref_metadata_table_name - self.bq_crossref_events_table_name = bq_crossref_events_table_name - self.bq_country_project_id = bq_country_project_id - self.bq_country_dataset_id = bq_country_dataset_id - self.bq_subject_project_id = bq_subject_project_id - self.bq_subject_dataset_id = bq_subject_dataset_id - self.bq_book_table_name = bq_book_table_name - self.bq_book_product_table_name = bq_book_product_table_name - self.bq_onix_workflow_dataset = bq_onix_workflow_dataset - self.bq_oaebu_intermediate_dataset = bq_oaebu_intermediate_dataset - self.bq_oaebu_dataset = bq_oaebu_dataset - self.bq_oaebu_export_dataset = bq_oaebu_export_dataset - self.bq_oaebu_latest_export_dataset = bq_oaebu_latest_export_dataset - self.bq_worksid_table_name = bq_worksid_table_name - self.bq_worksid_error_table_name = bq_worksid_error_table_name - self.bq_workfamilyid_table_name = bq_workfamilyid_table_name - self.bq_dataset_description = bq_dataset_description - self.oaebu_intermediate_match_suffix = oaebu_intermediate_match_suffix - # Run parameters - self.data_partners = [partner_from_str(p) for p in data_partners] - self.ga3_views_field = ga3_views_field - self.schema_folder = schema_folder - self.mailto = mailto - self.crossref_start_date = crossref_start_date - self.api_dataset_id = api_dataset_id - self.max_threads = max_threads - # Airflow Parameters - self.observatory_api_conn_id = observatory_api_conn_id - self.sensor_dag_ids = sensor_dag_ids - self.catchup = catchup - self.start_date = start_date - self.schedule = schedule - - # Initialise Telesecope base class - super().__init__( - dag_id=self.dag_id, - start_date=start_date, - schedule=schedule, - catchup=catchup, - airflow_conns=[observatory_api_conn_id], - tags=["oaebu"], - ) - - check_workflow_inputs(self) - - def make_dag(self) -> DAG: - """Construct the DAG""" - - with self.dag: - # DAG Sensors - Check the data partner dag runs are complete - task_sensors = [] - with TaskGroup(group_id="sensors"): - for ext_dag_id in self.sensor_dag_ids: - sensor = DagRunSensor( - task_id=f"{ext_dag_id}_sensor", - external_dag_id=ext_dag_id, - mode="reschedule", - duration=timedelta(days=7), # Look back up to 7 days from execution date - poke_interval=int( - timedelta(hours=1).total_seconds() - ), # Check at this interval if dag run is ready - timeout=int(timedelta(days=2).total_seconds()), # Sensor will fail after 2 days of waiting - ) - task_sensors.append(sensor) - - # Aggregate Works - task_aggregate_works = self.make_python_operator(self.aggregate_works, "aggregate_works") - - # Create crossref metadata and event tables - task_create_crossref_metadata_table = self.make_python_operator( - self.create_crossref_metadata_table, "create_crossref_metadata_table" - ) - # Create pool for crossref API calls (if they don't exist) - # Pools are necessary to throttle the maxiumum number of requests we can make per second and avoid 429 errors - crossref_events_pool = CrossrefEventsPool(pool_slots=15) - crossref_events_pool.create_pool() - task_create_crossref_events_table = self.make_python_operator( - self.create_crossref_events_table, - "create_crossref_events_table", - op_kwargs=dict( - pool=crossref_events_pool.pool_name, - pool_slots=min(self.max_threads, crossref_events_pool.pool_slots), - ), - ) - - # Create book table - task_create_book_table = self.make_python_operator(self.create_book_table, "create_book_table") - - # Create OAEBU Intermediate tables for data partners - task_create_intermediate_tables = [] - with TaskGroup(group_id="intermediate_tables"): - for data_partner in self.data_partners: - task_id = f"intermediate_{data_partner.bq_table_name}" - intermediate = self.make_python_operator( - self.create_intermediate_table, - task_id, - op_kwargs=dict( - orig_project_id=self.cloud_workspace.project_id, - orig_dataset=data_partner.bq_dataset_id, - orig_table=data_partner.bq_table_name, - orig_isbn=data_partner.isbn_field_name, - sharded=data_partner.sharded, - ), - ) - task_create_intermediate_tables.append(intermediate) - - # Book product table - task_create_book_product_table = self.make_python_operator( - self.create_book_product_table, "create_book_product_table" - ) - - # Create OAEBU Elastic Export tables - task_create_export_tables = self.create_tasks_export_tables() - - # Create the (non-sharded) copies of the sharded tables - task_update_latest_export_tables = self.make_python_operator( - self.update_latest_export_tables, "update_latest_export_tables" - ) - - # Final tasks - task_add_release = self.make_python_operator(self.add_new_dataset_releases, "add_new_dataset_releases") - task_cleanup = self.make_python_operator(self.cleanup, "cleanup") - - chain( - task_sensors, - task_aggregate_works, - task_create_crossref_metadata_table, - task_create_crossref_events_table, - task_create_book_table, - task_create_intermediate_tables, - task_create_book_product_table, - task_create_export_tables, - task_update_latest_export_tables, - task_add_release, - task_cleanup, - ) - - return self.dag - - def create_tasks_export_tables(self): - """Create tasks for exporting final metrics from our OAEBU data. - These are split into two categories: generic and custom. - The custom exports change their schema depending on the data partners.""" - - generic_export_tables = [ - { - "output_table": "book_list", - "query_template": os.path.join(sql_folder("onix_workflow"), "book_list.sql.jinja2"), - "schema": os.path.join(default_schema_folder("onix_workflow"), "book_list.json"), - }, - { - "output_table": "book_metrics_events", - "query_template": os.path.join(sql_folder("onix_workflow"), "book_metrics_events.sql.jinja2"), - "schema": os.path.join(default_schema_folder("onix_workflow"), "book_metrics_events.json"), - }, - ] - if "jstor_institution" in [dp.type_id for dp in self.data_partners]: - generic_export_tables.append( - { - "output_table": "book_institution_list", - "query_template": os.path.join(sql_folder("onix_workflow"), "book_institution_list.sql.jinja2"), - "schema": os.path.join(default_schema_folder("onix_workflow"), "book_institution_list.json"), - } - ) - generic_export_tables.append( - { - "output_table": "book_metrics_institution", - "query_template": os.path.join(sql_folder("onix_workflow"), "book_metrics_institution.sql.jinja2"), - "schema": os.path.join(default_schema_folder("onix_workflow"), "book_metrics_institution.json"), - }, - ) - if "irus_oapen" in [dp.type_id for dp in self.data_partners]: - generic_export_tables.append( - { - "output_table": "book_metrics_city", - "query_template": os.path.join(sql_folder("onix_workflow"), "book_metrics_city.sql.jinja2"), - "schema": os.path.join(default_schema_folder("onix_workflow"), "book_metrics_city.json"), - } - ) - - # Create each export table in BiqQuery - tasks = [] - with TaskGroup(group_id="export_tables"): - for export_table in generic_export_tables: - task_id = f"export_{export_table['output_table']}" - tasks.append( - self.make_python_operator( - self.export_oaebu_table, - task_id, - op_kwargs=dict( - output_table=export_table["output_table"], - query_template_path=export_table["query_template"], - schema_file_path=export_table["schema"], - ), - ) - ) - tasks.append(self.make_python_operator(self.export_book_metrics, "export_book_metrics")) - tasks.append(self.make_python_operator(self.export_book_metrics_country, "export_book_metrics_country")) - tasks.append(self.make_python_operator(self.export_book_metrics_author, "export_book_metrics_author")) - tasks.append(self.make_python_operator(self.export_book_metrics_subjects, "export_book_metrics_subjects")) - return tasks - - def make_release(self, **kwargs) -> OnixWorkflowRelease: - """Creates a release object. - - :param kwargs: From Airflow. Contains the execution_date. - :return: an OnixWorkflowRelease object. - """ - - # Make snapshot date - snapshot_date = make_snapshot_date(**kwargs) - - # Get ONIX release date - onix_table_id = bq_table_id( - project_id=self.cloud_workspace.project_id, - dataset_id=self.metadata_partner.bq_dataset_id, - table_id=self.metadata_partner.bq_table_name, - ) - onix_snapshot_dates = bq_select_table_shard_dates(table_id=onix_table_id, end_date=snapshot_date) - if not len(onix_snapshot_dates): - raise RuntimeError("OnixWorkflow.make_release: no ONIX releases found") - - onix_snapshot_date = onix_snapshot_dates[0] # Get most recent snapshot - - # Get Crossref Metadata release date - crossref_table_id = bq_table_id( - project_id=self.bq_master_crossref_project_id, - dataset_id=self.bq_master_crossref_dataset_id, - table_id=self.bq_master_crossref_metadata_table_name, - ) - crossref_metadata_snapshot_dates = bq_select_table_shard_dates( - table_id=crossref_table_id, end_date=snapshot_date - ) - if not len(crossref_metadata_snapshot_dates): - raise RuntimeError("OnixWorkflow.make_release: no Crossref Metadata releases found") - crossref_master_snapshot_date = crossref_metadata_snapshot_dates[0] # Get most recent snapshot - - # Make the release object - return OnixWorkflowRelease( - dag_id=self.dag_id, - run_id=kwargs["run_id"], - snapshot_date=snapshot_date, - onix_snapshot_date=onix_snapshot_date, - crossref_master_snapshot_date=crossref_master_snapshot_date, - ) - - def aggregate_works(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Fetches the ONIX product records from our ONIX database, aggregates them into works, workfamilies, - and outputs it into jsonl files. - - :param release: The onix workflow release object - """ - - # Fetch ONIX data - sharded_onix_table = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.metadata_partner.bq_dataset_id, - self.metadata_partner.bq_table_name, - release.onix_snapshot_date, - ) - products = get_onix_records(sharded_onix_table) - - # Aggregate into works - agg = BookWorkAggregator(products) - works = agg.aggregate() - lookup_table = agg.get_works_lookup_table() - save_jsonl_gz(release.workslookup_path, lookup_table) - - # Save errors from aggregation - error_table = [{"Error": error} for error in agg.errors] - save_jsonl_gz(release.workslookup_errors_path, error_table) - - # Aggregate work families - agg = BookWorkFamilyAggregator(works) - agg.aggregate() - lookup_table = agg.get_works_family_lookup_table() - save_jsonl_gz(release.worksfamilylookup_path, lookup_table) - - # Upload the aggregation tables and error tables to a GCP bucket in preparation for BQ loading - files = [release.workslookup_path, release.workslookup_errors_path, release.worksfamilylookup_path] - gcs_upload_files(bucket_name=self.cloud_workspace.transform_bucket, file_paths=files) - - # Load the 'WorkID lookup', 'WorkID lookup table errors' and 'WorkFamilyID lookup' tables into BigQuery - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_onix_workflow_dataset, - location=self.cloud_workspace.data_location, - description="Onix Workflow Aggregations", - ) - - aggregation_paths = [release.workslookup_path, release.workslookup_errors_path, release.worksfamilylookup_path] - aggregation_tables = [ - self.bq_worksid_table_name, - self.bq_worksid_error_table_name, - self.bq_workfamilyid_table_name, - ] - for path, table_name in zip(aggregation_paths, aggregation_tables): - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(path)) - table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, self.bq_onix_workflow_dataset, table_name, release.snapshot_date - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=bq_find_schema(path=self.schema_folder, table_name=table_name), - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - write_disposition="WRITE_TRUNCATE", - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def create_crossref_metadata_table(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Creates the crossref metadata table by querying the AO master table and matching on this publisher's ISBNs""" - - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_oaebu_crossref_dataset_id, - location=self.cloud_workspace.data_location, - description="Data from Crossref sources", - ) - - onix_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.metadata_partner.bq_dataset_id, - self.metadata_partner.bq_table_name, - release.onix_snapshot_date, - ) - master_crossref_metadata_table_id = bq_sharded_table_id( - self.bq_master_crossref_project_id, - self.bq_master_crossref_dataset_id, - self.bq_master_crossref_metadata_table_name, - release.crossref_master_snapshot_date, - ) - sql = render_template( - os.path.join(sql_folder(workflow_module="onix_workflow"), "crossref_metadata_filter_isbn.sql.jinja2"), - onix_table_id=onix_table_id, - crossref_metadata_table_id=master_crossref_metadata_table_id, - ) - logging.info("Creating crossref metadata table from master table") - schema_file_path = bq_find_schema( - path=self.schema_folder, table_name=self.bq_oaebu_crossref_metadata_table_name - ) - oaebu_crossref_metadata_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_crossref_dataset_id, - self.bq_oaebu_crossref_metadata_table_name, - release.snapshot_date, - ) - state = bq_create_table_from_query( - sql=sql, table_id=oaebu_crossref_metadata_table_id, schema_file_path=schema_file_path - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def create_crossref_events_table(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Download, transform, upload and create a table for crossref events""" - - # Get the unique dois from the metadata table - metadata_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_crossref_dataset_id, - self.bq_oaebu_crossref_metadata_table_name, - release.snapshot_date, - ) - dois = dois_from_table(metadata_table_id, doi_column_name="DOI", distinct=True) - - # Download and transform all events - start_date = self.crossref_start_date - end_date = release.snapshot_date.subtract(days=1).date() - events = download_crossref_events(dois, start_date, end_date, self.mailto, max_threads=self.max_threads) - events = transform_crossref_events(events, max_threads=self.max_threads) - - # Zip and upload to google cloud - save_jsonl_gz(release.crossref_events_path, events) - gcs_upload_files(bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.crossref_events_path]) - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.crossref_events_path)) - table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_crossref_dataset_id, - self.bq_crossref_events_table_name, - release.snapshot_date, - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=bq_find_schema(path=self.schema_folder, table_name=self.bq_crossref_events_table_name), - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - write_disposition="WRITE_TRUNCATE", - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def create_book_table(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Create the oaebu book table using the crossref event and metadata tables""" - - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_oaebu_dataset, - location=self.cloud_workspace.data_location, - description="OAEBU Tables", - ) - book_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, self.bq_oaebu_dataset, self.bq_book_table_name, release.snapshot_date - ) - crossref_metadata_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_crossref_dataset_id, - self.bq_oaebu_crossref_metadata_table_name, - release.snapshot_date, - ) - crossref_events_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_crossref_dataset_id, - self.bq_crossref_events_table_name, - release.snapshot_date, - ) - sql = render_template( - os.path.join(sql_folder(workflow_module="onix_workflow"), "book.sql.jinja2"), - crossref_events_table_id=crossref_events_table_id, - crossref_metadata_table_id=crossref_metadata_table_id, - ) - logging.info(sql) - - status = bq_create_table_from_query( - sql=sql, - table_id=book_table_id, - schema_file_path=os.path.join(self.schema_folder, "book.json"), - ) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def create_intermediate_table( - self, - release: OnixWorkflowRelease, - *, - orig_project_id: str, - orig_dataset: str, - orig_table: str, - orig_isbn: str, - sharded: bool, - **kwargs, - ) -> None: - """Create an intermediate oaebu table. They are of the form datasource_matched - - :param release: Onix workflow release information. - :param orig_project_id: Project ID for the partner data. - :param orig_dataset: Dataset ID for the partner data. - :param orig_table: Table ID for the partner data. - :param orig_isbn: Name of the ISBN field in the partner data table. - :param sharded: Whether the data partner table is sharded - """ - - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_oaebu_intermediate_dataset, - location=self.cloud_workspace.data_location, - description="Intermediate OAEBU Tables", - ) - orig_table_id = ( - bq_sharded_table_id(orig_project_id, orig_dataset, orig_table, release.snapshot_date) - if sharded - else bq_table_id(orig_project_id, orig_dataset, orig_table) - ) - output_table_name = f"{orig_table}{self.oaebu_intermediate_match_suffix}" - template_path = os.path.join( - sql_folder(workflow_module="onix_workflow"), "assign_workid_workfamilyid.sql.jinja2" - ) - output_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - output_table_name, - release.snapshot_date, - ) - wid_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_onix_workflow_dataset, - self.bq_worksid_table_name, - release.snapshot_date, - ) - wfam_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_onix_workflow_dataset, - self.bq_workfamilyid_table_name, - release.snapshot_date, - ) - - # Make the table from SQL query - sql = render_template( - template_path, - orig_table_id=orig_table_id, - orig_isbn=orig_isbn, - wid_table_id=wid_table_id, - wfam_table_id=wfam_table_id, - ) - status = bq_create_table_from_query(sql=sql, table_id=output_table_id) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def create_book_product_table( - self, - release: OnixWorkflowRelease, - **kwargs, - ) -> None: - """Create the Book Product Table""" - - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_oaebu_dataset, - location=self.cloud_workspace.data_location, - description="OAEBU Tables", - ) - - # Data partner table names - dp_tables = { - f"{dp.type_id}_table_id": bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_intermediate_dataset, - f"{dp.type_id}_matched", - release.snapshot_date, - ) - for dp in self.data_partners - } - - # Metadata table name - onix_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.metadata_partner.bq_dataset_id, - self.metadata_partner.bq_table_name, - release.onix_snapshot_date, - ) - - # ONIX WF table names - workid_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_onix_workflow_dataset, - self.bq_worksid_table_name, - release.snapshot_date, - ) - workfamilyid_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_onix_workflow_dataset, - self.bq_workfamilyid_table_name, - release.snapshot_date, - ) - country_table_id = bq_table_id(self.bq_country_project_id, self.bq_country_dataset_id, "country") - book_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, self.bq_oaebu_dataset, self.bq_book_table_name, release.snapshot_date - ) - - # Render the SQL - env = create_data_partner_env( - main_template=os.path.join(sql_folder(workflow_module="onix_workflow"), "book_product.sql.jinja2"), - data_partners=self.data_partners, - ) - sql = env.render( - onix_table_id=onix_table_id, - data_partners=self.data_partners, - book_table_id=book_table_id, - country_table_id=country_table_id, - workid_table_id=workid_table_id, - workfamilyid_table_id=workfamilyid_table_id, - ga3_views_field=self.ga3_views_field, - **dp_tables, - ) - logging.info(f"Book Product SQL:\n{sql}") - - # Create the table - with open(os.path.join(default_schema_folder("onix_workflow"), "book_product.json"), "r") as f: - schema = json.load(f) - - # Create the schema - for dp in self.data_partners: - months_schema_file = os.path.join(dp.schema_directory, dp.files.book_product_metrics_schema) - with open(months_schema_file, "r") as f: - months_schema = json.load(f) - schema = insert_into_schema(schema, insert_field=months_schema, schema_field_name="months") - - metadata_schema_file = os.path.join(dp.schema_directory, dp.files.book_product_metadata_schema) - if dp.has_metadata: - with open(metadata_schema_file, "r") as f: - metadata_schema = json.load(f) - schema = insert_into_schema(schema, insert_field=metadata_schema, schema_field_name="metadata") - - table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_dataset, - self.bq_book_product_table_name, - release.snapshot_date, - ) - - # Run the query - with open(release.book_product_schema_path, mode="w+") as f: - json.dump(schema, f) - status = bq_create_table_from_query( - sql=sql, table_id=table_id, schema_file_path=release.book_product_schema_path - ) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def export_oaebu_table(self, release: OnixWorkflowRelease, **kwargs) -> bool: - """Create an export table. - - Takes several kwargs: - :param output_table: The name of the table to create - :param query_template: The name of the template SQL file - :param schema_file_path: The path to the schema - :return: Whether the table creation was a success - """ - - output_table: str = kwargs["output_table"] - query_template_path: str = kwargs["query_template_path"] - schema_file_path: str = kwargs["schema_file_path"] - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_oaebu_export_dataset, - location=self.cloud_workspace.data_location, - description="OAEBU Tables for Dashboarding", - ) - output_table_name = f"{self.cloud_workspace.project_id.replace('-', '_')}_{output_table}" - output_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, self.bq_oaebu_export_dataset, output_table_name, release.snapshot_date - ) - - book_product_table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.bq_oaebu_dataset, - self.bq_book_product_table_name, - release.snapshot_date, - ) - country_table_id = bq_table_id(self.bq_country_project_id, self.bq_country_dataset_id, "country") - bic_table_id = bq_table_id(self.bq_subject_project_id, self.bq_subject_dataset_id, "bic_lookup") - bisac_table_id = bq_table_id(self.bq_subject_project_id, self.bq_subject_dataset_id, "bisac_lookup") - thema_table_id = bq_table_id(self.bq_subject_project_id, self.bq_subject_dataset_id, "thema_lookup") - - env = create_data_partner_env(main_template=query_template_path, data_partners=self.data_partners) - sql = env.render( - project_id=self.cloud_workspace.project_id, - dataset_id=self.bq_oaebu_dataset, - release=release.snapshot_date, - data_partners=self.data_partners, - book_product_table_id=book_product_table_id, - country_table_id=country_table_id, - bic_table_id=bic_table_id, - bisac_table_id=bisac_table_id, - thema_table_id=thema_table_id, - ) - logging.info(f"{output_table} SQL:\n{sql}") - - status = bq_create_table_from_query(sql=sql, table_id=output_table_id, schema_file_path=schema_file_path) - return status - - def export_book_metrics_country(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Create table for country metrics""" - - country_schema_base = os.path.join(default_schema_folder("onix_workflow"), "book_metrics_country.json") - with open(country_schema_base, "r") as f: - country_schema = json.load(f) - - for dp in [dp for dp in self.data_partners if dp.export_country]: - _file = dp.files.book_metrics_country_schema - with open(os.path.join(dp.schema_directory, _file), "r") as f: - dp_schema = json.load(f) - country_schema = insert_into_schema(country_schema, dp_schema) - - with open(release.country_metrics_schema, "w") as f: - json.dump(country_schema, f) - - query_template_path = os.path.join( - sql_folder(workflow_module="onix_workflow"), "book_metrics_country.sql.jinja2" - ) - status = self.export_oaebu_table( - release=release, - output_table="book_metrics_country", - query_template_path=query_template_path, - schema_file_path=release.country_metrics_schema, - ) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def export_book_metrics_author(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Create table for author metrics""" - - author_schema_base = os.path.join(default_schema_folder("onix_workflow"), "book_metrics_author.json") - with open(author_schema_base, "r") as f: - author_schema = json.load(f) - - for dp in [dp for dp in self.data_partners if dp.export_author]: - _file = dp.files.book_metrics_author_schema - with open(os.path.join(dp.schema_directory, _file), "r") as f: - dp_schema = json.load(f) - author_schema = insert_into_schema(author_schema, dp_schema) - - with open(release.author_metrics_schema, "w") as f: - json.dump(author_schema, f) - - query_template_path = os.path.join( - sql_folder(workflow_module="onix_workflow"), "book_metrics_author.sql.jinja2" - ) - status = self.export_oaebu_table( - release=release, - output_table="book_metrics_author", - query_template_path=query_template_path, - schema_file_path=release.author_metrics_schema, - ) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def export_book_metrics(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Create table for book metrics""" - - book_schema_base = os.path.join(default_schema_folder("onix_workflow"), "book_metrics.json") - with open(book_schema_base, "r") as f: - book_schema = json.load(f) - - for dp in [dp for dp in self.data_partners if dp.export_book_metrics]: - _file = dp.files.book_metrics_schema - with open(os.path.join(dp.schema_directory, _file), "r") as f: - dp_schema = json.load(f) - book_schema = insert_into_schema(book_schema, dp_schema) - - with open(release.book_metrics_schema, "w") as f: - json.dump(book_schema, f) - - query_template_path = os.path.join(sql_folder(workflow_module="onix_workflow"), "book_metrics.sql.jinja2") - status = self.export_oaebu_table( - release=release, - output_table="book_metrics", - query_template_path=query_template_path, - schema_file_path=release.book_metrics_schema, - ) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def export_book_metrics_subjects(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Create tables for subject metrics""" - - for sub, schema_dump in [ - ("bic", release.subject_metrics_bic_schema), - ("bisac", release.subject_metrics_bisac_schema), - ("thema", release.subject_metrics_thema_schema), - ]: - subject_schema_base = os.path.join( - default_schema_folder("onix_workflow"), f"book_metrics_subject_{sub}.json" - ) - with open(subject_schema_base, "r") as f: - subject_schema = json.load(f) - - for dp in [dp for dp in self.data_partners if dp.export_subject]: - _file = dp.files.book_metrics_subject_schema - with open(os.path.join(dp.schema_directory, _file), "r") as f: - dp_schema = json.load(f) - subject_schema = insert_into_schema(subject_schema, dp_schema) - - with open(schema_dump, "w") as f: - json.dump(subject_schema, f) - - query_template_path = os.path.join( - sql_folder(workflow_module="onix_workflow"), f"book_metrics_subject_{sub}.sql.jinja2" - ) - status = self.export_oaebu_table( - release=release, - output_table=f"book_metrics_subject_{sub}", - query_template_path=query_template_path, - schema_file_path=schema_dump, - ) - set_task_state(status, kwargs["ti"].task_id, release=release) - - def update_latest_export_tables(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Create copies of the latest data export tables in bigquery""" - copy_latest_export_tables( - project_id=self.cloud_workspace.project_id, - from_dataset=self.bq_oaebu_export_dataset, - to_dataset=self.bq_oaebu_latest_export_dataset, - date_match=release.snapshot_date.strftime("%Y%m%d"), - data_location=self.cloud_workspace.data_location, - ) - - def add_new_dataset_releases(self, release: OnixWorkflowRelease, **kwargs) -> None: - """Adds release information to API.""" - - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - snapshot_date=release.snapshot_date, - data_interval_start=kwargs["data_interval_start"], - data_interval_end=kwargs["data_interval_end"], - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, release: OnixWorkflowRelease, **kwargs): - """Cleanup temporary files.""" - cleanup(dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder) - - -def dois_from_table(table_id: str, doi_column_name: str = "DOI", distinct: str = True) -> List[str]: - """ - Queries a metadata table to retrieve the unique DOIs. Provided the DOIs are not in a nested structure. - - :param metadata_table_id: The fully qualified ID of the metadata table on GCP - :param doi_field_name: The name of the DOI column - :param distinct: Whether to retrieve only unique DOIs - :return: All DOIs present in the metadata table - """ - - select_field = f"DISTINCT({doi_column_name})" if distinct else doi_column_name - sql = f"SELECT {select_field} FROM `{table_id}`" - query_results = bq_run_query(sql) - dois = [r["DOI"] for r in query_results] - return dois - - -def download_crossref_events( - dois: List[str], - start_date: pendulum.DateTime, - end_date: pendulum.DateTime, - mailto: str, - max_threads: int = 1, -) -> List[dict]: - """ - Spawns multiple threads to download event data (DOI and publisher only) for each doi supplied. - The url template was made with reference to the crossref event api: - https://www.eventdata.crossref.org/guide/service/query-api/ - Note that the max_threads will cap at 15 because the events API will return a 429 if more than 15 requests are made - per second. Each API request happens to take roughly 1 second. Having more threadsthan necessary slows down the - download process as the retry script will wait a minimum of two seconds between each attempt. - - :param dois: The list of DOIs to download the events for - :param start_date: The start date for events we're interested in - :param end_date: The end date for events we're interested in - :param mailto: The email to use as a reference for who is requesting the data - :param max_threads: The maximum threads to spawn for the downloads. - :return: All events for the input DOIs - """ - - event_url_template = CROSSREF_EVENT_URL_TEMPLATE - url_start_date = start_date.strftime("%Y-%m-%d") - url_end_date = end_date.strftime("%Y-%m-%d") - max_threads = min(max_threads, 15) - - event_urls = [ - event_url_template.format(doi=doi, mailto=mailto, start_date=url_start_date, end_date=url_end_date) - for doi in dois - ] - - logging.info(f"Beginning crossref event data download from {len(event_urls)} URLs with {max_threads} workers") - logging.info( - f"Downloading DOI data using URL: {event_url_template.format(doi='***', mailto=mailto, start_date=url_start_date, end_date=url_end_date)}" - ) - all_events = [] - with ThreadPoolExecutor(max_workers=max_threads) as executor: - futures = [] - for i, url in enumerate(event_urls): - futures.append(executor.submit(download_crossref_event_url, url, i=i)) - for future in as_completed(futures): - all_events.extend(future.result()) - - return all_events - - -def download_crossref_event_url(url: str, i: int = 0) -> List[dict]: - """ - Downloads all crossref events from a url, iterating through pages if there is more than one - - :param url: The url send the request to - :param i: Worker number - :return: The events from this URL - """ - - events = [] - headers = {"User-Agent": get_user_agent(package_name="oaebu_workflows")} - next_cursor, page_counts, total_events, page_events = download_crossref_page_events(url, headers) - events.extend(page_events) - total_counts = page_counts - while next_cursor: - tmp_url = url + f"&cursor={next_cursor}" - next_cursor, page_counts, _, page_events = download_crossref_page_events(tmp_url, headers) - total_counts += page_counts - events.extend(page_events) - logging.info(f"{i + 1}: {url} successful") - logging.info(f"{i + 1}: Total no. events: {total_events}, downloaded " f"events: {total_counts}") - return events - - -def download_crossref_page_events(url: str, headers: dict) -> Tuple[str, int, int, List[dict]]: - """ - Download crossref events from a single page - - :param url: The url to send the request to - :param headers: Headers to send with the request - :return: The cursor, event counter, total number of events and the events for the URL - """ - - crossref_events_limiter() - response = retry_get_url(url, num_retries=5, wait=wait_exponential_jitter(initial=0.5, max=60), headers=headers) - response_json = response.json() - total_events = response_json["message"]["total-results"] - events = response_json["message"]["events"] - next_cursor = response_json["message"]["next-cursor"] - counter = len(events) - - return next_cursor, counter, total_events, events - - -@sleep_and_retry -@limits(calls=15, period=1) -def crossref_events_limiter(): - """ "Task to throttle the calls to the crossref events API""" - return - - -def transform_crossref_events(events: List[dict], max_threads: int = 1) -> List[dict]: - """ - Spawns workers to transforms crossref events - - :param all_events: A list of the events to transform - :param max_threads: The maximum number of threads to utilise for the transforming process - :return: transformed events, the order of the events in the input list is not preserved - """ - - logging.info(f"Beginning crossref event transform with {max_threads} workers") - transformed_events = [] - with ThreadPoolExecutor(max_workers=max_threads) as executor: - futures = [] - for event in events: - futures.append(executor.submit(transform_event, event)) - for future in as_completed(futures): - transformed_events.append(future.result()) - logging.info("Crossref event transformation complete") - return transformed_events - - -def transform_event(event: dict) -> dict: - """Transform the dictionary with event data by replacing '-' with '_' in key names, converting all int values to - string except for the 'total' field and parsing datetime columns for a valid datetime. - - :param event: The event dictionary - :return: The transformed event dictionary - """ - - if isinstance(event, (str, int, float)): - return event - if isinstance(event, dict): - new = event.__class__() - for k, v in event.items(): - if isinstance(v, int) and k != "total": - v = str(v) - if k in ["timestamp", "occurred_at", "issued", "dateModified", "updated_date"]: - try: - v = str(pendulum.parse(v)) - except ValueError: - v = "0001-01-01T00:00:00Z" - - # Replace hyphens with underscores for BigQuery compatibility - k = k.replace("-", "_") - - # Replace @ symbol in keys left by DataCite between the 15 and 22 March 2019 - k = k.replace("@", "") - - new[k] = transform_event(v) - return new - - -def copy_latest_export_tables( - project_id: str, from_dataset: str, to_dataset: str, date_match: str, data_location: str, description: str = None -) -> None: - """Creates copies of all sharded tables from a dataset with a matching a date string. - - :param project_id: The project id - :param from_dataset: The dataset containing the sharded tables - :param to_dataset: The dataset to contain the copied tables - will create if does not exist - :param date_match: The date string to match. e.g. for a table named 'this_table20220101', this would be '20220101' - :param data_location: The regional location of the data in google cloud - :param description: The description for dataset housing the copied tables - """ - - if description is None: - description = "OAEBU Export tables for Dashboarding" - - # Make to_dataset if it doesn't exist - bq_create_dataset( - project_id=project_id, - dataset_id=to_dataset, - location=data_location, - description=description, - ) - - # Get the tables from the from_dataset - client = Client(project_id) - tables = [t.table_id for t in client.list_tables(from_dataset)] - - # Find the tables with specified date string - regex_string = rf"^\w+{date_match}\b" - matched_tables = [re.findall(regex_string, t) for t in tables] - matched_tables = [t[0] for t in matched_tables if t] - assert len(matched_tables), f"No tables matching date {date_match} in dataset {project_id}.{from_dataset}" - - # Copy all of the tables - for table in matched_tables: - table_id = bq_table_id(project_id, from_dataset, table) - table_name = bq_table_id_parts(table_id)[2] # Drop the date from the table for copied table - unsharded_id = bq_table_id(project_id, to_dataset, table_name) - bq_copy_table(src_table_id=table_id, dst_table_id=unsharded_id, write_disposition="WRITE_TRUNCATE") - - -def get_onix_records(table_id: str) -> List[dict]: - """Fetch the latest onix snapshot from BigQuery. - :param table_id: Fully qualified table ID. - :return: List of onix product records. - """ - - sql = f"SELECT * FROM {table_id}" - records = bq_run_query(sql) - products = [{key: records[i][key] for key in records[i].keys()} for i in range(len(records))] - return products - - -def get_isbn_utils_sql_string() -> str: - """Load the ISBN utils sql functions. - :return BQ SQL string. - """ - - isbn_utils_file = "isbn_utils.sql" - isbn_utils_path = os.path.join(sql_folder(workflow_module="onix_workflow"), isbn_utils_file) - with open(isbn_utils_path, "r") as f: - isbn_utils_sql = f.read() - - return isbn_utils_sql - - -def create_data_partner_env(main_template: str, data_partners: Iterable[DataPartner]) -> Environment: - """Creates a jinja2 environment for any number of data partners - - :param main_template: The name of the main jinja2 template - :param data_partners: The data partners - :return: Jinja2 environment with data partners sql folders loaded - """ - - directories = [dp.sql_directory for dp in data_partners] - with open(main_template) as f: - contents = f.read() - loader = FileSystemLoader(directories) - env = Environment(loader=loader).from_string(contents) - return env - - -def insert_into_schema(schema_base: List[dict], insert_field: dict, schema_field_name: Optional[str] = None): - """ - Inserts a given field into a schema. - - :param schema_base: (List[dict]): The base schema to insert the field into. - :param insert_field: (dict): The field to be inserted into the schema. - :param schema_field_name: (Optional[str], optional): The name of the field in the schema. - If provided, the field will be inserted into the matching field. - If not provided, the field will be appended to the end of the schema. - :return: The updated schema with the field inserted. - - Raises ValueError If the provided schema_field_name is not found in the schema. - """ - - if schema_field_name: - field_found = False - for row in schema_base: - if row["name"] == schema_field_name: - field_found = True - row["fields"].append(insert_field) - break - if not field_found: - raise ValueError(f"Field {schema_field_name} not found in schema") - else: - schema_base.append(insert_field) - - return schema_base diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/crossref_events_request.yaml b/oaebu_workflows/onix_workflow/tests/fixtures/crossref_events_request.yaml deleted file mode 100644 index 3686cd15..00000000 --- a/oaebu_workflows/onix_workflow/tests/fixtures/crossref_events_request.yaml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d3805d1bdc847b49b818ea0a6e455708e79278265d13172e675b7d473cecb9c4 -size 10742 diff --git a/oaebu_workflows/thoth_telescope/thoth_telescope.py b/oaebu_workflows/thoth_telescope/thoth_telescope.py deleted file mode 100644 index d0c8e7b1..00000000 --- a/oaebu_workflows/thoth_telescope/thoth_telescope.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright 2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Keegan Smith - -import os -import logging -from typing import Union, Optional - -import pendulum -from pendulum.datetime import DateTime -from airflow.exceptions import AirflowException -from google.cloud.bigquery import SourceFormat - -from oaebu_workflows.onix_utils import OnixTransformer -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.bigquery import bq_load_table, bq_sharded_table_id, bq_create_dataset -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.utils.url_utils import retry_get_url -from observatory.platform.gcs import gcs_upload_files, gcs_blob_name_from_path, gcs_blob_uri -from observatory.platform.workflows.workflow import ( - Workflow, - SnapshotRelease, - make_snapshot_date, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -THOTH_URL = "{host_name}/specifications/{format_specification}/publisher/{publisher_id}" -DEFAULT_HOST_NAME = "https://export.thoth.pub" - - -class ThothRelease(SnapshotRelease): - def __init__( - self, - *, - dag_id: str, - run_id: str, - snapshot_date: DateTime, - ): - """Construct a ThothRelease. - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID - :param release_date: The date of the snapshot_date/release - """ - super().__init__(dag_id=dag_id, run_id=run_id, snapshot_date=snapshot_date) - self.download_path = os.path.join(self.download_folder, f"thoth_{snapshot_date.format('YYYY_MM_DD')}.xml") - self.transform_path = os.path.join(self.transform_folder, f"transformed.jsonl.gz") - - -class ThothTelescope(Workflow): - def __init__( - self, - *, - dag_id: str, - cloud_workspace: CloudWorkspace, - publisher_id: str, - format_specification: str, - elevate_related_products: bool = False, - metadata_partner: Union[str, OaebuPartner] = "thoth", - bq_dataset_description: str = "Thoth ONIX Feed", - bq_table_description: Optional[str] = None, - api_dataset_id: str = "onix", - host_name: str = "https://export.thoth.pub", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - catchup: bool = False, - start_date: DateTime = pendulum.datetime(2022, 12, 1), - schedule: str = "@weekly", - ): - """Construct an ThothOnixTelescope instance. - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param publisher_id: The Thoth ID for this publisher - :param format_specification: The Thoth ONIX/metadata format specification. e.g. "onix_3.0::oapen" - :param elevate_related_products: Whether to pull out the related products to the product level. - :param metadata_partner: The metadata partner name - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param host_name: The Thoth host name - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param catchup: Whether to catchup the DAG or not - :param start_date: The start date of the DAG - :param schedule: The schedule interval of the DAG - """ - super().__init__( - dag_id, - start_date=start_date, - schedule=schedule, - airflow_conns=[observatory_api_conn_id], - catchup=catchup, - tags=["oaebu"], - ) - - if bq_table_description is None: - bq_table_description = "Thoth ONIX Feed" - - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.publisher_id = publisher_id - self.elevate_related_products = elevate_related_products - self.metadata_partner = partner_from_str(metadata_partner, metadata_partner=True) - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.host_name = host_name - self.format_specification = format_specification - self.observatory_api_conn_id = observatory_api_conn_id - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_task(self.download) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> ThothRelease: - """Creates a new Thoth release instance - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: The Thoth release instance - """ - snapshot_date = make_snapshot_date(**kwargs) - release = ThothRelease(dag_id=self.dag_id, run_id=kwargs["run_id"], snapshot_date=snapshot_date) - return release - - def download(self, release: ThothRelease, **kwargs) -> None: - """Task to download the ONIX release from Thoth. - - :param release: The Thoth release instance - """ - thoth_download_onix( - publisher_id=self.publisher_id, - format_spec=self.format_specification, - download_path=release.download_path, - ) - - def upload_downloaded(self, release: ThothRelease, **kwargs) -> None: - """Upload the downloaded thoth onix XML to google cloud bucket""" - success = gcs_upload_files(bucket_name=self.cloud_workspace.download_bucket, file_paths=[release.download_path]) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def transform(self, release: ThothRelease, **kwargs) -> None: - """Task to transform the Thoth ONIX data""" - transformer = OnixTransformer( - input_path=release.download_path, - output_dir=release.transform_folder, - deduplicate_related_products=self.elevate_related_products, - elevate_related_products=self.elevate_related_products, - add_name_fields=True, - collapse_subjects=True, - ) - out_file = transformer.transform() - if release.transform_path != out_file: - raise FileNotFoundError(f"Expected file {release.transform_path} not equal to transformed file: {out_file}") - - def upload_transformed(self, release: ThothRelease, **kwargs) -> None: - """Upload the downloaded thoth onix .jsonl to google cloud bucket""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.transform_path] - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, release: ThothRelease, **kwargs) -> None: - """Task to load the transformed ONIX jsonl file to BigQuery.""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.metadata_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) - table_id = bq_sharded_table_id( - self.cloud_workspace.project_id, - self.metadata_partner.bq_dataset_id, - self.metadata_partner.bq_table_name, - release.snapshot_date, - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.metadata_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - table_description=self.bq_table_description, - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, release: ThothRelease, **kwargs) -> None: - """Adds release information to API.""" - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - snapshot_date=release.snapshot_date, - data_interval_start=kwargs["data_interval_start"], - data_interval_end=kwargs["data_interval_end"], - ) - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - api.post_dataset_release(dataset_release) - - def cleanup(self, release: ThothRelease, **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - cleanup(dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder) - - -def thoth_download_onix( - publisher_id: str, - download_path: str, - format_spec: str, - host_name: str = DEFAULT_HOST_NAME, - num_retries: int = 3, -) -> None: - """Hits the Thoth API and requests the ONIX feed for a particular publisher. - Creates a file called onix.xml at the specified location - - :param publisher_id: The ID of the publisher. Can be found using Thoth GraphiQL API - :param download_path: The path to download ONIX the file to - :param format_spec: The ONIX format specification to use. Options can be found with the /formats endpoint of the API - :param host_name: The Thoth host URL - :param num_retries: The number of times to retry the download, given an unsuccessful return code - """ - url = THOTH_URL.format(host_name=host_name, format_specification=format_spec, publisher_id=publisher_id) - logging.info(f"Downloading ONIX XML from {url}") - response = retry_get_url(url, num_retries=num_retries) - if response.status_code != 200: - raise AirflowException( - f"Request for URL {url} was unsuccessful with code: {response.status_code}\nContent response: {response.content.decode('utf-8')}" - ) - with open(download_path, "wb") as f: - f.write(response.content) diff --git a/oaebu_workflows/ucl_discovery_telescope/tests/fixtures/__init__.py b/oaebu_workflows/ucl_discovery_telescope/tests/fixtures/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py b/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py deleted file mode 100644 index 442a130e..00000000 --- a/oaebu_workflows/ucl_discovery_telescope/ucl_discovery_telescope.py +++ /dev/null @@ -1,399 +0,0 @@ -# Copyright 2023 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: Aniek Roelofs, Keegan Smith - - -import logging -import os -from typing import List, Union -from concurrent.futures import ThreadPoolExecutor, as_completed - -import pendulum -from airflow.hooks.base import BaseHook -from google.cloud.bigquery import SourceFormat, TimePartitioningType, WriteDisposition -from google.oauth2 import service_account -from apiclient import discovery - -from oaebu_workflows.oaebu_partners import OaebuPartner, partner_from_str -from observatory.api.client.model.dataset_release import DatasetRelease -from observatory.platform.api import make_observatory_api -from observatory.platform.airflow import AirflowConns -from observatory.platform.files import save_jsonl_gz, load_jsonl -from observatory.platform.gcs import gcs_blob_uri, gcs_upload_files, gcs_blob_name_from_path -from observatory.platform.bigquery import bq_load_table, bq_table_id, bq_create_dataset -from observatory.platform.utils.url_utils import retry_get_url -from observatory.platform.observatory_config import CloudWorkspace -from observatory.platform.files import add_partition_date -from observatory.platform.workflows.workflow import ( - Workflow, - PartitionRelease, - cleanup, - set_task_state, - check_workflow_inputs, -) - - -class UclDiscoveryRelease(PartitionRelease): - def __init__( - self, - dag_id: str, - run_id: str, - data_interval_start: pendulum.DateTime, - data_interval_end: pendulum.DateTime, - partition_date: pendulum.DateTime, - ): - """Construct a UclDiscoveryRelease instance. - - :param dag_id: The ID of the DAG - :param run_id: The Airflow run ID. - :param data_interval_start: The start of the data interval. - :param data_interval_end: The end of the data interval. - :param partition_date: The partition date for this release. - """ - super().__init__(dag_id=dag_id, run_id=run_id, partition_date=partition_date) - self.data_interval_start = data_interval_start - self.data_interval_end = data_interval_end - - self.download_country_path = os.path.join(self.download_folder, "ucl_discovery_country.jsonl.gz") - self.download_totals_path = os.path.join(self.download_folder, "ucl_discovery_totals.jsonl.gz") - self.transform_path = os.path.join(self.transform_folder, "ucl_discovery.jsonl.gz") - - -class UclDiscoveryTelescope(Workflow): - """The UCL Discovery telescope.""" - - def __init__( - self, - dag_id: str, - cloud_workspace: CloudWorkspace, - sheet_id: str, - data_partner: Union[str, OaebuPartner] = "ucl_discovery", - bq_dataset_description: str = "UCL Discovery dataset", - bq_table_description: str = "UCL Discovery table", - api_dataset_id: str = "ucl", - observatory_api_conn_id: str = AirflowConns.OBSERVATORY_API, - oaebu_service_account_conn_id: str = "oaebu_service_account", - max_threads: int = os.cpu_count() * 2, - schedule: str = "0 0 4 * *", # run on the 4th of every month - start_date: pendulum.DateTime = pendulum.datetime(2015, 6, 1), - catchup: bool = True, - max_active_runs: int = 10, - ): - """Construct a UclDiscoveryTelescope instance. - - :param dag_id: The ID of the DAG - :param cloud_workspace: The CloudWorkspace object for this DAG - :param sheet_id: The ID of the google sheet match eprint ID to ISBN13 - :param data_partner: The name of the data partner - :param bq_dataset_description: Description for the BigQuery dataset - :param bq_table_description: Description for the biguery table - :param api_dataset_id: The ID to store the dataset release in the API - :param observatory_api_conn_id: Airflow connection ID for the overvatory API - :param oaebu_service_account_conn_id: Airflow connection ID for the oaebu service account - :param max_threads: The maximum number threads to utilise for parallel processes - :param schedule: The schedule interval of the DAG - :param start_date: The start date of the DAG - :param catchup: Whether to catchup the DAG or not - :param max_active_runs: The maximum number of concurrent DAG runs - """ - super().__init__( - dag_id, - start_date, - schedule, - catchup=catchup, - max_active_runs=max_active_runs, - airflow_conns=[observatory_api_conn_id, oaebu_service_account_conn_id], - tags=["oaebu"], - ) - - self.dag_id = dag_id - self.cloud_workspace = cloud_workspace - self.sheet_id = sheet_id - self.data_partner = partner_from_str(data_partner) - self.bq_dataset_description = bq_dataset_description - self.bq_table_description = bq_table_description - self.api_dataset_id = api_dataset_id - self.oaebu_service_account_conn_id = oaebu_service_account_conn_id - self.max_threads = max_threads - self.observatory_api_conn_id = observatory_api_conn_id - - check_workflow_inputs(self) - - self.add_setup_task(self.check_dependencies) - self.add_task(self.download) - self.add_task(self.upload_downloaded) - self.add_task(self.transform) - self.add_task(self.upload_transformed) - self.add_task(self.bq_load) - self.add_task(self.add_new_dataset_releases) - self.add_task(self.cleanup) - - def make_release(self, **kwargs) -> List[UclDiscoveryRelease]: - """Make release instances. The release is passed as an argument to the function (TelescopeFunction) that is - called in 'task_callable'. There will only be 1 release, but it is passed on as a list so the - SnapshotTelescope template methods can be used. - - :param kwargs: the context passed from the PythonOperator. - See https://airflow.apache.org/docs/stable/macros-ref.html for the keyword arguments that can be passed - :return: A list with one ucldiscovery release instance. - """ - data_interval_start = kwargs["data_interval_start"].start_of("month") - data_interval_end = kwargs["data_interval_end"].start_of("month") - partition_date = data_interval_start.end_of("month") - run_id = kwargs["run_id"] - - logging.info( - f"Interval Start: {data_interval_start}, Interval End:{data_interval_end}, Partition date: {partition_date}, Run ID: {run_id}" - ) - return UclDiscoveryRelease( - self.dag_id, - kwargs["run_id"], - data_interval_start=data_interval_start, - data_interval_end=data_interval_end, - partition_date=partition_date, - ) - - def download(self, release: UclDiscoveryRelease, **kwargs): - """Fownload the ucl discovery data for a given release. - :param releases: The UCL discovery release. - """ - mappings = get_isbn_eprint_mappings(self.sheet_id, self.oaebu_service_account_conn_id, release.partition_date) - - with ThreadPoolExecutor(self.max_threads) as executor: - futures = [] - for eprint_id in mappings.keys(): - future = executor.submit( - download_discovery_stats, eprint_id, release.data_interval_start, release.partition_date - ) - futures.append(future) - totals = [] - country = [] - for future in as_completed(futures): - c, t = future.result() - country.append(c) - totals.append(t) - - logging.info(f"Saving totals data to file: {release.download_totals_path}") - save_jsonl_gz(release.download_totals_path, totals) - logging.info(f"Saving country data to file: {release.download_country_path}") - save_jsonl_gz(release.download_country_path, country) - - def upload_downloaded(self, release: UclDiscoveryRelease, **kwargs): - """Uploads the downloaded files to GCS""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.download_bucket, - file_paths=[release.download_country_path, release.download_totals_path], - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def transform(self, release: UclDiscoveryRelease, **kwargs): - """Transform the ucl discovery data for a given release.""" - mappings = get_isbn_eprint_mappings(self.sheet_id, self.oaebu_service_account_conn_id, release.partition_date) - - # Load the records and sort them by eprint id - country_records = load_jsonl(release.download_country_path) - totals_records = load_jsonl(release.download_totals_path) - country_records = sorted(country_records, key=lambda x: x["set"]["value"]) # ["set"]["value"] = eprint_id - totals_records = sorted(totals_records, key=lambda x: x["set"]["value"]) - assert len(country_records) == len(totals_records), f"{len(country_records)} != {len(totals_records)}" - - with ThreadPoolExecutor(self.max_threads) as executor: - futures = [] - for country_record, totals_record in zip(country_records, totals_records): - isbn = mappings[country_record["set"]["value"]]["ISBN13"] - title = mappings[country_record["set"]["value"]]["title"] - future = executor.submit(transform_discovery_stats, country_record, totals_record, isbn, title) - futures.append(future) - results = [] - for future in as_completed(futures): - results.append(future.result()) - - # Add the release date to the data as a parition field - results = add_partition_date( - results, release.partition_date, TimePartitioningType.MONTH, partition_field="release_date" - ) - print(results) - save_jsonl_gz(release.transform_path, results) - - def upload_transformed(self, release: UclDiscoveryRelease, **kwargs): - """Uploads the transformed file to GCS""" - success = gcs_upload_files( - bucket_name=self.cloud_workspace.transform_bucket, file_paths=[release.transform_path] - ) - set_task_state(success, kwargs["ti"].task_id, release=release) - - def bq_load(self, release: UclDiscoveryRelease, **kwargs) -> None: - """Loads the transformed data into BigQuery""" - bq_create_dataset( - project_id=self.cloud_workspace.project_id, - dataset_id=self.data_partner.bq_dataset_id, - location=self.cloud_workspace.data_location, - description=self.bq_dataset_description, - ) - - uri = gcs_blob_uri(self.cloud_workspace.transform_bucket, gcs_blob_name_from_path(release.transform_path)) - table_id = bq_table_id( - self.cloud_workspace.project_id, self.data_partner.bq_dataset_id, self.data_partner.bq_table_name - ) - state = bq_load_table( - uri=uri, - table_id=table_id, - schema_file_path=self.data_partner.schema_path, - source_format=SourceFormat.NEWLINE_DELIMITED_JSON, - partition_type=TimePartitioningType.MONTH, - partition=True, - partition_field="release_date", - write_disposition=WriteDisposition.WRITE_APPEND, - table_description=self.bq_table_description, - ignore_unknown_values=True, - ) - set_task_state(state, kwargs["ti"].task_id, release=release) - - def add_new_dataset_releases(self, release: UclDiscoveryRelease, **kwargs) -> None: - """Adds release information to API.""" - api = make_observatory_api(observatory_api_conn_id=self.observatory_api_conn_id) - dataset_release = DatasetRelease( - dag_id=self.dag_id, - dataset_id=self.api_dataset_id, - dag_run_id=release.run_id, - data_interval_start=kwargs["data_interval_start"], - data_interval_end=kwargs["data_interval_end"], - partition_date=release.partition_date, - ) - api.post_dataset_release(dataset_release) - - def cleanup(self, release: UclDiscoveryRelease, **kwargs) -> None: - """Delete all files, folders and XComs associated with this release.""" - cleanup(dag_id=self.dag_id, execution_date=kwargs["execution_date"], workflow_folder=release.workflow_folder) - - -def get_isbn_eprint_mappings(sheet_id: str, service_account_conn_id: str, cutoff_date: pendulum.DateTime) -> dict: - """Get the eprint id to isbn mapping from the google sheet - - :param sheet_id: The ID of the google sheet. - :param credentials: The credentials object to authenticate with. - :param cutoff_date: The cutoff date. If an item is published after this date, it will be skipped. - """ - scopes = [ - "https://www.googleapis.com/auth/drive", - "https://www.googleapis.com/auth/drive.file", - "https://www.googleapis.com/auth/spreadsheets", - ] - service_account_conn = BaseHook.get_connection(service_account_conn_id) - credentials = service_account.Credentials.from_service_account_info( - service_account_conn.extra_dejson, scopes=scopes - ) - service = discovery.build("sheets", "v4", credentials=credentials) - result = service.spreadsheets().values().get(spreadsheetId=sheet_id, range="isbn_mapping").execute() - sheet_contents = result.get("values") - if not sheet_contents: - raise ValueError(f"No content found for sheet with ID {sheet_id}") - - items = [] - header = sheet_contents[0] - if not all(heading in header for heading in ["ISBN13", "discovery_eprintid", "date", "title_list_title"]): - raise ValueError(f"Invalid header found for sheet: {header}") - for row in sheet_contents[1:]: - items.append(dict(zip(header, row))) - - mappings = {} - for item in items: - eprint_id = item.get("discovery_eprintid") - isbn = item.get("ISBN13") - title = item.get("title_list_title") - if not eprint_id or not isbn: - logging.warn(f"Item with missing information will be skipped: {item}") - continue - if pendulum.parse(item["date"]) > cutoff_date: - logging.info(f"Item released after cutoff date will be skipped: {item}") - continue - mappings[eprint_id] = {"ISBN13": isbn, "title": title} - - return mappings - - -def download_discovery_stats(eprint_id: str, start_date: pendulum.DateTime, end_date: pendulum.DateTime): - """Downloads the discovery stats for a given eprint ID within a specified date range. - - :param eprint_id: The eprint ID of the item to get the stats for. - :param start_date: The start date of the date range. - :param end_date: The end date of the date range. - :return: A tuple containing the country statistics and the total downloads statistics. - """ - countries_url = ( - "https://discovery.ucl.ac.uk/cgi/stats/get" - f"?from={start_date.format('YYYYMMDD')}&to={end_date.format('YYYYMMDD')}" - f"&irs2report=eprint&set_name=eprint&set_value={eprint_id}&datatype=countries&top=countries" - "&view=Table&limit=all&export=JSON" - ) - totals_url = ( - "https://discovery.ucl.ac.uk/cgi/stats/get" - f"?from={start_date.format('YYYYMMDD')}&to={end_date.format('YYYYMMDD')}" - f"&irs2report=eprint&set_name=eprint&set_value={eprint_id}&datatype=downloads&graph_type=column" - "&view=Google%3A%3AGraph&date_resolution=month&title=Download+activity+-+last+12+months&export=JSON" - ) - response = retry_get_url(countries_url) - country = response.json() - response = retry_get_url(totals_url) - totals = response.json() - - # Perform some checks on the returned data - timescale = (start_date.format("YYYYMMDD"), end_date.format("YYYYMMDD")) - country_timescale = (country["timescale"]["from"], country["timescale"]["to"]) - totals_timescale = (totals["timescale"]["from"], totals["timescale"]["to"]) - if country_timescale != timescale or totals_timescale != timescale: - raise ValueError( - f"Invalid timescale value(s): country: {country['timescale']} | totals: {totals['timescale']} != {timescale}" - ) - if country["set"]["value"] != eprint_id or totals["set"]["value"] != eprint_id: - raise ValueError( - f"Invalid eprint ID values downloaded: {totals['set']['value']} | {country['set']['value']} != {eprint_id}" - ) - - return country, totals - - -def transform_discovery_stats(country_record: dict, totals_record: dict, isbn: str, title: str) -> dict: - """Transforms the discovery stats for a single set of records - - :param country_record: The country record - :param totals_record: The totals record - :param isbn: The isbn that matches the eprint id - :return: The transformed stats - """ - # Sanity check the records - country_eprint_id = country_record["set"]["value"] - totals_eprint_id = totals_record["set"]["value"] - if country_eprint_id != totals_eprint_id: - raise ValueError(f"Country and totals eprint ID do not match: {country_eprint_id} != {totals_eprint_id}") - - country_timescale = country_record["timescale"] - totals_timescale = totals_record["timescale"] - if country_timescale != totals_timescale: - raise ValueError(f"Timescales do not match: {country_timescale} != {totals_timescale}") - - # If there are no downloads for the time period, there is no "records" field in country stats - country_records = country_record.get("records", []) - - transformed = { - "ISBN": isbn, - "title": title, - "eprint_id": totals_record["set"]["value"], - "timescale": totals_record["timescale"], - "origin": totals_record["origin"], - "total_downloads": totals_record["records"][0]["count"], - "country": country_records, - } - return transformed diff --git a/packages.txt b/packages.txt new file mode 100644 index 00000000..6c557598 --- /dev/null +++ b/packages.txt @@ -0,0 +1,2 @@ +# ONIX Telescope +openjdk-11-jre \ No newline at end of file diff --git a/requirements.sh b/requirements.sh deleted file mode 100644 index 356a2aad..00000000 --- a/requirements.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2021 Curtin University -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ONIX Telescope -apt-get install openjdk-11-jre -y -echo "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> /etc/profile.d/observatory.sh diff --git a/requirements.txt b/requirements.txt index 92cb54b2..c47ba573 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,13 @@ +# Astro Runtime includes the following pre-installed providers packages: https://docs.astronomer.io/astro/runtime-image-architecture#provider-packages beautifulsoup4>=4.9.3,<5 onixcheck>=0.9.7,<1 ratelimit>=2.2.1,<3 +oauth2client==4.1.3 +# Tests +liccheck>=0.4.9,<1 +flake8>=3.8.0,<4 +coverage>=5.2,<6 +faker>=8.12.1,<9 +vcrpy>=4.1.1,<5 +responses>=0,<1 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index f593403a..00000000 --- a/setup.cfg +++ /dev/null @@ -1,55 +0,0 @@ -[metadata] -name = oaebu-workflows -author = Curtin University -author_email = agent@observatory.academy -summary = Oaebu Workflows provides Apache Airflow Workflows for fetching, processing and analysing data about open access books. -description_file = README.md -description_content_type = text/markdown; charset=UTF-8 -home_page = https://github.com/The-Academic-Observatory/oaebu-workflows -project_urls = - Bug Tracker = https://github.com/The-Academic-Observatory/oaebu-workflows/issues - Documentation = https://oaebu-workflows.readthedocs.io/en/latest/ - Source Code = https://github.com/The-Academic-Observatory/oaebu-workflows -python_requires = >=3.10 -license = Apache License Version 2.0 -classifier = - Development Status :: 2 - Pre-Alpha - Environment :: Console - Environment :: Web Environment - Intended Audience :: Developers - Intended Audience :: Science/Research - License :: OSI Approved :: Apache Software License - Operating System :: OS Independent - Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.10 - Topic :: Scientific/Engineering - Topic :: Software Development :: Libraries - Topic :: Software Development :: Libraries :: Python Modules - Topic :: Utilities -keywords = - science - data - workflows - academic institutes - oaebu-workflows - -[files] -packages = - oaebu_workflows -data_files = - requirements.txt = requirements.txt - requirements.sh = requirements.sh - oaebu_workflows/database = oaebu_workflows/database/* - -[extras] -tests = - liccheck>=0.4.9,<1 - flake8>=3.8.0,<4 - coverage>=5.2,<6 - faker>=8.12.1,<9 - vcrpy>=4.1.1,<5 - responses>=0,<1 - -[pbr] -skip_authors = true diff --git a/setup.py b/setup.py deleted file mode 100644 index 2c3a057c..00000000 --- a/setup.py +++ /dev/null @@ -1,3 +0,0 @@ -from setuptools import setup - -setup(setup_requires=["pbr"], pbr=True, python_requires=">=3.10") diff --git a/oaebu_workflows/onix_telescope/tests/fixtures/__init__.py b/tests/__init__.py similarity index 100% rename from oaebu_workflows/onix_telescope/tests/fixtures/__init__.py rename to tests/__init__.py diff --git a/oaebu_workflows/onix_workflow/__init__.py b/tests/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/onix_workflow/__init__.py rename to tests/fixtures/__init__.py diff --git a/oaebu_workflows/onix_workflow/schema/__init__.py b/tests/fixtures/onix_utils/__init__.py similarity index 100% rename from oaebu_workflows/onix_workflow/schema/__init__.py rename to tests/fixtures/onix_utils/__init__.py diff --git a/oaebu_workflows/tests/fixtures/onix_utils/input_metadata.xml b/tests/fixtures/onix_utils/input_metadata.xml similarity index 100% rename from oaebu_workflows/tests/fixtures/onix_utils/input_metadata.xml rename to tests/fixtures/onix_utils/input_metadata.xml diff --git a/oaebu_workflows/tests/fixtures/onix_utils/output_metadata.jsonl b/tests/fixtures/onix_utils/output_metadata.jsonl similarity index 100% rename from oaebu_workflows/tests/fixtures/onix_utils/output_metadata.jsonl rename to tests/fixtures/onix_utils/output_metadata.jsonl diff --git a/oaebu_workflows/tests/fixtures/onix_utils/test_subjects_expected.json b/tests/fixtures/onix_utils/test_subjects_expected.json similarity index 100% rename from oaebu_workflows/tests/fixtures/onix_utils/test_subjects_expected.json rename to tests/fixtures/onix_utils/test_subjects_expected.json diff --git a/oaebu_workflows/tests/fixtures/onix_utils/test_subjects_input.json b/tests/fixtures/onix_utils/test_subjects_input.json similarity index 100% rename from oaebu_workflows/tests/fixtures/onix_utils/test_subjects_input.json rename to tests/fixtures/onix_utils/test_subjects_input.json diff --git a/oaebu_workflows/onix_workflow/sql/__init__.py b/tests/google_books_telescope/__init__.py similarity index 100% rename from oaebu_workflows/onix_workflow/sql/__init__.py rename to tests/google_books_telescope/__init__.py diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleBooksTrafficReport_2020_02.csv b/tests/google_books_telescope/fixtures/GoogleBooksTrafficReport_2020_02.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleBooksTrafficReport_2020_02.csv rename to tests/google_books_telescope/fixtures/GoogleBooksTrafficReport_2020_02.csv diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleBooksTrafficReport_bar2020_02.csv b/tests/google_books_telescope/fixtures/GoogleBooksTrafficReport_bar2020_02.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleBooksTrafficReport_bar2020_02.csv rename to tests/google_books_telescope/fixtures/GoogleBooksTrafficReport_bar2020_02.csv diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleBooksTrafficReport_foo2020_02.csv b/tests/google_books_telescope/fixtures/GoogleBooksTrafficReport_foo2020_02.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleBooksTrafficReport_foo2020_02.csv rename to tests/google_books_telescope/fixtures/GoogleBooksTrafficReport_foo2020_02.csv diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_2020_02.csv b/tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_2020_02.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_2020_02.csv rename to tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_2020_02.csv diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_bar2020_02.csv b/tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_bar2020_02.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_bar2020_02.csv rename to tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_bar2020_02.csv diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_foo2020_02.csv b/tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_foo2020_02.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_foo2020_02.csv rename to tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_foo2020_02.csv diff --git a/oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_foo2020_03.csv b/tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_foo2020_03.csv similarity index 100% rename from oaebu_workflows/google_books_telescope/tests/fixtures/GoogleSalesTransactionReport_foo2020_03.csv rename to tests/google_books_telescope/fixtures/GoogleSalesTransactionReport_foo2020_03.csv diff --git a/oaebu_workflows/onix_workflow/tests/__init__.py b/tests/google_books_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/onix_workflow/tests/__init__.py rename to tests/google_books_telescope/fixtures/__init__.py diff --git a/tests/google_books_telescope/test_google_books_telescope.py b/tests/google_books_telescope/test_google_books_telescope.py new file mode 100644 index 00000000..77b3f27b --- /dev/null +++ b/tests/google_books_telescope/test_google_books_telescope.py @@ -0,0 +1,344 @@ +# Copyright 2020-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: Aniek Roelofs, Keegan Smith + +import os +import shutil +from unittest.mock import patch + +import pendulum +from airflow.exceptions import AirflowException +from airflow.models.connection import Connection +from airflow.utils.state import State +from click.testing import CliRunner + +from oaebu_workflows.config import test_fixtures_folder, module_file_path +from oaebu_workflows.oaebu_partners import partner_from_str +from oaebu_workflows.google_books_telescope.google_books_telescope import ( + GoogleBooksRelease, + create_dag, + gb_transform, +) +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment +from observatory_platform.sandbox.test_utils import SandboxTestCase, find_free_port +from observatory_platform.sandbox.sftp_server import SftpServer +from observatory_platform.google.bigquery import bq_table_id +from observatory_platform.sftp import SftpFolders +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.airflow.workflow import Workflow + + +class TestGoogleBooksTelescope(SandboxTestCase): + """Tests for the GoogleBooks telescope""" + + def __init__(self, *args, **kwargs): + """Constructor which sets up variables used by tests. + :param args: arguments. + :param kwargs: keyword arguments. + """ + + super(TestGoogleBooksTelescope, self).__init__(*args, **kwargs) + self.project_id = os.getenv("TEST_GCP_PROJECT_ID") + self.data_location = os.getenv("TEST_GCP_DATA_LOCATION") + self.sftp_port = find_free_port() + + def test_dag_structure(self): + """Test that the Google Books DAG has the correct structure.""" + dag = create_dag(dag_id="test_dag", cloud_workspace=self.fake_cloud_workspace, sftp_root="/") + self.assert_dag_structure( + { + "check_dependencies": ["fetch_releases"], + "fetch_releases": [ + "process_release.move_files_to_in_progress", + "process_release.download", + "process_release.transform", + "process_release.move_files_to_finished", + "process_release.bq_load", + "process_release.add_new_dataset_release", + "process_release.cleanup_workflow", + ], + "process_release.move_files_to_in_progress": ["process_release.download"], + "process_release.download": ["process_release.transform"], + "process_release.transform": ["process_release.move_files_to_finished"], + "process_release.move_files_to_finished": ["process_release.bq_load"], + "process_release.bq_load": ["process_release.add_new_dataset_release"], + "process_release.add_new_dataset_release": ["process_release.cleanup_workflow"], + "process_release.cleanup_workflow": [], + }, + dag, + ) + + def test_dag_load(self): + """Test that the Google Books DAG can be loaded from a DAG bag.""" + # Run tests both for telescope with file suffixes and without + env = SandboxEnvironment( + workflows=[ + Workflow( + dag_id="google_books", + name="My Google Books Telescope", + class_name="oaebu_workflows.google_books_telescope.google_books_telescope.create_dag", + cloud_workspace=self.fake_cloud_workspace, + ) + ] + ) + with env.create(): + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("google_books", dag_file) + + def test_telescope(self): + """Test the Google Books telescope end to end.""" + fixtures_folder = test_fixtures_folder(workflow_module="google_books_telescope") + params = { + "no_download_files": 2, + "bq_rows": 4, + "traffic_download_hash": "db4dca44d5231e0c4e2ad95db41b79b6", + "traffic_transform_hash": "b8073007", + "sales_download_hash": "6496518be1ea73694d0a8f89c0b42f20", + "sales_transform_hash": "ebe49987", + "test_files": { + "GoogleBooksTrafficReport_2020_02.csv": os.path.join( + fixtures_folder, "GoogleBooksTrafficReport_2020_02.csv" + ), + "GoogleSalesTransactionReport_2020_02.csv": os.path.join( + fixtures_folder, "GoogleSalesTransactionReport_2020_02.csv" + ), + }, + } + + # Setup Observatory environment + env = SandboxEnvironment(project_id=self.project_id, data_location=self.data_location) + sftp_server = SftpServer(host="localhost", port=self.sftp_port) + dataset_id = env.add_dataset() + + # Create the Observatory environment and run tests + with env.create(): + with sftp_server.create() as sftp_root: + + # Setup DAG + execution_date = pendulum.datetime(year=2021, month=3, day=31) + sales_partner = partner_from_str("google_books_sales") + sales_partner.bq_dataset_id = dataset_id + traffic_partner = partner_from_str("google_books_traffic") + traffic_partner.bq_dataset_id = dataset_id + sftp_service_conn_id = "sftp_service" + dag_id = "google_books_test" + api_dataset_id = env.add_dataset() + dag = create_dag( + dag_id=dag_id, + cloud_workspace=env.cloud_workspace, + sftp_root="/", # Unintuitive, but this is correct + sales_partner=sales_partner, + traffic_partner=traffic_partner, + sftp_service_conn_id=sftp_service_conn_id, + api_dataset_id=api_dataset_id, + ) + + # Add SFTP connection + env.add_connection( + Connection(conn_id=sftp_service_conn_id, uri=f"ssh://:password@localhost:{self.sftp_port}") + ) + with env.create_dag_run(dag, execution_date): + # Test that all dependencies are specified: no error should be thrown + ti = env.run_task("check_dependencies") + self.assertEqual(ti.state, State.SUCCESS) + + # Add file to SFTP server + sftp_folders = SftpFolders(dag_id, sftp_conn_id=sftp_service_conn_id, sftp_root=sftp_root) + os.makedirs(sftp_folders.upload, exist_ok=True) + for file_name, file_path in params["test_files"].items(): + upload_file = os.path.join(sftp_folders.upload, file_name) + shutil.copy(file_path, upload_file) + + # Test that make release is successful + ti = env.run_task("fetch_releases") + self.assertEqual(ti.state, State.SUCCESS) + release_dicts = ti.xcom_pull(task_ids="fetch_releases", include_prior_dates=False) + expected_release_dicts = [ + { + "dag_id": "google_books_test", + "run_id": "scheduled__2021-03-31T00:00:00+00:00", + "partition_date": "2020-02-29", + "sftp_files": [ + "/workflows/google_books_test/in_progress/GoogleBooksTrafficReport_2020_02.csv", + "/workflows/google_books_test/in_progress/GoogleSalesTransactionReport_2020_02.csv", + ], + } + ] + self.assertEqual(release_dicts, expected_release_dicts) + release = GoogleBooksRelease.from_dict(release_dicts[0]) + + # Test move file to in progress + ti = env.run_task("process_release.move_files_to_in_progress", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + for file in release.sftp_files: + file_name = os.path.basename(file) + upload_file = os.path.join(sftp_folders.upload, file_name) + self.assertFalse(os.path.isfile(upload_file)) + in_progress_file = os.path.join(sftp_folders.in_progress, file_name) + self.assertTrue(os.path.isfile(in_progress_file)) + + # Run main telescope tasks + ti = env.run_task("process_release.download", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + ti = env.run_task("process_release.transform", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + ti = env.run_task("process_release.move_files_to_finished", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + ti = env.run_task("process_release.bq_load", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + + # Make assertions for the above tasks + # Test download + self.assertTrue(os.path.exists(release.download_traffic_path)) + self.assertTrue(os.path.exists(release.download_sales_path)) + self.assert_file_integrity(release.download_traffic_path, params["traffic_download_hash"], "md5") + self.assert_file_integrity(release.download_sales_path, params["sales_download_hash"], "md5") + + # Test upload downloaded + self.assert_blob_integrity( + env.download_bucket, + gcs_blob_name_from_path(release.download_traffic_path), + release.download_traffic_path, + ) + self.assert_blob_integrity( + env.download_bucket, + gcs_blob_name_from_path(release.download_sales_path), + release.download_sales_path, + ) + + # Test that file transformed + self.assertTrue(os.path.exists(release.transform_sales_path)) + self.assertTrue(os.path.exists(release.transform_traffic_path)) + self.assert_file_integrity(release.transform_sales_path, params["sales_transform_hash"], "gzip_crc") + self.assert_file_integrity( + release.transform_traffic_path, params["traffic_transform_hash"], "gzip_crc" + ) + + # Test that transformed file uploaded + self.assert_blob_integrity( + env.transform_bucket, + gcs_blob_name_from_path(release.transform_traffic_path), + release.transform_traffic_path, + ) + self.assert_blob_integrity( + env.transform_bucket, + gcs_blob_name_from_path(release.transform_sales_path), + release.transform_sales_path, + ) + + # Test that files correctly moved to "finished" + for file in release.sftp_files: + file_name = os.path.basename(file) + in_progress_file = os.path.join(sftp_folders.in_progress, file_name) + self.assertFalse(os.path.isfile(in_progress_file)) + + finished_file = os.path.join(sftp_folders.finished, file_name) + self.assertTrue(os.path.isfile(finished_file)) + + # Test that data loaded into BigQuery + table_id = bq_table_id( + env.cloud_workspace.project_id, + sales_partner.bq_dataset_id, + sales_partner.bq_table_name, + ) + self.assert_table_integrity(table_id, params["bq_rows"]) + table_id = bq_table_id( + env.cloud_workspace.project_id, + traffic_partner.bq_dataset_id, + traffic_partner.bq_table_name, + ) + self.assert_table_integrity(table_id, params["bq_rows"]) + + # Set up the API and check + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) + self.assertEqual(len(dataset_releases), 0) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch( + "oaebu_workflows.google_books_telescope.google_books_telescope.pendulum.now" + ) as mock_now: + mock_now.return_value = now + ti = env.run_task("process_release.add_new_dataset_release", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) + self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2021-03-31T00:00:00+00:00", + "data_interval_end": "2021-04-04T12:00:00+00:00", + "snapshot_date": None, + "partition_date": "2020-02-29T00:00:00+00:00", + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) + + # Test cleanup + workflow_folder_path = release.workflow_folder + ti = env.run_task("process_release.cleanup_workflow", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + self.assert_cleanup(workflow_folder_path) + + def test_gb_transform(self): + """Test sanity check in transform method when transaction date falls outside release month""" + with CliRunner().isolated_filesystem(): + + # Files and folders + transform_dir = os.path.join(os.getcwd(), "transform") + os.makedirs(transform_dir) + fixtures_folder = test_fixtures_folder(workflow_module="google_books_telescope") + sales_file_path = os.path.join(fixtures_folder, "GoogleSalesTransactionReport_2020_02.csv") + traffic_file_path = os.path.join(fixtures_folder, "GoogleBooksTrafficReport_2020_02.csv") + transform_sales_path = os.path.join(transform_dir, "GoogleSalesTransactionReport_2020_02.csv") + transform_traffic_path = os.path.join(transform_dir, "GoogleBooksTrafficReport_2020_02.csv") + + # test transaction date inside of release month + gb_transform( + [sales_file_path, traffic_file_path], + transform_sales_path, + transform_traffic_path, + pendulum.parse("2020-02-01"), + ) + self.assertTrue(os.path.exists(transform_sales_path)) + self.assertTrue(os.path.exists(transform_traffic_path)) + + # test transaction date before release month + with self.assertRaises(AirflowException): + gb_transform( + [sales_file_path, traffic_file_path], + transform_sales_path, + transform_traffic_path, + pendulum.parse("2020-01-31"), + ) + + # test transaction date after release month + with self.assertRaises(AirflowException): + gb_transform( + [sales_file_path, traffic_file_path], + transform_sales_path, + transform_traffic_path, + pendulum.parse("2020-03-01"), + ) diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/__init__.py b/tests/irus_fulcrum_telescope/__init__.py similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/__init__.py rename to tests/irus_fulcrum_telescope/__init__.py diff --git a/oaebu_workflows/schema/__init__.py b/tests/irus_fulcrum_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/schema/__init__.py rename to tests/irus_fulcrum_telescope/fixtures/__init__.py diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/fulcrum_download_cassette.yaml b/tests/irus_fulcrum_telescope/fixtures/fulcrum_download_cassette.yaml similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/fulcrum_download_cassette.yaml rename to tests/irus_fulcrum_telescope/fixtures/fulcrum_download_cassette.yaml diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_country_download.jsonl b/tests/irus_fulcrum_telescope/fixtures/test_country_download.jsonl similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_country_download.jsonl rename to tests/irus_fulcrum_telescope/fixtures/test_country_download.jsonl diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_final_table.json b/tests/irus_fulcrum_telescope/fixtures/test_final_table.json similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_final_table.json rename to tests/irus_fulcrum_telescope/fixtures/test_final_table.json diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_totals_download.jsonl b/tests/irus_fulcrum_telescope/fixtures/test_totals_download.jsonl similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_totals_download.jsonl rename to tests/irus_fulcrum_telescope/fixtures/test_totals_download.jsonl diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_transform.jsonl b/tests/irus_fulcrum_telescope/fixtures/test_transform.jsonl similarity index 100% rename from oaebu_workflows/irus_fulcrum_telescope/tests/fixtures/test_transform.jsonl rename to tests/irus_fulcrum_telescope/fixtures/test_transform.jsonl diff --git a/oaebu_workflows/irus_fulcrum_telescope/tests/test_irus_fulcrum_telescope.py b/tests/irus_fulcrum_telescope/test_irus_fulcrum_telescope.py similarity index 60% rename from oaebu_workflows/irus_fulcrum_telescope/tests/test_irus_fulcrum_telescope.py rename to tests/irus_fulcrum_telescope/test_irus_fulcrum_telescope.py index 1250fcc9..269e0801 100644 --- a/oaebu_workflows/irus_fulcrum_telescope/tests/test_irus_fulcrum_telescope.py +++ b/tests/irus_fulcrum_telescope/test_irus_fulcrum_telescope.py @@ -1,4 +1,4 @@ -# Copyright 2022-2023 Curtin University +# Copyright 2022-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ # Author: Keegan Smith import os +from unittest.mock import patch import pendulum import vcr @@ -24,26 +25,24 @@ from oaebu_workflows.config import test_fixtures_folder from oaebu_workflows.oaebu_partners import partner_from_str from oaebu_workflows.irus_fulcrum_telescope.irus_fulcrum_telescope import ( - IrusFulcrumTelescope, + IrusFulcrumRelease, + create_dag, download_fulcrum_month_data, transform_fulcrum_data, ) -from observatory.platform.files import load_jsonl -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - load_and_parse_json, -) -from observatory.platform.api import get_dataset_releases -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.bigquery import bq_table_id -from observatory.platform.observatory_config import Workflow +from observatory_platform.files import load_jsonl +from observatory_platform.config import module_file_path +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.test_utils import SandboxTestCase, load_and_parse_json +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.google.bigquery import bq_table_id FAKE_PUBLISHERS = ["Fake Publisher 1", "Fake Publisher 2", "Fake Publisher 3"] -class TestIrusFulcrumTelescope(ObservatoryTestCase): +class TestIrusFulcrumTelescope(SandboxTestCase): """Tests for the Fulcrum telescope""" def __init__(self, *args, **kwargs): @@ -66,97 +65,100 @@ def __init__(self, *args, **kwargs): def test_dag_structure(self): """Test that the ONIX DAG has the correct structure and raises errors when necessary""" - dag = IrusFulcrumTelescope( - dag_id="fulcrum_test", cloud_workspace=self.fake_cloud_workspace, publishers=FAKE_PUBLISHERS - ).make_dag() - + dag = create_dag(dag_id="fulcrum_test", cloud_workspace=self.fake_cloud_workspace, publishers=FAKE_PUBLISHERS) self.assert_dag_structure( { - "check_dependencies": ["download"], - "download": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], + "check_dependencies": ["make_release"], + "make_release": ["transform", "cleanup_workflow", "download", "add_new_dataset_releases", "bq_load"], + "download": ["transform"], + "transform": ["bq_load"], "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "add_new_dataset_releases": ["cleanup_workflow"], + "cleanup_workflow": [], }, dag, ) def test_dag_load(self): """Test that the DAG can be loaded from a DAG bag.""" - env = ObservatoryEnvironment( + env = SandboxEnvironment( workflows=[ Workflow( dag_id="fulcrum_test", name="Fulcrum Telescope", - class_name="oaebu_workflows.irus_fulcrum_telescope.irus_fulcrum_telescope.IrusFulcrumTelescope", + class_name="oaebu_workflows.irus_fulcrum_telescope.irus_fulcrum_telescope.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict(publishers=[FAKE_PUBLISHERS]), ) ] ) with env.create(): - self.assert_dag_load_from_config("fulcrum_test") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("fulcrum_test", dag_file) def test_telescope(self): """Test the Fulcrum telescope end to end.""" + # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.project_id, self.data_location) + # Create the Observatory environment and run tests with env.create(): - # Setup Telescope + # Setup DAG execution_date = pendulum.datetime(year=2022, month=4, day=7) - partner = partner_from_str("irus_fulcrum") - partner.bq_dataset_id = env.add_dataset() - telescope = IrusFulcrumTelescope( - dag_id="fulcrum_test", + data_partner = partner_from_str("irus_fulcrum") + data_partner.bq_dataset_id = env.add_dataset() + api_dataset_id = env.add_dataset() + dag_id = "fulcrum_test" + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, publishers=FAKE_PUBLISHERS, - data_partner=partner, + data_partner=data_partner, + api_dataset_id=api_dataset_id, ) - dag = telescope.make_dag() - env.add_connection(Connection(conn_id=telescope.irus_oapen_api_conn_id, uri=f"http://fake_api_login:@")) + env.add_connection(Connection(conn_id="irus_api", uri=f"http://fake_api_login:@")) # Add the fake requestor ID as a connection with env.create_dag_run(dag, execution_date): # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) + ti = env.run_task("check_dependencies") + self.assertEqual(ti.state, State.SUCCESS) + + # Test that make release is successful + ti = env.run_task("make_release") self.assertEqual(ti.state, State.SUCCESS) + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + expected_release_dict = { + "dag_id": "fulcrum_test", + "run_id": "scheduled__2022-04-07T00:00:00+00:00", + "data_interval_start": "2022-04-01", + "data_interval_end": "2022-05-01", + "partition_date": "2022-04-30", + } + self.assertEqual(release_dict, expected_release_dict) # Test download - fulcrum_vcr = vcr.VCR(record_mode="none") + # Ignore the googleapis host so the upload step works + fulcrum_vcr = vcr.VCR( + record_mode="none", ignore_hosts=["oauth2.googleapis.com", "storage.googleapis.com"] + ) with fulcrum_vcr.use_cassette(self.download_cassette): - ti = env.run_task(telescope.download.__name__) + ti = env.run_task("download") self.assertEqual(ti.state, State.SUCCESS) - # Test upload downloaded - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) - # Test transform - ti = env.run_task(telescope.transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Test upload to cloud storage - ti = env.run_task(telescope.upload_transformed.__name__) + ti = env.run_task("transform") self.assertEqual(ti.state, State.SUCCESS) # Test load into BigQuery - ti = env.run_task(telescope.bq_load.__name__) + ti = env.run_task("bq_load") self.assertEqual(ti.state, State.SUCCESS) ### Make assertions ## # Create the release - release = telescope.make_release( - run_id=env.dag_run.run_id, - data_interval_start=pendulum.parse(str(env.dag_run.data_interval_start)), - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - ) + release = IrusFulcrumRelease.from_dict(release_dict) # Downloaded files self.assert_file_integrity(release.download_totals_path, "95b7dceb", "gzip_crc") @@ -184,9 +186,7 @@ def test_telescope(self): # Uploaded table table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.data_partner.bq_dataset_id, - telescope.data_partner.bq_table_name, + env.cloud_workspace.project_id, data_partner.bq_dataset_id, data_partner.bq_table_name ) self.assert_table_integrity(table_id, expected_rows=3) self.assert_table_content( @@ -195,18 +195,43 @@ def test_telescope(self): primary_key="proprietary_id", ) - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.irus_fulcrum_telescope.irus_fulcrum_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("add_new_dataset_releases") self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2022-04-01T00:00:00+00:00", + "data_interval_end": "2022-05-01T00:00:00+00:00", + "snapshot_date": None, + "partition_date": "2022-04-30T00:00:00+00:00", + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test cleanup - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("cleanup_workflow") self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) def test_download_fulcrum_month_data(self): """Tests the download_fuclrum_month_data function""" diff --git a/oaebu_workflows/sql/__init__.py b/tests/irus_oapen_telescope/__init__.py similarity index 100% rename from oaebu_workflows/sql/__init__.py rename to tests/irus_oapen_telescope/__init__.py diff --git a/oaebu_workflows/sql/internet_archive/__init__.py b/tests/irus_oapen_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/sql/internet_archive/__init__.py rename to tests/irus_oapen_telescope/fixtures/__init__.py diff --git a/oaebu_workflows/irus_oapen_telescope/tests/fixtures/download.jsonl.gz b/tests/irus_oapen_telescope/fixtures/download.jsonl.gz similarity index 100% rename from oaebu_workflows/irus_oapen_telescope/tests/fixtures/download.jsonl.gz rename to tests/irus_oapen_telescope/fixtures/download.jsonl.gz diff --git a/oaebu_workflows/irus_oapen_telescope/tests/test_irus_oapen_telescope.py b/tests/irus_oapen_telescope/test_irus_oapen_telescope.py similarity index 59% rename from oaebu_workflows/irus_oapen_telescope/tests/test_irus_oapen_telescope.py rename to tests/irus_oapen_telescope/test_irus_oapen_telescope.py index e9c7c621..34e1cca7 100644 --- a/oaebu_workflows/irus_oapen_telescope/tests/test_irus_oapen_telescope.py +++ b/tests/irus_oapen_telescope/test_irus_oapen_telescope.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,29 +30,29 @@ from googleapiclient.discovery import build from googleapiclient.http import RequestMockBuilder -from oaebu_workflows.config import test_fixtures_folder +from oaebu_workflows.config import test_fixtures_folder, module_file_path from oaebu_workflows.oaebu_partners import partner_from_str from oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope import ( + IRUS_FUNCTION_SOURCE_URL, + IRUS_FUNCTION_BLOB_NAME, + IRUS_FUNCTION_NAME, + IRUS_FUNCTION_REGION, IrusOapenRelease, - IrusOapenTelescope, call_cloud_function, cloud_function_exists, create_cloud_function, upload_source_code_to_bucket, + create_dag, ) -from observatory.platform.api import get_dataset_releases -from observatory.platform.observatory_config import CloudWorkspace, Workflow -from observatory.platform.gcs import gcs_blob_name_from_path, gcs_upload_file -from observatory.platform.bigquery import bq_table_id -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - random_id, -) +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.airflow.workflow import CloudWorkspace, Workflow +from observatory_platform.google.gcs import gcs_blob_name_from_path, gcs_upload_file +from observatory_platform.google.bigquery import bq_table_id +from observatory_platform.sandbox.test_utils import SandboxTestCase, find_free_port, random_id +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment -class TestIrusOapenTelescope(ObservatoryTestCase): +class TestIrusOapenTelescope(SandboxTestCase): """Tests for the Oapen Irus Uk telescope""" def __init__(self, *args, **kwargs): @@ -71,23 +71,31 @@ def __init__(self, *args, **kwargs): def test_dag_structure(self): """Test that the Oapen Irus Uk DAG has the correct structure.""" - dag = IrusOapenTelescope( + dag = create_dag( dag_id="irus_oapen_test_dag", cloud_workspace=self.fake_cloud_workspace, publisher_name_v4=self.publisher_name_v4, publisher_uuid_v5=self.publisher_uuid_v5, - ).make_dag() + ) self.assert_dag_structure( { - "check_dependencies": ["create_cloud_function"], - "create_cloud_function": ["call_cloud_function"], - "call_cloud_function": ["transfer"], - "transfer": ["download_transform"], - "download_transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], - "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "check_dependencies": ["fetch_releases"], + "fetch_releases": [ + "process_release.transfer", + "process_release.transform", + "process_release.call_cloud_function_", + "create_cloud_function_", + "process_release.bq_load", + "process_release.add_new_dataset_releases", + "process_release.cleanup_workflow", + ], + "create_cloud_function_": ["process_release.call_cloud_function_"], + "process_release.call_cloud_function_": ["process_release.transfer"], + "process_release.transfer": ["process_release.transform"], + "process_release.transform": ["process_release.bq_load"], + "process_release.bq_load": ["process_release.add_new_dataset_releases"], + "process_release.add_new_dataset_releases": ["process_release.cleanup_workflow"], + "process_release.cleanup_workflow": [], }, dag, ) @@ -95,44 +103,48 @@ def test_dag_structure(self): def test_dag_load(self): """Test that the Oapen Irus Uk DAG can be loaded from a DAG bag.""" - env = ObservatoryEnvironment( + env = SandboxEnvironment( workflows=[ Workflow( dag_id="irus_oapen_test", name="My Oapen Irus UK Workflow", - class_name="oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.IrusOapenTelescope", + class_name="oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict(publisher_name_v4=self.publisher_name_v4, publisher_uuid_v5=self.publisher_uuid_v5), ) ], ) with env.create(): - self.assert_dag_load_from_config("irus_oapen_test") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("irus_oapen_test", dag_file) @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.build") @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.ServiceAccountCredentials") @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.AuthorizedSession.post") def test_telescope(self, mock_authorized_session, mock_account_credentials, mock_build): - """Test the Oapen Irus Uk telescope end to end.""" + """Test the IRUS OAPEN telescope end to end.""" + # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.project_id, self.data_location) - # Setup Telescope + # Setup DAG execution_date = pendulum.datetime(year=2021, month=2, day=14) - patner = partner_from_str("irus_oapen") - patner.bq_dataset_id = env.add_dataset() - telescope = IrusOapenTelescope( - dag_id="irus_oapen_test", + data_partner = partner_from_str("irus_oapen") + data_partner.bq_dataset_id = env.add_dataset() + dag_id = "irus_oapen_test" + gdpr_bucket_id = env.add_bucket() + api_dataset_id = env.add_dataset() + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, publisher_name_v4=self.publisher_name_v4, publisher_uuid_v5=self.publisher_uuid_v5, - data_partner=patner, + data_partner=data_partner, + gdpr_oapen_project_id=env.project_id, + gdpr_oapen_bucket_id=gdpr_bucket_id, + api_dataset_id=api_dataset_id, ) # Fake oapen project and bucket - IrusOapenTelescope.OAPEN_PROJECT_ID = env.project_id - IrusOapenTelescope.OAPEN_BUCKET = random_id() # Mock the Google Cloud Functions API service mock_account_credentials.from_json_keyfile_dict.return_value = "" @@ -168,35 +180,43 @@ def test_telescope(self, mock_authorized_session, mock_account_credentials, mock requestBuilder=request_builder, ) - dag = telescope.make_dag() - # Create the Observatory environment and run tests with env.create(task_logging=True): with env.create_dag_run(dag, execution_date): - # Use release to check results from tasks - # release = IrusOapenRelease( - # dag_id=telescope.dag_id, run_id=env.dag_run.run_id, partition_date=execution_date.end_of("month") - # ) - release = telescope.make_release( - run_id=env.dag_run.run_id, - data_interval_start=pendulum.parse(str(env.dag_run.data_interval_start)), - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - )[0] # Add airflow connections - conn = Connection(conn_id=telescope.geoip_license_conn_id, uri="http://email_address:password@") + geoip_license_conn_id = "geoip_license_key" + conn = Connection(conn_id=geoip_license_conn_id, uri="http://email_address:password@") env.add_connection(conn) - conn = Connection(conn_id=telescope.irus_oapen_api_conn_id, uri="mysql://requestor_id:api_key@") + irus_oapen_api_conn_id = "irus_api" + conn = Connection(conn_id=irus_oapen_api_conn_id, uri="mysql://requestor_id:api_key@") env.add_connection(conn) - conn = Connection(conn_id=telescope.irus_oapen_login_conn_id, uri="mysql://user_id:license_key@") + irus_oapen_login_conn_id = "irus_login" + conn = Connection(conn_id=irus_oapen_login_conn_id, uri="mysql://user_id:license_key@") env.add_connection(conn) # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) + ti = env.run_task("check_dependencies") self.assertEqual(ti.state, State.SUCCESS) + # Make the release + ti = env.run_task("fetch_releases") + self.assertEqual(ti.state, State.SUCCESS) + release_dicts = ti.xcom_pull(task_ids="fetch_releases", include_prior_dates=False) + expected_release_dicts = [ + { + "dag_id": "irus_oapen_test", + "run_id": "scheduled__2021-02-14T00:00:00+00:00", + "data_interval_start": "2021-02-01T00:00:00+00:00", + "data_interval_end": "2021-03-01T00:00:00+00:00", + "partition_date": "2021-02-28T23:59:59.999999+00:00", + } + ] + self.assertEqual(release_dicts, expected_release_dicts) + release = IrusOapenRelease.from_dict(release_dicts[0]) + # Test create cloud function task: no error should be thrown - ti = env.run_task(telescope.create_cloud_function.__name__) + ti = env.run_task("create_cloud_function_") self.assertEqual(ti.state, State.SUCCESS) # Test call cloud function task: no error should be thrown @@ -213,57 +233,78 @@ def test_telescope(self, mock_authorized_session, mock_account_credentials, mock ) url = "https://oapen-access-stats-kkinbzaigla-ew.a.run.app" httpretty.register_uri(httpretty.POST, url, body="") - ti = env.run_task(telescope.call_cloud_function.__name__) + ti = env.run_task("process_release.call_cloud_function_", map_index=0) self.assertEqual(ti.state, State.SUCCESS) # Test transfer task gcs_upload_file( - bucket_name=IrusOapenTelescope.OAPEN_BUCKET, - blob_name=release.blob_name, + bucket_name=gdpr_bucket_id, + blob_name=release.download_blob_name, file_path=self.download_path, ) - ti = env.run_task(telescope.transfer.__name__) + ti = env.run_task("process_release.transfer", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - self.assert_blob_integrity(env.download_bucket, release.blob_name, self.download_path) + self.assert_blob_integrity(env.download_bucket, release.download_blob_name, self.download_path) - # Test download_transform task - ti = env.run_task(telescope.download_transform.__name__) + # Test transform task + ti = env.run_task("process_release.transform", map_index=0) self.assertEqual(ti.state, State.SUCCESS) self.assertTrue(os.path.exists(release.transform_path)) self.assert_file_integrity(release.transform_path, "0b111b2f", "gzip_crc") - - # Test that transformed file uploaded - ti = env.run_task(telescope.upload_transformed.__name__) - self.assertEqual(ti.state, State.SUCCESS) self.assert_blob_integrity( env.transform_bucket, gcs_blob_name_from_path(release.transform_path), release.transform_path ) - # Test that data loaded into BigQuery - ti = env.run_task(telescope.bq_load.__name__) + # Test that data loads into BigQuery + ti = env.run_task("process_release.bq_load", map_index=0) self.assertEqual(ti.state, State.SUCCESS) table_id = bq_table_id( - project_id=telescope.cloud_workspace.project_id, - dataset_id=telescope.data_partner.bq_dataset_id, - table_id=telescope.data_partner.bq_table_name, + project_id=env.cloud_workspace.project_id, + dataset_id=data_partner.bq_dataset_id, + table_id=data_partner.bq_table_name, ) self.assert_table_integrity(table_id, 2) # Delete oapen bucket - env._delete_bucket(IrusOapenTelescope.OAPEN_BUCKET) + env._delete_bucket(gdpr_bucket_id) # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("process_release.add_new_dataset_releases", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2021-02-01T00:00:00+00:00", + "data_interval_end": "2021-03-01T00:00:00+00:00", + "snapshot_date": None, + "partition_date": "2021-02-28T23:59:59.999999+00:00", + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test that all telescope data deleted - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("process_release.cleanup_workflow", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) @patch("observatory.platform.airflow.Variable.get") @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.upload_source_code_to_bucket") @@ -282,10 +323,10 @@ def reset_mocks(): def assert_mocks(create: bool, update: bool): mock_upload.assert_called_once_with( - telescope.FUNCTION_SOURCE_URL, - telescope.OAPEN_PROJECT_ID, - telescope.OAPEN_BUCKET, - telescope.FUNCTION_BLOB_NAME, + IRUS_FUNCTION_SOURCE_URL, + gdpr_oapen_project_id, + gdpr_oapen_bucket_id, + IRUS_FUNCTION_BLOB_NAME, release.cloud_function_path, ) mock_function_exists.assert_called_once_with(ANY, full_name) @@ -294,9 +335,9 @@ def assert_mocks(create: bool, update: bool): ANY, location, full_name, - telescope.OAPEN_BUCKET, - telescope.FUNCTION_BLOB_NAME, - telescope.max_active_runs, + gdpr_oapen_bucket_id, + IRUS_FUNCTION_BLOB_NAME, + max_active_runs, update, ) else: @@ -311,122 +352,73 @@ def assert_mocks(create: bool, update: bool): transform_bucket="transform_bucket", data_location="us", ) - telescope = IrusOapenTelescope( - dag_id="irus_oapen_test", + dag_id = "irus_oapen_test" + publisher_name_v4 = "publisher" + publisher_uuid_v5 = "publisherUUID" + bq_dataset_id = "dataset_id" + gdpr_oapen_project_id = "oapen-usage-data-gdpr-proof" + gdpr_oapen_bucket_id = "oapen-usage-data-gdpr-proof_cloud-function" + max_active_runs = 5 + dag = create_dag( + dag_id=dag_id, cloud_workspace=cloud_workspace, - publisher_name_v4="publisher", - publisher_uuid_v5="publisherUUID", - bq_dataset_id="dataset_id", - ) - release = IrusOapenRelease( - dag_id=telescope.dag_id, run_id=random_id(), partition_date=pendulum.parse("2020-02-01") + publisher_name_v4=publisher_name_v4, + publisher_uuid_v5=publisher_uuid_v5, + bq_dataset_id=bq_dataset_id, + gdpr_oapen_project_id=gdpr_oapen_project_id, + gdpr_oapen_bucket_id=gdpr_oapen_bucket_id, + max_active_runs=max_active_runs, ) - location = f"projects/{telescope.OAPEN_PROJECT_ID}/locations/{telescope.FUNCTION_REGION}" - full_name = f"{location}/functions/{telescope.FUNCTION_NAME}" + release = IrusOapenRelease(dag_id=dag_id, run_id=random_id(), partition_date=pendulum.parse("2020-02-01")) + location = f"projects/{gdpr_oapen_project_id}/locations/{IRUS_FUNCTION_REGION}" + full_name = f"{location}/functions/{IRUS_FUNCTION_NAME}" - # Test when source code upload was unsuccessful - mock_upload.return_value = False, False - task_instance = MagicMock() - # context = dict(ti=task_instance) - with self.assertRaises(AirflowException): - telescope.create_cloud_function(releases=[release], ti=task_instance) - - # Test when cloud function does not exist - reset_mocks() - mock_upload.return_value = True, True - mock_function_exists.return_value = False - mock_create_function.return_value = True, "response" - telescope.create_cloud_function(releases=[release], ti=task_instance) - assert_mocks(create=True, update=False) - - # Test when cloud function exists, but source code has changed - reset_mocks() - mock_upload.return_value = True, True - mock_function_exists.return_value = True - mock_create_function.return_value = True, "response" - telescope.create_cloud_function(telescope.max_active_runs) - assert_mocks(create=False, update=True) - - # Test when cloud function exists and source code has not changed - reset_mocks() - mock_upload.return_value = True, False - mock_function_exists.return_value = True - telescope.create_cloud_function(releases=[release], ti=task_instance) - assert_mocks(create=False, update=False) - - # Test when create cloud function was unsuccessful - reset_mocks() - mock_upload.return_value = True, True - mock_function_exists.return_value = True - mock_create_function.return_value = False, "response" - with self.assertRaises(AirflowException): - telescope.create_cloud_function(releases=[release], ti=task_instance) - - @patch("observatory.platform.airflow.Variable.get") - @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.BaseHook.get_connection") - @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.call_cloud_function") - @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.cloud_function_exists") - def test_call_cloud_function(self, mock_function_exists, mock_call_function, mock_conn_get, mock_variable_get): - """Test the call_cloud_function method of the IrusOapenRelease - - :param mock_variable_get: Mock Airflow Variable 'data' - """ - connections = { - "geoip_license_key": Connection("geoip_license_key", uri="http://user_id:key@"), - "irus_oapen_api": Connection("irus_oapen_api", uri="http://requestor_id:api_key@"), - "irus_oapen_login": Connection("irus_oapen_login", uri="http://email:password@"), - } - mock_conn_get.side_effect = lambda x: connections[x] - - # Set URI to function url - function_url = "https://oapen-access-stats-kkinbzfjal-ew.a.run.app" - mock_function_exists.return_value = function_url - - with CliRunner().isolated_filesystem(): - mock_variable_get.return_value = os.path.join(os.getcwd(), "data") - cloud_workspace = CloudWorkspace( - project_id=self.project_id, - download_bucket="download_bucket", - transform_bucket="transform_bucket", - data_location="us", + env = SandboxEnvironment( + self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() ) - - # Test new platform and old platform - for date in ["2020-03", "2020-04"]: - # Test for a given publisher name and the 'oapen' publisher - for publisher in [("publisher", "uuid1"), ("oapen", "uuid2")]: - mock_call_function.reset_mock() - - telescope = IrusOapenTelescope( - dag_id="irus_oapen_test", - cloud_workspace=cloud_workspace, - publisher_name_v4=publisher[0], - publisher_uuid_v5=publisher[1], - bq_dataset_id="dataset_id", - ) - release = IrusOapenRelease( - dag_id=telescope.dag_id, run_id=random_id(), partition_date=pendulum.parse(date + "-01") - ) - telescope.call_cloud_function(releases=[release]) - - # Test that the call function is called with the correct args - if date == "2020-04": - username = "requestor_id" - password = "api_key" - else: - username = "email" - password = "password" - mock_call_function.assert_called_once_with( - function_url, - date, - username, - password, - "key", - telescope.publisher_name_v4, - telescope.publisher_uuid_v5, - telescope.OAPEN_BUCKET, - release.blob_name, - ) + with env.create_dag_run(dag, pendulum.datetime(year=2023, month=1, day=1)): + + ti = env.run_task("fetch_releases") + + # Test when source code upload was unsuccessful + mock_upload.return_value = False, False + with self.assertRaises(AirflowException): + env.run_task("create_cloud_function") + dag.clear(task_ids=["create_cloud_function"]) + + # Test when cloud function does not exist + reset_mocks() + mock_upload.return_value = True, True + mock_function_exists.return_value = False + mock_create_function.return_value = True, "response" + env.run_task("create_cloud_function") + assert_mocks(create=True, update=False) + dag.clear(task_ids=["create_cloud_function"]) + + # Test when cloud function exists, but source code has changed + reset_mocks() + mock_upload.return_value = True, True + mock_function_exists.return_value = True + mock_create_function.return_value = True, "response" + env.run_task("create_cloud_function") + assert_mocks(create=False, update=True) + dag.clear(task_ids=["create_cloud_function"]) + + # Test when cloud function exists and source code has not changed + reset_mocks() + mock_upload.return_value = True, False + mock_function_exists.return_value = True + env.run_task("create_cloud_function") + assert_mocks(create=False, update=False) + dag.clear(task_ids=["create_cloud_function"]) + + # Test when create cloud function was unsuccessful + reset_mocks() + mock_upload.return_value = True, True + mock_function_exists.return_value = True + mock_create_function.return_value = False, "response" + with self.assertRaises(AirflowException): + env.run_task("create_cloud_function") @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.gcs_upload_file") @patch("oaebu_workflows.irus_oapen_telescope.irus_oapen_telescope.gcs_create_bucket") @@ -438,23 +430,24 @@ def test_upload_source_code_to_bucket(self, mock_create_bucket, mock_upload_to_b with CliRunner().isolated_filesystem(): cloud_function_path = os.path.join(os.getcwd(), "cloud_function.zip") success, upload = upload_source_code_to_bucket( - IrusOapenTelescope.FUNCTION_SOURCE_URL, - IrusOapenTelescope.OAPEN_PROJECT_ID, - IrusOapenTelescope.OAPEN_BUCKET, - IrusOapenTelescope.FUNCTION_BLOB_NAME, - cloud_function_path, + source_url=IRUS_FUNCTION_SOURCE_URL, + project_id="project_id", + bucket_name="bucket", + blob_name=IRUS_FUNCTION_BLOB_NAME, + cloud_function_path=cloud_function_path, ) self.assertEqual(success, True) self.assertEqual(upload, True) - IrusOapenTelescope.FUNCTION_MD5_HASH = "different" + # Check that an error is raised if the md5 hash doesn't match with self.assertRaises(AirflowException): upload_source_code_to_bucket( - IrusOapenTelescope.FUNCTION_SOURCE_URL, - IrusOapenTelescope.OAPEN_PROJECT_ID, - IrusOapenTelescope.OAPEN_BUCKET, - IrusOapenTelescope.FUNCTION_BLOB_NAME, - cloud_function_path, + source_url=IRUS_FUNCTION_SOURCE_URL, + project_id="project_id", + bucket_name="bucket", + blob_name=IRUS_FUNCTION_BLOB_NAME, + cloud_function_path=cloud_function_path, + expected_md5_hash="different", ) def test_cloud_function_exists(self): @@ -591,7 +584,7 @@ def test_create_cloud_function(self): def test_call_cloud_function(self, mock_authorized_session): """Test the function that calls the cloud function""" - function_url = "function_url" + function_uri = "function_uri" release_date = "2020-01-01" username = "username" password = "password" @@ -625,42 +618,42 @@ def test_call_cloud_function(self, mock_authorized_session): ] # Test when there are unprocessed publishers (first 2 responses from side effect) call_cloud_function( - function_url, - release_date, - username, - password, - geoip_license_key, - publisher_name, - publisher_uuid, - bucket_name, - blob_name, + function_uri=function_uri, + release_date=release_date, + username=username, + password=password, + geoip_license_key=geoip_license_key, + publisher_name_v4=publisher_name, + publisher_uuid_v5=publisher_uuid, + bucket_name=bucket_name, + blob_name=blob_name, ) self.assertEqual(2, mock_authorized_session.call_count) # Test when entries is 0 (3rd response from side effect) with self.assertRaises(AirflowSkipException): call_cloud_function( - function_url, - release_date, - username, - password, - geoip_license_key, - publisher_name, - publisher_uuid, - bucket_name, - blob_name, + function_uri=function_uri, + release_date=release_date, + username=username, + password=password, + geoip_license_key=geoip_license_key, + publisher_name_v4=publisher_name, + publisher_uuid_v5=publisher_uuid, + bucket_name=bucket_name, + blob_name=blob_name, ) # Test when response status code is not 200 (last response from side effect) with self.assertRaises(AirflowException): call_cloud_function( - function_url, - release_date, - username, - password, - geoip_license_key, - publisher_name, - publisher_uuid, - bucket_name, - blob_name, + function_uri=function_uri, + release_date=release_date, + username=username, + password=password, + geoip_license_key=geoip_license_key, + publisher_name_v4=publisher_name, + publisher_uuid_v5=publisher_uuid, + bucket_name=bucket_name, + blob_name=blob_name, ) diff --git a/oaebu_workflows/tests/__init__.py b/tests/jstor_telescope/__init__.py similarity index 100% rename from oaebu_workflows/tests/__init__.py rename to tests/jstor_telescope/__init__.py diff --git a/oaebu_workflows/tests/fixtures/__init__.py b/tests/jstor_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/tests/fixtures/__init__.py rename to tests/jstor_telescope/fixtures/__init__.py diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/collection_country.json b/tests/jstor_telescope/fixtures/collection_country.json similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/collection_country.json rename to tests/jstor_telescope/fixtures/collection_country.json diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/collection_country_table.json b/tests/jstor_telescope/fixtures/collection_country_table.json similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/collection_country_table.json rename to tests/jstor_telescope/fixtures/collection_country_table.json diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/collection_institution.json b/tests/jstor_telescope/fixtures/collection_institution.json similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/collection_institution.json rename to tests/jstor_telescope/fixtures/collection_institution.json diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/collection_institution_table.json b/tests/jstor_telescope/fixtures/collection_institution_table.json similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/collection_institution_table.json rename to tests/jstor_telescope/fixtures/collection_institution_table.json diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/country_20220801.tsv b/tests/jstor_telescope/fixtures/country_20220801.tsv similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/country_20220801.tsv rename to tests/jstor_telescope/fixtures/country_20220801.tsv diff --git a/oaebu_workflows/jstor_telescope/tests/fixtures/institution_20220801.tsv b/tests/jstor_telescope/fixtures/institution_20220801.tsv similarity index 100% rename from oaebu_workflows/jstor_telescope/tests/fixtures/institution_20220801.tsv rename to tests/jstor_telescope/fixtures/institution_20220801.tsv diff --git a/oaebu_workflows/jstor_telescope/tests/test_jstor_telescope.py b/tests/jstor_telescope/test_jstor_telescope.py similarity index 75% rename from oaebu_workflows/jstor_telescope/tests/test_jstor_telescope.py rename to tests/jstor_telescope/test_jstor_telescope.py index ae52cd84..0016d6a8 100644 --- a/oaebu_workflows/jstor_telescope/tests/test_jstor_telescope.py +++ b/tests/jstor_telescope/test_jstor_telescope.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,25 +28,23 @@ from googleapiclient.discovery import build from googleapiclient.http import HttpMockSequence -from oaebu_workflows.config import test_fixtures_folder +from oaebu_workflows.config import test_fixtures_folder, module_file_path from oaebu_workflows.oaebu_partners import partner_from_str from oaebu_workflows.jstor_telescope.jstor_telescope import ( + JSTOR_PROCESSED_LABEL_NAME, JstorRelease, - JstorTelescope, JstorPublishersAPI, JstorCollectionsAPI, + create_dag, make_jstor_api, ) -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - load_and_parse_json, -) -from observatory.platform.observatory_config import Workflow -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.bigquery import bq_table_id -from observatory.platform.api import get_dataset_releases + +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.test_utils import SandboxTestCase, load_and_parse_json +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment +from observatory_platform.google.gcs import gcs_blob_name_from_path, gcs_upload_files +from observatory_platform.google.bigquery import bq_table_id +from observatory_platform.dataset_api import DatasetAPI def dummy_gmail_connection() -> Connection: @@ -57,34 +55,37 @@ def dummy_gmail_connection() -> Connection: ) -class TestTelescopeSetup(ObservatoryTestCase): +class TestTelescopeSetup(SandboxTestCase): def __init__(self, *args, **kwargs): super(TestTelescopeSetup, self).__init__(*args, **kwargs) self.entity_id = "anupress" def test_dag_structure(self): """Test that the Jstor DAG has the correct structure.""" - env = ObservatoryEnvironment() + env = SandboxEnvironment() with env.create(): env.add_connection(dummy_gmail_connection()) for entity_type in ["publisher", "collection"]: - dag = JstorTelescope( - "jstor", + dag = create_dag( + dag_id="jstor_test", cloud_workspace=self.fake_cloud_workspace, entity_id=self.entity_id, entity_type=entity_type, - ).make_dag() + ) self.assert_dag_structure( { "check_dependencies": ["list_reports"], - "list_reports": ["download_reports"], - "download_reports": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], - "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "list_reports": ["download"], + "download": [ + "process_release.transform", + "process_release.bq_load", + "process_release.add_new_dataset_releases", + "process_release.cleanup_workflow", + ], + "process_release.transform": ["process_release.bq_load"], + "process_release.bq_load": ["process_release.add_new_dataset_releases"], + "process_release.add_new_dataset_releases": ["process_release.cleanup_workflow"], + "process_release.cleanup_workflow": [], }, dag, ) @@ -92,12 +93,12 @@ def test_dag_structure(self): def test_dag_load(self): """Test that the Jstor DAG can be loaded from a DAG bag.""" for entity_type in ["publisher", "collection"]: - env = ObservatoryEnvironment( + env = SandboxEnvironment( workflows=[ Workflow( dag_id="jstor_test_telescope", name="My JSTOR Workflow", - class_name="oaebu_workflows.jstor_telescope.jstor_telescope.JstorTelescope", + class_name="oaebu_workflows.jstor_telescope.jstor_telescope.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict(entity_id=self.entity_id, entity_type=entity_type), ) @@ -106,10 +107,12 @@ def test_dag_load(self): with env.create(): env.add_connection(dummy_gmail_connection()) - self.assert_dag_load_from_config("jstor_test_telescope") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("jstor_test_telescope", dag_file) def test_entity_type(self): - env = ObservatoryEnvironment() + """Test that the correct entity type is enforced.""" + env = SandboxEnvironment() with env.create(): env.add_connection(dummy_gmail_connection()) for entity_type in ["collection", "publisher"]: @@ -118,7 +121,7 @@ def test_entity_type(self): make_jstor_api("invalid", self.entity_id) -class TestJstorTelescopePublisher(ObservatoryTestCase): +class TestJstorTelescopePublisher(SandboxTestCase): """Tests for the Jstor telescope""" def __init__(self, *args, **kwargs): @@ -180,10 +183,7 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): mock_build.return_value = build("gmail", "v1", http=http) # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) - dataset_id = env.add_dataset() + env = SandboxEnvironment(self.project_id, self.data_location) # Create the Observatory environment and run tests with env.create(task_logging=True): @@ -193,23 +193,27 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): # Setup Telescope execution_date = pendulum.datetime(year=2020, month=11, day=1) country_partner = partner_from_str("jstor_country") + dataset_id = env.add_dataset() country_partner.bq_dataset_id = dataset_id institution_partner = partner_from_str("jstor_institution") institution_partner.bq_dataset_id = dataset_id - telescope = JstorTelescope( - dag_id="jstor_test_telescope", + api_dataset_id = env.add_dataset() + dag_id = "jstor_test_telescope" + entity_type = "publisher" + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, entity_id=self.entity_id, - entity_type="publisher", + entity_type=entity_type, country_partner=country_partner, institution_partner=institution_partner, + api_dataset_id=api_dataset_id, ) - dag = telescope.make_dag() # Begin DAG run with env.create_dag_run(dag, execution_date): # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) + ti = env.run_task("check_dependencies") self.assertEqual(ti.state, State.SUCCESS) # Test list releases task with files available @@ -218,11 +222,9 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): self.setup_mock_file_download( report["url"], report["path"], headers=report["headers"], method=httpretty.HEAD ) - ti = env.run_task(telescope.list_reports.__name__) + ti = env.run_task("list_reports") self.assertEqual(ti.state, State.SUCCESS) - available_reports = ti.xcom_pull( - key=JstorTelescope.REPORTS_INFO, task_ids=telescope.list_reports.__name__, include_prior_dates=False - ) + available_reports = ti.xcom_pull(task_ids="list_reports", include_prior_dates=False) self.assertIsInstance(available_reports, list) expected_reports_info = [ {"type": "country", "url": self.country_report["url"], "id": "1788ec9e91f3de62"}, @@ -231,42 +233,57 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): self.assertListEqual(expected_reports_info, available_reports) # Test download_reports task - with httpretty.enabled(): + with httpretty.enabled(), patch( + "oaebu_workflows.jstor_telescope.jstor_telescope.gcs_upload_files" + ) as mock_upload: + mock_upload.return_value = True for report in [self.country_report, self.institution_report]: self.setup_mock_file_download(report["url"], report["path"], headers=report["headers"]) - ti = env.run_task(telescope.download_reports.__name__) + ti = env.run_task("download") self.assertEqual(ti.state, State.SUCCESS) # use release info for other tasks - available_releases = ti.xcom_pull( - key=JstorTelescope.RELEASE_INFO, - task_ids=telescope.download_reports.__name__, - include_prior_dates=False, - ) - self.assertIsInstance(available_releases, dict) - self.assertEqual(1, len(available_releases)) - for release_date, reports in available_releases.items(): - self.assertEqual(self.release_date.date(), pendulum.parse(release_date).date()) - self.assertIsInstance(reports, list) - self.assertListEqual(expected_reports_info, reports) - release = JstorRelease( - dag_id=telescope.dag_id, - run_id=env.dag_run.run_id, - data_interval_start=pendulum.parse(release_date).start_of("month"), - data_interval_end=pendulum.parse(release_date).add(days=1).start_of("month"), - partition_date=pendulum.parse(release_date), - reports=reports, - ) + release_dicts = ti.xcom_pull(task_ids="download", include_prior_dates=False) + expected_releases = [ + { + "dag_id": "jstor_test_telescope", + "run_id": "scheduled__2020-11-01T00:00:00+00:00", + "data_interval_start": "2022-07-01", + "data_interval_end": "2022-08-01", + "partition_date": "2022-07-31", + "reports": [ + { + "type": "country", + "url": "https://www.jstor.org/admin/reports/download/249192019", + "id": "1788ec9e91f3de62", + }, + { + "type": "institution", + "url": "https://www.jstor.org/admin/reports/download/129518301", + "id": "1788ebe4ecbab055", + }, + ], + } + ] + self.assertEqual(release_dicts, expected_releases) + release = JstorRelease.from_dict(release_dicts[0]) + # Check that the files were "downloaded" self.assertTrue(os.path.exists(release.download_country_path)) self.assertTrue(os.path.exists(release.download_institution_path)) self.assert_file_integrity(release.download_country_path, self.country_report["download_hash"], "md5") self.assert_file_integrity( release.download_institution_path, self.institution_report["download_hash"], "md5" ) + + # Do the upload that we patched above + success = gcs_upload_files( + bucket_name=env.cloud_workspace.download_bucket, + file_paths=[release.download_institution_path, release.download_country_path], + ) + self.assertTrue(success) + # Test that file uploaded - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) self.assert_blob_integrity( env.download_bucket, gcs_blob_name_from_path(release.download_country_path), @@ -279,7 +296,7 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): ) # Test that file transformed - ti = env.run_task(telescope.transform.__name__) + ti = env.run_task("process_release.transform", map_index=0) self.assertEqual(ti.state, State.SUCCESS) self.assertTrue(os.path.exists(release.transform_country_path)) self.assertTrue(os.path.exists(release.transform_institution_path)) @@ -290,8 +307,7 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): release.transform_institution_path, self.institution_report["transform_hash"], "gzip_crc" ) - # Test that transformed file uploaded - ti = env.run_task(telescope.upload_transformed.__name__) + # Test that transformed file was uploaded self.assertEqual(ti.state, State.SUCCESS) self.assert_blob_integrity( env.transform_bucket, @@ -304,34 +320,59 @@ def test_telescope_publisher(self, mock_account_credentials, mock_build): release.transform_institution_path, ) - # Test that data loaded into BigQuery - ti = env.run_task(telescope.bq_load.__name__) + # Test that data is loaded into BigQuery + ti = env.run_task("process_release.bq_load", map_index=0) self.assertEqual(ti.state, State.SUCCESS) country_table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.country_partner.bq_dataset_id, - telescope.country_partner.bq_table_name, + env.cloud_workspace.project_id, + country_partner.bq_dataset_id, + country_partner.bq_table_name, ) institution_table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.institution_partner.bq_dataset_id, - telescope.institution_partner.bq_table_name, + env.cloud_workspace.project_id, + institution_partner.bq_dataset_id, + institution_partner.bq_table_name, ) self.assert_table_integrity(country_table_id, self.country_report["table_rows"]) self.assert_table_integrity(institution_table_id, self.institution_report["table_rows"]) - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.jstor_telescope.jstor_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("process_release.add_new_dataset_releases", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2022-07-01T00:00:00+00:00", + "data_interval_end": "2022-08-01T00:00:00+00:00", + "snapshot_date": None, + "partition_date": "2022-07-31T00:00:00+00:00", + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test that all telescope data deleted - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("process_release.cleanup_workflow", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) def test_get_release_date(self): """Test that the get_release_date returns the correct release date and raises an exception when dates are @@ -380,7 +421,7 @@ def test_get_release_date(self): api.get_release_date(reports[3]["file"]) -class TestJstorTelescopeCollection(ObservatoryTestCase): +class TestJstorTelescopeCollection(SandboxTestCase): """Tests for the Jstor telescope""" def __init__(self, *args, **kwargs): @@ -420,45 +461,44 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): mock_build.return_value = build("gmail", "v1", http=http) # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) - dataset_id = env.add_dataset() + env = SandboxEnvironment(self.project_id, self.data_location) # Create the Observatory environment and run tests with env.create(task_logging=True): # Add gmail connection env.add_connection(dummy_gmail_connection()) - # Setup Telescope + # Setup DAG execution_date = pendulum.datetime(year=2023, month=10, day=4) country_partner = partner_from_str("jstor_country_collection") + dataset_id = env.add_dataset() country_partner.bq_dataset_id = dataset_id institution_partner = partner_from_str("jstor_institution_collection") institution_partner.bq_dataset_id = dataset_id - telescope = JstorTelescope( - dag_id="jstor_test_telescope", + api_dataset_id = env.add_dataset() + dag_id = "jstor_test_telescope" + entity_type = "collection" + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, entity_id=self.entity_id, - entity_type="collection", + entity_type=entity_type, country_partner=country_partner, institution_partner=institution_partner, + api_dataset_id=api_dataset_id, ) - dag = telescope.make_dag() # Begin DAG run with env.create_dag_run(dag, execution_date): # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) + ti = env.run_task("check_dependencies") self.assertEqual(ti.state, State.SUCCESS) # Test list releases task with files available - ti = env.run_task(telescope.list_reports.__name__) + ti = env.run_task("list_reports") self.assertEqual(ti.state, State.SUCCESS) - available_reports = ti.xcom_pull( - key=JstorTelescope.REPORTS_INFO, task_ids=telescope.list_reports.__name__, include_prior_dates=False - ) + available_reports = ti.xcom_pull(task_ids="list_reports", include_prior_dates=False) self.assertIsInstance(available_reports, list) expected_reports_info = [ {"type": "country", "attachment_id": "2", "id": "18af0b40b64fe408"}, @@ -467,32 +507,28 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): self.assertListEqual(expected_reports_info, available_reports) # Test download_reports task - ti = env.run_task(telescope.download_reports.__name__) + ti = env.run_task("download") self.assertEqual(ti.state, State.SUCCESS) # use release info for other tasks - available_releases = ti.xcom_pull( - key=JstorTelescope.RELEASE_INFO, - task_ids=telescope.download_reports.__name__, - include_prior_dates=False, - ) - self.assertIsInstance(available_releases, dict) - self.assertEqual(1, len(available_releases)) - for release_date, reports in available_releases.items(): - self.assertEqual(self.release_date.date(), pendulum.parse(release_date).date()) - self.assertIsInstance(reports, list) - self.assertListEqual(expected_reports_info, reports) - - release = JstorRelease( - dag_id=telescope.dag_id, - run_id=env.dag_run.run_id, - data_interval_start=pendulum.parse(release_date).start_of("month"), - data_interval_end=pendulum.parse(release_date).add(days=1).start_of("month"), - partition_date=pendulum.parse(release_date), - reports=reports, - ) + release_dicts = ti.xcom_pull(task_ids="download", include_prior_dates=False) + expected_releases = [ + { + "dag_id": "jstor_test_telescope", + "run_id": "scheduled__2023-10-04T00:00:00+00:00", + "data_interval_start": "2023-09-01", + "data_interval_end": "2023-10-01", + "partition_date": "2023-09-30", + "reports": [ + {"type": "country", "attachment_id": "2", "id": "18af0b40b64fe408"}, + {"type": "institution", "attachment_id": "3", "id": "18af0b40b64fe408"}, + ], + } + ] + self.assertEqual(release_dicts, expected_releases) + release = JstorRelease.from_dict(release_dicts[0]) - # Test that files download + # Check that the files were "downloaded" self.assertTrue(os.path.exists(release.download_country_path)) self.assertTrue(os.path.exists(release.download_institution_path)) self.assert_file_integrity(release.download_country_path, self.country_report["download_hash"], "md5") @@ -501,8 +537,6 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): ) # Test that file uploaded - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) self.assert_blob_integrity( env.download_bucket, gcs_blob_name_from_path(release.download_country_path), @@ -515,7 +549,7 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): ) # Test that file transformed - ti = env.run_task(telescope.transform.__name__) + ti = env.run_task("process_release.transform", map_index=0) self.assertEqual(ti.state, State.SUCCESS) self.assertTrue(os.path.exists(release.transform_country_path)) self.assertTrue(os.path.exists(release.transform_institution_path)) @@ -527,8 +561,6 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): ) # Test that transformed file uploaded - ti = env.run_task(telescope.upload_transformed.__name__) - self.assertEqual(ti.state, State.SUCCESS) self.assert_blob_integrity( env.transform_bucket, gcs_blob_name_from_path(release.transform_country_path), @@ -541,17 +573,17 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): ) # Test that data loaded into BigQuery - ti = env.run_task(telescope.bq_load.__name__) + ti = env.run_task("process_release.bq_load", map_index=0) self.assertEqual(ti.state, State.SUCCESS) country_table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.country_partner.bq_dataset_id, - telescope.country_partner.bq_table_name, + env.cloud_workspace.project_id, + country_partner.bq_dataset_id, + country_partner.bq_table_name, ) institution_table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.institution_partner.bq_dataset_id, - telescope.institution_partner.bq_table_name, + env.cloud_workspace.project_id, + institution_partner.bq_dataset_id, + institution_partner.bq_table_name, ) self.assert_table_integrity(country_table_id, self.country_report["table_rows"]) self.assert_table_integrity(institution_table_id, self.institution_report["table_rows"]) @@ -560,18 +592,43 @@ def test_telescope_collection(self, mock_account_credentials, mock_build): expected = load_and_parse_json(self.institution_report["table"], date_fields=["release_date"]) self.assert_table_content(institution_table_id, expected, primary_key="ISBN") - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.jstor_telescope.jstor_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("process_release.add_new_dataset_releases", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2023-09-01T00:00:00+00:00", + "data_interval_end": "2023-10-01T00:00:00+00:00", + "snapshot_date": None, + "partition_date": "2023-09-30T00:00:00+00:00", + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test that all telescope data deleted - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("process_release.cleanup_workflow", map_index=0) self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) def test_get_release_date(self): """Test that the get_release_date returns the correct release date and raises an exception when dates are @@ -631,7 +688,7 @@ def test_get_label_id(self, mock_account_credentials, mock_build): } create_label = { "id": "created_label", - "name": JstorTelescope.PROCESSED_LABEL_NAME, + "name": JSTOR_PROCESSED_LABEL_NAME, "messageListVisibility": "show", "labelListVisibility": "labelShow", } @@ -646,7 +703,7 @@ def test_get_label_id(self, mock_account_credentials, mock_build): }, { "id": "existing_label", - "name": JstorTelescope.PROCESSED_LABEL_NAME, + "name": JSTOR_PROCESSED_LABEL_NAME, "messageListVisibility": "show", "labelListVisibility": "labelShow", "type": "user", @@ -664,11 +721,11 @@ def test_get_label_id(self, mock_account_credentials, mock_build): api = JstorPublishersAPI(service, self.entity_id) # call function without match for label, so label is created - label_id = api.get_label_id(service, JstorTelescope.PROCESSED_LABEL_NAME) + label_id = api.get_label_id(service, JSTOR_PROCESSED_LABEL_NAME) self.assertEqual("created_label", label_id) # call function with match for label - label_id = api.get_label_id(service, JstorTelescope.PROCESSED_LABEL_NAME) + label_id = api.get_label_id(service, JSTOR_PROCESSED_LABEL_NAME) self.assertEqual("existing_label", label_id) @@ -693,7 +750,7 @@ def publisher_http_mock_sequence( }, { "id": "Label_1", - "name": JstorTelescope.PROCESSED_LABEL_NAME, + "name": JSTOR_PROCESSED_LABEL_NAME, "messageListVisibility": "show", "labelListVisibility": "labelShow", "type": "user", @@ -837,7 +894,7 @@ def collection_http_mock_sequence(country_json: str, institution_json: str) -> l "parts": [ {"partId": "0", "filename": "", "body": {"attachmentId": "0"}}, {"partId": "1", "filename": "BTAA_Overall_Open_Usage.xlsx", "body": {"attachmentId": "1"}}, - {"partId": "2", "filename": "BTAA_Country_Open_Usage.csv", "body": {"attachmentId": "2"}}, + {"partId": "2", "filename": "BTAA_Open_Country_Usage.csv", "body": {"attachmentId": "2"}}, {"partId": "3", "filename": "BTAA_Open_Institution_Usage.csv", "body": {"attachmentId": "3"}}, ], }, @@ -880,7 +937,7 @@ def collection_http_mock_sequence(country_json: str, institution_json: str) -> l }, { "id": "Label_1", - "name": JstorTelescope.PROCESSED_LABEL_NAME, + "name": JSTOR_PROCESSED_LABEL_NAME, "messageListVisibility": "show", "labelListVisibility": "labelShow", "type": "user", diff --git a/oaebu_workflows/tests/fixtures/onix_utils/__init__.py b/tests/oapen_metadata_telescope/__init__.py similarity index 100% rename from oaebu_workflows/tests/fixtures/onix_utils/__init__.py rename to tests/oapen_metadata_telescope/__init__.py diff --git a/oaebu_workflows/thoth_telescope/__init__.py b/tests/oapen_metadata_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/thoth_telescope/__init__.py rename to tests/oapen_metadata_telescope/fixtures/__init__.py diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_bad_response.yaml b/tests/oapen_metadata_telescope/fixtures/cassette_bad_response.yaml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_bad_response.yaml rename to tests/oapen_metadata_telescope/fixtures/cassette_bad_response.yaml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_empty.yaml b/tests/oapen_metadata_telescope/fixtures/cassette_empty.yaml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_empty.yaml rename to tests/oapen_metadata_telescope/fixtures/cassette_empty.yaml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_header_only.yaml b/tests/oapen_metadata_telescope/fixtures/cassette_header_only.yaml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_header_only.yaml rename to tests/oapen_metadata_telescope/fixtures/cassette_header_only.yaml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_invalid.yaml b/tests/oapen_metadata_telescope/fixtures/cassette_invalid.yaml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_invalid.yaml rename to tests/oapen_metadata_telescope/fixtures/cassette_invalid.yaml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_valid.yaml b/tests/oapen_metadata_telescope/fixtures/cassette_valid.yaml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/cassette_valid.yaml rename to tests/oapen_metadata_telescope/fixtures/cassette_valid.yaml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/empty_download.xml b/tests/oapen_metadata_telescope/fixtures/empty_download.xml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/empty_download.xml rename to tests/oapen_metadata_telescope/fixtures/empty_download.xml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/invalid_products.xml b/tests/oapen_metadata_telescope/fixtures/invalid_products.xml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/invalid_products.xml rename to tests/oapen_metadata_telescope/fixtures/invalid_products.xml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/invalid_products_removed.xml b/tests/oapen_metadata_telescope/fixtures/invalid_products_removed.xml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/invalid_products_removed.xml rename to tests/oapen_metadata_telescope/fixtures/invalid_products_removed.xml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/metadata_download_valid.xml b/tests/oapen_metadata_telescope/fixtures/metadata_download_valid.xml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/metadata_download_valid.xml rename to tests/oapen_metadata_telescope/fixtures/metadata_download_valid.xml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/parsed_valid.xml b/tests/oapen_metadata_telescope/fixtures/parsed_valid.xml similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/parsed_valid.xml rename to tests/oapen_metadata_telescope/fixtures/parsed_valid.xml diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/fixtures/test_table.json b/tests/oapen_metadata_telescope/fixtures/test_table.json similarity index 100% rename from oaebu_workflows/oapen_metadata_telescope/tests/fixtures/test_table.json rename to tests/oapen_metadata_telescope/fixtures/test_table.json diff --git a/oaebu_workflows/oapen_metadata_telescope/tests/test_oapen_metadata_telescope.py b/tests/oapen_metadata_telescope/test_oapen_metadata_telescope.py similarity index 65% rename from oaebu_workflows/oapen_metadata_telescope/tests/test_oapen_metadata_telescope.py rename to tests/oapen_metadata_telescope/test_oapen_metadata_telescope.py index f5eedfb9..bb154d2c 100644 --- a/oaebu_workflows/oapen_metadata_telescope/tests/test_oapen_metadata_telescope.py +++ b/tests/oapen_metadata_telescope/test_oapen_metadata_telescope.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ import os import unittest -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch from tempfile import NamedTemporaryFile from xml.parsers.expat import ExpatError @@ -26,25 +26,22 @@ from airflow.utils.state import State from tenacity import stop_after_attempt -from oaebu_workflows.config import test_fixtures_folder +from oaebu_workflows.config import test_fixtures_folder, module_file_path from oaebu_workflows.oaebu_partners import partner_from_str from oaebu_workflows.oapen_metadata_telescope.oapen_metadata_telescope import ( - OapenMetadataTelescope, + OapenMetadataRelease, download_metadata, + create_dag, ) -from observatory.platform.api import get_dataset_releases -from observatory.platform.observatory_config import Workflow -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.bigquery import bq_sharded_table_id -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - load_and_parse_json, -) +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.google.bigquery import bq_sharded_table_id +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.test_utils import SandboxTestCase, load_and_parse_json +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment -class TestOapenMetadataTelescope(ObservatoryTestCase): +class TestOapenMetadataTelescope(SandboxTestCase): """Tests for the Oapen Metadata Telescope DAG""" def __init__(self, *args, **kwargs): @@ -64,96 +61,98 @@ def __init__(self, *args, **kwargs): def test_dag_structure(self): """Test that the Oapen Metadata DAG has the correct structure""" - dag = OapenMetadataTelescope( + dag = create_dag( dag_id="oapen_metadata", cloud_workspace=self.fake_cloud_workspace, metadata_uri="", - ).make_dag() + ) self.assert_dag_structure( { - "check_dependencies": ["download"], - "download": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], + "check_dependencies": ["make_release"], + "make_release": ["download", "transform", "bq_load", "add_new_dataset_releases", "cleanup_workflow"], + "download": ["transform"], + "transform": ["bq_load"], "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "add_new_dataset_releases": ["cleanup_workflow"], + "cleanup_workflow": [], }, dag, ) def test_dag_load(self): """Test that the OapenMetadata DAG can be loaded from a DAG bag""" - env = ObservatoryEnvironment( + env = SandboxEnvironment( workflows=[ Workflow( dag_id="oapen_metadata", name="OAPEN Metadata Telescope", - class_name="oaebu_workflows.oapen_metadata_telescope.oapen_metadata_telescope.OapenMetadataTelescope", + class_name="oaebu_workflows.oapen_metadata_telescope.oapen_metadata_telescope.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict(metadata_uri=""), ) ], ) with env.create(): - self.assert_dag_load_from_config("oapen_metadata") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("oapen_metadata", dag_file) def test_telescope(self): """Test telescope task execution.""" - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.project_id, self.data_location) dataset_id = env.add_dataset() + api_dataset_id = env.add_dataset() with env.create(): - partner = partner_from_str("oapen_metadata", metadata_partner=True) - partner.bq_dataset_id = dataset_id - telescope = OapenMetadataTelescope( - dag_id="oapen_metadata", + metadata_partner = partner_from_str("oapen_metadata", metadata_partner=True) + metadata_partner.bq_dataset_id = dataset_id + dag_id = "oapen_metadata" + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, metadata_uri=self.metadata_uri, - metadata_partner=partner, + metadata_partner=metadata_partner, elevate_related_products=True, - bq_dataset_id=dataset_id, + api_dataset_id=api_dataset_id, ) - dag = telescope.make_dag() # first run with env.create_dag_run(dag, pendulum.datetime(year=2021, month=2, day=1)): # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) + ti = env.run_task("check_dependencies") self.assertEqual(ti.state, State.SUCCESS) + # Make release task + ti = env.run_task("make_release") + self.assertEqual(ti.state, State.SUCCESS) + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + expected_release_dict = { + "dag_id": "oapen_metadata", + "run_id": "scheduled__2021-02-01T00:00:00+00:00", + "snapshot_date": "2021-02-07", + } + self.assertEqual(release_dict, expected_release_dict) + release = OapenMetadataRelease.from_dict(release_dict) + # Download task - with vcr.VCR().use_cassette(self.valid_download_cassette, record_mode="None"): - ti = env.run_task(telescope.download.__name__) + with vcr.VCR().use_cassette( + self.valid_download_cassette, + record_mode="None", + ignore_hosts=["oauth2.googleapis.com", "storage.googleapis.com"], + ): + ti = env.run_task("download") self.assertEqual(ti.state, State.SUCCESS) - # Upload download task - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) - # Transform task - ti = env.run_task(telescope.transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # Upload transform task - ti = env.run_task(telescope.upload_transformed.__name__) + ti = env.run_task("transform") self.assertEqual(ti.state, State.SUCCESS) # Bigquery load task - ti = env.run_task(telescope.bq_load.__name__) + ti = env.run_task("bq_load") self.assertEqual(ti.state, State.SUCCESS) ### Make Assertions ### - # Create the release - release = telescope.make_release( - run_id=env.dag_run.run_id, data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)) - ) - # Test download task self.assertTrue(os.path.exists(release.download_path)) self.assert_file_integrity(release.download_path, "c246a8f7487de756f4dd47cd0ab94363", "md5") @@ -180,26 +179,52 @@ def test_telescope(self): # Test that table is loaded to BQ table_id = bq_sharded_table_id( - telescope.cloud_workspace.project_id, - telescope.metadata_partner.bq_dataset_id, - telescope.metadata_partner.bq_table_name, + env.cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, release.snapshot_date, ) self.assert_table_integrity(table_id, expected_rows=5) self.assert_table_content(table_id, load_and_parse_json(self.test_table), primary_key="ISBN13") - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) - self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch( + "oaebu_workflows.oapen_metadata_telescope.oapen_metadata_telescope.pendulum.now" + ) as mock_now: + mock_now.return_value = now + ti = env.run_task("add_new_dataset_releases") + self.assertEqual(ti.state, State.SUCCESS) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2021-02-01T00:00:00+00:00", + "data_interval_end": "2021-02-07T12:00:00+00:00", + "snapshot_date": "2021-02-07T00:00:00+00:00", + "partition_date": None, + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test that all data deleted - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("cleanup_workflow") self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) class TestDownloadMetadata(unittest.TestCase): diff --git a/oaebu_workflows/thoth_telescope/schema/__init__.py b/tests/onix_telescope/__init__.py similarity index 100% rename from oaebu_workflows/thoth_telescope/schema/__init__.py rename to tests/onix_telescope/__init__.py diff --git a/oaebu_workflows/onix_telescope/tests/fixtures/20210330_CURTINPRESS_ONIX.json b/tests/onix_telescope/fixtures/20210330_CURTINPRESS_ONIX.json similarity index 100% rename from oaebu_workflows/onix_telescope/tests/fixtures/20210330_CURTINPRESS_ONIX.json rename to tests/onix_telescope/fixtures/20210330_CURTINPRESS_ONIX.json diff --git a/oaebu_workflows/onix_telescope/tests/fixtures/20210330_CURTINPRESS_ONIX.xml b/tests/onix_telescope/fixtures/20210330_CURTINPRESS_ONIX.xml similarity index 100% rename from oaebu_workflows/onix_telescope/tests/fixtures/20210330_CURTINPRESS_ONIX.xml rename to tests/onix_telescope/fixtures/20210330_CURTINPRESS_ONIX.xml diff --git a/oaebu_workflows/thoth_telescope/sql/__init__.py b/tests/onix_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/thoth_telescope/sql/__init__.py rename to tests/onix_telescope/fixtures/__init__.py diff --git a/tests/onix_telescope/test_onix_telescope.py b/tests/onix_telescope/test_onix_telescope.py new file mode 100644 index 00000000..04c80973 --- /dev/null +++ b/tests/onix_telescope/test_onix_telescope.py @@ -0,0 +1,250 @@ +# Copyright 2021-2024 Curtin University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Author: James Diprose + +import os +import shutil +from unittest.mock import patch + +import pendulum +from airflow.models import Connection +from airflow.utils.state import State +from google.cloud.bigquery import Client + +from oaebu_workflows.onix_telescope.onix_telescope import OnixRelease, create_dag +from oaebu_workflows.oaebu_partners import partner_from_str +from oaebu_workflows.config import test_fixtures_folder, module_file_path +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.google.bigquery import bq_sharded_table_id +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.sftp import SftpFolders +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.test_utils import SandboxTestCase, find_free_port, load_and_parse_json +from observatory_platform.sandbox.sftp_server import SftpServer +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment + + +class TestOnixTelescope(SandboxTestCase): + """Tests for the ONIX telescope""" + + def __init__(self, *args, **kwargs): + """Constructor which sets up variables used by tests. + + :param args: arguments. + :param kwargs: keyword arguments. + """ + + super(TestOnixTelescope, self).__init__(*args, **kwargs) + self.project_id = os.getenv("TEST_GCP_PROJECT_ID") + self.data_location = os.getenv("TEST_GCP_DATA_LOCATION") + self.sftp_root = "/" + self.date_regex = "\\d{8}" + self.sftp_port = find_free_port() + + # Test file + fixtures_folder = test_fixtures_folder(workflow_module="onix_telescope") + self.onix_xml_path = os.path.join(fixtures_folder, "20210330_CURTINPRESS_ONIX.xml") + self.onix_json_path = os.path.join(fixtures_folder, "20210330_CURTINPRESS_ONIX.json") + + def test_dag_structure(self): + """Test that the ONIX DAG has the correct structure.""" + dag = create_dag( + dag_id="onix_telescope", + cloud_workspace=self.fake_cloud_workspace, + sftp_root=self.sftp_root, + date_regex=self.date_regex, + ) + self.assert_dag_structure( + { + "check_dependencies": ["fetch_releases"], + "fetch_releases": [ + "process_release.move_files_to_in_progress", + "process_release.download", + "process_release.transform", + "process_release.bq_load", + "process_release.move_files_to_finished", + "process_release.add_new_dataset_releases", + "process_release.cleanup_workflow", + ], + "process_release.move_files_to_in_progress": ["process_release.download"], + "process_release.download": ["process_release.transform"], + "process_release.transform": ["process_release.bq_load"], + "process_release.bq_load": ["process_release.move_files_to_finished"], + "process_release.move_files_to_finished": ["process_release.add_new_dataset_releases"], + "process_release.add_new_dataset_releases": ["process_release.cleanup_workflow"], + "process_release.cleanup_workflow": [], + }, + dag, + ) + + def test_dag_load(self): + """Test that the Geonames DAG can be loaded from a DAG bag.""" + env = SandboxEnvironment( + workflows=[ + Workflow( + dag_id="onix", + name="ONIX Telescope", + class_name="oaebu_workflows.onix_telescope.onix_telescope.create_dag", + cloud_workspace=self.fake_cloud_workspace, + kwargs=dict(date_regex=self.date_regex), + ) + ], + ) + with env.create(): + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("onix", dag_file) + + # Errors should be raised if kwargs dict not supplied + env.workflows[0].kwargs = {} + with env.create(): + with self.assertRaises(AssertionError) as cm: + self.assert_dag_load_from_config("onix", dag_file) + msg = cm.exception.args[0] + self.assertTrue("missing 1 required keyword-only argument" in msg) + self.assertTrue("date_regex" in msg) + + def test_telescope(self): + """Test the ONIX telescope end to end.""" + # Setup Observatory environmento + env = SandboxEnvironment(self.project_id, self.data_location) + sftp_server = SftpServer(host="localhost", port=self.sftp_port) + + with env.create(), sftp_server.create() as sftp_root: + # Setup DAG + execution_date = pendulum.datetime(year=2021, month=3, day=31) + metadata_partner = partner_from_str("onix", metadata_partner=True) + metadata_partner.bq_dataset_id = env.add_dataset() + api_dataset_id = env.add_dataset() + sftp_service_conn_id = "sftp_service" + dag_id = "onix_telescope_test" + dag = create_dag( + dag_id=dag_id, + cloud_workspace=env.cloud_workspace, + sftp_root="/", + date_regex=self.date_regex, + metadata_partner=metadata_partner, + sftp_service_conn_id=sftp_service_conn_id, + api_dataset_id=api_dataset_id, + ) + + # Add SFTP connection + conn = Connection(conn_id=sftp_service_conn_id, uri=f"ssh://:password@localhost:{self.sftp_port}") + env.add_connection(conn) + with env.create_dag_run(dag, execution_date): + # Test that all dependencies are specified: no error should be thrown + ti = env.run_task("check_dependencies") + self.assertEqual(ti.state, State.SUCCESS) + + # Add ONIX file to SFTP server + local_sftp_folders = SftpFolders(dag_id, sftp_service_conn_id, sftp_root) + os.makedirs(local_sftp_folders.upload, exist_ok=True) + onix_file_name = os.path.basename(self.onix_xml_path) + onix_file_dst = os.path.join(local_sftp_folders.upload, onix_file_name) + shutil.copy(self.onix_xml_path, onix_file_dst) + + # Get release info from SFTP server and check that the correct release info is returned via Xcom + ti = env.run_task("fetch_releases") + self.assertEqual(ti.state, State.SUCCESS) + release_dicts = ti.xcom_pull(task_ids="fetch_releases", include_prior_dates=False) + expected_release_dicts = [ + { + "dag_id": "onix_telescope_test", + "run_id": "scheduled__2021-03-31T00:00:00+00:00", + "snapshot_date": "2021-03-30", + "onix_file_name": "20210330_CURTINPRESS_ONIX.xml", + } + ] + self.assertEqual(release_dicts, expected_release_dicts) + release = OnixRelease.from_dict(release_dicts[0]) + + # Test move file to in progress + ti = env.run_task("process_release.move_files_to_in_progress", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + in_progress_path = os.path.join(local_sftp_folders.in_progress, release.onix_file_name) + self.assertFalse(os.path.isfile(onix_file_dst)) + self.assertTrue(os.path.isfile(in_progress_path)) + + # Test download + ti = env.run_task("process_release.download", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + self.assert_file_integrity(release.download_path, "28f85c488ab01b0cff769d9da6b4be24", "md5") + self.assert_blob_integrity( + env.download_bucket, gcs_blob_name_from_path(release.download_path), release.download_path + ) + + # Test transform + ti = env.run_task("process_release.transform", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + self.assert_file_integrity(release.transform_path, "2164a300", "gzip_crc") + self.assert_blob_integrity( + env.transform_bucket, gcs_blob_name_from_path(release.transform_path), release.transform_path + ) + + # Test load into BigQuery + ti = env.run_task("process_release.bq_load", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + table_id = bq_sharded_table_id( + env.cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, + release.snapshot_date, + ) + self.assert_table_integrity(table_id, expected_rows=1) + self.assert_table_content(table_id, load_and_parse_json(self.onix_json_path), primary_key="ISBN13") + + # Test move files to finished + ti = env.run_task("process_release.move_files_to_finished", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + finished_path = os.path.join(local_sftp_folders.finished, onix_file_name) + self.assertFalse(os.path.isfile(local_sftp_folders.in_progress)) + self.assertTrue(os.path.isfile(finished_path)) + + # Set up the API + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) + self.assertEqual(len(dataset_releases), 0) + + # Set up the API + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.onix_telescope.onix_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("process_release.add_new_dataset_releases", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) + self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2021-03-31T00:00:00+00:00", + "data_interval_end": "2021-04-04T12:00:00+00:00", + "snapshot_date": "2021-03-30T00:00:00+00:00", + "partition_date": None, + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) + + # Test cleanup + workflow_folder_path = release.workflow_folder + ti = env.run_task("process_release.cleanup_workflow", map_index=0) + self.assertEqual(ti.state, State.SUCCESS) + self.assert_cleanup(workflow_folder_path) diff --git a/oaebu_workflows/thoth_telescope/tests/__init__.py b/tests/onix_workflow/__init__.py similarity index 100% rename from oaebu_workflows/thoth_telescope/tests/__init__.py rename to tests/onix_workflow/__init__.py diff --git a/oaebu_workflows/thoth_telescope/tests/fixtures/__init__.py b/tests/onix_workflow/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/thoth_telescope/tests/fixtures/__init__.py rename to tests/onix_workflow/fixtures/__init__.py diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/crossref_download_function_test.yaml b/tests/onix_workflow/fixtures/crossref_download_function_test.yaml similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/crossref_download_function_test.yaml rename to tests/onix_workflow/fixtures/crossref_download_function_test.yaml diff --git a/tests/onix_workflow/fixtures/crossref_events_request.yaml b/tests/onix_workflow/fixtures/crossref_events_request.yaml new file mode 100644 index 00000000..c841a920 --- /dev/null +++ b/tests/onix_workflow/fixtures/crossref_events_request.yaml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f98ce1d52adc15cba007cb0dd900766bfdf96bf7c3c6c0d00ddd4db5c24b953 +size 10742 diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/doi_isbn_query_test.jsonl b/tests/onix_workflow/fixtures/doi_isbn_query_test.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/doi_isbn_query_test.jsonl rename to tests/onix_workflow/fixtures/doi_isbn_query_test.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/bic_lookup.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/bic_lookup.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/bic_lookup.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/bic_lookup.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/bisac_lookup.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/bisac_lookup.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/bisac_lookup.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/bisac_lookup.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/country.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/country.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/country.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/country.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/crossref_metadata_master.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/crossref_metadata_master.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/crossref_metadata_master.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/crossref_metadata_master.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/google_analytics3.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/google_analytics3.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/google_analytics3.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/google_analytics3.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/google_books_sales.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/google_books_sales.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/google_books_sales.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/google_books_sales.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/google_books_traffic.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/google_books_traffic.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/google_books_traffic.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/google_books_traffic.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/internet_archive.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/internet_archive.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/internet_archive.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/internet_archive.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/irus_fulcrum.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/irus_fulcrum.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/irus_fulcrum.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/irus_fulcrum.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/irus_oapen.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/irus_oapen.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/irus_oapen.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/irus_oapen.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/jstor_country.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/jstor_country.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/jstor_country.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/jstor_country.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/jstor_institution.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/jstor_institution.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/jstor_institution.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/jstor_institution.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/onix.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/onix.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/onix.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/onix.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/thema_lookup.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/thema_lookup.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/thema_lookup.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/thema_lookup.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/ucl_discovery.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/ucl_discovery.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/ucl_discovery.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/ucl_discovery.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/worldreader.jsonl b/tests/onix_workflow/fixtures/e2e_inputs/worldreader.jsonl similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_inputs/worldreader.jsonl rename to tests/onix_workflow/fixtures/e2e_inputs/worldreader.jsonl diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book.json b/tests/onix_workflow/fixtures/e2e_outputs/book.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book.json rename to tests/onix_workflow/fixtures/e2e_outputs/book.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_list.json b/tests/onix_workflow/fixtures/e2e_outputs/book_list.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_list.json rename to tests/onix_workflow/fixtures/e2e_outputs/book_list.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_list_dry.json b/tests/onix_workflow/fixtures/e2e_outputs/book_list_dry.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_list_dry.json rename to tests/onix_workflow/fixtures/e2e_outputs/book_list_dry.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_product.json b/tests/onix_workflow/fixtures/e2e_outputs/book_product.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_product.json rename to tests/onix_workflow/fixtures/e2e_outputs/book_product.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_product_dry.json b/tests/onix_workflow/fixtures/e2e_outputs/book_product_dry.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/book_product_dry.json rename to tests/onix_workflow/fixtures/e2e_outputs/book_product_dry.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/crossref_events.json b/tests/onix_workflow/fixtures/e2e_outputs/crossref_events.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/crossref_events.json rename to tests/onix_workflow/fixtures/e2e_outputs/crossref_events.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/crossref_metadata.json b/tests/onix_workflow/fixtures/e2e_outputs/crossref_metadata.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/crossref_metadata.json rename to tests/onix_workflow/fixtures/e2e_outputs/crossref_metadata.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/onix_workfamilyid_isbn.json b/tests/onix_workflow/fixtures/e2e_outputs/onix_workfamilyid_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/onix_workfamilyid_isbn.json rename to tests/onix_workflow/fixtures/e2e_outputs/onix_workfamilyid_isbn.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/onix_workid_isbn.json b/tests/onix_workflow/fixtures/e2e_outputs/onix_workid_isbn.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/onix_workid_isbn.json rename to tests/onix_workflow/fixtures/e2e_outputs/onix_workid_isbn.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/onix_workid_isbn_errors.json b/tests/onix_workflow/fixtures/e2e_outputs/onix_workid_isbn_errors.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/e2e_outputs/onix_workid_isbn_errors.json rename to tests/onix_workflow/fixtures/e2e_outputs/onix_workid_isbn_errors.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/schema/bic_lookup.json b/tests/onix_workflow/fixtures/schema/bic_lookup.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/schema/bic_lookup.json rename to tests/onix_workflow/fixtures/schema/bic_lookup.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/schema/bisac_lookup.json b/tests/onix_workflow/fixtures/schema/bisac_lookup.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/schema/bisac_lookup.json rename to tests/onix_workflow/fixtures/schema/bisac_lookup.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/schema/country.json b/tests/onix_workflow/fixtures/schema/country.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/schema/country.json rename to tests/onix_workflow/fixtures/schema/country.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/schema/country_2019-01-01.json b/tests/onix_workflow/fixtures/schema/country_2019-01-01.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/schema/country_2019-01-01.json rename to tests/onix_workflow/fixtures/schema/country_2019-01-01.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/schema/crossref_metadata_master.json b/tests/onix_workflow/fixtures/schema/crossref_metadata_master.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/schema/crossref_metadata_master.json rename to tests/onix_workflow/fixtures/schema/crossref_metadata_master.json diff --git a/oaebu_workflows/onix_workflow/tests/fixtures/schema/thema_lookup.json b/tests/onix_workflow/fixtures/schema/thema_lookup.json similarity index 100% rename from oaebu_workflows/onix_workflow/tests/fixtures/schema/thema_lookup.json rename to tests/onix_workflow/fixtures/schema/thema_lookup.json diff --git a/oaebu_workflows/onix_workflow/tests/test_onix_work_aggregation.py b/tests/onix_workflow/test_onix_work_aggregation.py similarity index 100% rename from oaebu_workflows/onix_workflow/tests/test_onix_work_aggregation.py rename to tests/onix_workflow/test_onix_work_aggregation.py diff --git a/oaebu_workflows/onix_workflow/tests/test_onix_workflow.py b/tests/onix_workflow/test_onix_workflow.py similarity index 78% rename from oaebu_workflows/onix_workflow/tests/test_onix_workflow.py rename to tests/onix_workflow/test_onix_workflow.py index 829d1051..1d4ce691 100644 --- a/oaebu_workflows/onix_workflow/tests/test_onix_workflow.py +++ b/tests/onix_workflow/test_onix_workflow.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ import os from datetime import timedelta -from unittest.mock import MagicMock, patch +from unittest.mock import patch import vcr import shutil from typing import List @@ -29,9 +29,9 @@ from oaebu_workflows.config import test_fixtures_folder from oaebu_workflows.oaebu_partners import OaebuPartner, OAEBU_DATA_PARTNERS, partner_from_str from oaebu_workflows.onix_workflow.onix_workflow import ( - OnixWorkflow, OnixWorkflowRelease, CROSSREF_EVENT_URL_TEMPLATE, + create_dag, download_crossref_events, transform_crossref_events, transform_event, @@ -41,25 +41,25 @@ get_onix_records, insert_into_schema, ) -from observatory.platform.api import get_dataset_releases -from observatory.platform.observatory_config import Workflow -from observatory.platform.files import load_jsonl -from observatory.platform.bigquery import bq_find_schema, bq_run_query, bq_sharded_table_id, bq_table_id -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.config import module_file_path -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.files import load_jsonl +from observatory_platform.google.bigquery import bq_find_schema, bq_run_query, bq_sharded_table_id, bq_table_id +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.config import module_file_path + +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment +from observatory_platform.sandbox.test_utils import ( + SandboxTestCase, Table, - find_free_port, - random_id, - make_dummy_dag, bq_load_tables, load_and_parse_json, + random_id, + make_dummy_dag, ) -class TestOnixWorkflow(ObservatoryTestCase): +class TestOnixWorkflow(SandboxTestCase): """Functionally test the workflow""" onix_data = [ @@ -143,27 +143,24 @@ def __init__(self, *args, **kwargs): @patch("oaebu_workflows.onix_workflow.onix_workflow.bq_select_table_shard_dates") def test_make_release(self, mock_sel_table_suffixes): """Tests that the make_release function works as intended""" + # Use a different onix snapshot date for testing purposes onix_snapshot_date = self.snapshot_date.add(days=1) crossref_snapshot_date = self.snapshot_date mock_sel_table_suffixes.side_effect = [[onix_snapshot_date], [crossref_snapshot_date]] - env = ObservatoryEnvironment( - self.gcp_project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.gcp_project_id, self.data_location) with env.create(): - wf = OnixWorkflow( + dag = create_dag( dag_id="test_make_release", cloud_workspace=self.fake_cloud_workspace, data_partners=[self.fake_onix_data_partner], metadata_partner="onix", ) - dag = wf.make_dag() with env.create_dag_run(dag, self.snapshot_date.add(days=1)): - release = wf.make_release( - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - run_id=env.dag_run.run_id, - ) - self.assertEqual(release.dag_id, wf.dag_id) + ti = env.run_task("make_release") + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + release = OnixWorkflowRelease.from_dict(release_dict) + self.assertEqual(release.dag_id, dag.dag_id) # Test release file names are as expected self.assertEqual(release.workslookup_path, os.path.join(release.transform_folder, "worksid.jsonl.gz")) @@ -187,57 +184,26 @@ def test_make_release(self, mock_sel_table_suffixes): self.assertEqual(crossref_snapshot_date, release.crossref_master_snapshot_date) # Test for case - no ONIX releases found + dag.clear(task_ids=["make_release"]) mock_sel_table_suffixes.side_effect = [[]] with self.assertRaisesRegex(RuntimeError, "ONIX"): - release = wf.make_release( - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - run_id=env.dag_run.run_id, - ) + env.run_task("make_release") # Test for case - no Crossref releases found + dag.clear(task_ids=["make_release"]) mock_sel_table_suffixes.side_effect = [[onix_snapshot_date], []] # No crossref releases with self.assertRaisesRegex(RuntimeError, "Crossref"): - release = wf.make_release( - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - run_id=env.dag_run.run_id, - ) - - def test_cleanup(self): - """Tests the cleanup function of the workflow""" - env = ObservatoryEnvironment( - self.gcp_project_id, - self.data_location, - api_host="localhost", - api_port=find_free_port(), - ) - with env.create(): - wf = OnixWorkflow( - dag_id="test_cleanup", - cloud_workspace=self.fake_cloud_workspace, - data_partners=[self.fake_onix_data_partner], - metadata_partner="onix", - ) - release = OnixWorkflowRelease( - dag_id=wf.dag_id, - run_id="test_run_id", - snapshot_date=self.snapshot_date, - onix_snapshot_date=self.snapshot_date, - crossref_master_snapshot_date=self.snapshot_date, - ) - self.assertTrue(os.path.exists(release.download_folder)) - self.assertTrue(os.path.exists(release.extract_folder)) - self.assertTrue(os.path.exists(release.transform_folder)) - wf.cleanup(release, execution_date=self.snapshot_date) - self.assert_cleanup(release.workflow_folder) + env.run_task("make_release") def test_dag_load(self): """Test that the DAG loads""" - env = ObservatoryEnvironment( + + env = SandboxEnvironment( workflows=[ Workflow( dag_id="onix_workflow_test_dag_load", name="Onix Workflow Test Dag Load", - class_name="oaebu_workflows.onix_workflow.onix_workflow.OnixWorkflow", + class_name="oaebu_workflows.onix_workflow.onix_workflow.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict( sensor_dag_ids=[ @@ -257,45 +223,60 @@ def test_dag_load(self): with env.create() as dag_folder: # This should raise one error for nonexistent partner - shutil.copy( - os.path.join(module_file_path("observatory.platform.dags"), "load_workflows.py"), "load_workflows.py" - ) + shutil.copy(os.path.join(module_file_path("dags"), "load_dags.py"), "load_dags.py") dag_bag = DagBag(dag_folder=dag_folder) self.assertNotEqual({}, dag_bag.import_errors) self.assertEqual(len(dag_bag.import_errors), 1) + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") # Remove the nonexistent partner and onix partner env.workflows[0].kwargs["data_partners"] = env.workflows[0].kwargs["data_partners"][:-1] with env.create(): # Should not raise any errors - self.assert_dag_load_from_config("onix_workflow_test_dag_load") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("onix_workflow_test_dag_load", dag_file) def test_dag_structure(self): """Tests that the dag structure is created as expected on dag load""" ## No data partners - dry run - env = ObservatoryEnvironment( - self.gcp_project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.gcp_project_id, self.data_location) with env.create(): - dag = OnixWorkflow( + dag = create_dag( dag_id=self.dag_id, cloud_workspace=self.fake_cloud_workspace, data_partners=[], metadata_partner="onix", sensor_dag_ids=[], - ).make_dag() + ) expected_dag_structure = { + "check_dependencies": [], + "make_release": [ + "export_tables.export_book_metrics_author", + "export_tables.export_book_list", + "export_tables.export_book_metrics_country", + "aggregate_works", + "export_tables.export_book_metrics", + "create_book_product_table", + "export_tables.export_book_metrics_events", + "add_new_dataset_releases", + "create_crossref_metadata_table", + "create_book_table", + "update_latest_export_tables", + "cleanup_workflow", + "export_tables.export_book_metrics_subjects", + "create_crossref_events_table", + ], "aggregate_works": ["create_crossref_metadata_table"], "create_crossref_metadata_table": ["create_crossref_events_table"], "create_crossref_events_table": ["create_book_table"], "create_book_table": [], "create_book_product_table": [ + "export_tables.export_book_metrics_author", "export_tables.export_book_list", - "export_tables.export_book_metrics_events", - "export_tables.export_book_metrics", "export_tables.export_book_metrics_country", - "export_tables.export_book_metrics_author", + "export_tables.export_book_metrics", + "export_tables.export_book_metrics_events", "export_tables.export_book_metrics_subjects", ], "export_tables.export_book_list": ["update_latest_export_tables"], @@ -305,44 +286,78 @@ def test_dag_structure(self): "export_tables.export_book_metrics_author": ["update_latest_export_tables"], "export_tables.export_book_metrics_subjects": ["update_latest_export_tables"], "update_latest_export_tables": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "add_new_dataset_releases": ["cleanup_workflow"], + "cleanup_workflow": [], } self.assert_dag_structure(expected_dag_structure, dag) ## All data partners - env = ObservatoryEnvironment( - self.gcp_project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.gcp_project_id, self.data_location) with env.create(): sensor_dag_ids = ["jstor", "irus_oapen", "google_books", "onix", "google_analytics3"] - dag = OnixWorkflow( + dag = create_dag( dag_id=self.dag_id, cloud_workspace=self.fake_cloud_workspace, data_partners=self.data_partner_list, metadata_partner="onix", sensor_dag_ids=sensor_dag_ids, - ).make_dag() + ) expected_dag_structure = { - "sensors.jstor_sensor": ["aggregate_works"], - "sensors.irus_oapen_sensor": ["aggregate_works"], - "sensors.google_books_sensor": ["aggregate_works"], - "sensors.onix_sensor": ["aggregate_works"], - "sensors.google_analytics3_sensor": ["aggregate_works"], + "check_dependencies": [ + "sensors.onix_sensor", + "sensors.google_analytics3_sensor", + "sensors.jstor_sensor", + "sensors.google_books_sensor", + "sensors.irus_oapen_sensor", + ], + "sensors.jstor_sensor": ["make_release"], + "sensors.irus_oapen_sensor": ["make_release"], + "sensors.google_books_sensor": ["make_release"], + "sensors.onix_sensor": ["make_release"], + "sensors.google_analytics3_sensor": ["make_release"], + "make_release": [ + "aggregate_works", + "intermediate_tables.intermediate_irus_fulcrum", + "export_tables.export_book_metrics_author", + "intermediate_tables.intermediate_google_books_sales", + "export_tables.export_book_institution_list", + "create_crossref_events_table", + "export_tables.export_book_metrics", + "add_new_dataset_releases", + "intermediate_tables.intermediate_jstor_country", + "intermediate_tables.intermediate_google_analytics3", + "intermediate_tables.intermediate_irus_oapen", + "intermediate_tables.intermediate_google_books_traffic", + "create_book_table", + "export_tables.export_book_metrics_events", + "export_tables.export_book_metrics_subjects", + "update_latest_export_tables", + "export_tables.export_book_list", + "cleanup_workflow", + "intermediate_tables.intermediate_worldreader", + "create_book_product_table", + "intermediate_tables.intermediate_jstor_institution", + "intermediate_tables.intermediate_ucl_discovery", + "export_tables.export_book_metrics_country", + "intermediate_tables.intermediate_internet_archive", + "export_tables.export_book_metrics_city", + "create_crossref_metadata_table", + "export_tables.export_book_metrics_institution", + ], "aggregate_works": ["create_crossref_metadata_table"], "create_crossref_metadata_table": ["create_crossref_events_table"], "create_crossref_events_table": ["create_book_table"], "create_book_table": [ - "intermediate_tables.intermediate_google_analytics3", - "intermediate_tables.intermediate_google_books_sales", - "intermediate_tables.intermediate_google_books_traffic", + "intermediate_tables.intermediate_worldreader", "intermediate_tables.intermediate_jstor_country", + "intermediate_tables.intermediate_google_analytics3", "intermediate_tables.intermediate_jstor_institution", "intermediate_tables.intermediate_irus_oapen", + "intermediate_tables.intermediate_google_books_sales", "intermediate_tables.intermediate_irus_fulcrum", "intermediate_tables.intermediate_ucl_discovery", + "intermediate_tables.intermediate_google_books_traffic", "intermediate_tables.intermediate_internet_archive", - "intermediate_tables.intermediate_worldreader", ], "intermediate_tables.intermediate_google_analytics3": ["create_book_product_table"], "intermediate_tables.intermediate_google_books_sales": ["create_book_product_table"], @@ -355,28 +370,28 @@ def test_dag_structure(self): "intermediate_tables.intermediate_internet_archive": ["create_book_product_table"], "intermediate_tables.intermediate_worldreader": ["create_book_product_table"], "create_book_product_table": [ - "export_tables.export_book_list", + "export_tables.export_book_metrics_author", "export_tables.export_book_institution_list", - "export_tables.export_book_metrics_institution", - "export_tables.export_book_metrics_city", - "export_tables.export_book_metrics_events", - "export_tables.export_book_metrics", "export_tables.export_book_metrics_country", - "export_tables.export_book_metrics_author", + "export_tables.export_book_metrics_events", + "export_tables.export_book_metrics_city", + "export_tables.export_book_metrics_institution", "export_tables.export_book_metrics_subjects", + "export_tables.export_book_list", + "export_tables.export_book_metrics", ], "export_tables.export_book_list": ["update_latest_export_tables"], + "export_tables.export_book_metrics_events": ["update_latest_export_tables"], "export_tables.export_book_institution_list": ["update_latest_export_tables"], "export_tables.export_book_metrics_institution": ["update_latest_export_tables"], "export_tables.export_book_metrics_city": ["update_latest_export_tables"], - "export_tables.export_book_metrics_events": ["update_latest_export_tables"], "export_tables.export_book_metrics": ["update_latest_export_tables"], "export_tables.export_book_metrics_country": ["update_latest_export_tables"], "export_tables.export_book_metrics_author": ["update_latest_export_tables"], "export_tables.export_book_metrics_subjects": ["update_latest_export_tables"], "update_latest_export_tables": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "add_new_dataset_releases": ["cleanup_workflow"], + "cleanup_workflow": [], } self.assert_dag_structure(expected_dag_structure, dag) @@ -398,27 +413,32 @@ def test_create_and_load_aggregate_works_table(self, mock_bq_query): {"isbn13": "111", "work_family_id": "111"}, {"isbn13": "211", "work_family_id": "111"}, ] - env = ObservatoryEnvironment( - self.gcp_project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.gcp_project_id, self.data_location) with env.create(): - wf = OnixWorkflow( + bq_onix_workflow_dataset = env.add_dataset() + bq_worksid_table_name = "onix_workid_isbn" + bq_worksid_error_table_name = "onix_workid_isbn_errors" + bq_workfamilyid_table_name = "onix_workfamilyid_isbn" + dag = create_dag( dag_id=self.dag_id, cloud_workspace=env.cloud_workspace, data_partners=[self.fake_onix_data_partner], - bq_onix_workflow_dataset=env.add_dataset(), + bq_onix_workflow_dataset=bq_onix_workflow_dataset, + bq_worksid_table_name=bq_worksid_table_name, + bq_worksid_error_table_name=bq_worksid_error_table_name, + bq_workfamilyid_table_name=bq_workfamilyid_table_name, metadata_partner="onix", ) - dag = wf.make_dag() with env.create_dag_run(dag, self.snapshot_date.add(days=1)): - release = OnixWorkflowRelease( - dag_id="aggregation_test", - run_id=env.dag_run.run_id, - snapshot_date=self.snapshot_date, - onix_snapshot_date=self.snapshot_date, - crossref_master_snapshot_date=self.snapshot_date, - ) - wf.aggregate_works(release, ti=MagicMock(task_id="")) # Test works aggregation + # Mock the table shard dates so the release can be made + with patch("oaebu_workflows.onix_workflow.onix_workflow.bq_select_table_shard_dates") as mock_date: + mock_date.return_value = [self.snapshot_date] + ti = env.run_task("make_release") + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + release = OnixWorkflowRelease.from_dict(release_dict) + + # Run aggregations + env.run_task("aggregate_works") ### Make Assertions ### self.assertTrue(os.path.exists(release.workslookup_path)) @@ -442,9 +462,9 @@ def test_create_and_load_aggregate_works_table(self, mock_bq_query): ) table_id = bq_sharded_table_id( - wf.cloud_workspace.project_id, - wf.bq_onix_workflow_dataset, - wf.bq_worksid_table_name, + env.cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_worksid_table_name, release.snapshot_date, ) self.assert_table_integrity(table_id, len(workslookup_expected)) @@ -452,9 +472,9 @@ def test_create_and_load_aggregate_works_table(self, mock_bq_query): self.assert_table_content(table_id, workslookup_expected, primary_key="isbn13") table_id = bq_sharded_table_id( - wf.cloud_workspace.project_id, - wf.bq_onix_workflow_dataset, - wf.bq_worksid_error_table_name, + env.cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_worksid_error_table_name, release.snapshot_date, ) self.assert_table_integrity(table_id, len(workslookup_errors_expected)) @@ -462,9 +482,9 @@ def test_create_and_load_aggregate_works_table(self, mock_bq_query): self.assert_table_content(table_id, workslookup_errors_expected, primary_key="Error") table_id = bq_sharded_table_id( - wf.cloud_workspace.project_id, - wf.bq_onix_workflow_dataset, - wf.bq_workfamilyid_table_name, + env.cloud_workspace.project_id, + bq_onix_workflow_dataset, + bq_workfamilyid_table_name, release.snapshot_date, ) self.assert_table_integrity(table_id, len(worksfamilylookup_expected)) @@ -492,13 +512,12 @@ def test_crossref_API_calls(self): end_date=events_end.strftime("%Y-%m-%d"), ) events = download_crossref_event_url(event_url) - assert events == [{"passed": True}], f"Event return incorrect. Got {events}" + self.assertListEqual(events, [{"passed": True}], f"Event return incorrect. Got {events}") good_events = download_crossref_events([good_test_doi], events_start, events_end, mailto, max_threads=1) bad_events = download_crossref_events([bad_test_doi], events_start, events_end, mailto, max_threads=1) - assert good_events, "Events should have returned something" - assert len(good_events) == 4 - assert not bad_events, f"Events should have returned nothing, instead returned {bad_events}" + self.assertEqual(len(good_events), 4) + self.assertIsNot(bad_events, f"Events should have returned nothing, instead returned {bad_events}") @patch("oaebu_workflows.onix_workflow.onix_workflow.bq_run_query") def test_get_onix_records(self, mock_bq_query): @@ -570,11 +589,11 @@ def test_crossref_transform(self): ] # Standalone transform actual_transformed_events = transform_event(input_events[0]) - assert expected_transformed_events[0] == actual_transformed_events + self.assertEqual(expected_transformed_events[0], actual_transformed_events) # List transform actual_transformed_events = transform_crossref_events(input_events) - assert len(actual_transformed_events) == 1 - assert expected_transformed_events == actual_transformed_events + self.assertEqual(len(actual_transformed_events), 1) + self.assertEqual(expected_transformed_events, actual_transformed_events) def test_insert_into_schema(self): """Tests the instert_into_schema function""" @@ -606,9 +625,7 @@ def test_utility_functions(self): ### Test dois_from_table ### ############################ - env = ObservatoryEnvironment( - self.gcp_project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.gcp_project_id, self.data_location) fake_doi_isbn_dataset_id = env.add_dataset(prefix="doi_isbn_test") fake_sharded_dataset = env.add_dataset(prefix="sharded_data") fake_copied_export_dataset = env.add_dataset(prefix="copied_export") @@ -637,14 +654,14 @@ def test_utility_functions(self): fake_doi_isbns = [entry["DOI"] for entry in fake_doi_isbn_table] # Check there are no duplicates and the contents are the same - assert len(actual_dois) == len(set(fake_doi_isbns)) - assert set(actual_dois) == set(fake_doi_isbns) + self.assertEqual(len(actual_dois), len(set(fake_doi_isbns))) + self.assertEqual(set(actual_dois), set(fake_doi_isbns)) # Do the same but allow duplicates actual_dois = dois_from_table(table_id, doi_column_name="DOI", distinct=False) fake_doi_isbns = [entry["DOI"] for entry in fake_doi_isbn_table] - assert len(actual_dois) == len(fake_doi_isbns) - assert sorted(actual_dois) == sorted(fake_doi_isbns) + self.assertEqual(len(actual_dois), len(fake_doi_isbns)) + self.assertEqual(sorted(actual_dois), sorted(fake_doi_isbns)) ############################################# ### Test copy_latest_export_tables ### @@ -678,11 +695,11 @@ def test_utility_functions(self): actual_isbns = [entry["ISBN13"] for entry in fake_doi_isbn_table] actual_dois = [entry["DOI"] for entry in fake_doi_isbn_table] - assert len(copied_data) == len(fake_doi_isbn_table) - assert len(actual_isbns) == len(copied_isbns) - assert sorted(actual_isbns) == sorted(copied_isbns) - assert len(actual_dois) == len(copied_dois) - assert sorted(actual_dois) == sorted(copied_dois) + self.assertEqual(len(copied_data), len(fake_doi_isbn_table)) + self.assertEqual(len(actual_isbns), len(copied_isbns)) + self.assertEqual(sorted(actual_isbns), sorted(copied_isbns)) + self.assertEqual(len(actual_dois), len(copied_dois)) + self.assertEqual(sorted(actual_dois), sorted(copied_dois)) def setup_fake_lookup_tables( self, settings_dataset_id: str, fixtures_dataset_id: str, release_date: pendulum.DateTime, bucket_name: str @@ -827,9 +844,7 @@ def vcr_ignore_condition(request): return request # Setup Observatory environment - env = ObservatoryEnvironment( - self.gcp_project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.gcp_project_id, self.data_location) # Create workflow datasets onix_workflow_dataset_id = env.add_dataset(prefix="onix_workflow") @@ -876,14 +891,28 @@ def vcr_ignore_condition(request): else [] ) - workflow = OnixWorkflow( - dag_id=f"onix_workflow_test", + start_date = pendulum.datetime(year=2021, month=5, day=9) + dag_id = "onix_workflow_test" + bq_oaebu_crossref_metadata_table_name = "crossref_metadata" + bq_crossref_events_table_name = "crossref_events" + bq_book_table_name = "book" + bq_book_product_table_name = "book_product" + bq_worksid_table_name = "onix_workid_isbn" + bq_worksid_error_table_name = "onix_workid_isbn_errors" + bq_workfamilyid_table_name = "onix_workfamilyid_isbn" + api_dataset_id = env.add_dataset() + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, metadata_partner=metadata_partner, bq_master_crossref_project_id=env.cloud_workspace.project_id, bq_master_crossref_dataset_id=master_crossref_dataset_id, bq_oaebu_crossref_dataset_id=oaebu_crossref_dataset_id, + bq_oaebu_crossref_metadata_table_name=bq_oaebu_crossref_metadata_table_name, bq_master_crossref_metadata_table_name="crossref_metadata_master", # Set in setup_input_data() + bq_crossref_events_table_name=bq_crossref_events_table_name, + bq_book_table_name=bq_book_table_name, + bq_book_product_table_name=bq_book_product_table_name, bq_country_project_id=env.cloud_workspace.project_id, bq_country_dataset_id=oaebu_settings_dataset_id, bq_subject_project_id=env.cloud_workspace.project_id, @@ -891,64 +920,68 @@ def vcr_ignore_condition(request): bq_onix_workflow_dataset=onix_workflow_dataset_id, bq_oaebu_intermediate_dataset=oaebu_intermediate_dataset_id, bq_oaebu_dataset=oaebu_output_dataset_id, + bq_worksid_table_name=bq_worksid_table_name, + bq_worksid_error_table_name=bq_worksid_error_table_name, + bq_workfamilyid_table_name=bq_workfamilyid_table_name, bq_oaebu_export_dataset=oaebu_export_dataset_id, bq_oaebu_latest_export_dataset=oaebu_latest_export_dataset_id, + api_dataset_id=api_dataset_id, data_partners=data_partners, sensor_dag_ids=sensor_dag_ids, - start_date=pendulum.datetime(year=2021, month=5, day=9), + start_date=start_date, crossref_start_date=pendulum.datetime(year=2018, month=5, day=14), max_threads=1, # Use 1 thread for tests ) - workflow_dag = workflow.make_dag() # Skip dag existence check in sensor. - for sensor in [task for task in workflow_dag.tasks if task.node_id.startswith("sensors.")]: + for sensor in [task for task in dag.tasks if task.node_id.startswith("sensors.")]: sensor.check_exists = False + sensor.check_existence = False sensor.grace_period = timedelta(seconds=1) - # If there is no dag run in the search interval, sensor will return success. - expected_state = "success" - with env.create_dag_run(workflow_dag, workflow.start_date): - for sensor_id in sensor_dag_ids: - ti = env.run_task(f"sensors.{sensor_id}_sensor") - self.assertEqual(ti.state, State.SUCCESS) - # Run Dummy Dags - execution_date = pendulum.datetime(year=2021, month=5, day=16) - for dag_id in sensor_dag_ids: - dag = make_dummy_dag(dag_id, execution_date) - with env.create_dag_run(dag, execution_date): + execution_date = pendulum.datetime(year=2021, month=5, day=17) + for sensor_id in sensor_dag_ids: + dummy_dag = make_dummy_dag(sensor_id, execution_date) + with env.create_dag_run(dummy_dag, execution_date): # Running all of a DAGs tasks sets the DAG to finished ti = env.run_task("dummy_task") self.assertEqual(ti.state, State.SUCCESS) # Run end to end tests for DOI DAG - with env.create_dag_run(workflow_dag, execution_date): + with env.create_dag_run(dag, execution_date): + # Run dependency check + ti = env.run_task("check_dependencies") + self.assertEqual(ti.state, State.SUCCESS) + # Test that sensors go into 'success' state as the DAGs that they are waiting for have finished - for dag_id in sensor_dag_ids: - ti = env.run_task(f"sensors.{dag_id}_sensor") + for sensor_id in sensor_dag_ids: + ti = env.run_task(f"sensors.{sensor_id}_sensor") self.assertEqual(ti.state, State.SUCCESS) # Mock make_release - release_date = pendulum.datetime(year=2021, month=5, day=22) - workflow.make_release = MagicMock( - return_value=OnixWorkflowRelease( - dag_id=workflow.dag_id, - run_id=env.dag_run.run_id, - snapshot_date=release_date, - onix_snapshot_date=partner_release_date, - crossref_master_snapshot_date=partner_release_date, - ) - ) - release = workflow.make_release() + with patch("oaebu_workflows.onix_workflow.onix_workflow.bq_select_table_shard_dates") as mock_date: + mock_date.return_value = [partner_release_date] + ti = env.run_task("make_release") + self.assertEqual(ti.state, State.SUCCESS) + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + expected_release_dict = { + "dag_id": "onix_workflow_test", + "run_id": "scheduled__2021-05-17T00:00:00+00:00", + "snapshot_date": "2021-05-24", + "onix_snapshot_date": "2021-05-15", + "crossref_master_snapshot_date": "2021-05-15", + } + self.assertEqual(release_dict, expected_release_dict) + release = OnixWorkflowRelease.from_dict(release_dict) release_suffix = release.snapshot_date.strftime("%Y%m%d") # Aggregate works - ti = env.run_task(workflow.aggregate_works.__name__) + ti = env.run_task("aggregate_works") self.assertEqual(ti.state, State.SUCCESS) table_id = bq_sharded_table_id( - self.gcp_project_id, onix_workflow_dataset_id, workflow.bq_worksid_table_name, release_date + self.gcp_project_id, onix_workflow_dataset_id, bq_worksid_table_name, release.snapshot_date ) self.assert_table_content( table_id, @@ -956,7 +989,7 @@ def vcr_ignore_condition(request): primary_key="isbn13", ) table_id = bq_sharded_table_id( - self.gcp_project_id, onix_workflow_dataset_id, workflow.bq_worksid_error_table_name, release_date + self.gcp_project_id, onix_workflow_dataset_id, bq_worksid_error_table_name, release.snapshot_date ) self.assert_table_content( table_id, @@ -966,7 +999,7 @@ def vcr_ignore_condition(request): primary_key="Error", ) table_id = bq_sharded_table_id( - self.gcp_project_id, onix_workflow_dataset_id, workflow.bq_workfamilyid_table_name, release_date + self.gcp_project_id, onix_workflow_dataset_id, bq_workfamilyid_table_name, release.snapshot_date ) self.assert_table_content( table_id, @@ -977,14 +1010,14 @@ def vcr_ignore_condition(request): ) # Load crossref metadata table into bigquery - ti = env.run_task(workflow.create_crossref_metadata_table.__name__) + ti = env.run_task("create_crossref_metadata_table") self.assertEqual(ti.state, State.SUCCESS) table_id = bq_sharded_table_id( self.gcp_project_id, oaebu_crossref_dataset_id, - workflow.bq_oaebu_crossref_metadata_table_name, - release_date, + bq_oaebu_crossref_metadata_table_name, + release.snapshot_date, ) self.assert_table_content( table_id, @@ -996,13 +1029,13 @@ def vcr_ignore_condition(request): with vcr.use_cassette( self.events_cassette, record_mode="none", before_record_request=vcr_ignore_condition ): - ti = env.run_task(workflow.create_crossref_events_table.__name__) + ti = env.run_task("create_crossref_events_table") self.assertEqual(ti.state, State.SUCCESS) table_id = bq_sharded_table_id( self.gcp_project_id, oaebu_crossref_dataset_id, - workflow.bq_crossref_events_table_name, - release_date, + bq_crossref_events_table_name, + release.snapshot_date, ) crossref_fixture_table = load_and_parse_json( os.path.join(self.fixtures_folder, "e2e_outputs", "crossref_events.json"), @@ -1011,10 +1044,10 @@ def vcr_ignore_condition(request): self.assert_table_content(table_id, crossref_fixture_table, primary_key="id") # Create book table in bigquery - ti = env.run_task(workflow.create_book_table.__name__) + ti = env.run_task("create_book_table") self.assertEqual(ti.state, State.SUCCESS) table_id = bq_sharded_table_id( - self.gcp_project_id, oaebu_output_dataset_id, workflow.bq_book_table_name, release_date + self.gcp_project_id, oaebu_output_dataset_id, bq_book_table_name, release.snapshot_date ) self.assert_table_content( table_id, @@ -1028,10 +1061,10 @@ def vcr_ignore_condition(request): self.assertEqual(ti.state, State.SUCCESS) # Create book product table - ti = env.run_task(workflow.create_book_product_table.__name__) + ti = env.run_task("create_book_product_table") self.assertEqual(ti.state, State.SUCCESS) table_id = bq_sharded_table_id( - self.gcp_project_id, oaebu_output_dataset_id, workflow.bq_book_product_table_name, release_date + self.gcp_project_id, oaebu_output_dataset_id, bq_book_product_table_name, release.snapshot_date ) expected_book_product_table = "book_product.json" if not dry_run else "book_product_dry.json" self.assert_table_content( @@ -1101,13 +1134,13 @@ def vcr_ignore_condition(request): # Check that the data_export tables tables exist and have the correct number of rows for table, exp_rows in export_tables: table_id = bq_sharded_table_id( - self.gcp_project_id, oaebu_export_dataset_id, f"{export_prefix}_{table}", release_date + self.gcp_project_id, oaebu_export_dataset_id, f"{export_prefix}_{table}", release.snapshot_date ) self.assert_table_integrity(table_id, expected_rows=exp_rows) # Book product list content assertion table_id = bq_sharded_table_id( - self.gcp_project_id, oaebu_export_dataset_id, f"{export_prefix}_book_list", release_date + self.gcp_project_id, oaebu_export_dataset_id, f"{export_prefix}_book_list", release.snapshot_date ) expected_book_list_table = "book_list.json" if not dry_run else "book_list_dry.json" fixture_table = load_and_parse_json( @@ -1166,7 +1199,7 @@ def vcr_ignore_condition(request): ### Create and validate export copies ### ######################################### - ti = env.run_task(workflow.update_latest_export_tables.__name__) + ti = env.run_task("update_latest_export_tables") self.assertEqual(ti.state, State.SUCCESS) # Check export views are the same as the tables @@ -1174,7 +1207,7 @@ def vcr_ignore_condition(request): export_copy = bq_run_query( f"SELECT * FROM {self.gcp_project_id}.{oaebu_latest_export_dataset_id}.{self.gcp_project_id.replace('-', '_')}_{export_table}" ) - self.assertEqual(expected_state, ti.state, msg=f"table: {table}") + self.assertEqual(ti.state, State.SUCCESS, msg=f"table: {table}") # Check that the data_export table has the correct number of rows self.assertEqual(len(export_copy), exp_rows) @@ -1182,18 +1215,43 @@ def vcr_ignore_condition(request): ### Add releases and Cleanup ### ################################ - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=workflow.dag_id, dataset_id=workflow.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=self.gcp_project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(workflow.add_new_dataset_releases.__name__) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.onix_workflow.onix_workflow.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("add_new_dataset_releases") self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=workflow.dag_id, dataset_id=workflow.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2021-05-17T00:00:00+00:00", + "data_interval_end": "2021-05-24T00:00:00+00:00", + "snapshot_date": "2021-05-24T00:00:00+00:00", + "partition_date": None, + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test cleanup - ti = env.run_task(workflow.cleanup.__name__) + release_workflow_folder = release.workflow_folder + ti = env.run_task("cleanup_workflow") self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(release_workflow_folder) def test_workflow_e2e(self): """Test that ONIX Workflow works as expected""" diff --git a/oaebu_workflows/tests/test_oaebu_partners.py b/tests/test_oaebu_partners.py similarity index 100% rename from oaebu_workflows/tests/test_oaebu_partners.py rename to tests/test_oaebu_partners.py diff --git a/oaebu_workflows/tests/test_onix_utils.py b/tests/test_onix_utils.py similarity index 99% rename from oaebu_workflows/tests/test_onix_utils.py rename to tests/test_onix_utils.py index ab5cc430..80d2a520 100644 --- a/oaebu_workflows/tests/test_onix_utils.py +++ b/tests/test_onix_utils.py @@ -36,16 +36,13 @@ deduplicate_related_products, ) from oaebu_workflows.config import test_fixtures_folder, schema_folder -from observatory.platform.observatory_environment import ( - ObservatoryTestCase, - compare_lists_of_dicts, -) -from observatory.platform.files import load_jsonl +from observatory_platform.files import load_jsonl +from observatory_platform.sandbox.test_utils import SandboxTestCase, compare_lists_of_dicts FIXTURES_FOLDER = os.path.join(test_fixtures_folder(), "onix_utils") -class TestOnixTransformer(ObservatoryTestCase): +class TestOnixTransformer(SandboxTestCase): """Tests for the ONIX transformer end to end""" filtered_name = "filtered.xml" @@ -155,7 +152,7 @@ def _json_loader(file_path: str): loader(transformer_output_path) -class TestOnixFunctions(ObservatoryTestCase): +class TestOnixFunctions(SandboxTestCase): """Tests for the ONIX telescope""" def test_onix_parser_download_execute(self): diff --git a/oaebu_workflows/ucl_discovery_telescope/__init__.py b/tests/thoth_telescope/__init__.py similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/__init__.py rename to tests/thoth_telescope/__init__.py diff --git a/oaebu_workflows/ucl_discovery_telescope/schema/__init__.py b/tests/thoth_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/schema/__init__.py rename to tests/thoth_telescope/fixtures/__init__.py diff --git a/oaebu_workflows/thoth_telescope/tests/fixtures/test_table.json b/tests/thoth_telescope/fixtures/test_table.json similarity index 100% rename from oaebu_workflows/thoth_telescope/tests/fixtures/test_table.json rename to tests/thoth_telescope/fixtures/test_table.json diff --git a/oaebu_workflows/thoth_telescope/tests/fixtures/thoth_download_cassette.yaml b/tests/thoth_telescope/fixtures/thoth_download_cassette.yaml similarity index 100% rename from oaebu_workflows/thoth_telescope/tests/fixtures/thoth_download_cassette.yaml rename to tests/thoth_telescope/fixtures/thoth_download_cassette.yaml diff --git a/oaebu_workflows/thoth_telescope/tests/test_thoth_telescope.py b/tests/thoth_telescope/test_thoth_telescope.py similarity index 60% rename from oaebu_workflows/thoth_telescope/tests/test_thoth_telescope.py rename to tests/thoth_telescope/test_thoth_telescope.py index 21e50b39..6551a64b 100644 --- a/oaebu_workflows/thoth_telescope/tests/test_thoth_telescope.py +++ b/tests/thoth_telescope/test_thoth_telescope.py @@ -16,6 +16,7 @@ import os from tempfile import TemporaryDirectory +from unittest.mock import patch import pendulum import vcr @@ -23,28 +24,25 @@ from oaebu_workflows.oaebu_partners import partner_from_str from oaebu_workflows.thoth_telescope.thoth_telescope import ( - ThothTelescope, - thoth_download_onix, DEFAULT_HOST_NAME, + ThothRelease, + thoth_download_onix, + create_dag, ) -from oaebu_workflows.config import test_fixtures_folder -from observatory.platform.api import get_dataset_releases -from observatory.platform.bigquery import bq_sharded_table_id -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.utils.url_utils import retry_get_url -from observatory.platform.observatory_config import Workflow -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - load_and_parse_json, -) +from oaebu_workflows.config import test_fixtures_folder, module_file_path +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.google.bigquery import bq_sharded_table_id +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.url_utils import retry_get_url +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.test_utils import SandboxTestCase, load_and_parse_json +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment FAKE_PUBLISHER_ID = "fake_publisher_id" -class TestThothTelescope(ObservatoryTestCase): +class TestThothTelescope(SandboxTestCase): """Tests for the Thoth telescope""" def __init__(self, *args, **kwargs): @@ -71,48 +69,48 @@ def __init__(self, *args, **kwargs): def test_dag_structure(self): """Test that the ONIX DAG has the correct structure.""" - dag = ThothTelescope( + dag = create_dag( dag_id="thoth_telescope_test", cloud_workspace=self.fake_cloud_workspace, publisher_id=FAKE_PUBLISHER_ID, format_specification="onix_3.0::jstor", - ).make_dag() + ) self.assert_dag_structure( { - "check_dependencies": ["download"], - "download": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], + "check_dependencies": ["make_release"], + "make_release": ["download", "transform", "bq_load", "add_new_dataset_releases", "cleanup_workflow"], + "download": ["transform"], + "transform": ["bq_load"], "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "add_new_dataset_releases": ["cleanup_workflow"], + "cleanup_workflow": [], }, dag, ) def test_dag_load(self): """Test that the DAG can be loaded from a DAG bag.""" - env = ObservatoryEnvironment( + env = SandboxEnvironment( workflows=[ Workflow( dag_id="thoth_telescope_test", name="Thoth Telescope", - class_name="oaebu_workflows.thoth_telescope.thoth_telescope.ThothTelescope", + class_name="oaebu_workflows.thoth_telescope.thoth_telescope.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict(publisher_id=FAKE_PUBLISHER_ID, format_specification="onix::oapen"), ) ], ) with env.create(): - self.assert_dag_load_from_config("thoth_telescope_test") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("thoth_telescope_test", dag_file) # Error should be raised for no publisher_id env.workflows[0].kwargs = {} with env.create(): with self.assertRaises(AssertionError) as cm: - self.assert_dag_load_from_config("onix_workflow_test_dag_load") + self.assert_dag_load_from_config("onix_workflow_test_dag_load", dag_file) msg = cm.exception.args[0] self.assertIn("missing 2 required keyword-only arguments", msg) self.assertIn("publisher_id", msg) @@ -120,9 +118,7 @@ def test_dag_load(self): def test_telescope(self): """Test the Thoth telescope end to end.""" - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.project_id, self.data_location) # Create the Observatory environment and run tests with env.create(): @@ -130,39 +126,54 @@ def test_telescope(self): execution_date = pendulum.datetime(year=2022, month=12, day=1) metadata_partner = partner_from_str("thoth", metadata_partner=True) metadata_partner.bq_dataset_id = env.add_dataset() - telescope = ThothTelescope( - dag_id="thoth_telescope_test", + dag_id = "thoth_telescope_test" + api_dataset_id = env.add_dataset() + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, format_specification="onix_3.0::oapen", elevate_related_products=True, publisher_id=FAKE_PUBLISHER_ID, metadata_partner=metadata_partner, + api_dataset_id=api_dataset_id, ) - dag = telescope.make_dag() with env.create_dag_run(dag, execution_date): - ti = env.run_task(telescope.check_dependencies.__name__) + # Check dependencies task + ti = env.run_task("check_dependencies") + self.assertEqual(ti.state, State.SUCCESS) + + # Make release task + ti = env.run_task("make_release") self.assertEqual(ti.state, State.SUCCESS) - thoth_vcr = vcr.VCR(record_mode="none") + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + expected_release_dict = { + "dag_id": "thoth_telescope_test", + "run_id": "scheduled__2022-12-01T00:00:00+00:00", + "snapshot_date": "2022-12-04", + } + self.assertEqual(release_dict, expected_release_dict) + release = ThothRelease.from_dict(release_dict) + + # Download task + # Ignore the googleapis host so the upload step works + thoth_vcr = vcr.VCR( + record_mode="none", ignore_hosts=["oauth2.googleapis.com", "storage.googleapis.com"] + ) with thoth_vcr.use_cassette(self.download_cassette): - ti = env.run_task(telescope.download.__name__) + ti = env.run_task("download") self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.upload_downloaded.__name__) - self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.upload_transformed.__name__) + + # Transform task + ti = env.run_task("transform") self.assertEqual(ti.state, State.SUCCESS) - ti = env.run_task(telescope.bq_load.__name__) + + # Bigquery load task + ti = env.run_task("bq_load") self.assertEqual(ti.state, State.SUCCESS) ### Make assertions ### - # Make the release - release = telescope.make_release( - run_id=env.dag_run.run_id, data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)) - ) - # Downloaded file self.assert_file_integrity(release.download_path, "043e9c474e14e2776b22fc590ea1773c", "md5") @@ -178,26 +189,50 @@ def test_telescope(self): # Uploaded table table_id = bq_sharded_table_id( - telescope.cloud_workspace.project_id, - telescope.metadata_partner.bq_dataset_id, - telescope.metadata_partner.bq_table_name, + env.cloud_workspace.project_id, + metadata_partner.bq_dataset_id, + metadata_partner.bq_table_name, release.snapshot_date, ) self.assert_table_integrity(table_id, expected_rows=5) self.assert_table_content(table_id, load_and_parse_json(self.test_table), primary_key="ISBN13") - # add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=self.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) + + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.thoth_telescope.thoth_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("add_new_dataset_releases") self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2022-12-01T00:00:00+00:00", + "data_interval_end": "2022-12-04T12:00:00+00:00", + "snapshot_date": "2022-12-04T00:00:00+00:00", + "partition_date": None, + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test cleanup - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("cleanup_workflow") self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) # Function tests def test_download_onix(self): diff --git a/oaebu_workflows/ucl_discovery_telescope/sql/__init__.py b/tests/ucl_discovery_telescope/__init__.py similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/sql/__init__.py rename to tests/ucl_discovery_telescope/__init__.py diff --git a/oaebu_workflows/ucl_discovery_telescope/tests/__init__.py b/tests/ucl_discovery_telescope/fixtures/__init__.py similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/tests/__init__.py rename to tests/ucl_discovery_telescope/fixtures/__init__.py diff --git a/oaebu_workflows/ucl_discovery_telescope/tests/fixtures/download_cassette.yaml b/tests/ucl_discovery_telescope/fixtures/download_cassette.yaml similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/tests/fixtures/download_cassette.yaml rename to tests/ucl_discovery_telescope/fixtures/download_cassette.yaml diff --git a/oaebu_workflows/ucl_discovery_telescope/tests/fixtures/test_table.json b/tests/ucl_discovery_telescope/fixtures/test_table.json similarity index 100% rename from oaebu_workflows/ucl_discovery_telescope/tests/fixtures/test_table.json rename to tests/ucl_discovery_telescope/fixtures/test_table.json diff --git a/oaebu_workflows/ucl_discovery_telescope/tests/test_ucl_discovery_telescope.py b/tests/ucl_discovery_telescope/test_ucl_discovery_telescope.py similarity index 81% rename from oaebu_workflows/ucl_discovery_telescope/tests/test_ucl_discovery_telescope.py rename to tests/ucl_discovery_telescope/test_ucl_discovery_telescope.py index 7f93087b..4de795c9 100644 --- a/oaebu_workflows/ucl_discovery_telescope/tests/test_ucl_discovery_telescope.py +++ b/tests/ucl_discovery_telescope/test_ucl_discovery_telescope.py @@ -1,4 +1,4 @@ -# Copyright 2020-2023 Curtin University +# Copyright 2020-2024 Curtin University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,28 +22,26 @@ from airflow.utils.state import State from airflow.models.connection import Connection import vcr +from google.cloud.bigquery import Client -from oaebu_workflows.config import test_fixtures_folder +from oaebu_workflows.config import test_fixtures_folder, module_file_path from oaebu_workflows.oaebu_partners import partner_from_str from oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope import ( - UclDiscoveryTelescope, + UclDiscoveryRelease, + create_dag, get_isbn_eprint_mappings, download_discovery_stats, transform_discovery_stats, ) -from observatory.platform.api import get_dataset_releases -from observatory.platform.observatory_config import Workflow -from observatory.platform.bigquery import bq_table_id -from observatory.platform.gcs import gcs_blob_name_from_path -from observatory.platform.observatory_environment import ( - ObservatoryEnvironment, - ObservatoryTestCase, - find_free_port, - load_and_parse_json, -) +from observatory_platform.dataset_api import DatasetAPI +from observatory_platform.google.bigquery import bq_table_id +from observatory_platform.google.gcs import gcs_blob_name_from_path +from observatory_platform.airflow.workflow import Workflow +from observatory_platform.sandbox.test_utils import SandboxTestCase, load_and_parse_json +from observatory_platform.sandbox.sandbox_environment import SandboxEnvironment -class TestUclDiscoveryTelescope(ObservatoryTestCase): +class TestUclDiscoveryTelescope(SandboxTestCase): """Tests for the Ucl Discovery telescope""" def __init__(self, *args, **kwargs): @@ -58,62 +56,59 @@ def __init__(self, *args, **kwargs): def test_dag_structure(self): """Test that the UCL Discovery DAG has the correct structure.""" - dag = UclDiscoveryTelescope( - dag_id="Test_Dag", cloud_workspace=self.fake_cloud_workspace, sheet_id="foo" - ).make_dag() + dag = create_dag(dag_id="Test_Dag", cloud_workspace=self.fake_cloud_workspace, sheet_id="foo") self.assert_dag_structure( { - "check_dependencies": ["download"], - "download": ["upload_downloaded"], - "upload_downloaded": ["transform"], - "transform": ["upload_transformed"], - "upload_transformed": ["bq_load"], + "check_dependencies": ["make_release"], + "make_release": ["download", "transform", "bq_load", "add_new_dataset_releases", "cleanup_workflow"], + "download": ["transform"], + "transform": ["bq_load"], "bq_load": ["add_new_dataset_releases"], - "add_new_dataset_releases": ["cleanup"], - "cleanup": [], + "add_new_dataset_releases": ["cleanup_workflow"], + "cleanup_workflow": [], }, dag, ) def test_dag_load(self): """Test that the UCL Discovery DAG can be loaded from a DAG bag.""" - env = ObservatoryEnvironment( + env = SandboxEnvironment( workflows=[ Workflow( dag_id="ucl_discovery", name="UCL Discovery Telescope", - class_name="oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope.UclDiscoveryTelescope", + class_name="oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope.create_dag", cloud_workspace=self.fake_cloud_workspace, kwargs=dict(sheet_id="foo"), ) ] ) with env.create(): - self.assert_dag_load_from_config("ucl_discovery") + dag_file = os.path.join(module_file_path("dags"), "load_dags.py") + self.assert_dag_load_from_config("ucl_discovery", dag_file) def test_telescope(self): """Test the UCL Discovery telescope end to end.""" # Setup Observatory environment - env = ObservatoryEnvironment( - self.project_id, self.data_location, api_host="localhost", api_port=find_free_port() - ) + env = SandboxEnvironment(self.project_id, self.data_location) - # Setup Telescope + # Setup DAG data_partner = partner_from_str("ucl_discovery") data_partner.bq_dataset_id = env.add_dataset() - telescope = UclDiscoveryTelescope( - dag_id="ucl_discovery", + dag_id = "ucl_discovery" + api_dataset_id = env.add_dataset() + dag = create_dag( + dag_id=dag_id, cloud_workspace=env.cloud_workspace, sheet_id="foo", data_partner=data_partner, max_threads=1, + api_dataset_id=api_dataset_id, ) - dag = telescope.make_dag() execution_date = pendulum.datetime(year=2023, month=6, day=1) # Create the Observatory environment and run tests with env.create(), env.create_dag_run(dag, execution_date): - # env.add_connection(Connection(conn_id=telescope.oaebu_service_account_conn_id)) # Mock return values of download function interval_start = pendulum.instance(env.dag_run.data_interval_start) sheet_return = [ @@ -139,7 +134,22 @@ def test_telescope(self): ############################ # Test that all dependencies are specified: no error should be thrown - ti = env.run_task(telescope.check_dependencies.__name__) + ti = env.run_task("check_dependencies") + self.assertEqual(ti.state, State.SUCCESS) + + # Make the release + ti = env.run_task("make_release") + self.assertEqual(ti.state, State.SUCCESS) + release_dict = ti.xcom_pull(task_ids="make_release", include_prior_dates=False) + expected_release_dict = { + "dag_id": "ucl_discovery", + "run_id": "scheduled__2023-06-01T00:00:00+00:00", + "data_interval_start": "2023-06-01", + "data_interval_end": "2023-06-01", + "partition_date": "2023-06-30", + } + self.assertEqual(release_dict, expected_release_dict) + release = UclDiscoveryRelease.from_dict(release_dict) # download cassette = vcr.VCR(record_mode="none") @@ -148,46 +158,34 @@ def test_telescope(self): "oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope.BaseHook.get_connection" ) build_patch = patch("oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope.discovery.build") - with sa_patch, conn_patch, build_patch as mock_build, cassette.use_cassette(self.download_cassette): + with sa_patch, conn_patch, build_patch as mock_build, cassette.use_cassette( + self.download_cassette, ignore_hosts=["oauth2.googleapis.com", "storage.googleapis.com"] + ): mock_service = mock_build.return_value.spreadsheets.return_value.values.return_value.get.return_value mock_service.execute.return_value = {"values": sheet_return} - ti = env.run_task(telescope.download.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # upload_downloaded - ti = env.run_task(telescope.upload_downloaded.__name__) + ti = env.run_task("download") self.assertEqual(ti.state, State.SUCCESS) # transform with sa_patch, conn_patch, build_patch as mock_build: mock_service = mock_build.return_value.spreadsheets.return_value.values.return_value.get.return_value mock_service.execute.return_value = {"values": sheet_return} - ti = env.run_task(telescope.transform.__name__) - self.assertEqual(ti.state, State.SUCCESS) - - # upload_transformed - ti = env.run_task(telescope.upload_transformed.__name__) + ti = env.run_task("transform") self.assertEqual(ti.state, State.SUCCESS) # bq_load - ti = env.run_task(telescope.bq_load.__name__) + ti = env.run_task("bq_load") self.assertEqual(ti.state, State.SUCCESS) - ############################################## - ### Create the release and make assertions ### - ############################################## - - release = telescope.make_release( - run_id=env.dag_run.run_id, - data_interval_start=pendulum.parse(str(env.dag_run.data_interval_start)), - data_interval_end=pendulum.parse(str(env.dag_run.data_interval_end)), - ) + ####################### + ### Make Assertions ### + ####################### # Download self.assertTrue(os.path.exists(release.download_country_path)) self.assertTrue(os.path.exists(release.download_totals_path)) - # Upload Downloaded + # Check downloaded files uploaded download_country_blob = gcs_blob_name_from_path(release.download_country_path) self.assert_blob_integrity(env.download_bucket, download_country_blob, release.download_country_path) download_totals_blob = gcs_blob_name_from_path(release.download_totals_path) @@ -196,16 +194,16 @@ def test_telescope(self): # Transform self.assertTrue(os.path.exists(release.transform_path)) - # Upload Transform + # Check transformed files uploaded self.assert_blob_integrity( env.transform_bucket, gcs_blob_name_from_path(release.transform_path), release.transform_path ) # Bigquery load table_id = bq_table_id( - telescope.cloud_workspace.project_id, - telescope.data_partner.bq_dataset_id, - telescope.data_partner.bq_table_name, + env.cloud_workspace.project_id, + data_partner.bq_dataset_id, + data_partner.bq_table_name, ) self.assert_table_integrity(table_id, 2) self.assert_table_content( @@ -216,18 +214,43 @@ def test_telescope(self): ### Final tasks ### ################### - # Add_dataset_release_task - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + # Set up the API + api = DatasetAPI(project_id=env.cloud_workspace.project_id, dataset_id=api_dataset_id) + api.seed_db() + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 0) - ti = env.run_task(telescope.add_new_dataset_releases.__name__) + + # Add_dataset_release_task + now = pendulum.now("Europe/London") # Use Europe/London to ensure +00UTC timezone + with patch("oaebu_workflows.ucl_discovery_telescope.ucl_discovery_telescope.pendulum.now") as mock_now: + mock_now.return_value = now + ti = env.run_task("add_new_dataset_releases") self.assertEqual(ti.state, State.SUCCESS) - dataset_releases = get_dataset_releases(dag_id=telescope.dag_id, dataset_id=telescope.api_dataset_id) + dataset_releases = api.get_dataset_releases(dag_id=dag_id, dataset_id=api_dataset_id) self.assertEqual(len(dataset_releases), 1) + expected_release = { + "dag_id": dag_id, + "dataset_id": api_dataset_id, + "dag_run_id": release.run_id, + "created": now.to_iso8601_string(), + "modified": now.to_iso8601_string(), + "data_interval_start": "2023-06-01T00:00:00+00:00", + "data_interval_end": "2023-06-04T00:00:00+00:00", + "snapshot_date": None, + "partition_date": "2023-06-30T00:00:00+00:00", + "changefile_start_date": None, + "changefile_end_date": None, + "sequence_start": None, + "sequence_end": None, + "extra": None, + } + self.assertEqual(expected_release, dataset_releases[0].to_dict()) # Test that all telescope data deleted - ti = env.run_task(telescope.cleanup.__name__) + workflow_folder_path = release.workflow_folder + ti = env.run_task("cleanup_workflow") self.assertEqual(ti.state, State.SUCCESS) - self.assert_cleanup(release.workflow_folder) + self.assert_cleanup(workflow_folder_path) class TestGetIsbnEprintMappings(TestCase):