WordPress
diff --git a/‎DAGs.md
Lines changed: 50 additions & 3 deletions b/‎DAGs.md
Lines changed: 50 additions & 3 deletions
diff --git a/‎openverse_catalog/dags/common/helpers.py
Lines changed: 3 additions & 3 deletions b/‎openverse_catalog/dags/common/helpers.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎openverse_catalog/dags/common/loader/reporting.py
Lines changed: 64 additions & 10 deletions b/‎openverse_catalog/dags/common/loader/reporting.py
Lines changed: 64 additions & 10 deletions
diff --git a/‎openverse_catalog/dags/common/storage/audio.py
Lines changed: 3 additions & 0 deletions b/‎openverse_catalog/dags/common/storage/audio.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎openverse_catalog/dags/common/storage/image.py
Lines changed: 3 additions & 0 deletions b/‎openverse_catalog/dags/common/storage/image.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎openverse_catalog/dags/common/storage/media.py
Lines changed: 5 additions & 4 deletions b/‎openverse_catalog/dags/common/storage/media.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎openverse_catalog/dags/providers/factory_utils.py
Lines changed: 15 additions & 1 deletion b/‎openverse_catalog/dags/providers/factory_utils.py
Lines changed: 15 additions & 1 deletion
@@ -102,9 +102,9 @@ The following are DAGs grouped by their primary tag:
 
 | DAG ID | Schedule Interval |
 | --- | --- |
-| `europeana_ingestion_workflow` | `@daily` |
-| `flickr_ingestion_workflow` | `@daily` |
-| `wikimedia_ingestion_workflow` | `@daily` |
+| [`europeana_reingestion_workflow`](#europeana_reingestion_workflow) | `@weekly` |
+| [`flickr_reingestion_workflow`](#flickr_reingestion_workflow) | `@weekly` |
+| [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) | `@weekly` |
 
 
 # DAG documentation
@@ -114,7 +114,9 @@ The following is documentation associated with each DAG (where available):
  1. [`airflow_log_cleanup`](#airflow_log_cleanup)
  1. [`audio_data_refresh`](#audio_data_refresh)
  1. [`check_silenced_dags`](#check_silenced_dags)
+ 1. [`europeana_reingestion_workflow`](#europeana_reingestion_workflow)
  1. [`europeana_workflow`](#europeana_workflow)
+ 1. [`flickr_reingestion_workflow`](#flickr_reingestion_workflow)
  1. [`flickr_workflow`](#flickr_workflow)
  1. [`freesound_workflow`](#freesound_workflow)
  1. [`image_data_refresh`](#image_data_refresh)
@@ -133,6 +135,7 @@ The following is documentation associated with each DAG (where available):
  1. [`tsv_to_postgres_loader`](#tsv_to_postgres_loader)
  1. [`walters_workflow`](#walters_workflow)
  1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow)
+ 1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow)
  1. [`wordpress_workflow`](#wordpress_workflow)
 
 
@@ -215,6 +218,20 @@ The DAG runs weekly.
 
 
 
+## `europeana_reingestion_workflow`
+
+
+Content Provider:       Europeana
+
+ETL Process:            Use the API to identify all CC licensed images.
+
+Output:                 TSV file containing the images and the
+                        respective meta-data.
+
+Notes:                  https://www.europeana.eu/api/v2/search.json
+
+
+
 ## `europeana_workflow`
 
 
@@ -229,6 +246,21 @@ Notes:                  https://www.europeana.eu/api/v2/search.json
 
 
 
+## `flickr_reingestion_workflow`
+
+
+Content Provider:       Flickr
+
+ETL Process:            Use the API to identify all CC licensed images.
+
+Output:                 TSV file containing the images and the
+                        respective meta-data.
+
+Notes:                  https://www.flickr.com/help/terms/api
+                        Rate limit: 3600 requests per hour.
+
+
+
 ## `flickr_workflow`
 
 
@@ -599,6 +631,21 @@ Notes:                  https://commons.wikimedia.org/wiki/API:Main_page
 
 
 
+## `wikimedia_reingestion_workflow`
+
+
+Content Provider:       Wikimedia Commons
+
+ETL Process:            Use the API to identify all CC-licensed images.
+
+Output:                 TSV file containing the image, the respective
+                        meta-data.
+
+Notes:                  https://commons.wikimedia.org/wiki/API:Main_page
+                        No rate limit specified.
+
+
+
 ## `wordpress_workflow`
 
 
 
@@ -6,13 +6,13 @@ class IngestionInput(NamedTuple):
     repeats: int
 
 
-def get_reingestion_day_list_list(inputs: List[IngestionInput]):
+def get_partitioned_reingestion_days(inputs: List[IngestionInput]):
     """
     This method calculates day-shift lists for Provider API workflows.
 
     The input should be a list of pairs of integers:
 
-    `get_reingestion_day_list_list((x_0, y_0), ..., (x_n, y_n))`
+    `get_partitioned_reingestion_days((x_0, y_0), ..., (x_n, y_n))`
 
     The return will be a list of lists of integers. The zeroth inner
     list will be a list of integers counting by x_0, of length y_0. The
@@ -23,7 +23,7 @@ def get_reingestion_day_list_list(inputs: List[IngestionInput]):
     list.
 
     For example,
-        get_reingestion_day_list_list((1, 2), (2, 3), (3, 2))
+        get_partitioned_reingestion_days((1, 2), (2, 3), (3, 2))
     returns
         [[1, 2], [4, 6, 8], [11, 14]]
     """
 
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import NamedTuple, Optional
+from typing import NamedTuple, Optional, Sequence
 
 from common.slack import send_message
 
@@ -26,11 +26,24 @@ class RecordMetrics(NamedTuple):
     foreign_id_dup: Optional[int]
     url_dup: Optional[int]
 
+    def _add_counts(self, a, b):
+        return (a or 0) + (b or 0)
+
+    def __add__(self, other):
+        if other is None:
+            return self
+        return RecordMetrics(
+            self._add_counts(self.upserted, other.upserted),
+            self._add_counts(self.missing_columns, other.missing_columns),
+            self._add_counts(self.foreign_id_dup, other.foreign_id_dup),
+            self._add_counts(self.url_dup, other.url_dup),
+        )
+
 
 MediaTypeRecordMetrics = dict[str, RecordMetrics]
 
 
-def humanize_time_duration(seconds: float) -> str:
+def humanize_time_duration(seconds: float | int) -> str:
     if seconds == 0:
         return "inf"
     elif seconds < 1:
@@ -43,10 +56,39 @@ def humanize_time_duration(seconds: float) -> str:
     return ", ".join(parts)
 
 
+def clean_duration(duration: float | list[float]):
+    # If a list of duration values is provided, get the sum of all non-None values
+    if isinstance(duration, list):
+        duration = sum([x for x in duration if x])
+
+    # Truncate the duration value if it's provided
+    if isinstance(duration, float) or isinstance(duration, int):
+        duration = humanize_time_duration(duration)
+
+    return duration
+
+
+def clean_record_counts(
+    record_counts_by_media_type: MediaTypeRecordMetrics | list[MediaTypeRecordMetrics],
+    media_types: Sequence[str],
+):
+    # If a list of record_counts dicts is provided, sum all of the individual values
+    if isinstance(record_counts_by_media_type, list):
+        return {
+            media_type: sum(
+                [x[media_type] for x in record_counts_by_media_type],
+                RecordMetrics(0, 0, 0, 0),
+            )
+            for media_type in media_types
+        }
+    return record_counts_by_media_type
+
+
 def report_completion(
-    provider_name: str,
-    duration: float | str | None,
-    record_counts_by_media_type: MediaTypeRecordMetrics,
+    dag_id: str,
+    media_types: Sequence[str],
+    duration: float | str | list[float] | None,
+    record_counts_by_media_type: MediaTypeRecordMetrics | list[MediaTypeRecordMetrics],
     dated: bool = False,
     date_range_start: str | None = None,
     date_range_end: str | None = None,
@@ -72,9 +114,12 @@ def report_completion(
         - `date_range`: The range of time this ingestion covers. If the ingestion covers
           the entire provided dataset, "all" is provided
     """
-    # Truncate the duration value if it's provided
-    if isinstance(duration, float):
-        duration = humanize_time_duration(duration)
+    is_aggregate_duration = isinstance(duration, list)
+
+    duration = clean_duration(duration)
+    record_counts_by_media_type = clean_record_counts(
+        record_counts_by_media_type, media_types
+    )
 
     # List record count per media type
     media_type_reports = ""
@@ -104,10 +149,19 @@ def report_completion(
 
     # Collect data into a single message
     message = f"""
-*Provider*: `{provider_name}`
+*DAG*: `{dag_id}`
 *Date range*: {date_range}
-*Duration of data pull task*: {duration or '_No data_'}
+*Duration of data pull tasks*: {duration or '_No data_'}
 *Number of records upserted per media type*:
 {media_type_reports}"""
+
+    if is_aggregate_duration:
+        # Add disclaimer about duration for aggregate data
+        message += (
+            "\n_Duration is the sum of the duration for each data pull task."
+            " It does not include loading time and does not account for data"
+            " pulls that may happen concurrently."
+        )
+
     send_message(message, username="Airflow DAG Load Data Complete")
     return message
@@ -18,6 +18,9 @@ class AudioStore(MediaStore):
 
     Optional init arguments:
     provider:       String marking the provider in the `audio` table of the DB.
+    date:           Date String in the form YYYY-MM-DD. This is the date for
+                    which data is being stored. If provided, it will be appended to
+                    the tsv filename.
     output_file:    String giving a temporary .tsv filename (*not* the
                     full path) where the audio info should be stored.
     output_dir:     String giving a path where `output_file` should be placed.
 
@@ -18,6 +18,9 @@ class ImageStore(MediaStore):
 
     Optional init arguments:
     provider:       String marking the provider in the `image` table of the DB.
+    date:           Date String in the form YYYY-MM-DD. This is the date for
+                    which data is being stored. If provided, it will be appended to
+                    the tsv filename.
     output_file:    String giving a temporary .tsv filename (*not* the
                     full path) where the image info should be stored.
     output_dir:     String giving a path where `output_file` should be placed.
 
@@ -46,6 +46,9 @@ class MediaStore(metaclass=abc.ABCMeta):
     Optional init arguments:
     provider:       String marking the provider in the `media`
                     (`image`, `audio` etc) table of the DB.
+    date:           Date String in the form YYYY-MM-DD. This is the date for
+                    which data is being stored. If provided, it will be appended to
+                    the tsv filename.
     output_file:    String giving a temporary .tsv filename (*not* the
                     full path) where the media info should be stored.
     output_dir:     String giving a path where `output_file` should be placed.
@@ -66,9 +69,7 @@ def __init__(
         self.provider = provider
         self.buffer_length = buffer_length
         self.output_path = self._initialize_output_path(
-            output_dir,
-            output_file,
-            provider,
+            output_dir, output_file, provider
         )
         self.columns = None
         self._media_buffer = []
@@ -158,7 +159,7 @@ def _initialize_output_path(
         self,
         output_dir: Optional[str],
         output_file: Optional[str],
-        provider: str,
+        provider: Optional[str],
         version: Optional[str] = None,
     ) -> str:
         """Creates the path for the tsv file.
 
@@ -147,7 +147,9 @@ def pull_media_wrapper(
 
 
 def date_partition_for_prefix(
-    schedule_interval: str | None, logical_date: datetime
+    schedule_interval: str | None,
+    logical_date: datetime,
+    reingestion_date: datetime,
 ) -> str:
     """
     Given a schedule interval and the logical date for a DAG run, determine an
@@ -158,6 +160,14 @@ def date_partition_for_prefix(
         - Hourly -> `year=YYYY/month=MM/day=DD`
         - Daily -> `year=YYYY/month=MM`
         - None/yearly/monthly/weekly/other -> `year=YYYY`
+
+    If a reingestion_date is supplied, it is further partitioned by the reingestion
+    date itself to avoid filename collisions.
+
+    Example:
+        - Hourly -> `year=YYYY/month=MM/day=DD/reingestion=YYYY-MM-DD`
+        - Daily -> `year=YYYY/month=MM/reingestion=YYYY-MM-DD`
+        - None/yearly/monthly/weekly/other -> `year=YYYY/reingestion=YYYY-MM-DD`
     """
     hourly_airflow = "@hourly"
     hourly_cron = cron_presets[hourly_airflow]
@@ -175,4 +185,8 @@ def date_partition_for_prefix(
     if schedule_interval in {hourly_airflow, hourly_cron}:
         prefix += f"/day={logical_date.day:02}"
 
+    # Further partition by reingestion date if supplied
+    if reingestion_date is not None:
+        prefix += f"/reingestion={reingestion_date}"
+
     return prefix