Skip to content

Commit aef5a1b

Browse files
Add DAG to decode and deduplicate image tags with escaped literal unicode sequences (#4475)
* Add (unusable) DAG to decode and deduplicate tags * Only modify tags with escaped unicode sequences * Fix timeouts and pass param validation
1 parent 29c540c commit aef5a1b

File tree

4 files changed

+147
-3
lines changed

4 files changed

+147
-3
lines changed
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
"""
2+
See the issue for context and motivation: https://github.com/WordPress/openverse/issues/4452
3+
4+
This DAG triggers a run of the batched update DAG. It generates a new list of tags by
5+
trimming all existing tags and re-inserting only the distinct tags of the resulting list of tags.
6+
7+
Only records before the CC Search -> Openverse transition are affected. As such, because all
8+
audio records are dated after that transition, we only need to scan images.
9+
"""
10+
11+
from datetime import datetime, timedelta
12+
from textwrap import dedent
13+
14+
from airflow.decorators import dag, task
15+
from airflow.models.abstractoperator import AbstractOperator
16+
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
17+
18+
from common.constants import DAG_DEFAULT_ARGS, POSTGRES_CONN_ID
19+
from common.sql import PostgresHook
20+
from database.batched_update.constants import DAG_ID as BATCHED_UPDATE_DAG_ID
21+
22+
23+
DAG_ID = "decode_and_deduplicate_image_tags"
24+
25+
HAS_RAW_ESCAPED_UNICODE = (
26+
r'(@.name like_regex "\\\\(x)([\\da-f]{2})|\\\\(u)([\\da-f]{4})" flag "i")'
27+
)
28+
29+
30+
@task
31+
def ensure_ov_unistr(
32+
postgres_conn_id: str = POSTGRES_CONN_ID,
33+
task: AbstractOperator = None,
34+
):
35+
"""
36+
Create a naïve implementation of Postgres 14+ ``unistr``.
37+
38+
We are on Postgres 13 and have to do without ``unistr``. For all intents and purposes,
39+
this implementation solves the problem for us.
40+
41+
The ``ov`` prefix prevents clashing with the built-in should we upgrade.
42+
"""
43+
44+
postgres = PostgresHook(
45+
postgres_conn_id=postgres_conn_id,
46+
default_statement_timeout=PostgresHook.get_execution_timeout(task),
47+
log_sql=True,
48+
)
49+
50+
return postgres.run(
51+
dedent(
52+
"""
53+
CREATE OR REPLACE FUNCTION ov_unistr (string text)
54+
RETURNS text
55+
AS $$
56+
return string.encode().decode("unicode_escape") if string else string
57+
$$ LANGUAGE plpython3u;
58+
"""
59+
)
60+
)
61+
62+
63+
@dag(
64+
dag_id=DAG_ID,
65+
schedule=None,
66+
start_date=datetime(2024, 6, 3),
67+
tags=["database"],
68+
doc_md=__doc__,
69+
max_active_runs=1,
70+
default_args=DAG_DEFAULT_ARGS,
71+
)
72+
def decode_and_deduplicate_image_tags():
73+
ensure_ov_unistr() >> TriggerDagRunOperator(
74+
task_id="trigger_batched_update",
75+
trigger_dag_id=BATCHED_UPDATE_DAG_ID,
76+
wait_for_completion=True,
77+
retries=0,
78+
conf={
79+
"query_id": DAG_ID,
80+
"table_name": "image",
81+
# jsonb_path_query_first will return null if the first argument is null,
82+
# and so is safe for tagless works
83+
"select_query": dedent(
84+
f"""
85+
WHERE jsonb_path_query_first(
86+
image.tags,
87+
'$[*] ? {HAS_RAW_ESCAPED_UNICODE}'
88+
) IS NOT NULL
89+
"""
90+
).strip(),
91+
"update_query": dedent(
92+
f"""
93+
SET updated_on = NOW(),
94+
tags = (
95+
SELECT jsonb_agg(deduplicated.tag) FROM (
96+
SELECT DISTINCT ON (all_tags.tag->'name', all_tags.tag->'provider')
97+
all_tags.tag tag
98+
FROM (
99+
SELECT
100+
jsonb_array_elements(
101+
separated_tags.no_escape || (
102+
SELECT jsonb_agg(
103+
jsonb_set(
104+
to_escape,
105+
'{{name}}',
106+
to_jsonb(ov_unistr(to_escape->>'name'))
107+
)
108+
) FROM jsonb_array_elements(separated_tags.needs_escape) AS to_escape
109+
)
110+
) tag
111+
FROM (
112+
SELECT
113+
jsonb_path_query_array(image.tags, '$[*] ? (!{HAS_RAW_ESCAPED_UNICODE})') no_escape,
114+
jsonb_path_query_array(image.tags, '$[*] ? {HAS_RAW_ESCAPED_UNICODE}') needs_escape
115+
) AS separated_tags
116+
) AS all_tags
117+
) AS deduplicated
118+
)
119+
"""
120+
).strip(),
121+
"update_timeout": int(timedelta(hours=10).total_seconds()),
122+
"dry_run": False,
123+
},
124+
)
125+
126+
127+
decode_and_deduplicate_image_tags()

catalog/dags/maintenance/trim_and_deduplicate_tags.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def trim_and_deduplicate_tags():
3232
task_id=DAG_ID,
3333
trigger_dag_id=BATCHED_UPDATE_DAG_ID,
3434
wait_for_completion=True,
35-
execution_timeout=timedelta(hours=5),
3635
max_active_tis_per_dag=2,
3736
map_index_template="""{{ task.conf['table_name'] }}""",
3837
retries=0,
@@ -56,7 +55,7 @@ def trim_and_deduplicate_tags():
5655
)
5756
),
5857
"update_query": (
59-
"SET updated_on = now(), "
58+
"SET updated_on = NOW(), "
6059
+ dedent(
6160
f"""
6261
tags = (
@@ -78,6 +77,7 @@ def trim_and_deduplicate_tags():
7877
"""
7978
)
8079
),
80+
"update_timeout": int(timedelta(hours=5).total_seconds()),
8181
"dry_run": False,
8282
}
8383
for media_type in MEDIA_TYPES

documentation/catalog/reference/DAGs.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ The following are DAGs grouped by their primary tag:
4848
| DAG ID | Schedule Interval |
4949
| -------------------------------------------------------------------------------------- | ----------------- |
5050
| [`batched_update`](#batched_update) | `None` |
51+
| [`decode_and_deduplicate_image_tags`](#decode_and_deduplicate_media_type_tags) | `None` |
5152
| [`delete_records`](#delete_records) | `None` |
5253
| [`recreate_full_staging_index`](#recreate_full_staging_index) | `None` |
5354
| [`recreate_audio_popularity_calculation`](#recreate_media_type_popularity_calculation) | `None` |
@@ -144,6 +145,7 @@ The following is documentation associated with each DAG (where available):
144145
1. [`create_new_production_es_index`](#create_new_environment_es_index)
145146
1. [`create_new_staging_es_index`](#create_new_environment_es_index)
146147
1. [`create_proportional_by_source_staging_index`](#create_proportional_by_source_staging_index)
148+
1. [`decode_and_deduplicate_image_tags`](#decode_and_deduplicate_media_type_tags)
147149
1. [`delete_records`](#delete_records)
148150
1. [`europeana_workflow`](#europeana_workflow)
149151
1. [`finnish_museums_workflow`](#finnish_museums_workflow)
@@ -575,6 +577,21 @@ However, it will fail immediately if any of the DAGs tagged as part of the
575577

576578
----
577579

580+
### `decode_and_deduplicate_{media_type}_tags`
581+
582+
See the issue for context and motivation:
583+
https://github.com/WordPress/openverse/issues/4452
584+
585+
This DAG triggers a run of the batched update DAG. It generates a new list of
586+
tags by trimming all existing tags and re-inserting only the distinct tags of
587+
the resulting list of tags.
588+
589+
Only records before the CC Search -> Openverse transition are affected. As such,
590+
because all audio records are dated after that transition, we only need to scan
591+
images.
592+
593+
----
594+
578595
### `delete_records`
579596

580597
#### Delete Records DAG

sample_data/sample_image.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
identifier,created_on,updated_on,ingestion_type,provider,source,foreign_identifier,foreign_landing_url,url,thumbnail,width,height,filesize,license,license_version,creator,creator_url,title,meta_data,tags,watermarked,last_synced_with_source,removed_from_source,filetype,category,standardized_popularity
22
0e3315c5-3328-4a99-80ab-567ac32f685f,2022-12-21 17:29:54.000000+00,2022-12-21 17:29:54.000000+00,provider_api,flickr,flickr,51745822704,https://www.flickr.com/photos/54633257@N04/51745822704,https://live.staticflickr.com/65535/51745822704_ae97226e20_b.jpg,https://live.staticflickr.com/65535/51745822704_ae97226e20_m.jpg,433,1024,97150,by-nc-sa,2.0,mexicofist3,https://www.flickr.com/photos/54633257@N04,my dress is not very short,"{""views"": ""5991"", ""pub_date"": ""1639443671"", ""date_taken"": ""2021-07-20 18:19:25"", ""description"": ""dressed"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-sa/2.0/"", ""raw_license_url"": null}",,f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
33
b840de61-fb9d-4ec5-9572-8d778875869f,2022-03-08 14:32:25.000000+00,2022-03-08 14:32:25.000000+00,provider_api,flickr,flickr,51745349583,https://www.flickr.com/photos/129684398@N07/51745349583,https://live.staticflickr.com/65535/51745349583_cebf1fa6e0_b.jpg,https://live.staticflickr.com/65535/51745349583_cebf1fa6e0_m.jpg,1024,713,171956,by-nc,2.0,clairetresse,https://www.flickr.com/photos/129684398@N07,Barranco de las bodegas,"{""views"": ""4156"", ""pub_date"": ""1639440673"", ""date_taken"": ""2021-10-28 10:54:43"", ""description"": ""Descente dans un des Barranco, descente prudente parce que ça glisse. L’érosion des sols est extrêmement puissante, les reliefs sont sculptés par les forces de la nature telles que le vente les pluies . Descent in one of the Barranco, careful descent because it slips. The erosion of the soil is extremely powerful, the reliefs are sculpted by the forces of nature such as the rains."", ""license_url"": ""https://creativecommons.org/licenses/by-nc/2.0/"", ""raw_license_url"": null}","[{""name"": ""bardenas"", ""provider"": ""flickr""}, {""name"": ""barranco"", ""provider"": ""flickr""}, {""name"": ""desert"", ""provider"": ""flickr""}, {""name"": ""espagne"", ""provider"": ""flickr""}, {""name"": ""glaise"", ""provider"": ""flickr""}, {""name"": ""navarre"", ""provider"": ""flickr""}, {""name"": ""pyrennees"", ""provider"": ""flickr""}, {""name"": ""ravin"", ""provider"": ""flickr""}, {""name"": ""roches"", ""provider"": ""flickr""}, {""name"": ""water"", ""provider"": ""flickr""}, {""name"": ""lake"", ""provider"": ""machine_example"", ""accuracy"": 0.95}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
4-
aeba0547-61da-42ee-b561-27c8fc817d5a,2022-07-16 05:51:03.000000+00,2022-07-16 05:51:03.000000+00,provider_api,flickr,flickr,51745788239,https://www.flickr.com/photos/88123769@N02/51745788239,https://live.staticflickr.com/65535/51745788239_b645ce02fe_b.jpg,https://live.staticflickr.com/65535/51745788239_b645ce02fe_m.jpg,1024,602,281512,pdm,1.0,Bernard Spragg,https://www.flickr.com/photos/88123769@N02,Alone on the prairie.,"{""views"": ""1779"", ""pub_date"": ""1639441826"", ""date_taken"": ""2017-09-26 11:39:16"", ""description"": ""The Canadian Prairies (usually referred to as simply the Prairies in Canada) is a region in Western Canada. It includes the Canadian portion of the Great Plains and the Prairie Provinces, namely Alberta, Saskatchewan, and Manitoba.These provinces are partially covered by grasslands, plains, and lowlands, mostly in the southern regions."", ""license_url"": ""https://creativecommons.org/publicdomain/mark/1.0/"", ""raw_license_url"": null}","[{""name"": ""alberta"", ""provider"": ""flickr""}, {""name"": ""alone"", ""provider"": ""flickr""}, {""name"": ""canada"", ""provider"": ""flickr""}, {""name"": ""evening"", ""provider"": ""flickr""}, {""name"": ""house"", ""provider"": ""flickr""}, {""name"": ""landscape"", ""provider"": ""flickr""}, {""name"": ""lumixfz1000"", ""provider"": ""flickr""}, {""name"": ""old"", ""provider"": ""flickr""}, {""name"": ""outside"", ""provider"": ""flickr""}, {""name"": ""prairie"", ""provider"": ""flickr""}, {""name"": ""rural"", ""provider"": ""flickr""}, {""name"": ""scenery"", ""provider"": ""flickr""}, {""name"": ""sky"", ""provider"": ""flickr""}, {""name"": ""travel"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
4+
aeba0547-61da-42ee-b561-27c8fc817d5a,2022-07-16 05:51:03.000000+00,2022-07-16 05:51:03.000000+00,provider_api,flickr,flickr,51745788239,https://www.flickr.com/photos/88123769@N02/51745788239,https://live.staticflickr.com/65535/51745788239_b645ce02fe_b.jpg,https://live.staticflickr.com/65535/51745788239_b645ce02fe_m.jpg,1024,602,281512,pdm,1.0,Bernard Spragg,https://www.flickr.com/photos/88123769@N02,Alone on the prairie.,"{""views"": ""1779"", ""pub_date"": ""1639441826"", ""date_taken"": ""2017-09-26 11:39:16"", ""description"": ""The Canadian Prairies (usually referred to as simply the Prairies in Canada) is a region in Western Canada. It includes the Canadian portion of the Great Plains and the Prairie Provinces, namely Alberta, Saskatchewan, and Manitoba.These provinces are partially covered by grasslands, plains, and lowlands, mostly in the southern regions."", ""license_url"": ""https://creativecommons.org/publicdomain/mark/1.0/"", ""raw_license_url"": null}","[{""name"": ""alberta"", ""provider"": ""flickr""}, {""name"": ""alone"", ""provider"": ""flickr""}, {""name"": ""ciudaddelassiencias"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""recognition"", ""accuracy"": 0.96}, {""name"": ""uploaded by me"", ""provider"": ""flickr""}, {}, {""name"": ""unknown"", ""provider"": ""recognition"", ""accuracy"": 0.86}, {""name"": ""mus\\xe9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""musu00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00E9o"", ""provider"": ""flickr""}, {""name"": ""canada"", ""provider"": ""flickr""}, {""name"": ""evening"", ""provider"": ""flickr""}, {""name"": ""house"", ""provider"": ""flickr""}, {""name"": ""landscape"", ""provider"": ""flickr""}, {""name"": ""lumixfz1000"", ""provider"": ""flickr""}, {""name"": ""old"", ""provider"": ""flickr""}, {""name"": ""outside"", ""provider"": ""flickr""}, {""name"": ""prairie"", ""provider"": ""flickr""}, {""name"": ""rural"", ""provider"": ""flickr""}, {""name"": ""scenery"", ""provider"": ""flickr""}, {""name"": ""sky"", ""provider"": ""flickr""}, {""name"": ""travel"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
55
3c98150c-51a8-4175-a47f-acef10e784f7,2022-06-10 09:14:13.000000+00,2022-06-10 09:14:13.000000+00,provider_api,flickr,flickr,51747927224,https://www.flickr.com/photos/151325871@N07/51747927224,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93.jpg,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93_m.jpg,318,500,53633,cc0,1.0,lyndawaybi3,https://www.flickr.com/photos/151325871@N07,Naughty Little Elf,"{""views"": ""1342"", ""pub_date"": ""1639526583"", ""date_taken"": ""2021-12-14 16:02:55"", ""license_url"": ""https://creativecommons.org/publicdomain/zero/1.0/""}","[{""name"": ""babe"", ""provider"": ""flickr""}, {""name"": ""bi"", ""provider"": ""flickr""}, {""name"": ""brunette"", ""provider"": ""flickr""}, {""name"": ""chick"", ""provider"": ""flickr""}, {""name"": ""christmas"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""elf"", ""provider"": ""flickr""}, {""name"": ""great"", ""provider"": ""flickr""}, {""name"": ""hot"", ""provider"": ""flickr""}, {""name"": ""hotwife"", ""provider"": ""flickr""}, {""name"": ""leggings"", ""provider"": ""flickr""}, {""name"": ""legs"", ""provider"": ""flickr""}, {""name"": ""lynda"", ""provider"": ""flickr""}, {""name"": ""married"", ""provider"": ""flickr""}, {""name"": ""milf"", ""provider"": ""flickr""}, {""name"": ""mini"", ""provider"": ""flickr""}, {""name"": ""mom"", ""provider"": ""flickr""}, {""name"": ""nylons"", ""provider"": ""flickr""}, {""name"": ""panyhose"", ""provider"": ""flickr""}, {""name"": ""season"", ""provider"": ""flickr""}, {""name"": ""sexy"", ""provider"": ""flickr""}, {""name"": ""short"", ""provider"": ""flickr""}, {""name"": ""skirt"", ""provider"": ""flickr""}, {""name"": ""stockings"", ""provider"": ""flickr""}, {""name"": ""sweater"", ""provider"": ""flickr""}, {""name"": ""wife"", ""provider"": ""flickr""}, {""name"": ""young"", ""provider"": ""flickr""}]",f,2021-12-15 22:19:02.971943+00,f,jpg,photograph,
66
cdbd3bf6-1745-45bb-b399-61ee149cd58a,2022-12-28 15:41:34.000000+00,2022-12-28 15:41:34.000000+00,provider_api,flickr,flickr,51745389858,https://www.flickr.com/photos/126744325@N07/51745389858,https://live.staticflickr.com/65535/51745389858_c10358e1a3_b.jpg,https://live.staticflickr.com/65535/51745389858_c10358e1a3_m.jpg,1024,683,157497,by,2.0,Kristoffer Trolle,https://www.flickr.com/photos/126744325@N07,Train area in Copenhagen South / Tog område i Syd København,"{""views"": ""1337"", ""pub_date"": ""1639441947"", ""date_taken"": ""2021-07-14 23:49:46"", ""description"": ""This old train area in Copenhagen South will soon be transformed into a residential area. I love to go there and take photos. I used a Tiffen Black Pro Mist 1/4 filter for this photo, it gives that diffused highlights look, read more about it on my blog here . The photo is Creative Commons license: Use it for free. Keywords: train, tog, DSB, område, syd, København, south, Copenhagen, Danmark, Denmark, Fujifilm X-H1, Fujifilm XF 35mm f2 R WR, Tiffen Black Pro-Mist 1/4 filter"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""copenhagen"", ""provider"": ""flickr""}, {""name"": ""danmark"", ""provider"": ""flickr""}, {""name"": ""denmark"", ""provider"": ""flickr""}, {""name"": ""dsb"", ""provider"": ""flickr""}, {""name"": ""fujifilmxf35mmf2rwr"", ""provider"": ""flickr""}, {""name"": ""fujifilmxh1"", ""provider"": ""flickr""}, {""name"": ""københavn"", ""provider"": ""flickr""}, {""name"": ""område"", ""provider"": ""flickr""}, {""name"": ""south"", ""provider"": ""flickr""}, {""name"": ""syd"", ""provider"": ""flickr""}, {""name"": ""tiffenblackpromist14filter"", ""provider"": ""flickr""}, {""name"": ""tog"", ""provider"": ""flickr""}, {""name"": ""train"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
77
a3583692-349d-4ab7-8649-dfb6ab25a9a6,2022-05-10 05:38:53.000000+00,2022-05-10 05:38:53.000000+00,provider_api,flickr,flickr,51748188420,https://www.flickr.com/photos/38959360@N07/51748188420,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_b.jpg,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_m.jpg,579,1024,131055,by,2.0,...Amame hasta con los dientes....,https://www.flickr.com/photos/38959360@N07,Say you'll see me again....,"{""views"": ""1188"", ""pub_date"": ""1639527704"", ""date_taken"": ""2021-12-14 19:21:36"", ""description"": ""♡ Click Here for Details., Credits ♡and More Photos ♡ ♡ My Facebook ♡ My Instagram"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""altier"", ""provider"": ""flickr""}, {""name"": ""blog"", ""provider"": ""flickr""}, {""name"": ""blogger"", ""provider"": ""flickr""}, {""name"": ""cute"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""event"", ""provider"": ""flickr""}, {""name"": ""gown"", ""provider"": ""flickr""}, {""name"": ""hollyhood"", ""provider"": ""flickr""}, {""name"": ""kaya"", ""provider"": ""flickr""}, {""name"": ""kimo"", ""provider"": ""flickr""}, {""name"": ""log"", ""provider"": ""flickr""}, {""name"": ""maitreya"", ""provider"": ""flickr""}, {""name"": ""nightdress"", ""provider"": ""flickr""}, {""name"": ""nitedress"", ""provider"": ""flickr""}, {""name"": ""thirsty"", ""provider"": ""flickr""}, {""name"": ""versa"", ""provider"": ""flickr""}]",f,2021-12-15 22:14:02.778385+00,f,jpg,photograph,

0 commit comments

Comments
 (0)