Skip to content

Commit 227569d

Browse files
committed
Decode and deduplicate tags during ingestion cleanup
Signed-off-by: Olga Bulat <obulat@gmail.com>
1 parent c94e0f3 commit 227569d

File tree

3 files changed

+82
-12
lines changed

3 files changed

+82
-12
lines changed

ingestion_server/ingestion_server/cleanup.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from ingestion_server.db_helpers import database_connect
1919
from ingestion_server.indexer import DB_BUFFER_SIZE
20+
from ingestion_server.strings import decode_data, deduplicate_tags
2021

2122

2223
# Number of records to buffer in memory at once
@@ -112,6 +113,22 @@ def cleanup_url(url, tls_support):
112113
else:
113114
return None
114115

116+
@staticmethod
117+
def deduplicate_tags(tags: list[dict]) -> list[dict]:
118+
"""
119+
Remove tags that have the same name and provider.
120+
Not comparing accuracy here: if the tags have different accuracy values,
121+
they will also have different provider values (e.g., `clarifai` vs `flickr`).
122+
"""
123+
seen = set()
124+
unique_tags = []
125+
for i, tag in enumerate(tags):
126+
tag_tuple = (tag["name"], tag.get("provider"))
127+
if tag_tuple not in seen:
128+
seen.add(tag_tuple)
129+
unique_tags.append(tag)
130+
return unique_tags
131+
115132
@staticmethod
116133
def cleanup_tags(tags):
117134
"""
@@ -125,19 +142,25 @@ def cleanup_tags(tags):
125142
if not tags:
126143
return None
127144
for tag in tags:
128-
below_threshold = False
129145
if "accuracy" in tag and float(tag["accuracy"]) < TAG_MIN_CONFIDENCE:
130-
below_threshold = True
131-
if "name" in tag and isinstance(tag["name"], str):
132-
lower_tag = tag["name"].lower()
133-
should_filter = _tag_denylisted(lower_tag) or below_threshold
134-
else:
135-
log.warning(f'Filtering malformed tag "{tag}" in "{tags}"')
136-
should_filter = True
137-
if should_filter:
138146
update_required = True
139-
else:
140-
tag_output.append(tag)
147+
continue
148+
if "name" not in tag or not isinstance(tag["name"], str):
149+
update_required = True
150+
continue
151+
decoded_tag_name = decode_data(tag["name"])
152+
if _tag_denylisted(decoded_tag_name.lower()):
153+
update_required = True
154+
continue
155+
if decoded_tag_name != tag["name"]:
156+
update_required = True
157+
tag["name"] = decoded_tag_name
158+
tag_output.append(tag)
159+
160+
deduplicated_tags = deduplicate_tags(tag_output)
161+
if len(deduplicated_tags) != len(tag_output):
162+
update_required = True
163+
tag_output = deduplicated_tags
141164

142165
if update_required:
143166
fragment = Json(tag_output)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import logging
2+
import re
3+
from urllib.parse import quote, unquote
4+
5+
6+
DOUBLE_BACKSLASH_ESCAPE = re.compile(
7+
r"\\(x)([\da-f]{2})|\\(u)([\da-f]{4})", re.IGNORECASE
8+
)
9+
NO_BACKSLASH_ESCAPE = re.compile(r"(u)([\da-f]{4})", re.IGNORECASE)
10+
11+
12+
def convert_grp(grp: str) -> str | None:
13+
"""
14+
Convert a hex value into a character. Return None if the conversion results in
15+
a character that cannot be used as a URI component.
16+
"""
17+
try:
18+
converted = chr(int(grp, 16))
19+
# Decoded strings should be usable as URI components
20+
quote(converted)
21+
return converted
22+
except UnicodeEncodeError:
23+
return None
24+
25+
26+
def decode_data(data: str | None = "") -> str:
27+
if not data:
28+
return ""
29+
30+
def replace_func(match):
31+
"""Replace the matched group with the converted character if possible, otherwise return the original string."""
32+
prefix, grp = match.groups()
33+
if converted := convert_grp(grp):
34+
return converted
35+
return f"{prefix}{grp}"
36+
37+
# Handle characters encoded with double backslashes
38+
if DOUBLE_BACKSLASH_ESCAPE.search(data):
39+
try:
40+
decoded_data = data.encode().decode("unicode_escape")
41+
data = decoded_data
42+
except (UnicodeDecodeError, UnicodeEncodeError):
43+
logging.debug(f"Failed to decode data with double backslash: {data}")
44+
# Handle characters encoded without backslashes
45+
data = re.sub(NO_BACKSLASH_ESCAPE, replace_func, data)
46+
47+
return unquote(data)

sample_data/sample_image.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ aeba0547-61da-42ee-b561-27c8fc817d5a,2022-07-16 05:51:03.000000+00,2022-07-16 05
55
3c98150c-51a8-4175-a47f-acef10e784f7,2022-06-10 09:14:13.000000+00,2022-06-10 09:14:13.000000+00,provider_api,flickr,flickr,51747927224,https://www.flickr.com/photos/151325871@N07/51747927224,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93.jpg,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93_m.jpg,318,500,53633,cc0,1.0,lyndawaybi3,https://www.flickr.com/photos/151325871@N07,Naughty Little Elf,"{""views"": ""1342"", ""pub_date"": ""1639526583"", ""date_taken"": ""2021-12-14 16:02:55"", ""license_url"": ""https://creativecommons.org/publicdomain/zero/1.0/""}","[{""name"": ""babe"", ""provider"": ""flickr""}, {""name"": ""bi"", ""provider"": ""flickr""}, {""name"": ""brunette"", ""provider"": ""flickr""}, {""name"": ""chick"", ""provider"": ""flickr""}, {""name"": ""christmas"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""elf"", ""provider"": ""flickr""}, {""name"": ""great"", ""provider"": ""flickr""}, {""name"": ""hot"", ""provider"": ""flickr""}, {""name"": ""hotwife"", ""provider"": ""flickr""}, {""name"": ""leggings"", ""provider"": ""flickr""}, {""name"": ""legs"", ""provider"": ""flickr""}, {""name"": ""lynda"", ""provider"": ""flickr""}, {""name"": ""married"", ""provider"": ""flickr""}, {""name"": ""milf"", ""provider"": ""flickr""}, {""name"": ""mini"", ""provider"": ""flickr""}, {""name"": ""mom"", ""provider"": ""flickr""}, {""name"": ""nylons"", ""provider"": ""flickr""}, {""name"": ""panyhose"", ""provider"": ""flickr""}, {""name"": ""season"", ""provider"": ""flickr""}, {""name"": ""sexy"", ""provider"": ""flickr""}, {""name"": ""short"", ""provider"": ""flickr""}, {""name"": ""skirt"", ""provider"": ""flickr""}, {""name"": ""stockings"", ""provider"": ""flickr""}, {""name"": ""sweater"", ""provider"": ""flickr""}, {""name"": ""wife"", ""provider"": ""flickr""}, {""name"": ""young"", ""provider"": ""flickr""}]",f,2021-12-15 22:19:02.971943+00,f,jpg,photograph,
66
cdbd3bf6-1745-45bb-b399-61ee149cd58a,2022-12-28 15:41:34.000000+00,2022-12-28 15:41:34.000000+00,provider_api,flickr,flickr,51745389858,https://www.flickr.com/photos/126744325@N07/51745389858,https://live.staticflickr.com/65535/51745389858_c10358e1a3_b.jpg,https://live.staticflickr.com/65535/51745389858_c10358e1a3_m.jpg,1024,683,157497,by,2.0,Kristoffer Trolle,https://www.flickr.com/photos/126744325@N07,Train area in Copenhagen South / Tog område i Syd København,"{""views"": ""1337"", ""pub_date"": ""1639441947"", ""date_taken"": ""2021-07-14 23:49:46"", ""description"": ""This old train area in Copenhagen South will soon be transformed into a residential area. I love to go there and take photos. I used a Tiffen Black Pro Mist 1/4 filter for this photo, it gives that diffused highlights look, read more about it on my blog here . The photo is Creative Commons license: Use it for free. Keywords: train, tog, DSB, område, syd, København, south, Copenhagen, Danmark, Denmark, Fujifilm X-H1, Fujifilm XF 35mm f2 R WR, Tiffen Black Pro-Mist 1/4 filter"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""copenhagen"", ""provider"": ""flickr""}, {""name"": ""danmark"", ""provider"": ""flickr""}, {""name"": ""denmark"", ""provider"": ""flickr""}, {""name"": ""dsb"", ""provider"": ""flickr""}, {""name"": ""fujifilmxf35mmf2rwr"", ""provider"": ""flickr""}, {""name"": ""fujifilmxh1"", ""provider"": ""flickr""}, {""name"": ""københavn"", ""provider"": ""flickr""}, {""name"": ""område"", ""provider"": ""flickr""}, {""name"": ""south"", ""provider"": ""flickr""}, {""name"": ""syd"", ""provider"": ""flickr""}, {""name"": ""tiffenblackpromist14filter"", ""provider"": ""flickr""}, {""name"": ""tog"", ""provider"": ""flickr""}, {""name"": ""train"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
77
a3583692-349d-4ab7-8649-dfb6ab25a9a6,2022-05-10 05:38:53.000000+00,2022-05-10 05:38:53.000000+00,provider_api,flickr,flickr,51748188420,https://www.flickr.com/photos/38959360@N07/51748188420,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_b.jpg,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_m.jpg,579,1024,131055,by,2.0,...Amame hasta con los dientes....,https://www.flickr.com/photos/38959360@N07,Say you'll see me again....,"{""views"": ""1188"", ""pub_date"": ""1639527704"", ""date_taken"": ""2021-12-14 19:21:36"", ""description"": ""♡ Click Here for Details., Credits ♡and More Photos ♡ ♡ My Facebook ♡ My Instagram"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""altier"", ""provider"": ""flickr""}, {""name"": ""blog"", ""provider"": ""flickr""}, {""name"": ""blogger"", ""provider"": ""flickr""}, {""name"": ""cute"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""event"", ""provider"": ""flickr""}, {""name"": ""gown"", ""provider"": ""flickr""}, {""name"": ""hollyhood"", ""provider"": ""flickr""}, {""name"": ""kaya"", ""provider"": ""flickr""}, {""name"": ""kimo"", ""provider"": ""flickr""}, {""name"": ""log"", ""provider"": ""flickr""}, {""name"": ""maitreya"", ""provider"": ""flickr""}, {""name"": ""nightdress"", ""provider"": ""flickr""}, {""name"": ""nitedress"", ""provider"": ""flickr""}, {""name"": ""thirsty"", ""provider"": ""flickr""}, {""name"": ""versa"", ""provider"": ""flickr""}]",f,2021-12-15 22:14:02.778385+00,f,jpg,photograph,
8-
94fa6f0a-1819-4297-9fc5-758bf8e5c71d,2022-06-15 22:50:55.000000+00,2022-06-15 22:50:55.000000+00,provider_api,flickr,flickr,51746016845,https://www.flickr.com/photos/56830712@N03/51746016845,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_b.jpg,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_m.jpg,1024,683,127991,by-nc-nd,2.0,shadobb,https://www.flickr.com/photos/56830712@N03,***,"{""views"": ""1095"", ""pub_date"": ""1639441180"", ""date_taken"": ""2016-07-22 12:53:54"", ""description"": ""Instagram: Street: instagram.com/moscow_and_the_people Portraits&arts: instagram.com/dances_with_arts MyLife&arts: instagram.com/imanishi17"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""beatiful"", ""provider"": ""flickr""}, {""name"": ""gmaster"", ""provider"": ""flickr""}, {""name"": ""model"", ""provider"": ""flickr""}, {""name"": ""portrait"", ""provider"": ""flickr""}, {""name"": ""russia"", ""provider"": ""flickr""}, {""name"": ""russianmodel"", ""provider"": ""flickr""}, {""name"": ""sony"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
8+
94fa6f0a-1819-4297-9fc5-758bf8e5c71d,2022-06-15 22:50:55.000000+00,2022-06-15 22:50:55.000000+00,provider_api,flickr,flickr,51746016845,https://www.flickr.com/photos/56830712@N03/51746016845,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_b.jpg,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_m.jpg,1024,683,127991,by-nc-nd,2.0,shadobb,https://www.flickr.com/photos/56830712@N03,***,"{""views"": ""1095"", ""pub_date"": ""1639441180"", ""date_taken"": ""2016-07-22 12:53:54"", ""description"": ""Instagram: Street: instagram.com/moscow_and_the_people Portraits&arts: instagram.com/dances_with_arts MyLife&arts: instagram.com/imanishi17"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""ciudaddelassiencias"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""recognition"", ""accuracy"": 0.96}, {""name"": ""uploaded by me"", ""provider"": ""flickr""}, {}, {""name"": ""unknown"", ""provider"": ""recognition"", ""accuracy"": 0.86}, {""name"": ""mus\\xe9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""musu00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00E9o"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
99
4e8fff2c-5e81-4f1f-8cda-fa29d3dcef6c,2022-09-21 15:36:29.000000+00,2022-09-21 15:36:29.000000+00,provider_api,flickr,flickr,51745392113,https://www.flickr.com/photos/23465276@N04/51745392113,https://live.staticflickr.com/65535/51745392113_4e9af1dd55_b.jpg,https://live.staticflickr.com/65535/51745392113_4e9af1dd55_m.jpg,1024,972,143647,by-nc-nd,2.0,Antonio Marín Segovia,https://www.flickr.com/photos/23465276@N04,Marilyn Monroe: la poeta que se convirtió en sex symbol,"{""views"": ""828"", ""pub_date"": ""1639442005"", ""date_taken"": ""2021-12-14 01:33:21"", ""description"": ""Marilyn Monroe: la poeta que se convirtió en sex symbol ¿Qué hizo de Marilyn Monroe un rostro perdurable tan conocido como La Gioconda, un icono transgeneracional, una leyenda viva? Por qué después de medio siglo, a diferencia de muchos de sus contemporáneos su imagen sigue siendo tan actual? Quizá porque Marilyn Monroe no sólo fue bella, ni sólo fue sexy, ni sólo fue inteligente. Quizá porque fue todo eso y una rubia boba en sus personajes y una mujer con intensa curiosidad crítica que resistió los embates del macartismo y su cacería de brujas, que quiso modificar y modificó su vida y su mundo que fue Hollywood (esa industria que devora y fabrica imágenes como ganado a decir de Hitchcock) y el circuito de la alta política que como en la época de los Kennedy inventa y desecha personajes. De que modificó su vida no cabe duda. Pasar su infancia en cinco o seis hogares de refugio y un orfanato y llegar a las fiestas de los Kennedy no es cosa fácil; y crear sus reglas y legislación propias en un mundo esclerotizado por las formas de la política donde caravanas y genuflexiones son el santo y seña de la sobrevivencia, tampoco es algo que resulte sencillo. Randdy Taraborrelli en La vida secreta de Marilyn Monroe rescata un momento que describe muy bien cómo se manejaba con los personajes de la Casa Blanca. Resumo su relato: En febrero de 1962 invitaron a Marilyn Monroe a una cena en honor del presidente Kennedy. La cena era a las ocho y a las siete treinta un automóvil pasó por ella. Marilyn, por supuesto no estaba lista. Según su mucama aún no sabía qué vestido ponerse y su estilista Kenneth Battelle estaba tratando de peinarla. A las ocho el asistente personal de Kennedy regresó a la fiesta y mandó una limusina por ella que llegó 15 minutos después. Milt Ebbins, el encargado de llevarla, a las 8:45 seguía esperando. Presionado telefónicamente por el asistente de Kennedy, a las nueve Ebbins entró a la habitación y encontró a Marilyn totalmente desnuda aunque con z"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}",,f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
1010
28d4a996-1c98-4a7e-893f-fed2aefdc6af,2022-06-03 19:28:02.000000+00,2022-06-03 19:28:02.000000+00,provider_api,flickr,flickr,51745124976,https://www.flickr.com/photos/109715245@N06/51745124976,https://live.staticflickr.com/65535/51745124976_0456ba00ee_b.jpg,https://live.staticflickr.com/65535/51745124976_0456ba00ee_m.jpg,768,1024,150344,by-nd,2.0,paaddor,https://www.flickr.com/photos/109715245@N06,A Narrow Alley Downtown Zurich,"{""views"": ""813"", ""pub_date"": ""1639440834"", ""date_taken"": ""2020-09-02 12:57:01"", ""description"": ""Where I come from is Zurich, the biggest city in Switzerland. The city was founded in its present form at the end of the Middle Ages, between 1000 and 1200, and gained imperial freedom in the 13th century. Many narrow alleys, like this one, bear witness to the medieval architecture."", ""license_url"": ""https://creativecommons.org/licenses/by-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""alley"", ""provider"": ""flickr""}, {""name"": ""architecture"", ""provider"": ""flickr""}, {""name"": ""blur"", ""provider"": ""flickr""}, {""name"": ""blurred"", ""provider"": ""flickr""}, {""name"": ""city"", ""provider"": ""flickr""}, {""name"": ""flickr"", ""provider"": ""flickr""}, {""name"": ""houses"", ""provider"": ""flickr""}, {""name"": ""icm"", ""provider"": ""flickr""}, {""name"": ""people"", ""provider"": ""flickr""}, {""name"": ""reflection"", ""provider"": ""flickr""}, {""name"": ""switzerland"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
1111
aedab569-c886-419e-aa36-0cae5489eff3,2022-03-18 07:40:46.000000+00,2022-03-18 07:40:46.000000+00,provider_api,flickr,flickr,51746011725,https://www.flickr.com/photos/120313817@N08/51746011725,https://live.staticflickr.com/65535/51746011725_0e43599f2e_b.jpg,https://live.staticflickr.com/65535/51746011725_0e43599f2e_m.jpg,1024,576,145225,by-nc-sa,2.0,IndiaTrue,https://www.flickr.com/photos/120313817@N08,♥ Hunt Dolls♥,"{""views"": ""682"", ""pub_date"": ""1639441112"", ""date_taken"": ""2021-12-13 16:15:26"", ""description"": ""✨ @MYBlog ✨@Cynful Winter Baby Lingerie @Equal 10 EVENT ✨@[Glam Affair] Lizy Skin [Lelutka EvoX] ✨@VELOUR: 'FRIDAY' LIPSTICK COLLECTION (EVO X BOM)GIFT ✨@LeLUTKA Kaia Head 3.1 GIFT GROUP ✨@Legacy Body ✨@Maria'S Shape Lelutka Evox ✨@TRUTH Adore Xmas - Essential"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-sa/2.0/"", ""raw_license_url"": null}","[{""name"": ""cynful"", ""provider"": ""flickr""}, {""name"": ""glamaffair"", ""provider"": ""flickr""}, {""name"": ""legacy"", ""provider"": ""flickr""}, {""name"": ""lelutkaevox"", ""provider"": ""flickr""}, {""name"": ""marias"", ""provider"": ""flickr""}, {""name"": ""truth"", ""provider"": ""flickr""}, {""name"": ""velour"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,

0 commit comments

Comments
 (0)