Skip to content

Commit

Permalink
Decode and deduplicate tags during ingestion cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: Olga Bulat <obulat@gmail.com>
  • Loading branch information
obulat committed Apr 17, 2024
1 parent 797b32c commit 368e00d
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 12 deletions.
45 changes: 34 additions & 11 deletions ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from ingestion_server.db_helpers import database_connect
from ingestion_server.indexer import DB_BUFFER_SIZE
from ingestion_server.strings import decode_data, deduplicate_tags


# Number of records to buffer in memory at once
Expand Down Expand Up @@ -112,6 +113,22 @@ def cleanup_url(url, tls_support):
else:
return None

@staticmethod
def deduplicate_tags(tags: list[dict]) -> list[dict]:
"""
Remove tags that have the same name and provider.
Not comparing accuracy here: if the tags have different accuracy values,
they will also have different provider values (e.g., `clarifai` vs `flickr`).
"""
seen = set()
unique_tags = []
for i, tag in enumerate(tags):
tag_tuple = (tag["name"], tag.get("provider"))
if tag_tuple not in seen:
seen.add(tag_tuple)
unique_tags.append(tag)
return unique_tags

@staticmethod
def cleanup_tags(tags):
"""
Expand All @@ -125,19 +142,25 @@ def cleanup_tags(tags):
if not tags:
return None
for tag in tags:
below_threshold = False
if "accuracy" in tag and float(tag["accuracy"]) < TAG_MIN_CONFIDENCE:
below_threshold = True
if "name" in tag and isinstance(tag["name"], str):
lower_tag = tag["name"].lower()
should_filter = _tag_denylisted(lower_tag) or below_threshold
else:
log.warning(f'Filtering malformed tag "{tag}" in "{tags}"')
should_filter = True
if should_filter:
update_required = True
else:
tag_output.append(tag)
continue
if "name" not in tag or not isinstance(tag["name"], str):
update_required = True
continue
decoded_tag_name = decode_data(tag["name"])
if _tag_denylisted(decoded_tag_name.lower()):
update_required = True
continue
if decoded_tag_name != tag["name"]:
update_required = True
tag["name"] = decoded_tag_name
tag_output.append(tag)

deduplicated_tags = deduplicate_tags(tag_output)
if len(deduplicated_tags) != len(tag_output):
update_required = True
tag_output = deduplicated_tags

if update_required:
fragment = Json(tag_output)
Expand Down
47 changes: 47 additions & 0 deletions ingestion_server/ingestion_server/strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import logging
import re
from urllib.parse import quote, unquote


DOUBLE_BACKSLASH_ESCAPE = re.compile(
r"\\(x)([\da-f]{2})|\\(u)([\da-f]{4})", re.IGNORECASE
)
NO_BACKSLASH_ESCAPE = re.compile(r"(u)([\da-f]{4})", re.IGNORECASE)


def convert_grp(grp: str) -> str | None:
"""
Convert a hex value into a character. Return None if the conversion results in
a character that cannot be used as a URI component.
"""
try:
converted = chr(int(grp, 16))
# Decoded strings should be usable as URI components
quote(converted)
return converted
except UnicodeEncodeError:
return None


def decode_data(data: str | None = "") -> str:
if not data:
return ""

def replace_func(match):
"""Replace the matched group with the converted character if possible, otherwise return the original string."""
prefix, grp = match.groups()
if converted := convert_grp(grp):
return converted
return f"{prefix}{grp}"

# Handle characters encoded with double backslashes
if DOUBLE_BACKSLASH_ESCAPE.search(data):
try:
decoded_data = data.encode().decode("unicode_escape")
data = decoded_data
except (UnicodeDecodeError, UnicodeEncodeError):
logging.debug(f"Failed to decode data with double backslash: {data}")
# Handle characters encoded without backslashes
data = re.sub(NO_BACKSLASH_ESCAPE, replace_func, data)

return unquote(data)
2 changes: 1 addition & 1 deletion sample_data/sample_image.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ aeba0547-61da-42ee-b561-27c8fc817d5a,2022-07-16 05:51:03.000000+00,2022-07-16 05
3c98150c-51a8-4175-a47f-acef10e784f7,2022-06-10 09:14:13.000000+00,2022-06-10 09:14:13.000000+00,provider_api,flickr,flickr,51747927224,https://www.flickr.com/photos/151325871@N07/51747927224,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93.jpg,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93_m.jpg,318,500,53633,cc0,1.0,lyndawaybi3,https://www.flickr.com/photos/151325871@N07,Naughty Little Elf,"{""views"": ""1342"", ""pub_date"": ""1639526583"", ""date_taken"": ""2021-12-14 16:02:55"", ""license_url"": ""https://creativecommons.org/publicdomain/zero/1.0/""}","[{""name"": ""babe"", ""provider"": ""flickr""}, {""name"": ""bi"", ""provider"": ""flickr""}, {""name"": ""brunette"", ""provider"": ""flickr""}, {""name"": ""chick"", ""provider"": ""flickr""}, {""name"": ""christmas"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""elf"", ""provider"": ""flickr""}, {""name"": ""great"", ""provider"": ""flickr""}, {""name"": ""hot"", ""provider"": ""flickr""}, {""name"": ""hotwife"", ""provider"": ""flickr""}, {""name"": ""leggings"", ""provider"": ""flickr""}, {""name"": ""legs"", ""provider"": ""flickr""}, {""name"": ""lynda"", ""provider"": ""flickr""}, {""name"": ""married"", ""provider"": ""flickr""}, {""name"": ""milf"", ""provider"": ""flickr""}, {""name"": ""mini"", ""provider"": ""flickr""}, {""name"": ""mom"", ""provider"": ""flickr""}, {""name"": ""nylons"", ""provider"": ""flickr""}, {""name"": ""panyhose"", ""provider"": ""flickr""}, {""name"": ""season"", ""provider"": ""flickr""}, {""name"": ""sexy"", ""provider"": ""flickr""}, {""name"": ""short"", ""provider"": ""flickr""}, {""name"": ""skirt"", ""provider"": ""flickr""}, {""name"": ""stockings"", ""provider"": ""flickr""}, {""name"": ""sweater"", ""provider"": ""flickr""}, {""name"": ""wife"", ""provider"": ""flickr""}, {""name"": ""young"", ""provider"": ""flickr""}]",f,2021-12-15 22:19:02.971943+00,f,jpg,photograph,
cdbd3bf6-1745-45bb-b399-61ee149cd58a,2022-12-28 15:41:34.000000+00,2022-12-28 15:41:34.000000+00,provider_api,flickr,flickr,51745389858,https://www.flickr.com/photos/126744325@N07/51745389858,https://live.staticflickr.com/65535/51745389858_c10358e1a3_b.jpg,https://live.staticflickr.com/65535/51745389858_c10358e1a3_m.jpg,1024,683,157497,by,2.0,Kristoffer Trolle,https://www.flickr.com/photos/126744325@N07,Train area in Copenhagen South / Tog område i Syd København,"{""views"": ""1337"", ""pub_date"": ""1639441947"", ""date_taken"": ""2021-07-14 23:49:46"", ""description"": ""This old train area in Copenhagen South will soon be transformed into a residential area. I love to go there and take photos. I used a Tiffen Black Pro Mist 1/4 filter for this photo, it gives that diffused highlights look, read more about it on my blog here . The photo is Creative Commons license: Use it for free. Keywords: train, tog, DSB, område, syd, København, south, Copenhagen, Danmark, Denmark, Fujifilm X-H1, Fujifilm XF 35mm f2 R WR, Tiffen Black Pro-Mist 1/4 filter"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""copenhagen"", ""provider"": ""flickr""}, {""name"": ""danmark"", ""provider"": ""flickr""}, {""name"": ""denmark"", ""provider"": ""flickr""}, {""name"": ""dsb"", ""provider"": ""flickr""}, {""name"": ""fujifilmxf35mmf2rwr"", ""provider"": ""flickr""}, {""name"": ""fujifilmxh1"", ""provider"": ""flickr""}, {""name"": ""københavn"", ""provider"": ""flickr""}, {""name"": ""område"", ""provider"": ""flickr""}, {""name"": ""south"", ""provider"": ""flickr""}, {""name"": ""syd"", ""provider"": ""flickr""}, {""name"": ""tiffenblackpromist14filter"", ""provider"": ""flickr""}, {""name"": ""tog"", ""provider"": ""flickr""}, {""name"": ""train"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
a3583692-349d-4ab7-8649-dfb6ab25a9a6,2022-05-10 05:38:53.000000+00,2022-05-10 05:38:53.000000+00,provider_api,flickr,flickr,51748188420,https://www.flickr.com/photos/38959360@N07/51748188420,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_b.jpg,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_m.jpg,579,1024,131055,by,2.0,...Amame hasta con los dientes....,https://www.flickr.com/photos/38959360@N07,Say you'll see me again....,"{""views"": ""1188"", ""pub_date"": ""1639527704"", ""date_taken"": ""2021-12-14 19:21:36"", ""description"": ""♡ Click Here for Details., Credits ♡and More Photos ♡ ♡ My Facebook ♡ My Instagram"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""altier"", ""provider"": ""flickr""}, {""name"": ""blog"", ""provider"": ""flickr""}, {""name"": ""blogger"", ""provider"": ""flickr""}, {""name"": ""cute"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""event"", ""provider"": ""flickr""}, {""name"": ""gown"", ""provider"": ""flickr""}, {""name"": ""hollyhood"", ""provider"": ""flickr""}, {""name"": ""kaya"", ""provider"": ""flickr""}, {""name"": ""kimo"", ""provider"": ""flickr""}, {""name"": ""log"", ""provider"": ""flickr""}, {""name"": ""maitreya"", ""provider"": ""flickr""}, {""name"": ""nightdress"", ""provider"": ""flickr""}, {""name"": ""nitedress"", ""provider"": ""flickr""}, {""name"": ""thirsty"", ""provider"": ""flickr""}, {""name"": ""versa"", ""provider"": ""flickr""}]",f,2021-12-15 22:14:02.778385+00,f,jpg,photograph,
94fa6f0a-1819-4297-9fc5-758bf8e5c71d,2022-06-15 22:50:55.000000+00,2022-06-15 22:50:55.000000+00,provider_api,flickr,flickr,51746016845,https://www.flickr.com/photos/56830712@N03/51746016845,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_b.jpg,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_m.jpg,1024,683,127991,by-nc-nd,2.0,shadobb,https://www.flickr.com/photos/56830712@N03,***,"{""views"": ""1095"", ""pub_date"": ""1639441180"", ""date_taken"": ""2016-07-22 12:53:54"", ""description"": ""Instagram: Street: instagram.com/moscow_and_the_people Portraits&arts: instagram.com/dances_with_arts MyLife&arts: instagram.com/imanishi17"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""beatiful"", ""provider"": ""flickr""}, {""name"": ""gmaster"", ""provider"": ""flickr""}, {""name"": ""model"", ""provider"": ""flickr""}, {""name"": ""portrait"", ""provider"": ""flickr""}, {""name"": ""russia"", ""provider"": ""flickr""}, {""name"": ""russianmodel"", ""provider"": ""flickr""}, {""name"": ""sony"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
94fa6f0a-1819-4297-9fc5-758bf8e5c71d,2022-06-15 22:50:55.000000+00,2022-06-15 22:50:55.000000+00,provider_api,flickr,flickr,51746016845,https://www.flickr.com/photos/56830712@N03/51746016845,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_b.jpg,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_m.jpg,1024,683,127991,by-nc-nd,2.0,shadobb,https://www.flickr.com/photos/56830712@N03,***,"{""views"": ""1095"", ""pub_date"": ""1639441180"", ""date_taken"": ""2016-07-22 12:53:54"", ""description"": ""Instagram: Street: instagram.com/moscow_and_the_people Portraits&arts: instagram.com/dances_with_arts MyLife&arts: instagram.com/imanishi17"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""ciudaddelassiencias"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""recognition"", ""accuracy"": 0.96}, {""name"": ""uploaded by me"", ""provider"": ""flickr""}, {}, {""name"": ""unknown"", ""provider"": ""recognition"", ""accuracy"": 0.86}, {""name"": ""mus\\xe9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""musu00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00E9o"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
4e8fff2c-5e81-4f1f-8cda-fa29d3dcef6c,2022-09-21 15:36:29.000000+00,2022-09-21 15:36:29.000000+00,provider_api,flickr,flickr,51745392113,https://www.flickr.com/photos/23465276@N04/51745392113,https://live.staticflickr.com/65535/51745392113_4e9af1dd55_b.jpg,https://live.staticflickr.com/65535/51745392113_4e9af1dd55_m.jpg,1024,972,143647,by-nc-nd,2.0,Antonio Marín Segovia,https://www.flickr.com/photos/23465276@N04,Marilyn Monroe: la poeta que se convirtió en sex symbol,"{""views"": ""828"", ""pub_date"": ""1639442005"", ""date_taken"": ""2021-12-14 01:33:21"", ""description"": ""Marilyn Monroe: la poeta que se convirtió en sex symbol ¿Qué hizo de Marilyn Monroe un rostro perdurable tan conocido como La Gioconda, un icono transgeneracional, una leyenda viva? Por qué después de medio siglo, a diferencia de muchos de sus contemporáneos su imagen sigue siendo tan actual? Quizá porque Marilyn Monroe no sólo fue bella, ni sólo fue sexy, ni sólo fue inteligente. Quizá porque fue todo eso y una rubia boba en sus personajes y una mujer con intensa curiosidad crítica que resistió los embates del macartismo y su cacería de brujas, que quiso modificar y modificó su vida y su mundo que fue Hollywood (esa industria que devora y fabrica imágenes como ganado a decir de Hitchcock) y el circuito de la alta política que como en la época de los Kennedy inventa y desecha personajes. De que modificó su vida no cabe duda. Pasar su infancia en cinco o seis hogares de refugio y un orfanato y llegar a las fiestas de los Kennedy no es cosa fácil; y crear sus reglas y legislación propias en un mundo esclerotizado por las formas de la política donde caravanas y genuflexiones son el santo y seña de la sobrevivencia, tampoco es algo que resulte sencillo. Randdy Taraborrelli en La vida secreta de Marilyn Monroe rescata un momento que describe muy bien cómo se manejaba con los personajes de la Casa Blanca. Resumo su relato: En febrero de 1962 invitaron a Marilyn Monroe a una cena en honor del presidente Kennedy. La cena era a las ocho y a las siete treinta un automóvil pasó por ella. Marilyn, por supuesto no estaba lista. Según su mucama aún no sabía qué vestido ponerse y su estilista Kenneth Battelle estaba tratando de peinarla. A las ocho el asistente personal de Kennedy regresó a la fiesta y mandó una limusina por ella que llegó 15 minutos después. Milt Ebbins, el encargado de llevarla, a las 8:45 seguía esperando. Presionado telefónicamente por el asistente de Kennedy, a las nueve Ebbins entró a la habitación y encontró a Marilyn totalmente desnuda aunque con z"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}",,f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
28d4a996-1c98-4a7e-893f-fed2aefdc6af,2022-06-03 19:28:02.000000+00,2022-06-03 19:28:02.000000+00,provider_api,flickr,flickr,51745124976,https://www.flickr.com/photos/109715245@N06/51745124976,https://live.staticflickr.com/65535/51745124976_0456ba00ee_b.jpg,https://live.staticflickr.com/65535/51745124976_0456ba00ee_m.jpg,768,1024,150344,by-nd,2.0,paaddor,https://www.flickr.com/photos/109715245@N06,A Narrow Alley Downtown Zurich,"{""views"": ""813"", ""pub_date"": ""1639440834"", ""date_taken"": ""2020-09-02 12:57:01"", ""description"": ""Where I come from is Zurich, the biggest city in Switzerland. The city was founded in its present form at the end of the Middle Ages, between 1000 and 1200, and gained imperial freedom in the 13th century. Many narrow alleys, like this one, bear witness to the medieval architecture."", ""license_url"": ""https://creativecommons.org/licenses/by-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""alley"", ""provider"": ""flickr""}, {""name"": ""architecture"", ""provider"": ""flickr""}, {""name"": ""blur"", ""provider"": ""flickr""}, {""name"": ""blurred"", ""provider"": ""flickr""}, {""name"": ""city"", ""provider"": ""flickr""}, {""name"": ""flickr"", ""provider"": ""flickr""}, {""name"": ""houses"", ""provider"": ""flickr""}, {""name"": ""icm"", ""provider"": ""flickr""}, {""name"": ""people"", ""provider"": ""flickr""}, {""name"": ""reflection"", ""provider"": ""flickr""}, {""name"": ""switzerland"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
aedab569-c886-419e-aa36-0cae5489eff3,2022-03-18 07:40:46.000000+00,2022-03-18 07:40:46.000000+00,provider_api,flickr,flickr,51746011725,https://www.flickr.com/photos/120313817@N08/51746011725,https://live.staticflickr.com/65535/51746011725_0e43599f2e_b.jpg,https://live.staticflickr.com/65535/51746011725_0e43599f2e_m.jpg,1024,576,145225,by-nc-sa,2.0,IndiaTrue,https://www.flickr.com/photos/120313817@N08,♥ Hunt Dolls♥,"{""views"": ""682"", ""pub_date"": ""1639441112"", ""date_taken"": ""2021-12-13 16:15:26"", ""description"": ""✨ @MYBlog ✨@Cynful Winter Baby Lingerie @Equal 10 EVENT ✨@[Glam Affair] Lizy Skin [Lelutka EvoX] ✨@VELOUR: 'FRIDAY' LIPSTICK COLLECTION (EVO X BOM)GIFT ✨@LeLUTKA Kaia Head 3.1 GIFT GROUP ✨@Legacy Body ✨@Maria'S Shape Lelutka Evox ✨@TRUTH Adore Xmas - Essential"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-sa/2.0/"", ""raw_license_url"": null}","[{""name"": ""cynful"", ""provider"": ""flickr""}, {""name"": ""glamaffair"", ""provider"": ""flickr""}, {""name"": ""legacy"", ""provider"": ""flickr""}, {""name"": ""lelutkaevox"", ""provider"": ""flickr""}, {""name"": ""marias"", ""provider"": ""flickr""}, {""name"": ""truth"", ""provider"": ""flickr""}, {""name"": ""velour"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,
Expand Down

0 comments on commit 368e00d

Please sign in to comment.