From d3e467091235069ab2f875a3d78d19f345a4b886 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Wed, 17 Apr 2024 11:19:28 +0300 Subject: [PATCH] Decode and deduplicate tags during ingestion cleanup Signed-off-by: Olga Bulat --- ingestion_server/ingestion_server/cleanup.py | 45 ++++++++++++++----- ingestion_server/ingestion_server/strings.py | 47 ++++++++++++++++++++ sample_data/sample_image.csv | 2 +- 3 files changed, 82 insertions(+), 12 deletions(-) create mode 100644 ingestion_server/ingestion_server/strings.py diff --git a/ingestion_server/ingestion_server/cleanup.py b/ingestion_server/ingestion_server/cleanup.py index d9ca160b4ab..5462d414c83 100644 --- a/ingestion_server/ingestion_server/cleanup.py +++ b/ingestion_server/ingestion_server/cleanup.py @@ -21,6 +21,7 @@ from ingestion_server.db_helpers import database_connect from ingestion_server.indexer import DB_BUFFER_SIZE +from ingestion_server.strings import decode_data, deduplicate_tags # Number of records to buffer in memory at once @@ -116,6 +117,22 @@ def cleanup_url(url, tls_support): else: return None + @staticmethod + def deduplicate_tags(tags: list[dict]) -> list[dict]: + """ + Remove tags that have the same name and provider. + Not comparing accuracy here: if the tags have different accuracy values, + they will also have different provider values (e.g., `clarifai` vs `flickr`). + """ + seen = set() + unique_tags = [] + for i, tag in enumerate(tags): + tag_tuple = (tag["name"], tag.get("provider")) + if tag_tuple not in seen: + seen.add(tag_tuple) + unique_tags.append(tag) + return unique_tags + @staticmethod def cleanup_tags(tags): """ @@ -129,19 +146,25 @@ def cleanup_tags(tags): if not tags: return None for tag in tags: - below_threshold = False if "accuracy" in tag and float(tag["accuracy"]) < TAG_MIN_CONFIDENCE: - below_threshold = True - if "name" in tag and isinstance(tag["name"], str): - lower_tag = tag["name"].lower() - should_filter = _tag_denylisted(lower_tag) or below_threshold - else: - log.warning(f'Filtering malformed tag "{tag}" in "{tags}"') - should_filter = True - if should_filter: update_required = True - else: - tag_output.append(tag) + continue + if "name" not in tag or not isinstance(tag["name"], str): + update_required = True + continue + decoded_tag_name = decode_data(tag["name"]) + if _tag_denylisted(decoded_tag_name.lower()): + update_required = True + continue + if decoded_tag_name != tag["name"]: + update_required = True + tag["name"] = decoded_tag_name + tag_output.append(tag) + + deduplicated_tags = deduplicate_tags(tag_output) + if len(deduplicated_tags) != len(tag_output): + update_required = True + tag_output = deduplicated_tags if update_required: fragment = Json(tag_output) diff --git a/ingestion_server/ingestion_server/strings.py b/ingestion_server/ingestion_server/strings.py new file mode 100644 index 00000000000..7a6e4554e71 --- /dev/null +++ b/ingestion_server/ingestion_server/strings.py @@ -0,0 +1,47 @@ +import logging +import re +from urllib.parse import quote, unquote + + +DOUBLE_BACKSLASH_ESCAPE = re.compile( + r"\\(x)([\da-f]{2})|\\(u)([\da-f]{4})", re.IGNORECASE +) +NO_BACKSLASH_ESCAPE = re.compile(r"(u)([\da-f]{4})", re.IGNORECASE) + + +def convert_grp(grp: str) -> str | None: + """ + Convert a hex value into a character. Return None if the conversion results in + a character that cannot be used as a URI component. + """ + try: + converted = chr(int(grp, 16)) + # Decoded strings should be usable as URI components + quote(converted) + return converted + except UnicodeEncodeError: + return None + + +def decode_data(data: str | None = "") -> str: + if not data: + return "" + + def replace_func(match): + """Replace the matched group with the converted character if possible, otherwise return the original string.""" + prefix, grp = match.groups() + if converted := convert_grp(grp): + return converted + return f"{prefix}{grp}" + + # Handle characters encoded with double backslashes + if DOUBLE_BACKSLASH_ESCAPE.search(data): + try: + decoded_data = data.encode().decode("unicode_escape") + data = decoded_data + except (UnicodeDecodeError, UnicodeEncodeError): + logging.debug(f"Failed to decode data with double backslash: {data}") + # Handle characters encoded without backslashes + data = re.sub(NO_BACKSLASH_ESCAPE, replace_func, data) + + return unquote(data) diff --git a/sample_data/sample_image.csv b/sample_data/sample_image.csv index 32295ae7724..591fdf656c4 100644 --- a/sample_data/sample_image.csv +++ b/sample_data/sample_image.csv @@ -5,7 +5,7 @@ aeba0547-61da-42ee-b561-27c8fc817d5a,2022-07-16 05:51:03.000000+00,2022-07-16 05 3c98150c-51a8-4175-a47f-acef10e784f7,2022-06-10 09:14:13.000000+00,2022-06-10 09:14:13.000000+00,provider_api,flickr,flickr,51747927224,https://www.flickr.com/photos/151325871@N07/51747927224,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93.jpg,https://live.staticflickr.com/65535/51747927224_3ca7ac2e93_m.jpg,318,500,53633,cc0,1.0,lyndawaybi3,https://www.flickr.com/photos/151325871@N07,Naughty Little Elf,"{""views"": ""1342"", ""pub_date"": ""1639526583"", ""date_taken"": ""2021-12-14 16:02:55"", ""license_url"": ""https://creativecommons.org/publicdomain/zero/1.0/""}","[{""name"": ""babe"", ""provider"": ""flickr""}, {""name"": ""bi"", ""provider"": ""flickr""}, {""name"": ""brunette"", ""provider"": ""flickr""}, {""name"": ""chick"", ""provider"": ""flickr""}, {""name"": ""christmas"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""elf"", ""provider"": ""flickr""}, {""name"": ""great"", ""provider"": ""flickr""}, {""name"": ""hot"", ""provider"": ""flickr""}, {""name"": ""hotwife"", ""provider"": ""flickr""}, {""name"": ""leggings"", ""provider"": ""flickr""}, {""name"": ""legs"", ""provider"": ""flickr""}, {""name"": ""lynda"", ""provider"": ""flickr""}, {""name"": ""married"", ""provider"": ""flickr""}, {""name"": ""milf"", ""provider"": ""flickr""}, {""name"": ""mini"", ""provider"": ""flickr""}, {""name"": ""mom"", ""provider"": ""flickr""}, {""name"": ""nylons"", ""provider"": ""flickr""}, {""name"": ""panyhose"", ""provider"": ""flickr""}, {""name"": ""season"", ""provider"": ""flickr""}, {""name"": ""sexy"", ""provider"": ""flickr""}, {""name"": ""short"", ""provider"": ""flickr""}, {""name"": ""skirt"", ""provider"": ""flickr""}, {""name"": ""stockings"", ""provider"": ""flickr""}, {""name"": ""sweater"", ""provider"": ""flickr""}, {""name"": ""wife"", ""provider"": ""flickr""}, {""name"": ""young"", ""provider"": ""flickr""}]",f,2021-12-15 22:19:02.971943+00,f,jpg,photograph, cdbd3bf6-1745-45bb-b399-61ee149cd58a,2022-12-28 15:41:34.000000+00,2022-12-28 15:41:34.000000+00,provider_api,flickr,flickr,51745389858,https://www.flickr.com/photos/126744325@N07/51745389858,https://live.staticflickr.com/65535/51745389858_c10358e1a3_b.jpg,https://live.staticflickr.com/65535/51745389858_c10358e1a3_m.jpg,1024,683,157497,by,2.0,Kristoffer Trolle,https://www.flickr.com/photos/126744325@N07,Train area in Copenhagen South / Tog område i Syd København,"{""views"": ""1337"", ""pub_date"": ""1639441947"", ""date_taken"": ""2021-07-14 23:49:46"", ""description"": ""This old train area in Copenhagen South will soon be transformed into a residential area. I love to go there and take photos. I used a Tiffen Black Pro Mist 1/4 filter for this photo, it gives that diffused highlights look, read more about it on my blog here . The photo is Creative Commons license: Use it for free. Keywords: train, tog, DSB, område, syd, København, south, Copenhagen, Danmark, Denmark, Fujifilm X-H1, Fujifilm XF 35mm f2 R WR, Tiffen Black Pro-Mist 1/4 filter"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""copenhagen"", ""provider"": ""flickr""}, {""name"": ""danmark"", ""provider"": ""flickr""}, {""name"": ""denmark"", ""provider"": ""flickr""}, {""name"": ""dsb"", ""provider"": ""flickr""}, {""name"": ""fujifilmxf35mmf2rwr"", ""provider"": ""flickr""}, {""name"": ""fujifilmxh1"", ""provider"": ""flickr""}, {""name"": ""københavn"", ""provider"": ""flickr""}, {""name"": ""område"", ""provider"": ""flickr""}, {""name"": ""south"", ""provider"": ""flickr""}, {""name"": ""syd"", ""provider"": ""flickr""}, {""name"": ""tiffenblackpromist14filter"", ""provider"": ""flickr""}, {""name"": ""tog"", ""provider"": ""flickr""}, {""name"": ""train"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph, a3583692-349d-4ab7-8649-dfb6ab25a9a6,2022-05-10 05:38:53.000000+00,2022-05-10 05:38:53.000000+00,provider_api,flickr,flickr,51748188420,https://www.flickr.com/photos/38959360@N07/51748188420,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_b.jpg,https://live.staticflickr.com/65535/51748188420_cac46bb1f3_m.jpg,579,1024,131055,by,2.0,...Amame hasta con los dientes....,https://www.flickr.com/photos/38959360@N07,Say you'll see me again....,"{""views"": ""1188"", ""pub_date"": ""1639527704"", ""date_taken"": ""2021-12-14 19:21:36"", ""description"": ""♡ Click Here for Details., Credits ♡and More Photos ♡ ♡ My Facebook ♡ My Instagram"", ""license_url"": ""https://creativecommons.org/licenses/by/2.0/"", ""raw_license_url"": null}","[{""name"": ""altier"", ""provider"": ""flickr""}, {""name"": ""blog"", ""provider"": ""flickr""}, {""name"": ""blogger"", ""provider"": ""flickr""}, {""name"": ""cute"", ""provider"": ""flickr""}, {""name"": ""dress"", ""provider"": ""flickr""}, {""name"": ""event"", ""provider"": ""flickr""}, {""name"": ""gown"", ""provider"": ""flickr""}, {""name"": ""hollyhood"", ""provider"": ""flickr""}, {""name"": ""kaya"", ""provider"": ""flickr""}, {""name"": ""kimo"", ""provider"": ""flickr""}, {""name"": ""log"", ""provider"": ""flickr""}, {""name"": ""maitreya"", ""provider"": ""flickr""}, {""name"": ""nightdress"", ""provider"": ""flickr""}, {""name"": ""nitedress"", ""provider"": ""flickr""}, {""name"": ""thirsty"", ""provider"": ""flickr""}, {""name"": ""versa"", ""provider"": ""flickr""}]",f,2021-12-15 22:14:02.778385+00,f,jpg,photograph, -94fa6f0a-1819-4297-9fc5-758bf8e5c71d,2022-06-15 22:50:55.000000+00,2022-06-15 22:50:55.000000+00,provider_api,flickr,flickr,51746016845,https://www.flickr.com/photos/56830712@N03/51746016845,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_b.jpg,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_m.jpg,1024,683,127991,by-nc-nd,2.0,shadobb,https://www.flickr.com/photos/56830712@N03,***,"{""views"": ""1095"", ""pub_date"": ""1639441180"", ""date_taken"": ""2016-07-22 12:53:54"", ""description"": ""Instagram: Street: instagram.com/moscow_and_the_people Portraits&arts: instagram.com/dances_with_arts MyLife&arts: instagram.com/imanishi17"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""beatiful"", ""provider"": ""flickr""}, {""name"": ""gmaster"", ""provider"": ""flickr""}, {""name"": ""model"", ""provider"": ""flickr""}, {""name"": ""portrait"", ""provider"": ""flickr""}, {""name"": ""russia"", ""provider"": ""flickr""}, {""name"": ""russianmodel"", ""provider"": ""flickr""}, {""name"": ""sony"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph, +94fa6f0a-1819-4297-9fc5-758bf8e5c71d,2022-06-15 22:50:55.000000+00,2022-06-15 22:50:55.000000+00,provider_api,flickr,flickr,51746016845,https://www.flickr.com/photos/56830712@N03/51746016845,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_b.jpg,https://live.staticflickr.com/65535/51746016845_7fcae21f7d_m.jpg,1024,683,127991,by-nc-nd,2.0,shadobb,https://www.flickr.com/photos/56830712@N03,***,"{""views"": ""1095"", ""pub_date"": ""1639441180"", ""date_taken"": ""2016-07-22 12:53:54"", ""description"": ""Instagram: Street: instagram.com/moscow_and_the_people Portraits&arts: instagram.com/dances_with_arts MyLife&arts: instagram.com/imanishi17"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""ciudaddelassiencias"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""flickr""}, {""name"": ""muséo"", ""provider"": ""recognition"", ""accuracy"": 0.96}, {""name"": ""uploaded by me"", ""provider"": ""flickr""}, {}, {""name"": ""unknown"", ""provider"": ""recognition"", ""accuracy"": 0.86}, {""name"": ""mus\\xe9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""musu00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00e9o"", ""provider"": ""flickr""}, {""name"": ""mus\\u00E9o"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph, 4e8fff2c-5e81-4f1f-8cda-fa29d3dcef6c,2022-09-21 15:36:29.000000+00,2022-09-21 15:36:29.000000+00,provider_api,flickr,flickr,51745392113,https://www.flickr.com/photos/23465276@N04/51745392113,https://live.staticflickr.com/65535/51745392113_4e9af1dd55_b.jpg,https://live.staticflickr.com/65535/51745392113_4e9af1dd55_m.jpg,1024,972,143647,by-nc-nd,2.0,Antonio Marín Segovia,https://www.flickr.com/photos/23465276@N04,Marilyn Monroe: la poeta que se convirtió en sex symbol,"{""views"": ""828"", ""pub_date"": ""1639442005"", ""date_taken"": ""2021-12-14 01:33:21"", ""description"": ""Marilyn Monroe: la poeta que se convirtió en sex symbol ¿Qué hizo de Marilyn Monroe un rostro perdurable tan conocido como La Gioconda, un icono transgeneracional, una leyenda viva? Por qué después de medio siglo, a diferencia de muchos de sus contemporáneos su imagen sigue siendo tan actual? Quizá porque Marilyn Monroe no sólo fue bella, ni sólo fue sexy, ni sólo fue inteligente. Quizá porque fue todo eso y una rubia boba en sus personajes y una mujer con intensa curiosidad crítica que resistió los embates del macartismo y su cacería de brujas, que quiso modificar y modificó su vida y su mundo que fue Hollywood (esa industria que devora y fabrica imágenes como ganado a decir de Hitchcock) y el circuito de la alta política que como en la época de los Kennedy inventa y desecha personajes. De que modificó su vida no cabe duda. Pasar su infancia en cinco o seis hogares de refugio y un orfanato y llegar a las fiestas de los Kennedy no es cosa fácil; y crear sus reglas y legislación propias en un mundo esclerotizado por las formas de la política donde caravanas y genuflexiones son el santo y seña de la sobrevivencia, tampoco es algo que resulte sencillo. Randdy Taraborrelli en La vida secreta de Marilyn Monroe rescata un momento que describe muy bien cómo se manejaba con los personajes de la Casa Blanca. Resumo su relato: En febrero de 1962 invitaron a Marilyn Monroe a una cena en honor del presidente Kennedy. La cena era a las ocho y a las siete treinta un automóvil pasó por ella. Marilyn, por supuesto no estaba lista. Según su mucama aún no sabía qué vestido ponerse y su estilista Kenneth Battelle estaba tratando de peinarla. A las ocho el asistente personal de Kennedy regresó a la fiesta y mandó una limusina por ella que llegó 15 minutos después. Milt Ebbins, el encargado de llevarla, a las 8:45 seguía esperando. Presionado telefónicamente por el asistente de Kennedy, a las nueve Ebbins entró a la habitación y encontró a Marilyn totalmente desnuda aunque con z"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-nd/2.0/"", ""raw_license_url"": null}",,f,2021-12-15 20:49:19.976193+00,f,jpg,photograph, 28d4a996-1c98-4a7e-893f-fed2aefdc6af,2022-06-03 19:28:02.000000+00,2022-06-03 19:28:02.000000+00,provider_api,flickr,flickr,51745124976,https://www.flickr.com/photos/109715245@N06/51745124976,https://live.staticflickr.com/65535/51745124976_0456ba00ee_b.jpg,https://live.staticflickr.com/65535/51745124976_0456ba00ee_m.jpg,768,1024,150344,by-nd,2.0,paaddor,https://www.flickr.com/photos/109715245@N06,A Narrow Alley Downtown Zurich,"{""views"": ""813"", ""pub_date"": ""1639440834"", ""date_taken"": ""2020-09-02 12:57:01"", ""description"": ""Where I come from is Zurich, the biggest city in Switzerland. The city was founded in its present form at the end of the Middle Ages, between 1000 and 1200, and gained imperial freedom in the 13th century. Many narrow alleys, like this one, bear witness to the medieval architecture."", ""license_url"": ""https://creativecommons.org/licenses/by-nd/2.0/"", ""raw_license_url"": null}","[{""name"": ""alley"", ""provider"": ""flickr""}, {""name"": ""architecture"", ""provider"": ""flickr""}, {""name"": ""blur"", ""provider"": ""flickr""}, {""name"": ""blurred"", ""provider"": ""flickr""}, {""name"": ""city"", ""provider"": ""flickr""}, {""name"": ""flickr"", ""provider"": ""flickr""}, {""name"": ""houses"", ""provider"": ""flickr""}, {""name"": ""icm"", ""provider"": ""flickr""}, {""name"": ""people"", ""provider"": ""flickr""}, {""name"": ""reflection"", ""provider"": ""flickr""}, {""name"": ""switzerland"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph, aedab569-c886-419e-aa36-0cae5489eff3,2022-03-18 07:40:46.000000+00,2022-03-18 07:40:46.000000+00,provider_api,flickr,flickr,51746011725,https://www.flickr.com/photos/120313817@N08/51746011725,https://live.staticflickr.com/65535/51746011725_0e43599f2e_b.jpg,https://live.staticflickr.com/65535/51746011725_0e43599f2e_m.jpg,1024,576,145225,by-nc-sa,2.0,IndiaTrue,https://www.flickr.com/photos/120313817@N08,♥ Hunt Dolls♥,"{""views"": ""682"", ""pub_date"": ""1639441112"", ""date_taken"": ""2021-12-13 16:15:26"", ""description"": ""✨ @MYBlog ✨@Cynful Winter Baby Lingerie @Equal 10 EVENT ✨@[Glam Affair] Lizy Skin [Lelutka EvoX] ✨@VELOUR: 'FRIDAY' LIPSTICK COLLECTION (EVO X BOM)GIFT ✨@LeLUTKA Kaia Head 3.1 GIFT GROUP ✨@Legacy Body ✨@Maria'S Shape Lelutka Evox ✨@TRUTH Adore Xmas - Essential"", ""license_url"": ""https://creativecommons.org/licenses/by-nc-sa/2.0/"", ""raw_license_url"": null}","[{""name"": ""cynful"", ""provider"": ""flickr""}, {""name"": ""glamaffair"", ""provider"": ""flickr""}, {""name"": ""legacy"", ""provider"": ""flickr""}, {""name"": ""lelutkaevox"", ""provider"": ""flickr""}, {""name"": ""marias"", ""provider"": ""flickr""}, {""name"": ""truth"", ""provider"": ""flickr""}, {""name"": ""velour"", ""provider"": ""flickr""}]",f,2021-12-15 20:49:19.976193+00,f,jpg,photograph,