From 2bb5385cdbf3e6f56bfecda8807d0008f891cd14 Mon Sep 17 00:00:00 2001 From: "Basli, Adel" Date: Mon, 23 Sep 2024 08:25:32 +0200 Subject: [PATCH 1/4] initial commit --- genai-features/dataset/recent_changes.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 genai-features/dataset/recent_changes.txt diff --git a/genai-features/dataset/recent_changes.txt b/genai-features/dataset/recent_changes.txt new file mode 100644 index 00000000..ff3a4a96 --- /dev/null +++ b/genai-features/dataset/recent_changes.txt @@ -0,0 +1 @@ +http://static.openfoodfacts.org/data/openfoodfacts_recent_changes.jsonl.gz From 0d60fe84e2d5b00ff7a0b33089d65094fec4b5dd Mon Sep 17 00:00:00 2001 From: "Basli, Adel" Date: Wed, 2 Oct 2024 07:31:41 +0200 Subject: [PATCH 2/4] update notebook --- .../notebooks/explore_recent_changes.ipynb | 319 ++++++++++++++++++ 1 file changed, 319 insertions(+) create mode 100644 genai-features/notebooks/explore_recent_changes.ipynb diff --git a/genai-features/notebooks/explore_recent_changes.ipynb b/genai-features/notebooks/explore_recent_changes.ipynb new file mode 100644 index 00000000..ff976056 --- /dev/null +++ b/genai-features/notebooks/explore_recent_changes.ipynb @@ -0,0 +1,319 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-09-23T11:52:35.225763Z", + "start_time": "2024-09-23T11:52:35.182686Z" + } + }, + "source": [ + "import pandas as pd\n", + "import json\n", + "import requests" + ], + "outputs": [], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-23T11:46:09.321728Z", + "start_time": "2024-09-23T11:46:09.319571Z" + } + }, + "cell_type": "code", + "source": "n_sample = None", + "id": "5471642089fb0a62", + "outputs": [], + "execution_count": 2 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-23T11:50:45.968560Z", + "start_time": "2024-09-23T11:49:23.828286Z" + } + }, + "cell_type": "code", + "source": [ + "data_path = '/Users/baslad01/data_dump'\n", + "file_path = f'{data_path}/openfoodfacts_recent_changes.jsonl'\n", + "key_words = ['vandal']\n", + "\n", + "filtered_data = []\n", + "\n", + "with open(file_path, 'r') as file:\n", + " for line in file:\n", + " try:\n", + " json_obj = json.loads(line)\n", + " if 'comment' in json_obj and any(kw.lower() in json_obj['comment'].lower() for kw in key_words):\n", + " filtered_data.append(json_obj)\n", + " except json.JSONDecodeError as e:\n", + " print(f\"Error decoding JSON: {e}\")\n", + "\n", + "df_recent_changes_filtered = pd.DataFrame(filtered_data)\n", + "df_recent_changes_filtered" + ], + "id": "56b61ebb99017c3d", + "outputs": [ + { + "data": { + "text/plain": [ + " _id userid code \\\n", + "0 {'$oid': '5bbcdc0a4ade5fdf2732e301'} sebleouf 3596654383769 \n", + "1 {'$oid': '5bbce3b24ade5f069444b8ce'} sebleouf 8010059016480 \n", + "2 {'$oid': '5bc064534ade5fee8676c08a'} sebleouf 9789045548647 \n", + "3 {'$oid': '5bc373984ade5f613e7f885c'} sebleouf 6922572400030 \n", + "4 {'$oid': '5bc4674c4ade5fa734766af5'} sebleouf 3515450030899 \n", + "... ... ... ... \n", + "1578 {'$oid': '66d886631ec3700d69da1183'} charlesnepote 0810554026773 \n", + "1579 {'$oid': '66d8868637b2e30ab8da1180'} charlesnepote 3760144210563 \n", + "1580 {'$oid': '66d886a906b365812fda1180'} charlesnepote 3760282062437 \n", + "1581 {'$oid': '66d8871362cae1c7fcda1180'} charlesnepote 8052282080203 \n", + "1582 {'$oid': '66d887396aabbb9bc9da1181'} charlesnepote 4056489774877 \n", + "\n", + " comment countries_tags diffs t rev \n", + "0 Suppression du produit :Vandalisme [en:france] {} 1539103754 8 \n", + "1 Suppression du produit :Vandalisme [en:france] {} 1539105714 8 \n", + "2 Suppression du produit :Vandalisme [en:belgium] {} 1539335251 6 \n", + "3 Suppression du produit :Vandalisme [en:belgium] {} 1539535765 6 \n", + "4 Deleting product:Vandalisme [en:algeria] {} 1539598156 5 \n", + "... ... ... ... ... ... \n", + "1578 Deleting product:Vandalism [en:france] {} 1725466211 7 \n", + "1579 Deleting product:Vandalism [en:france] {} 1725466246 7 \n", + "1580 Deleting product:Vandalism [en:france] {} 1725466281 7 \n", + "1581 Deleting product:Vandalism [en:france] {} 1725466387 10 \n", + "1582 Deleting product:Vandalism [en:france] {} 1725466425 9 \n", + "\n", + "[1583 rows x 8 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_iduseridcodecommentcountries_tagsdiffstrev
0{'$oid': '5bbcdc0a4ade5fdf2732e301'}sebleouf3596654383769Suppression du produit :Vandalisme[en:france]{}15391037548
1{'$oid': '5bbce3b24ade5f069444b8ce'}sebleouf8010059016480Suppression du produit :Vandalisme[en:france]{}15391057148
2{'$oid': '5bc064534ade5fee8676c08a'}sebleouf9789045548647Suppression du produit :Vandalisme[en:belgium]{}15393352516
3{'$oid': '5bc373984ade5f613e7f885c'}sebleouf6922572400030Suppression du produit :Vandalisme[en:belgium]{}15395357656
4{'$oid': '5bc4674c4ade5fa734766af5'}sebleouf3515450030899Deleting product:Vandalisme[en:algeria]{}15395981565
...........................
1578{'$oid': '66d886631ec3700d69da1183'}charlesnepote0810554026773Deleting product:Vandalism[en:france]{}17254662117
1579{'$oid': '66d8868637b2e30ab8da1180'}charlesnepote3760144210563Deleting product:Vandalism[en:france]{}17254662467
1580{'$oid': '66d886a906b365812fda1180'}charlesnepote3760282062437Deleting product:Vandalism[en:france]{}17254662817
1581{'$oid': '66d8871362cae1c7fcda1180'}charlesnepote8052282080203Deleting product:Vandalism[en:france]{}172546638710
1582{'$oid': '66d887396aabbb9bc9da1181'}charlesnepote4056489774877Deleting product:Vandalism[en:france]{}17254664259
\n", + "

1583 rows × 8 columns

\n", + "
" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-23T11:53:24.687972Z", + "start_time": "2024-09-23T11:53:24.625426Z" + } + }, + "cell_type": "code", + "source": [ + "product_id = 875444\n", + "rev_id = 5\n", + "api_url = f\"https://world.openfoodfacts.org/api/v2/product/{product_id}?rev={rev_id}\"\n", + "# Get the product data\n", + "product_data = requests.get(api_url).json()\n", + "product_data" + ], + "id": "31a28bb96ee9ff2f", + "outputs": [ + { + "data": { + "text/plain": [ + "{'code': '875444', 'status': 0, 'status_verbose': 'product not found'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 8 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "669c29e99544240f" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 782463ea368c9be99825cdecadcb420c0382eefd Mon Sep 17 00:00:00 2001 From: "Basli, Adel" Date: Wed, 2 Oct 2024 07:39:43 +0200 Subject: [PATCH 3/4] add option to redownload new file --- .../notebooks/explore_recent_changes.ipynb | 54 +++++++++++++------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/genai-features/notebooks/explore_recent_changes.ipynb b/genai-features/notebooks/explore_recent_changes.ipynb index ff976056..48045947 100644 --- a/genai-features/notebooks/explore_recent_changes.ipynb +++ b/genai-features/notebooks/explore_recent_changes.ipynb @@ -6,8 +6,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-09-23T11:52:35.225763Z", - "start_time": "2024-09-23T11:52:35.182686Z" + "end_time": "2024-10-02T05:33:00.083862Z", + "start_time": "2024-10-02T05:32:59.258183Z" } }, "source": [ @@ -16,17 +16,20 @@ "import requests" ], "outputs": [], - "execution_count": 5 + "execution_count": 1 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-09-23T11:46:09.321728Z", - "start_time": "2024-09-23T11:46:09.319571Z" + "end_time": "2024-10-02T05:33:00.094035Z", + "start_time": "2024-10-02T05:33:00.092103Z" } }, "cell_type": "code", - "source": "n_sample = None", + "source": [ + "n_sample = None\n", + "re_download = False" + ], "id": "5471642089fb0a62", "outputs": [], "execution_count": 2 @@ -34,12 +37,33 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-09-23T11:50:45.968560Z", - "start_time": "2024-09-23T11:49:23.828286Z" + "end_time": "2024-10-02T05:34:22.364615Z", + "start_time": "2024-10-02T05:33:00.106532Z" } }, "cell_type": "code", "source": [ + "data_url = 'http://static.openfoodfacts.org/data/openfoodfacts_recent_changes.jsonl.gz'\n", + "\n", + "if re_download:\n", + " import gzip\n", + " import shutil\n", + " # Download the data\n", + " data_path = '/Users/baslad01/data_dump'\n", + " file_path = f'{data_path}/openfoodfacts_recent_changes.jsonl.gz'\n", + " response = requests.get(data_url)\n", + " with open(file_path, 'wb') as file:\n", + " file.write(response.content)\n", + " \n", + " compressed_file_path = f'{data_path}/openfoodfacts_recent_changes.jsonl.gz'\n", + " uncompressed_file_path = f'{data_path}/openfoodfacts_recent_changes.jsonl'\n", + "\n", + " \n", + " with gzip.open(compressed_file_path, 'rb') as f_in:\n", + " with open(uncompressed_file_path, 'wb') as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + "\n", + "\n", "data_path = '/Users/baslad01/data_dump'\n", "file_path = f'{data_path}/openfoodfacts_recent_changes.jsonl'\n", "key_words = ['vandal']\n", @@ -248,24 +272,24 @@ "" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 4 + "execution_count": 3 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-09-23T11:53:24.687972Z", - "start_time": "2024-09-23T11:53:24.625426Z" + "end_time": "2024-10-02T05:39:14.162753Z", + "start_time": "2024-10-02T05:39:14.011972Z" } }, "cell_type": "code", "source": [ "product_id = 875444\n", - "rev_id = 5\n", + "rev_id = 3\n", "api_url = f\"https://world.openfoodfacts.org/api/v2/product/{product_id}?rev={rev_id}\"\n", "# Get the product data\n", "product_data = requests.get(api_url).json()\n", @@ -279,12 +303,12 @@ "{'code': '875444', 'status': 0, 'status_verbose': 'product not found'}" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 8 + "execution_count": 5 }, { "metadata": {}, From 86526ce107246158163bd1ae738f1a0f2c13016d Mon Sep 17 00:00:00 2001 From: "Basli, Adel" Date: Sun, 3 Nov 2024 17:40:29 +0100 Subject: [PATCH 4/4] update notebook --- .../notebooks/explore_recent_changes.ipynb | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/genai-features/notebooks/explore_recent_changes.ipynb b/genai-features/notebooks/explore_recent_changes.ipynb index 48045947..f1f16feb 100644 --- a/genai-features/notebooks/explore_recent_changes.ipynb +++ b/genai-features/notebooks/explore_recent_changes.ipynb @@ -310,13 +310,37 @@ ], "execution_count": 5 }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-02T05:43:52.597713Z", + "start_time": "2024-10-02T05:43:52.593995Z" + } + }, + "cell_type": "code", + "source": "api_url", + "id": "669c29e99544240f", + "outputs": [ + { + "data": { + "text/plain": [ + "'https://world.openfoodfacts.org/api/v2/product/875444?rev=3'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 6 + }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", - "id": "669c29e99544240f" + "id": "797b55bb518c5c03" } ], "metadata": {