improve Jupyter Notebook

do-me · Oct 22, 2023 · f3b1ddb · f3b1ddb
1 parent e96c508
commit f3b1ddb
Showing 1 changed file with 16 additions and 46 deletions.
diff --git a/copernicus_services_miner.ipynb b/copernicus_services_miner.ipynb
@@ -31,7 +31,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Part 1 query all ~850 services URLs"
+    "## Part 1 query all 834 services URLs"
    ]
   },
   {
@@ -54,7 +54,7 @@
     "data_list = []\n",
     "\n",
     "for page in tqdm(range(start_page, end_page + 1)):\n",
-    "    time.sleep(2)\n",
+    "    time.sleep(2) # be gentle and add 2 seconds of break for each iteration!\n",
     "    url = base_url + str(page)\n",
     "    response = requests.get(url)\n",
     "\n",
@@ -85,8 +85,7 @@
     "df[\"Service_URL\"] = \"https://www.copernicus.eu\" + df[\"Link\"]\n",
     "del df[\"Link\"]\n",
     "\n",
-    "print(\"Mining and extraction complete.\")\n",
-    "print(\"Links starting with '/en/access-data/' have been added to the DataFrame.\")"
+    "print(\"Done!\")"
    ]
   },
   {
@@ -105,23 +104,6 @@
     "## Part 2 Query every individual service page and append title & description"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import requests\n",
-    "from bs4 import BeautifulSoup\n",
-    "import pandas as pd\n",
-    "import re  # Import the re module for text processing\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "# Load the initial DataFrame from your CSV file\n",
-    "df = pd.read_csv(\"copernicus_services_october-2023.csv.gz\")#.head(10)\n",
-    "df"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -147,6 +129,8 @@
     "    return re.sub(r\"'''\", \"\", text)\n",
     "\n",
     "for index, row in tqdm(df.iterrows()):\n",
+    "\n",
+    "    time.sleep(2) # be gentle and add 2 seconds of break for each iteration!\n",
     "    url = row[\"Service_URL\"]\n",
     "    response = requests.get(url)\n",
     "\n",
@@ -172,8 +156,7 @@
     "            mx_auto_div = main_content_div.find(\"div\", class_=\"mx-auto mt-5\")\n",
     "            mx_auto_links_div = mx_auto_div.find_all(\"a\") if mx_auto_div else [\"\"]\n",
     "            mx_auto_links.append([link.get(\"href\") for link in mx_auto_links_div][0])\n",
-    "\n",
-    "            #print(f\"Processed page {row['Page Number']} - Title: {title}\")\n",
+    "            \n",
     "        else:\n",
     "            print(f\"No content <div> found on page {row['Page Number']}\")\n",
     "\n",
@@ -183,16 +166,11 @@
     "# Add the extracted data to the DataFrame\n",
     "df[\"Title\"] = titles\n",
     "df[\"Content\"] = content_text\n",
-    "df[\"Catalogue_URL\"] = mx_auto_links"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.to_csv(\"copernicus_services_title_descr_url_october-2023.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)"
+    "df[\"Catalogue_URL\"] = mx_auto_links\n",
+    "\n",
+    "df.to_csv(\"copernicus_services_title_descr_url_october-2023.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)\n",
+    "\n",
+    "print(\"Done!\")"
    ]
   },
   {
@@ -320,15 +298,6 @@
     "token_num(\"test this tokenizer\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from haystack.nodes import PreProcessor # python 3.9.1. works"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 21,
@@ -507,6 +476,7 @@
     }
    ],
    "source": [
+    "from haystack.nodes import PreProcessor # python 3.9.1. works\n",
     "processor = PreProcessor(\n",
     "    clean_empty_lines=True,\n",
     "    clean_whitespace=True,\n",
@@ -1132,7 +1102,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[\"mean_emebdding\"] = df.Content.apply(lambda x: null_vector) # add null vector to all rows by default"
+    "df[\"mean_embedding\"] = df.Content.apply(lambda x: null_vector) # add null vector to all rows by default"
    ]
   },
   {
@@ -1978,7 +1948,7 @@
     }
    ],
    "source": [
-    "df.loc[df.Content != \" \", \"mean_emebdding\"] = df.loc[df.Content != \" \", \"Content\"].progress_apply(lambda x: text_to_embedding(x)) # 13 min!"
+    "df.loc[df.Content != \" \", \"mean_embedding\"] = df.loc[df.Content != \" \", \"Content\"].progress_apply(lambda x: text_to_embedding(x)) # 13 min!"
    ]
   },
   {
@@ -1987,7 +1957,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_emebdding']].to_csv(\"copernicus_services_embeddings.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)"
+    "df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_embedding']].to_csv(\"copernicus_services_embeddings.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)"
    ]
   },
   {
@@ -1996,7 +1966,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_emebdding']].to_json(\"copernicus_services_embeddings.json.gz\", compression=\"gzip\", orient=\"records\")"
+    "df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_embedding']].to_json(\"copernicus_services_embeddings.json.gz\", compression=\"gzip\", orient=\"records\")"
    ]
   },
   {