Skip to content

Commit

Permalink
improve Jupyter Notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
do-me committed Oct 22, 2023
1 parent e96c508 commit f3b1ddb
Showing 1 changed file with 16 additions and 46 deletions.
62 changes: 16 additions & 46 deletions copernicus_services_miner.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part 1 query all ~850 services URLs"
"## Part 1 query all 834 services URLs"
]
},
{
Expand All @@ -54,7 +54,7 @@
"data_list = []\n",
"\n",
"for page in tqdm(range(start_page, end_page + 1)):\n",
" time.sleep(2)\n",
" time.sleep(2) # be gentle and add 2 seconds of break for each iteration!\n",
" url = base_url + str(page)\n",
" response = requests.get(url)\n",
"\n",
Expand Down Expand Up @@ -85,8 +85,7 @@
"df[\"Service_URL\"] = \"https://www.copernicus.eu\" + df[\"Link\"]\n",
"del df[\"Link\"]\n",
"\n",
"print(\"Mining and extraction complete.\")\n",
"print(\"Links starting with '/en/access-data/' have been added to the DataFrame.\")"
"print(\"Done!\")"
]
},
{
Expand All @@ -105,23 +104,6 @@
"## Part 2 Query every individual service page and append title & description"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import re # Import the re module for text processing\n",
"from tqdm import tqdm\n",
"\n",
"# Load the initial DataFrame from your CSV file\n",
"df = pd.read_csv(\"copernicus_services_october-2023.csv.gz\")#.head(10)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -147,6 +129,8 @@
" return re.sub(r\"'''\", \"\", text)\n",
"\n",
"for index, row in tqdm(df.iterrows()):\n",
"\n",
" time.sleep(2) # be gentle and add 2 seconds of break for each iteration!\n",
" url = row[\"Service_URL\"]\n",
" response = requests.get(url)\n",
"\n",
Expand All @@ -172,8 +156,7 @@
" mx_auto_div = main_content_div.find(\"div\", class_=\"mx-auto mt-5\")\n",
" mx_auto_links_div = mx_auto_div.find_all(\"a\") if mx_auto_div else [\"\"]\n",
" mx_auto_links.append([link.get(\"href\") for link in mx_auto_links_div][0])\n",
"\n",
" #print(f\"Processed page {row['Page Number']} - Title: {title}\")\n",
" \n",
" else:\n",
" print(f\"No content <div> found on page {row['Page Number']}\")\n",
"\n",
Expand All @@ -183,16 +166,11 @@
"# Add the extracted data to the DataFrame\n",
"df[\"Title\"] = titles\n",
"df[\"Content\"] = content_text\n",
"df[\"Catalogue_URL\"] = mx_auto_links"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"copernicus_services_title_descr_url_october-2023.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)"
"df[\"Catalogue_URL\"] = mx_auto_links\n",
"\n",
"df.to_csv(\"copernicus_services_title_descr_url_october-2023.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)\n",
"\n",
"print(\"Done!\")"
]
},
{
Expand Down Expand Up @@ -320,15 +298,6 @@
"token_num(\"test this tokenizer\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from haystack.nodes import PreProcessor # python 3.9.1. works"
]
},
{
"cell_type": "code",
"execution_count": 21,
Expand Down Expand Up @@ -507,6 +476,7 @@
}
],
"source": [
"from haystack.nodes import PreProcessor # python 3.9.1. works\n",
"processor = PreProcessor(\n",
" clean_empty_lines=True,\n",
" clean_whitespace=True,\n",
Expand Down Expand Up @@ -1132,7 +1102,7 @@
"metadata": {},
"outputs": [],
"source": [
"df[\"mean_emebdding\"] = df.Content.apply(lambda x: null_vector) # add null vector to all rows by default"
"df[\"mean_embedding\"] = df.Content.apply(lambda x: null_vector) # add null vector to all rows by default"
]
},
{
Expand Down Expand Up @@ -1978,7 +1948,7 @@
}
],
"source": [
"df.loc[df.Content != \" \", \"mean_emebdding\"] = df.loc[df.Content != \" \", \"Content\"].progress_apply(lambda x: text_to_embedding(x)) # 13 min!"
"df.loc[df.Content != \" \", \"mean_embedding\"] = df.loc[df.Content != \" \", \"Content\"].progress_apply(lambda x: text_to_embedding(x)) # 13 min!"
]
},
{
Expand All @@ -1987,7 +1957,7 @@
"metadata": {},
"outputs": [],
"source": [
"df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_emebdding']].to_csv(\"copernicus_services_embeddings.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)"
"df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_embedding']].to_csv(\"copernicus_services_embeddings.csv.gz\", compression=\"gzip\", encoding=\"utf-8\", index=False)"
]
},
{
Expand All @@ -1996,7 +1966,7 @@
"metadata": {},
"outputs": [],
"source": [
"df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_emebdding']].to_json(\"copernicus_services_embeddings.json.gz\", compression=\"gzip\", orient=\"records\")"
"df[['Service_URL','Catalogue_URL','Title', 'Content', 'mean_embedding']].to_json(\"copernicus_services_embeddings.json.gz\", compression=\"gzip\", orient=\"records\")"
]
},
{
Expand Down

0 comments on commit f3b1ddb

Please sign in to comment.