-
Notifications
You must be signed in to change notification settings - Fork 114
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
58013f9
commit 43813ec
Showing
1 changed file
with
376 additions
and
0 deletions.
There are no files selected for viewing
376 changes: 376 additions & 0 deletions
376
bin/API_Migration_scripts_readme/WebScraper_Java_Code.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,376 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "361542ff", | ||
"metadata": {}, | ||
"source": [ | ||
"# Python Scripts - Text extraction (webscraping) and matching contents of Java file(s)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "8a2f0458", | ||
"metadata": {}, | ||
"source": [ | ||
"## Created for API Migration" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ab88f421", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#imports\n", | ||
"import os\n", | ||
"import csv\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import csv" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "aaa1ab25", | ||
"metadata": {}, | ||
"source": [ | ||
"## >>>WebScraper - will save a CSV and a JSON file --JSON not required--<<<" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a251d7fc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"url = \"https://developer.android.com/sdk/api_diff/33/changes/alldiffs_index_changes\" #URL from android website - only changes\n", | ||
"\n", | ||
"#Request and creation of Beautiful Soup with the response\n", | ||
"response = requests.get(url)\n", | ||
"soup = BeautifulSoup(response.content, \"html.parser\")\n", | ||
"\n", | ||
"#getting links from the webpage\n", | ||
"links = []\n", | ||
"for a in soup.find_all(\"a\"):\n", | ||
" if a.has_attr(\"href\") and a[\"href\"].startswith(\"/sdk/api_diff/33/changes/\"):\n", | ||
" links.append(a[\"href\"])\n", | ||
"\n", | ||
"#creation and writing the file with DictWriter\n", | ||
"csv_file = open(\"classes.csv\", \"w\", newline=\"\")\n", | ||
"csv_writer = csv.DictWriter(csv_file, fieldnames=[\"Package Class\"])\n", | ||
"csv_writer.writeheader()\n", | ||
"\n", | ||
"#technically not writing any links but the classes themselves\n", | ||
"for link in links:\n", | ||
" css_class = link.split('/')[-1]\n", | ||
" csv_writer.writerow({\"Package Class\": css_class})\n", | ||
"\n", | ||
"\n", | ||
"#JSON not required\n", | ||
"json_file = open(\"classes.json\", \"w\")\n", | ||
"json_data = {\"links\": links}\n", | ||
"json.dump(json_data, json_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "355d9124", | ||
"metadata": {}, | ||
"source": [ | ||
"## python script to compare the CSV against Java files in a given directory\n", | ||
"##### python script to use the obtained CSV file. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2e458972", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def match_csv_java(csv_file, directory): #method accepting CSV and a directory\n", | ||
" java_files = os.listdir(directory)\n", | ||
" with open(csv_file, \"r\") as f: # open CSV in read mode\n", | ||
" reader = csv.reader(f)\n", | ||
" for row in reader:\n", | ||
" classes = row[1] \n", | ||
" #java_files = os.listdir(directory)\n", | ||
" for java_file in java_files:\n", | ||
" if not os.path.isdir(java_file) and java_file.endswith(\".java\"): #to avoid error when encountering a directory - if another directory exists\n", | ||
" with open(os.path.join(directory, java_file), \"r\", encoding=\"utf-8\") as f:\n", | ||
" text = f.read()\n", | ||
" if classes in text:\n", | ||
" print(f\"Found {classes} in {java_file}\") # printing this in such a way we know which package/ class is in which file \n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c7b00cef", | ||
"metadata": {}, | ||
"source": [ | ||
"### 1. data collection" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "fba6313a", | ||
"metadata": {}, | ||
"source": [ | ||
"#### primary folder" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "577a2047", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#absolute PATH: format - csv, directory \n", | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1c671243", | ||
"metadata": {}, | ||
"source": [ | ||
"#### location" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9a3bad65", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "6fad8583", | ||
"metadata": {}, | ||
"source": [ | ||
"#### location -> actions" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "8deb408e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "99036ee8", | ||
"metadata": {}, | ||
"source": [ | ||
"#### sensors" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d4646eed", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "de5fe5a3", | ||
"metadata": {}, | ||
"source": [ | ||
"#### verification" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "caa49791", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e0b35ea6", | ||
"metadata": {}, | ||
"source": [ | ||
"#### wrapper" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "412308ac", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "f8bc9c2b", | ||
"metadata": {}, | ||
"source": [ | ||
"### 2. OPcodeauth" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "253a2005", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "36b339a4", | ||
"metadata": {}, | ||
"source": [ | ||
"### 3. server communication" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b1054a92", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "cf19bce4", | ||
"metadata": {}, | ||
"source": [ | ||
"### 4. Serversync" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "401d1e5f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5058994c", | ||
"metadata": {}, | ||
"source": [ | ||
"### 5. settings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a4849f6b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4440291c", | ||
"metadata": {}, | ||
"source": [ | ||
"### 6. unified logger" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "86975b04", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c7f14032", | ||
"metadata": {}, | ||
"source": [ | ||
"### 7. usercache" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ead0592d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if __name__ == \"__main__\":\n", | ||
" match_csv_java(\"/tmp\",\"/tmp\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c0c0837d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |