From 47e52f57765e02c87566ad967ca8e167d628b7c0 Mon Sep 17 00:00:00 2001 From: CaptainStabs Date: Tue, 10 Aug 2021 14:31:47 -0400 Subject: [PATCH] Accidentally used the common scripts instead of the base_scripts --- .../crimegraphics/crimegraphics_bulletin.py | 90 ++----------- .../crimegraphics/crimegraphics_clery.py | 120 ++++-------------- 2 files changed, 36 insertions(+), 174 deletions(-) diff --git a/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py b/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py index e2579892..cdda1ca0 100644 --- a/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py +++ b/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_bulletin.py @@ -1,87 +1,21 @@ import sys import os -import requests -import json -from pathlib import Path -from bs4 import BeautifulSoup -import pandas as pd -from tqdm import tqdm -import time +import CG_configs as configs from pathlib import Path -# This is a hack that loads that root common folder like a module (without you expressly needing to install it). -# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3] -p = Path(__file__).resolve().parents[1] +p = Path(__file__).resolve().parents[5] sys.path.insert(1, str(p)) +from common.base_scrapers import crimegraphics_bulletin -# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py -from common.utils import hash_comparer, page_hasher, page_update - -# import data_parser from common/crimegraphics/utils/data_parser.py -from crimegraphics.utils import data_parser - -# this function is used for gathering time stats -def function_timer(stats): - if stats != False: - return time.perf_counter() - - -# this function simply calculates and prints the difference between the end and start times -def time_dif(stats, string, start, end): - if stats != False: - print(f"{string}: {end - start} seconds") - - -# configs = { -# "url": "", -# "department_code": "", -# } - -# Stats default to False -def crimegraphics_bulletin(configs, save_dir, stats=False, configs_file=False): - if not configs_file: # Default setting - department_code = configs["department_code"] - url = configs["url"] - else: - department_code = configs.department_code - url = configs.url - - # Automatically have the CLERYMenu clicked for daily crime data - payload = { - "MYAGCODE": configs.department_code, - "__EVENTTARGET": "MainMenu$BulletinMenu", - "__EVENTARGUMENT": "BulletinMenu", - } - - # Initialize "data" table (a table called data, not a datatable) - data = [] - - print("Receiving Data... Please wait...") - request_start = function_timer(stats) - - # Send a POST request to the url with our headers - response = requests.request("POST", configs.url, data=payload) - request_end = function_timer(stats) - time_dif(stats, "Request Time", request_start, request_end) - - print("Data received.") - parse_start = function_timer(stats) - - # Parse the response using bs4 - soup = BeautifulSoup(response.text, "html.parser") - # with open("html.html", 'wb') as output: - # output.write(str(soup).encode('utf-8')) - # output.close() - parse_end = function_timer(stats) - time_dif(stats, "Parse time", parse_start, parse_end) +configs = { + "url": "", + "department_code": "", +} - search_start = function_timer(stats) +save_dir = "./data/" +data = [] - table = soup.find("span", id="Bull") - # Send "table" to page_update to be hashed and compared. - page_update(table) - search_end = function_timer(stats) - time_dif(stats, "Search time", search_start, search_end) +if not os.path.exists(save_dir): + os.makedirs(save_dir) - # Import the parser - data_parser(configs, save_dir, table) +crimegraphics_bulletin(configs, save_dir) diff --git a/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py b/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py index 64d3f8cb..7c3ed70c 100644 --- a/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py +++ b/setup_gui/Base_Scripts/Scrapers/crimegraphics/crimegraphics_clery.py @@ -1,101 +1,29 @@ import sys import os -import requests -import json -from pathlib import Path -from bs4 import BeautifulSoup -import pandas as pd -from tqdm import tqdm -import time -from datetime import date +import CG_configs as configs from pathlib import Path -# This is a hack that loads that root common folder like a module (without you expressly needing to install it). -# I'm going to be honest, I have no clue why it uses parents[1] while the list_pdf scrapesr use parents[3] -p = Path(__file__).resolve().parents[1] +p = Path(__file__).resolve().parents[5] sys.path.insert(1, str(p)) - -# import hash_comparer, page_hasher, and page_update from common/utils/website_hasher/page_update.py -from common.utils import hash_comparer, page_hasher, page_update - -# this function is used for gathering time stats -def function_timer(stats): - if stats != False: - return time.perf_counter() - - -# this function simply calculates and prints the difference between the end and start times -def time_dif(stats, string, start, end): - if stats != False: - print(f"{string}: {end - start} seconds") - - -# stats default to False -def crimegraphics_clery(configs, save_dir, stats=False, configs_file=False): - if not configs_file: # Default setting - department_code = configs["department_code"] - url = configs["url"] - list_header = configs["list_header"] - else: - department_code = configs.department_code - url = configs.url - list_header = configs.list_header - - # automatically have the CLERYMenu clicked for daily crime data - payload = { - "MYAGCODE": configs.department_code, - "__EVENTTARGET": "MainMenu$CLERYMenu", - "__EVENTARGUMENT": "CLERYMenu", - } - - # initialize "data" table (a table called data, not a datatable) - data = [] - - print("Receiving Data... Please wait...") - - # used for stats, mark beginning of request - request_start = function_timer(stats) - - # Send a POST request to the url with our headers - response = requests.request("POST", configs.url, data=payload) - request_end = function_timer(stats) - time_dif(stats, "Request Time", request_start, request_end) - - print("Data received.") - parse_start = function_timer(stats) - - # Parse the response using bs4 - soup = BeautifulSoup(response.text, "html.parser") - parse_end = function_timer(stats) - time_dif(stats, "Parse time", parse_start, parse_end) - - search_start = function_timer(stats) - # this website has a bunch of empty tables with the same name - # the 6th index has the data we need - table = soup.find_all("table", {"class": "ob_gBody"})[6] - search_end = function_timer(stats) - time_dif(stats, "Search time", search_start, search_end) - - hash_start = function_timer(stats) - # Checks if the page has been updated - page_update(table) - - hash_end = function_timer(stats) - time_dif(stats, "Hash time", hash_start, hash_end) - - # Use BeautifulSoup4 (bs4)'s find_all method to find all html table rows (tr) - rows = table.find_all("tr") - for row in tqdm(rows): - # Use BeautifulSoup4 (bs4)'s find_all method to find all html tags for table data (td) - td = row.find_all("td") - table_data = [] - for actual_data in td: - table_data.append(actual_data.get_text()) - data.append(table_data) - - date_name = date.today() - file_name = "_" + str(date_name).replace("-", "_") # + "_" - - dataframe = pd.DataFrame(data=data, columns=configs.list_header) - - dataframe.to_csv(save_dir + configs.department_code + file_name + "_daily_bulletin") +from common.base_scrapers import crimegraphics_scraper + +configs = { + "url": "", + "department_code": "", + "list_header": [ + "ChargeDescription", + "CaseNum", + "ReportDate", + "OffenseDate", + "Location", + "ChargeDisposition", + ], +} + +save_dir = "./data/" +data = [] + +if not os.path.exists(save_dir): + os.makedirs(save_dir) + +crimegraphics_scraper(configs, save_dir)