From 0269c305c01c9230a1cd0a6311376cafaffacc02 Mon Sep 17 00:00:00 2001 From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com> Date: Sun, 4 Feb 2024 14:07:48 -0800 Subject: [PATCH 1/5] Fixed issues with wayback machine scraper --- wayback_machine_scraper/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wayback_machine_scraper/__main__.py b/wayback_machine_scraper/__main__.py index 3984dbb..79ec3f2 100644 --- a/wayback_machine_scraper/__main__.py +++ b/wayback_machine_scraper/__main__.py @@ -1,10 +1,10 @@ import argparse -from pkg_resources import get_distribution +from importlib_metadata import distribution from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings -from .mirror_spider import MirrorSpider +from mirror_spider import MirrorSpider def main(): @@ -21,7 +21,7 @@ def main(): 'USER_AGENT': ( 'Wayback Machine Scraper/{0} ' '(+https://github.com/sangaline/scrapy-wayback-machine)' - ).format(get_distribution('wayback-machine-scraper').version), + ).format(distribution('wayback-machine-scraper').version), 'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO', 'DOWNLOADER_MIDDLEWARES': { 'scrapy_wayback_machine.WaybackMachineMiddleware': 5, From 84f71f9403a7e1b8b1201bb1cbaaad5866cba1f1 Mon Sep 17 00:00:00 2001 From: houxiru <158221690+houxiru@users.noreply.github.com> Date: Tue, 20 Feb 2024 16:48:02 -0800 Subject: [PATCH 2/5] Create get_url_bs4.py --- wayback_machine_scraper/get_url_bs4.py | 63 ++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 wayback_machine_scraper/get_url_bs4.py diff --git a/wayback_machine_scraper/get_url_bs4.py b/wayback_machine_scraper/get_url_bs4.py new file mode 100644 index 0000000..f938a1f --- /dev/null +++ b/wayback_machine_scraper/get_url_bs4.py @@ -0,0 +1,63 @@ +from bs4 import BeautifulSoup +import datetime +import subprocess +import time +import os + +def extract_links(file_path, class_name): + # Open and read the .snapshot file + with open(file_path, 'r', encoding='utf-8') as file: + html_content = file.read() + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'lxml') + + links = [] + + for link in soup.find_all('a', class_ = class_name): + href = link.get('href') + if href: + links.append(href) + return links + + +# get time range +def get_time(file_path): + file_name = file_path.split('/')[-1] + time_str = file_name.split('.')[0] + + return time_str + + +# convert string date time to unix timestamp +def convert_to_unix(date_time_str): + date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S') + timestamp = str(int(date_time_obj.timestamp())) + return timestamp + + +if __name__ == '__main__': + # Path to your .snapshot files + dir_path = 'www.nytimes.com' + file_ls = [] + for (dir_path, dir_names, file_names) in os.walk(dir_path): + # file_path = 'nytimes/20240128031617.snapshot' + for file_name in file_names: + if file_name.endswith('.snapshot') and file_name not in file_ls: + file_ls.append(file_name) + + # # don't look inside any subdirectory + # break + print(file_ls) + + for file_name in file_ls[1:]: + file_path = os.path.join(dir_path, file_name) + # extract links from the .snapshot file + links = extract_links(file_path, 'css-9mylee') + time = get_time(file_path) + + for link in links: + # get the file name without prefix + link = link.split('www.')[-1] + shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link + subprocess.Popen(shell_command, shell=True) \ No newline at end of file From ef3e9085fc4b1faee70a0dc41929adf5d7fb7ecb Mon Sep 17 00:00:00 2001 From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:36:36 -0800 Subject: [PATCH 3/5] Added Cici's code to scrape NYT --- .DS_Store | Bin 0 -> 10244 bytes nyt_scraper.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 .DS_Store create mode 100644 nyt_scraper.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..dc1a17f89e1a08ba01a1d73ee2a64b83ffdd82da GIT binary patch literal 10244 zcmeHMU2GIp6h3EK=nNg`v=qT%iwjlx+d#{oDX87~3ib$TGV#v=e4$mYvz% zmfG0pn~3_PXnawl#%MHAL5xuo9}Lk46&e#rH1KNl#YBle37$JM+t@7_6B0wn++^;# z_x#;^zddJm&s_k(SV3(C2m=6552KPu-E|t1vu9T0e9k>TCV!MsT z?vH<>Va6aD_35NXi9-!NQ=*hT$!hf&5d_z2$(^!1 zSV-%~3c52UHS@g7ZCWOa2US{ix1};~s&tz7^j9pC3t)kp-C~3`*RKZ(k^A=GaD0;w3yL{%>Mo}KjSlKKc#rAm*w#)J% zwrr1LJH0v0q%EEE*N)SchM25FRox9WNcDB{4tXapo#*%4BT8QDEm7!A`TRXR?|z8H zXWq7}x?@cX`U^?B|J3226`>@z+pjpTHA%5}(Fr@D#p`ui&frHhzqs;HUT*p2PEa0jKbL`~ffH z75oW*#ozEMCvXkiLavcp$*tmAxvg9~7xnhdl@n*kKZj4aZW_!^9RGE|J$wDc33WZx zwL{!_>n6^{1zxHwx?^#e<-ZTvPsq6v`K6i1PRHAEDPbi3CL{SvxBwU73j9owd=0Bnz$LgG zSL1qY!*+~P>~>-o_TpabqX=g4n?eQCsA3jPis55u Date: Fri, 23 Feb 2024 14:38:15 -0800 Subject: [PATCH 4/5] Renamed files for consistency --- .../{get_url_bs4.py => scrape_nyt.py} | 0 wayback_machine_scraper/scrape_reuters.py | 63 +++++++++++++++++++ 2 files changed, 63 insertions(+) rename wayback_machine_scraper/{get_url_bs4.py => scrape_nyt.py} (100%) create mode 100644 wayback_machine_scraper/scrape_reuters.py diff --git a/wayback_machine_scraper/get_url_bs4.py b/wayback_machine_scraper/scrape_nyt.py similarity index 100% rename from wayback_machine_scraper/get_url_bs4.py rename to wayback_machine_scraper/scrape_nyt.py diff --git a/wayback_machine_scraper/scrape_reuters.py b/wayback_machine_scraper/scrape_reuters.py new file mode 100644 index 0000000..f938a1f --- /dev/null +++ b/wayback_machine_scraper/scrape_reuters.py @@ -0,0 +1,63 @@ +from bs4 import BeautifulSoup +import datetime +import subprocess +import time +import os + +def extract_links(file_path, class_name): + # Open and read the .snapshot file + with open(file_path, 'r', encoding='utf-8') as file: + html_content = file.read() + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'lxml') + + links = [] + + for link in soup.find_all('a', class_ = class_name): + href = link.get('href') + if href: + links.append(href) + return links + + +# get time range +def get_time(file_path): + file_name = file_path.split('/')[-1] + time_str = file_name.split('.')[0] + + return time_str + + +# convert string date time to unix timestamp +def convert_to_unix(date_time_str): + date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S') + timestamp = str(int(date_time_obj.timestamp())) + return timestamp + + +if __name__ == '__main__': + # Path to your .snapshot files + dir_path = 'www.nytimes.com' + file_ls = [] + for (dir_path, dir_names, file_names) in os.walk(dir_path): + # file_path = 'nytimes/20240128031617.snapshot' + for file_name in file_names: + if file_name.endswith('.snapshot') and file_name not in file_ls: + file_ls.append(file_name) + + # # don't look inside any subdirectory + # break + print(file_ls) + + for file_name in file_ls[1:]: + file_path = os.path.join(dir_path, file_name) + # extract links from the .snapshot file + links = extract_links(file_path, 'css-9mylee') + time = get_time(file_path) + + for link in links: + # get the file name without prefix + link = link.split('www.')[-1] + shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link + subprocess.Popen(shell_command, shell=True) \ No newline at end of file From 83ed8a3025fe4be5b6e37fb4bba9deb5d271b074 Mon Sep 17 00:00:00 2001 From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:42:00 -0800 Subject: [PATCH 5/5] Delete nyt_scraper.py --- nyt_scraper.py | 67 -------------------------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 nyt_scraper.py diff --git a/nyt_scraper.py b/nyt_scraper.py deleted file mode 100644 index 927ad4d..0000000 --- a/nyt_scraper.py +++ /dev/null @@ -1,67 +0,0 @@ -from bs4 import BeautifulSoup -import datetime -import subprocess -import time -import os - -def extract_links(file_path, class_name): - # Open and read the .snapshot file - with open(file_path, 'r', encoding='utf-8') as file: - html_content = file.read() - - # Parse the HTML content using BeautifulSoup - soup = BeautifulSoup(html_content, 'lxml') - - links = [] - - for link in soup.find_all('a', class_ = class_name): - href = link.get('href') - if href: - links.append(href) - return links - - -# get time range -def get_time(file_path): - file_name = file_path.split('/')[-1] - time_str = file_name.split('.')[0] - - return time_str - - -# convert string date time to unix timestamp -def convert_to_unix(date_time_str): - date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S') - timestamp = str(int(date_time_obj.timestamp())) - return timestamp - - -if __name__ == '__main__': - # Path to your .snapshot files - dir_path = '/Users/hou/GitHub/wayback-machine-scraper/www.nytimes.com' - file_ls = [] - for (dir_path, dir_names, file_names) in os.walk(dir_path): - # file_path = 'nytimes/20240128031617.snapshot' - for file_name in file_names: - if file_name.endswith('.snapshot') and file_name not in file_ls: - file_ls.append(file_name) - - # # don't look inside any subdirectory - # break - print(file_ls) - - for file_name in file_ls[1:]: - file_path = os.path.join(dir_path, file_name) - # extract links from the .snapshot file - links = extract_links(file_path, 'css-9mylee') - time = get_time(file_path) - # print(links) - - for link in links: - # get the file name without prefix - link = link.split('www.')[-1] - shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link - print(shell_command) - subprocess.Popen(shell_command, shell=True) - - \ No newline at end of file