diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..dc1a17f Binary files /dev/null and b/.DS_Store differ diff --git a/wayback_machine_scraper/__main__.py b/wayback_machine_scraper/__main__.py index 3984dbb..79ec3f2 100644 --- a/wayback_machine_scraper/__main__.py +++ b/wayback_machine_scraper/__main__.py @@ -1,10 +1,10 @@ import argparse -from pkg_resources import get_distribution +from importlib_metadata import distribution from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings -from .mirror_spider import MirrorSpider +from mirror_spider import MirrorSpider def main(): @@ -21,7 +21,7 @@ def main(): 'USER_AGENT': ( 'Wayback Machine Scraper/{0} ' '(+https://github.com/sangaline/scrapy-wayback-machine)' - ).format(get_distribution('wayback-machine-scraper').version), + ).format(distribution('wayback-machine-scraper').version), 'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO', 'DOWNLOADER_MIDDLEWARES': { 'scrapy_wayback_machine.WaybackMachineMiddleware': 5, diff --git a/wayback_machine_scraper/scrape_nyt.py b/wayback_machine_scraper/scrape_nyt.py new file mode 100644 index 0000000..f938a1f --- /dev/null +++ b/wayback_machine_scraper/scrape_nyt.py @@ -0,0 +1,63 @@ +from bs4 import BeautifulSoup +import datetime +import subprocess +import time +import os + +def extract_links(file_path, class_name): + # Open and read the .snapshot file + with open(file_path, 'r', encoding='utf-8') as file: + html_content = file.read() + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'lxml') + + links = [] + + for link in soup.find_all('a', class_ = class_name): + href = link.get('href') + if href: + links.append(href) + return links + + +# get time range +def get_time(file_path): + file_name = file_path.split('/')[-1] + time_str = file_name.split('.')[0] + + return time_str + + +# convert string date time to unix timestamp +def convert_to_unix(date_time_str): + date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S') + timestamp = str(int(date_time_obj.timestamp())) + return timestamp + + +if __name__ == '__main__': + # Path to your .snapshot files + dir_path = 'www.nytimes.com' + file_ls = [] + for (dir_path, dir_names, file_names) in os.walk(dir_path): + # file_path = 'nytimes/20240128031617.snapshot' + for file_name in file_names: + if file_name.endswith('.snapshot') and file_name not in file_ls: + file_ls.append(file_name) + + # # don't look inside any subdirectory + # break + print(file_ls) + + for file_name in file_ls[1:]: + file_path = os.path.join(dir_path, file_name) + # extract links from the .snapshot file + links = extract_links(file_path, 'css-9mylee') + time = get_time(file_path) + + for link in links: + # get the file name without prefix + link = link.split('www.')[-1] + shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link + subprocess.Popen(shell_command, shell=True) \ No newline at end of file diff --git a/wayback_machine_scraper/scrape_reuters.py b/wayback_machine_scraper/scrape_reuters.py new file mode 100644 index 0000000..f938a1f --- /dev/null +++ b/wayback_machine_scraper/scrape_reuters.py @@ -0,0 +1,63 @@ +from bs4 import BeautifulSoup +import datetime +import subprocess +import time +import os + +def extract_links(file_path, class_name): + # Open and read the .snapshot file + with open(file_path, 'r', encoding='utf-8') as file: + html_content = file.read() + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(html_content, 'lxml') + + links = [] + + for link in soup.find_all('a', class_ = class_name): + href = link.get('href') + if href: + links.append(href) + return links + + +# get time range +def get_time(file_path): + file_name = file_path.split('/')[-1] + time_str = file_name.split('.')[0] + + return time_str + + +# convert string date time to unix timestamp +def convert_to_unix(date_time_str): + date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S') + timestamp = str(int(date_time_obj.timestamp())) + return timestamp + + +if __name__ == '__main__': + # Path to your .snapshot files + dir_path = 'www.nytimes.com' + file_ls = [] + for (dir_path, dir_names, file_names) in os.walk(dir_path): + # file_path = 'nytimes/20240128031617.snapshot' + for file_name in file_names: + if file_name.endswith('.snapshot') and file_name not in file_ls: + file_ls.append(file_name) + + # # don't look inside any subdirectory + # break + print(file_ls) + + for file_name in file_ls[1:]: + file_path = os.path.join(dir_path, file_name) + # extract links from the .snapshot file + links = extract_links(file_path, 'css-9mylee') + time = get_time(file_path) + + for link in links: + # get the file name without prefix + link = link.split('www.')[-1] + shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link + subprocess.Popen(shell_command, shell=True) \ No newline at end of file