Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
6 changes: 3 additions & 3 deletions wayback_machine_scraper/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
from pkg_resources import get_distribution
from importlib_metadata import distribution

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

from .mirror_spider import MirrorSpider
from mirror_spider import MirrorSpider


def main():
Expand All @@ -21,7 +21,7 @@ def main():
'USER_AGENT': (
'Wayback Machine Scraper/{0} '
'(+https://github.com/sangaline/scrapy-wayback-machine)'
).format(get_distribution('wayback-machine-scraper').version),
).format(distribution('wayback-machine-scraper').version),
'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
Expand Down
63 changes: 63 additions & 0 deletions wayback_machine_scraper/scrape_nyt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from bs4 import BeautifulSoup
import datetime
import subprocess
import time
import os

def extract_links(file_path, class_name):
# Open and read the .snapshot file
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')

links = []

for link in soup.find_all('a', class_ = class_name):
href = link.get('href')
if href:
links.append(href)
return links


# get time range
def get_time(file_path):
file_name = file_path.split('/')[-1]
time_str = file_name.split('.')[0]

return time_str


# convert string date time to unix timestamp
def convert_to_unix(date_time_str):
date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S')
timestamp = str(int(date_time_obj.timestamp()))
return timestamp


if __name__ == '__main__':
# Path to your .snapshot files
dir_path = 'www.nytimes.com'
file_ls = []
for (dir_path, dir_names, file_names) in os.walk(dir_path):
# file_path = 'nytimes/20240128031617.snapshot'
for file_name in file_names:
if file_name.endswith('.snapshot') and file_name not in file_ls:
file_ls.append(file_name)

# # don't look inside any subdirectory
# break
print(file_ls)

for file_name in file_ls[1:]:
file_path = os.path.join(dir_path, file_name)
# extract links from the .snapshot file
links = extract_links(file_path, 'css-9mylee')
time = get_time(file_path)

for link in links:
# get the file name without prefix
link = link.split('www.')[-1]
shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link
subprocess.Popen(shell_command, shell=True)
63 changes: 63 additions & 0 deletions wayback_machine_scraper/scrape_reuters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from bs4 import BeautifulSoup
import datetime
import subprocess
import time
import os

def extract_links(file_path, class_name):
# Open and read the .snapshot file
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')

links = []

for link in soup.find_all('a', class_ = class_name):
href = link.get('href')
if href:
links.append(href)
return links


# get time range
def get_time(file_path):
file_name = file_path.split('/')[-1]
time_str = file_name.split('.')[0]

return time_str


# convert string date time to unix timestamp
def convert_to_unix(date_time_str):
date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S')
timestamp = str(int(date_time_obj.timestamp()))
return timestamp


if __name__ == '__main__':
# Path to your .snapshot files
dir_path = 'www.nytimes.com'
file_ls = []
for (dir_path, dir_names, file_names) in os.walk(dir_path):
# file_path = 'nytimes/20240128031617.snapshot'
for file_name in file_names:
if file_name.endswith('.snapshot') and file_name not in file_ls:
file_ls.append(file_name)

# # don't look inside any subdirectory
# break
print(file_ls)

for file_name in file_ls[1:]:
file_path = os.path.join(dir_path, file_name)
# extract links from the .snapshot file
links = extract_links(file_path, 'css-9mylee')
time = get_time(file_path)

for link in links:
# get the file name without prefix
link = link.split('www.')[-1]
shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link
subprocess.Popen(shell_command, shell=True)