From 0269c305c01c9230a1cd0a6311376cafaffacc02 Mon Sep 17 00:00:00 2001
From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com>
Date: Sun, 4 Feb 2024 14:07:48 -0800
Subject: [PATCH 1/5] Fixed issues with wayback machine scraper

---
 wayback_machine_scraper/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wayback_machine_scraper/__main__.py b/wayback_machine_scraper/__main__.py
index 3984dbb..79ec3f2 100644
--- a/wayback_machine_scraper/__main__.py
+++ b/wayback_machine_scraper/__main__.py
@@ -1,10 +1,10 @@
 import argparse
-from pkg_resources import get_distribution
+from importlib_metadata import distribution
 
 from scrapy.crawler import CrawlerProcess
 from scrapy.settings import Settings
 
-from .mirror_spider import MirrorSpider
+from mirror_spider import MirrorSpider
 
 
 def main():
@@ -21,7 +21,7 @@ def main():
         'USER_AGENT': (
             'Wayback Machine Scraper/{0} '
             '(+https://github.com/sangaline/scrapy-wayback-machine)'
-        ).format(get_distribution('wayback-machine-scraper').version),
+        ).format(distribution('wayback-machine-scraper').version),
         'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
         'DOWNLOADER_MIDDLEWARES': {
             'scrapy_wayback_machine.WaybackMachineMiddleware': 5,

From 84f71f9403a7e1b8b1201bb1cbaaad5866cba1f1 Mon Sep 17 00:00:00 2001
From: houxiru <158221690+houxiru@users.noreply.github.com>
Date: Tue, 20 Feb 2024 16:48:02 -0800
Subject: [PATCH 2/5] Create get_url_bs4.py

---
 wayback_machine_scraper/get_url_bs4.py | 63 ++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 wayback_machine_scraper/get_url_bs4.py

diff --git a/wayback_machine_scraper/get_url_bs4.py b/wayback_machine_scraper/get_url_bs4.py
new file mode 100644
index 0000000..f938a1f
--- /dev/null
+++ b/wayback_machine_scraper/get_url_bs4.py
@@ -0,0 +1,63 @@
+from bs4 import BeautifulSoup
+import datetime
+import subprocess
+import time
+import os
+
+def extract_links(file_path, class_name):
+    # Open and read the .snapshot file
+    with open(file_path, 'r', encoding='utf-8') as file:
+        html_content = file.read()
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, 'lxml')
+
+    links = []
+
+    for link in soup.find_all('a', class_ = class_name):
+        href = link.get('href')
+        if href:
+            links.append(href)
+    return links
+
+
+# get time range
+def get_time(file_path):
+    file_name = file_path.split('/')[-1]
+    time_str = file_name.split('.')[0]
+
+    return time_str
+
+
+# convert string date time to unix timestamp
+def convert_to_unix(date_time_str):
+    date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S')
+    timestamp = str(int(date_time_obj.timestamp()))
+    return timestamp
+
+
+if __name__ == '__main__':
+    # Path to your .snapshot files
+    dir_path = 'www.nytimes.com'
+    file_ls = []
+    for (dir_path, dir_names, file_names) in os.walk(dir_path):
+        # file_path = 'nytimes/20240128031617.snapshot'
+        for file_name in file_names:
+            if file_name.endswith('.snapshot') and file_name not in file_ls:
+                file_ls.append(file_name)
+        
+        # # don't look inside any subdirectory
+        # break
+    print(file_ls)
+    
+    for file_name in file_ls[1:]:
+        file_path = os.path.join(dir_path, file_name)
+        # extract links from the .snapshot file
+        links = extract_links(file_path, 'css-9mylee')
+        time = get_time(file_path)
+
+        for link in links:
+            # get the file name without prefix
+            link = link.split('www.')[-1]
+            shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link
+            subprocess.Popen(shell_command, shell=True)
\ No newline at end of file

From ef3e9085fc4b1faee70a0dc41929adf5d7fb7ecb Mon Sep 17 00:00:00 2001
From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:36:36 -0800
Subject: [PATCH 3/5] Added Cici's code to scrape NYT

---
 .DS_Store      | Bin 0 -> 10244 bytes
 nyt_scraper.py |  67 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 .DS_Store
 create mode 100644 nyt_scraper.py

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..dc1a17f89e1a08ba01a1d73ee2a64b83ffdd82da
GIT binary patch
literal 10244
zcmeHMU2GIp6h3EK=nNg`v=qT%iwjlx+d#{oDX87<mP(;e>~3ib$TGV#v=e4$mYvz%
zmfG0pn~3_PXnawl#%MHAL5xuo9}Lk46&e#rH1KNl#YBle37$JM+t@7_6B0wn++^;#
z_x#;^zddJm&s_k(SV3(C2m=6552KPu-E|t1vu9T0e9k<gNcLc5(uS3$nKVl`U<3#R
z2m}ZO2m}ZO2;3S7(04X#e62@=K0qKqAV6Rm0rq|f)5B=gr;{FaUmeuBB>>TCV!MsT
z?vH<>Va6aD_35NXi9-!NQ=*<J`XvVXwF7$QB#-*~lOFZV0o|3!kH0ed6$<*L2ZmjB
zz^F%qK0qKqU^W7*cCP{*vM}6j%&6ZxNV4vl+g!6+=z(sC!*<vM3F_B@1Eb(nSiM_e
zB|snPc0ecWc5Skt&a}X$3JV@=(8zZIa<oPc#wzqrROl~;5-2bRY3eg@6jF2+HX5N*
zndB)&{xXm!UXEt@3U9wxc&j0A8u_xx+5v5*HaGn5+Ub&=^KU@TrgMgFJ1s5WAy-vB
zXKoE&!`JeUBu1=K!cm-p9aHiL*eq#TM!Gy!())C+l#&|bmgy+EspVY*UDZhC;DBzb
zRw-r`Ox1R66P|z<cp)V%86V%Ur6m&HxVdE_5*}~cK;umtw@ge3{G#Ud+xk*RjjU-s
zNhhF&{%aKGd-Lg@p<980VHe}xsj8NiPj#t$Dq&u&IA2coC-;eiDXF1I+waXex@Fq?
zw7g9;CDqw<%<hb3?#*jVK9I4DLe{iXQhi1-G6h3%v<|~a=*KnU)!CW6Wf*%cTW4;D
z1|4nGVX|4G!Tq+CcjB(Ilq3~wtxqFgdQs~nkM^KWOEN<Vg?AP#4lTcD&H7Dk55&55
z^-k4E^$iOfMR7j?Xgc~4UCZ~Ebk!LdQtXUos=7HmP}FTbZD>hT$!hf&5d_z2$(^!1
zSV-%~3c52UHS@g7ZCWOa2US{ix1};~s&tz7^j9pC<pWF-&ui4HzQMgKWie^9%_+X(
z+K4D7kJ2YB+5S>3t)kp-C~3`*RKZ(k^A=GaD0;w3yL{%>Mo}KjSlKKc#rAm*w#)J%
zwrr1LJH0v0q%EEE*N)SchM25FRox9WNcDB{4tXapo#*%4BT8QDEm7!A`TRXR?|z8H
zXWq7}x?@cX`<PJD&7w|GO`n$++<bF6pqitryrGMu&>U^?B|J3226`>@z+p<xaX1O5
z;4Sz7&cKK81$+&c;4=IOzra=a1O9}+a1Jg&3G1-|m*Og1gKIH@o3R697{?xb7!#Pp
zemsO3)bI!nV-AZrieq>jpTHA%5}(Fr@D#p`ui&frHhzqs;HUT*p2PEa0jKbL`~ffH
z75oW*#ozEMCvXkiLavcp$*tmAxvg9~7xnhdl@n*kKZj4aZW_!^9RGE|J$wDc33WZx
zwL{!_>n6^{1zxHwx?^#e<<EWhN2f#0i_LO4{xitk$exi8Ze+V5P!W|=lu%1UP2yeU
ze5=gEeo6^#lxG<z)p*(~#JhP)32BZuT`8{SDH)`>-ZT<g$5Z-AoI4d;LmLH3GO5a&
zMnbieB5YByP246>vPsq6v`K6i1PR<pC>HAEDPbi3CL{SvxBwU73j9owd=0Bnz$LgG
zSL1qY!*+~P>~>-o_TpabqX=g4n?eQCsA3jPis55u<D)o%kKrUfPI3GcMe=j_JidT0
z;!F4%zK(C;oA@5SkEihrp2d&wbNmXw!EY;~b=S;j#Y?lJwV248hGnv^xN?loGzw+`
z0s#U60s#U60s#WI0RmOt8N%%TfBTL9|Gy1R8XO@&AVA<o2w-(*qBBO)8~yHQckN+%
z4%5RL)0_0DYeJoS9Z%G~j;9H`j(016;+0E&W)~$sqmv$`5w-vQ&w!x*|Bn}ELH&Po
KH(9rC{r@-V^YQWk

literal 0
HcmV?d00001

diff --git a/nyt_scraper.py b/nyt_scraper.py
new file mode 100644
index 0000000..927ad4d
--- /dev/null
+++ b/nyt_scraper.py
@@ -0,0 +1,67 @@
+from bs4 import BeautifulSoup
+import datetime
+import subprocess
+import time
+import os
+
+def extract_links(file_path, class_name):
+    # Open and read the .snapshot file
+    with open(file_path, 'r', encoding='utf-8') as file:
+        html_content = file.read()
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, 'lxml')
+
+    links = []
+
+    for link in soup.find_all('a', class_ = class_name):
+        href = link.get('href')
+        if href:
+            links.append(href)
+    return links
+
+
+# get time range
+def get_time(file_path):
+    file_name = file_path.split('/')[-1]
+    time_str = file_name.split('.')[0]
+
+    return time_str
+
+
+# convert string date time to unix timestamp
+def convert_to_unix(date_time_str):
+    date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S')
+    timestamp = str(int(date_time_obj.timestamp()))
+    return timestamp
+
+
+if __name__ == '__main__':
+    # Path to your .snapshot files
+    dir_path = '/Users/hou/GitHub/wayback-machine-scraper/www.nytimes.com'
+    file_ls = []
+    for (dir_path, dir_names, file_names) in os.walk(dir_path):
+        # file_path = 'nytimes/20240128031617.snapshot'
+        for file_name in file_names:
+            if file_name.endswith('.snapshot') and file_name not in file_ls:
+                file_ls.append(file_name)
+        
+        # # don't look inside any subdirectory
+        # break
+    print(file_ls)
+    
+    for file_name in file_ls[1:]:
+        file_path = os.path.join(dir_path, file_name)
+        # extract links from the .snapshot file
+        links = extract_links(file_path, 'css-9mylee')
+        time = get_time(file_path)
+        # print(links)
+
+        for link in links:
+            # get the file name without prefix
+            link = link.split('www.')[-1]
+            shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link
+            print(shell_command)
+            subprocess.Popen(shell_command, shell=True)
+
+    
\ No newline at end of file

From edb075bd1aaa880af28e5c511be132bb19022848 Mon Sep 17 00:00:00 2001
From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:38:15 -0800
Subject: [PATCH 4/5] Renamed files for consistency

---
 .../{get_url_bs4.py => scrape_nyt.py}         |  0
 wayback_machine_scraper/scrape_reuters.py     | 63 +++++++++++++++++++
 2 files changed, 63 insertions(+)
 rename wayback_machine_scraper/{get_url_bs4.py => scrape_nyt.py} (100%)
 create mode 100644 wayback_machine_scraper/scrape_reuters.py

diff --git a/wayback_machine_scraper/get_url_bs4.py b/wayback_machine_scraper/scrape_nyt.py
similarity index 100%
rename from wayback_machine_scraper/get_url_bs4.py
rename to wayback_machine_scraper/scrape_nyt.py
diff --git a/wayback_machine_scraper/scrape_reuters.py b/wayback_machine_scraper/scrape_reuters.py
new file mode 100644
index 0000000..f938a1f
--- /dev/null
+++ b/wayback_machine_scraper/scrape_reuters.py
@@ -0,0 +1,63 @@
+from bs4 import BeautifulSoup
+import datetime
+import subprocess
+import time
+import os
+
+def extract_links(file_path, class_name):
+    # Open and read the .snapshot file
+    with open(file_path, 'r', encoding='utf-8') as file:
+        html_content = file.read()
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, 'lxml')
+
+    links = []
+
+    for link in soup.find_all('a', class_ = class_name):
+        href = link.get('href')
+        if href:
+            links.append(href)
+    return links
+
+
+# get time range
+def get_time(file_path):
+    file_name = file_path.split('/')[-1]
+    time_str = file_name.split('.')[0]
+
+    return time_str
+
+
+# convert string date time to unix timestamp
+def convert_to_unix(date_time_str):
+    date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S')
+    timestamp = str(int(date_time_obj.timestamp()))
+    return timestamp
+
+
+if __name__ == '__main__':
+    # Path to your .snapshot files
+    dir_path = 'www.nytimes.com'
+    file_ls = []
+    for (dir_path, dir_names, file_names) in os.walk(dir_path):
+        # file_path = 'nytimes/20240128031617.snapshot'
+        for file_name in file_names:
+            if file_name.endswith('.snapshot') and file_name not in file_ls:
+                file_ls.append(file_name)
+        
+        # # don't look inside any subdirectory
+        # break
+    print(file_ls)
+    
+    for file_name in file_ls[1:]:
+        file_path = os.path.join(dir_path, file_name)
+        # extract links from the .snapshot file
+        links = extract_links(file_path, 'css-9mylee')
+        time = get_time(file_path)
+
+        for link in links:
+            # get the file name without prefix
+            link = link.split('www.')[-1]
+            shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link
+            subprocess.Popen(shell_command, shell=True)
\ No newline at end of file

From 83ed8a3025fe4be5b6e37fb4bba9deb5d271b074 Mon Sep 17 00:00:00 2001
From: Anika Fuloria <62360723+anikafuloria@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:42:00 -0800
Subject: [PATCH 5/5] Delete nyt_scraper.py

---
 nyt_scraper.py | 67 --------------------------------------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 nyt_scraper.py

diff --git a/nyt_scraper.py b/nyt_scraper.py
deleted file mode 100644
index 927ad4d..0000000
--- a/nyt_scraper.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from bs4 import BeautifulSoup
-import datetime
-import subprocess
-import time
-import os
-
-def extract_links(file_path, class_name):
-    # Open and read the .snapshot file
-    with open(file_path, 'r', encoding='utf-8') as file:
-        html_content = file.read()
-
-    # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(html_content, 'lxml')
-
-    links = []
-
-    for link in soup.find_all('a', class_ = class_name):
-        href = link.get('href')
-        if href:
-            links.append(href)
-    return links
-
-
-# get time range
-def get_time(file_path):
-    file_name = file_path.split('/')[-1]
-    time_str = file_name.split('.')[0]
-
-    return time_str
-
-
-# convert string date time to unix timestamp
-def convert_to_unix(date_time_str):
-    date_time_obj = datetime.datetime.strptime(date_time_str, '%Y%m%d%H%M%S')
-    timestamp = str(int(date_time_obj.timestamp()))
-    return timestamp
-
-
-if __name__ == '__main__':
-    # Path to your .snapshot files
-    dir_path = '/Users/hou/GitHub/wayback-machine-scraper/www.nytimes.com'
-    file_ls = []
-    for (dir_path, dir_names, file_names) in os.walk(dir_path):
-        # file_path = 'nytimes/20240128031617.snapshot'
-        for file_name in file_names:
-            if file_name.endswith('.snapshot') and file_name not in file_ls:
-                file_ls.append(file_name)
-        
-        # # don't look inside any subdirectory
-        # break
-    print(file_ls)
-    
-    for file_name in file_ls[1:]:
-        file_path = os.path.join(dir_path, file_name)
-        # extract links from the .snapshot file
-        links = extract_links(file_path, 'css-9mylee')
-        time = get_time(file_path)
-        # print(links)
-
-        for link in links:
-            # get the file name without prefix
-            link = link.split('www.')[-1]
-            shell_command = 'wayback-machine-scraper -f ' + time + ' -t ' + time + ' -a "' + link + '$" ' + link
-            print(shell_command)
-            subprocess.Popen(shell_command, shell=True)
-
-    
\ No newline at end of file