ROS URL fixes and changes to requirements.txt (#64)

* updated readme * fixed video scraper, pdf scraper, updates to general scraper * comment changes * changes to requirements.txt and scrape_header URL parsing
augcog · Jul 19, 2024 · 010fdff · 010fdff
1 parent 5f48a72
commit 010fdff
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 8 deletions.
diff --git a/rag/requirements.txt b/rag/requirements.txt
@@ -12,7 +12,7 @@ openai_whisper==20231117
 packaging==24.0
 pytest==8.1.1
 python-dotenv==1.0.1
-pytube==15.0.0
+pytubefix==6.3.3
 PyYAML==6.0.1
 Requests==2.31.0
 rst_to_myst==0.4.0

diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py
@@ -4,7 +4,7 @@
 import time
 import re
 from termcolor import colored
-from urllib.parse import urljoin
+from urllib.parse import urljoin, unquote
 from markdownify import markdownify as md
 from rag.scraper.Scraper_master.base_scraper import BaseScraper
 import yaml
@@ -70,6 +70,7 @@ def process_links_and_save(self, links, dir_name, delay, content_tags):
                 link = link[:-1]
             filename = link.split('/')[-1]
             filename = filename.split('.')[0]
+            filename = unquote(filename).replace(' ', '_')
             cur_dir = os.getcwd()
             create_and_enter_dir(filename)
             # if not os.path.exists(filename):
@@ -227,13 +228,13 @@ def run_tasks(yaml_file):
             scrapper.scrape()
 
 if __name__ == "__main__":
-    # url =  "https://guide.berkeley.edu/courses/"
-    # root_regex = r"^https://classes.berkeley.edu/"
-    # root =  "https://classes.berkeley.edu/"
-    # root_filename = "courses"
-    # #
+    # url =  "https://wiki.ros.org/ROS/Tutorials/"
+    # root_regex = r"^https://wiki.ros.org/ROS/Tutorials/"
+    # root =  "https://wiki.ros.org/ROS/Tutorials/"
+    # root_filename = "ROS"
+
     # content_tags = match_tags(url)
-    #
+
     # scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags)
     # scrapper.scrape()
     run_tasks('106b_task.yaml')