Skip to content

Commit

Permalink
ROS URL fixes and changes to requirements.txt (#64)
Browse files Browse the repository at this point in the history
* updated readme

* fixed video scraper, pdf scraper, updates to general scraper

* comment changes

* changes to requirements.txt and scrape_header URL parsing
  • Loading branch information
terrianne-zhang authored Jul 19, 2024
1 parent 5f48a72 commit 010fdff
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
2 changes: 1 addition & 1 deletion rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ openai_whisper==20231117
packaging==24.0
pytest==8.1.1
python-dotenv==1.0.1
pytube==15.0.0
pytubefix==6.3.3
PyYAML==6.0.1
Requests==2.31.0
rst_to_myst==0.4.0
Expand Down
15 changes: 8 additions & 7 deletions rag/scraper/Scraper_master/scrape_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import time
import re
from termcolor import colored
from urllib.parse import urljoin
from urllib.parse import urljoin, unquote
from markdownify import markdownify as md
from rag.scraper.Scraper_master.base_scraper import BaseScraper
import yaml
Expand Down Expand Up @@ -70,6 +70,7 @@ def process_links_and_save(self, links, dir_name, delay, content_tags):
link = link[:-1]
filename = link.split('/')[-1]
filename = filename.split('.')[0]
filename = unquote(filename).replace(' ', '_')
cur_dir = os.getcwd()
create_and_enter_dir(filename)
# if not os.path.exists(filename):
Expand Down Expand Up @@ -227,13 +228,13 @@ def run_tasks(yaml_file):
scrapper.scrape()

if __name__ == "__main__":
# url = "https://guide.berkeley.edu/courses/"
# root_regex = r"^https://classes.berkeley.edu/"
# root = "https://classes.berkeley.edu/"
# root_filename = "courses"
# #
# url = "https://wiki.ros.org/ROS/Tutorials/"
# root_regex = r"^https://wiki.ros.org/ROS/Tutorials/"
# root = "https://wiki.ros.org/ROS/Tutorials/"
# root_filename = "ROS"

# content_tags = match_tags(url)
#

# scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags)
# scrapper.scrape()
run_tasks('106b_task.yaml')

0 comments on commit 010fdff

Please sign in to comment.