Skip to content

Commit

Permalink
comment changes
Browse files Browse the repository at this point in the history
  • Loading branch information
terrianne-zhang committed Jul 18, 2024
1 parent 47ab43b commit b5bc1e2
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
8 changes: 4 additions & 4 deletions rag/scraper/Scraper_master/scrape_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,10 @@ def scrape(self):
self.extract_unique_links(self.url,self.root,self.root_regex,self.root_filename,self.content_tags, self.delay)

if __name__ == "__main__":
url = "https://cs61a.org/"
root_regex = r"https://cs61a.org/"
root = "https://cs61a.org/"
root_filename = "61A_Website"
url = "https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html"
root_regex = r"^https://docs.opencv.org/4.x\/\w+\/\w+\/tutorial_py"
root = "https://docs.opencv.org/4.x/d6/d00/"
root_filename = "opencv"
content_tags = match_tags(url)

scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags)
Expand Down
6 changes: 3 additions & 3 deletions rag/scraper/Scraper_master/scrape_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ def content_extract(self, filename, url, **kwargs):

# Example usage:
if __name__ == "__main__":
pdf_url = "https://ucb-ee106.github.io/106b-sp23site/assets/hw/Homework_5__Grasping.pdf" # Replace with the actual PDF URL
pdf_saver = ScrapePdf(pdf_url) # Specify the filename to save as
pdf_saver.content_extract("HW5", pdf_url) # Start the download process
pdf_url = "pdflink" # Replace with the actual PDF URL
pdf_saver = ScrapePdf(pdf_url)
pdf_saver.content_extract("filename", pdf_url) # Change filename to save as and start the download process

0 comments on commit b5bc1e2

Please sign in to comment.