comment changes

augcog · Jul 18, 2024 · b5bc1e2 · b5bc1e2
1 parent 47ab43b
commit b5bc1e2
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py
@@ -201,10 +201,10 @@ def scrape(self):
             self.extract_unique_links(self.url,self.root,self.root_regex,self.root_filename,self.content_tags, self.delay)
 
 if __name__ == "__main__":
-    url = "https://cs61a.org/"
-    root_regex = r"https://cs61a.org/"
-    root = "https://cs61a.org/"
-    root_filename = "61A_Website"
+    url = "https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html"
+    root_regex = r"^https://docs.opencv.org/4.x\/\w+\/\w+\/tutorial_py"
+    root = "https://docs.opencv.org/4.x/d6/d00/"
+    root_filename = "opencv"
     content_tags = match_tags(url)
 
     scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags)

diff --git a/rag/scraper/Scraper_master/scrape_pdf.py b/rag/scraper/Scraper_master/scrape_pdf.py
@@ -25,6 +25,6 @@ def content_extract(self, filename, url, **kwargs):
 
 # Example usage:
 if __name__ == "__main__":
-    pdf_url = "https://ucb-ee106.github.io/106b-sp23site/assets/hw/Homework_5__Grasping.pdf"  # Replace with the actual PDF URL
-    pdf_saver = ScrapePdf(pdf_url)  # Specify the filename to save as
-    pdf_saver.content_extract("HW5", pdf_url) # Start the download process
+    pdf_url = "pdflink"  # Replace with the actual PDF URL
+    pdf_saver = ScrapePdf(pdf_url)
+    pdf_saver.content_extract("filename", pdf_url) # Change filename to save as and start the download process