diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index 53f7a78c..2fb613d0 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -201,10 +201,10 @@ def scrape(self): self.extract_unique_links(self.url,self.root,self.root_regex,self.root_filename,self.content_tags, self.delay) if __name__ == "__main__": - url = "https://cs61a.org/" - root_regex = r"https://cs61a.org/" - root = "https://cs61a.org/" - root_filename = "61A_Website" + url = "https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html" + root_regex = r"^https://docs.opencv.org/4.x\/\w+\/\w+\/tutorial_py" + root = "https://docs.opencv.org/4.x/d6/d00/" + root_filename = "opencv" content_tags = match_tags(url) scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags) diff --git a/rag/scraper/Scraper_master/scrape_pdf.py b/rag/scraper/Scraper_master/scrape_pdf.py index d25b44e9..a41a6bb7 100644 --- a/rag/scraper/Scraper_master/scrape_pdf.py +++ b/rag/scraper/Scraper_master/scrape_pdf.py @@ -25,6 +25,6 @@ def content_extract(self, filename, url, **kwargs): # Example usage: if __name__ == "__main__": - pdf_url = "https://ucb-ee106.github.io/106b-sp23site/assets/hw/Homework_5__Grasping.pdf" # Replace with the actual PDF URL - pdf_saver = ScrapePdf(pdf_url) # Specify the filename to save as - pdf_saver.content_extract("HW5", pdf_url) # Start the download process + pdf_url = "pdflink" # Replace with the actual PDF URL + pdf_saver = ScrapePdf(pdf_url) + pdf_saver.content_extract("filename", pdf_url) # Change filename to save as and start the download process