From 4796a4edb0cabd08485a16752de9655cbe761adb Mon Sep 17 00:00:00 2001 From: gmanhas12 Date: Thu, 24 Apr 2025 13:03:23 -0700 Subject: [PATCH 1/5] Add technical documentation to README (task 17) --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index f6c551f..3362b30 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,27 @@ # Cheaper.com Initial Landing page![Initial Landing page](https://github.com/user-attachments/assets/7d2a1c73-0d16-4965-a2f8-4402ef267ff4) + + + + +##Technical Documentation of how to run the application + +#How to run + +-To run the scraper, execute the main.py script by running the command + +python main.py + +-Make sure you are in the src directory when you run the command (the directory that contains main.py). + +##Where is the entry point? + +-The entry point is the main() in main.py, specifically, this block of code, +if __name__ == "__main__": + main() + + +##what file needs to be located and what variables would need to be changed if you wanted to scrape another website? + +-If you wanted to scrape another website, you need to locate the file main.py and change the variables “scraper” and “pages” to whatever website you wanted and the new URl paths. As well ensure the website allows scraping. From a3b373e95d7f1868af84688f6675764bf42396b5 Mon Sep 17 00:00:00 2001 From: gmanhas12 Date: Thu, 24 Apr 2025 17:38:23 -0700 Subject: [PATCH 2/5] added caching, used functools already built in python, and used lru cache, i tested it and it should be working, there is a test function comemnted out in main. i added some imports due to when i ran it, it wasnt finding the folders, can remove those if you want as it is probably me and my environment. --- README.md | 4 +- .../__pycache__/base_scraper.cpython-39.pyc | Bin 0 -> 1964 bytes webscraper/output.json | 24 ++++++ webscraper/src/Cheaper_Scraper.py | 75 ++++++++++++------ .../Cheaper_Scraper.cpython-39.pyc | Bin 1939 -> 3488 bytes .../src/__pycache__/__init__.cpython-39.pyc | Bin 0 -> 158 bytes .../src/__pycache__/main.cpython-39.pyc | Bin 0 -> 878 bytes .../__pycache__/robot_check.cpython-39.pyc | Bin 760 -> 778 bytes webscraper/src/main.py | 29 ++++++- 9 files changed, 104 insertions(+), 28 deletions(-) create mode 100644 webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc create mode 100644 webscraper/output.json create mode 100644 webscraper/src/__pycache__/__init__.cpython-39.pyc create mode 100644 webscraper/src/__pycache__/main.cpython-39.pyc diff --git a/README.md b/README.md index 3362b30..e4a5606 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/ -To run the scraper, execute the main.py script by running the command -python main.py +python src/main.py --Make sure you are in the src directory when you run the command (the directory that contains main.py). +-Make sure you are in the webscraper directory when you run the command ##Where is the entry point? diff --git a/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc b/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd1704d439473037f51337091f46e993989391f5 GIT binary patch literal 1964 zcma)7OK;pZ5Z2qW*6Zfs7U-b>uL~4D7mJ`x>>y47cU$Cf5f%`*B4rWKDxwD7x~4*t71wLdDyNf?9fYDRD$Ycm`$U`JpnDS^x{=GP#FVQWvd`3%A zY?e>!v@O9SwY(7edAnMsImeVN*L<}_ZVBbKG$3p@PMb{^CW!!?lLYHYIY6w#>yxva zoSyCkpkTd*>Re7hCN+{M4FlJAW$@E8uG5@A+!{P1nXF4fiE4N%GLb_1biP;OP_R!L z<_kGQF+=48YM9`Gd10#Tf9Zez>X*yG7piut6d6%%nsTMGw)mbb>WY&Z>P@bN;)BM8 z!G7k*Da0!8AsLb(kAdxc)(A2By$QN`Iuw1KT03_{zJ;P z(4r}@Gr;}5fHIPEvWs$8VCJ3n&twSi%UEH{#%?V)q@+!Qox None: """Initialize the scraper with base parameters. - + Args: base_url: The base URL to scrape user_agent: User agent string to identify the scraper delay: Time in seconds to wait between requests """ - + self.base_url = base_url.rstrip('/') self.delay = delay self.user_agent = user_agent + #initialize session self.session = requests.Session() self.session.headers.update({"User-Agent": self.user_agent}) - # robot logic checks if there are instances not able to be + + # robot logic checks if there are instances not able to be self.robots = RoboCheck(base_url, user_agent) + + @staticmethod + @lru_cache(maxsize=128) # cache up to 128 unique URLs + def _cached_get(url: str, user_agent: str) -> Optional[str]: + print(f"[HTTP Request] Fetching from web: {url}") # <== ADD THIS + headers = {"User-Agent": user_agent} + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + return response.text + except requests.RequestException as e: + logging.error(f"Error fetching {url}: {e}") + return None + + def fetch(self, path: str = "/") -> Optional[str]: """Fetch content from a specific path. - + Args: path: The URL path to fetch - + Returns: HTML content as string if successful, None otherwise """ @@ -42,43 +65,44 @@ def fetch(self, path: str = "/") -> Optional[str]: logging.warning(f"Disallowed by robots.txt: {path}") return None + url = self.base_url + path - - try: - response = self.session.get(url, timeout=10) - response.raise_for_status() - time.sleep(self.delay) # delay to simulate a user - return response.text - except requests.RequestException as e: - logging.error(f"Error fetching {url}: {e}") - return None - + cached_before = self._cached_get.cache_info().hits + html = self._cached_get(url, self.user_agent) + cached_after = self._cached_get.cache_info().hits + + if cached_after == cached_before: # No cache hit, so it was fetched + time.sleep(self.delay) + + return html + def parse(self, html: str) -> List[str]: """Parse HTML content. - + Args: html: The HTML content to parse - + Returns: List of parsed items from the HTML """ soup = BeautifulSoup(html, "html.parser") results = [] - + for book in soup.find_all("article", class_="product_pod"): title = book.h3.a["title"] results.append(title) - + return results - - + + + def scrape(self, paths: List[str]) -> Dict[str, List[str]]: """Scrape multiple paths. - + Args: paths: List of URL paths to scrape - + Returns: Dictionary mapping paths to their parsed results """ @@ -88,4 +112,5 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]: html = self.fetch(path) if html: results[path] = self.parse(html) - return results \ No newline at end of file + return results + diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc index ac8006a8cb604891a54e4c1b0a883ee5d364f6d7..2cdded46a744febff08c2cf4452bb7cfa41856f1 100644 GIT binary patch literal 3488 zcma)9OK;rP73Sq@I1fE6%aU!h01>o72T5e7DY_7h!ml`P;VOaUASkPVpm^_SIOYt; z_uhf50aX-`w*k6tU0}2+(Ef<7`wt3m+eP6&(1ln1&gG0OD*v>f%a-*Ys+@nCsNBY@{szM>&T=cNe_6y(w{v^wL=LOl+{i_nlY2uy@`sJ6 zF$|($-q(zpW_~MbiRPd^>xdTYlCjIiu87rB#@$o!Fy@7D9;p8tJ-s0^S?g=|#@X;pv z>Yk;SHeco|nAt*G+l*dC+ac|b7sL~pSFMQ>GENS}NLL;gd2&?wQs{{srEWdJSSr(d z53hOvQ?eOIa>`08W~Bpbmo}`U-I;f4q2!m=z#lYb0Z045$qdxGS+deTp^+_fYtzcY+y|SSQvr(u-T8y}& zsfS6X!+oI-g&2iWJevrm)!E$F?8fUBZTZO^^8HS?4ZqXAW7&awC1TW)OE6Wim>V^A zg;HR0)KGPyY5;N$$5j4g%oDh%K{I>_Mr#F}T_y5fy&$b+R{gsrucG|=H}|&p=m2W( zK{CwpqrLk@IvL_DYVV6oKb-9EAqt34dxv6wj>?{r>E4`Y@tkN|wuKz-f{(fsSUGuJQPU83-7~vghW*xiC zn*K!lvj+EQCGM9l)^kagzOjA>u{(DE0IA!Zmu^}&Q9`&icBUWv{^64+UxklrWWNm` z2%YwcHa%GkL-=evl;nB46^mOHODGM3)7L%s-cGwLnZw%&Xc^P9X z=RoLaRVEoiwO7bk=|oRdZ&f2N4i2z?&YKumD-nPZxBh1 zi;)r)6NUn{xkh`nYtiE|_WT(d)EbQCEVF>wY!xUT@FX&sY(cr*Xh8Z>~VHStf;KzJ6{P@uE&W22)N^y%S~-#$7!Num~7ll4M1NmIl%lGw-LPB9W;0X>w5fd6d8 z>Gk`WO7gro6g=EN3hO}M($DpFxYO;(4fr)`rpYKar;;B~y>Xbx5r|Riz)#UswG3>C z*en^Js6fMmG-xr{yz+^qlqt3iB=|O@nQp+<( z`6VVRqM_C5fJ&~@VjZ^3I(C3}x_*IHW(s2FBaD9sm2P2(Mmpl`#6GqUEKtNAFkT10 zec$@_I%wh!?196*Yt}JHGIL9h$noq)C1p;3`j4*?i9B;2nfIXsUA?Bm`I$rsWUOMYGg)E>)%Pf{I^3*KdspXP~D zapmbu=K@MVG8zTFY{X-*C$jsWRV=CeWIPrlz5tn=9k~i0P*np`G0C-x-2I|>DnB6@ zC~2Hi49}e5OEk{)zQtU74WRx2oMtGb0dsU;eIHO#_A_nRLM5Z9X8>#TR?{C5IL zAD#52DUZS-d`ro|EKi&Ol$l&WJa_Ne%Aoqk+0bNLh+cvwp#4K&ujvk`$)C4qw2eTm^qH@%cs_a75&bvWP zjbvXD9F;pLvQgFGnWU@H<$D=jI9k1P_uiKIfUKP*C1tIsZDcl1>3@X!3#-|ij8a_` mxvK9?7x zyxllV9zZt_K}aHLM#B0_Lke@2u`v%ht=B{-V2x+a*bQAuz9&*h=bT9AjD?=0eG<5@ z!4BCCXxVI<**X^xL0hh$lhBtN5Xp^l9h}8erTdA-&*jzpVQ%kk z_rClKd|`VK@Y3Ipjf!SEE1NT;bQBNN#Fma!S$vv^TCb8g#{LL;6>UM6fZ7EuNJI+` zk`)Xjw_@pB5E!`y8Mz~G=}TB2@f36^v5kTj>Y0NJe0_3d;L@A8Q z4r+sFtFrnB(VH;7{O0k!y%l16#;Q28!+vVj-m%&@HG+FaCs!)b3f129wDhAWouoF3 z(57jDAhbnY>hdPt;`3YY=An0`t&y5Q3^9%EFkyIX)sPRkS)u$J3;;Q@qy=DSx|S`u zgz-!AJ3}lHKiC%qy+sym{yB`Dky|jyB!9scyx>dkf&k|JvH^QdJX7$|+A}yOh;5D9 zr@vCN5R}YsgY9)G3ic8>T?p_XQ2QDA3A`-C&%plvYP3zZE6;(uJOA);YT_)*kCp81 zpLT1sd-lZM>%Ok<7@h2HLVg0E9%5*%tzRHT9f9< zNus6}IpZh{Bye;G_VpGjad-eB(92BoiBTn06)|dY13f!caHwlO06|!bcBo)Y#^CEi z&VBgar1Pz78Ly6EriXT%y_0F==bQKRF-$BesHA6XQRVNP6l_T)WZ^M6!}{Xv&ezaL zZyIZOD+$FgRt*`1%;`g~;u`=jo_7|&l@a7h(iij>b|kur|;SUHRxk|8MAI{^_-QRzdZ4vOI}1a~}p*M?yCIDqtET4E=+2 zA|v2Sz-#RB+MzH|C9gYmDubWEzCn-(ZT=VAO646)5ra3F_F$62yC?v`=UsRWWC`rQ zpbJq5?Zb=~fG*T5zC(`uf&n^o7ef6A>7ARtza27CSVh2lSX_)>Mqq82a^~-MU(Q z23vJ4U;=p2qyo4xfBza{wJtius5SgWf&aQNG_bBr;<1XNvKd9=T+T8ax1#817H6v+ z4p{_VW_fJ&duZZK;2QJvR&ZG4G}2fjI9OB-)+?WGwK|*YPPH2N)pOCv?vU@Bud9^l ln>by=|1TTC1(L(6u2v`h&q3+$)Y0BYdKxSN{_1(O*uU;l!%P4G diff --git a/webscraper/src/__pycache__/__init__.cpython-39.pyc b/webscraper/src/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d43fde4adb6f131a6895b08baaa604a0dfb64288 GIT binary patch literal 158 zcmYe~<>g`k0{bOg8P-7hF^Gc<7=auIATDMB5-AM944RC7D;bJF!U*D*v$Iu9XmM&$ zag1AHZe~tpj7xrUX>Mv>NpXyOW{F2>QcQA2YGOfZQA~MiQgLz-m|0wu91|a(nU`4- ZAFo$Xd5gm)H$SB`C)EyQ_-7zy003+mCr|(Y literal 0 HcmV?d00001 diff --git a/webscraper/src/__pycache__/main.cpython-39.pyc b/webscraper/src/__pycache__/main.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2f5041a36342fd5e75c133cc35f8a18e7604dca GIT binary patch literal 878 zcmYjPOKaRP5SHX;ybrc5l+sg9frZwaTtjI?655^uC6rR*V!SIQ>%^}}GGQ0{w7G^t zuceSZ_Fwc*^rCA|`3rj}9eIqoz(b1gv{t&DF!IZ@%imH(G9KvM?-3ILzd_eeR&a*#%N4wP`ft1BkKGx7J#!hA=Ym!xOZ!7K`^@5 zzu1x{-JsrreBEqpX`R_MscqV!-D?A*Pr%KDwE#OG>>zQq4J`Eftl1!7(>t0!pGsMZ z-O*@V6z9U1g=o3>lOmhn)7vlPXxra81rl2Jh9vMR!FUv(vRnohWobNH;nBSBttUY@ zPk51SuJG;(znpi9N|u%M&qR?ce0fWt#t;W~l2b?@I$U1__PVjM63H^5tdb|WRAy1K zT$ydENvo_JI)Sr($pfnd03vWW6ICjOGRs&_xgNSQbrpqAej+{qfA!{IH~a{YMff(( zl5`dx7LzK2_9A?j$oJJaoUA`(c*(|h^+UlYVHPL3U(S@NjV_cD&p;t!VuHzE+p-PM z7~lbRao^}-{upA)jUxRziiUmO(HDl82}KIQd)l+(5=c?y#ABfm1HB^2l|vJr#~IV< zCrQepD0l)0{JVcFT91~u(^95PGWNflPK%zqs#qugEXqB?pddNc0!Ue1@-XdM% J8tzjU{{t2=;-mlo literal 0 HcmV?d00001 diff --git a/webscraper/src/__pycache__/robot_check.cpython-39.pyc b/webscraper/src/__pycache__/robot_check.cpython-39.pyc index 480646465240180a64772a0da29aea83e5bace76..d1557b9a0c0a6f8523932e0a708a69b7e718f65d 100644 GIT binary patch delta 67 zcmeyt+Qr75$ji&c00jFqI5%=TF>1OwTg8MHrxq2*xFzOh=2XVGT~_ViY%Vv5IldNKGtAEs7~FNlnZt$q3FYN!?t=IE4`a DQaKMi diff --git a/webscraper/src/main.py b/webscraper/src/main.py index 700835b..354fa53 100644 --- a/webscraper/src/main.py +++ b/webscraper/src/main.py @@ -1,6 +1,12 @@ -from Cheaper_Scraper import CheaperScraper import json +#import time // for testing +# i added htese imports below becasue when i ran it it wasnt finding the folders +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from src.Cheaper_Scraper import CheaperScraper + def main(): # Set up the scraper for a simple legal-to-scrape website @@ -26,3 +32,24 @@ def main(): if __name__ == "__main__": main() + + + + +# For testing cache +# def main(): +# scraper = CheaperScraper("https://books.toscrape.com") + +# print("=== First Request ===") +# start = time.time() +# html1 = scraper.fetch("/") # should print: [HTTP Request] ... +# print("Time taken:", round(time.time() - start, 2), "seconds\n") + +# print("=== Second Request (Should Be Cached) ===") +# start = time.time() +# html2 = scraper.fetch("/") # should NOT print: [HTTP Request] ... +# print("Time taken:", round(time.time() - start, 2), "seconds\n") +# print("Cache stats:", scraper._cached_get.cache_info()) + +# if __name__ == "__main__": +# main() From 84264a7902c2eb3129c4df3bbcb0b44bf2a0e956 Mon Sep 17 00:00:00 2001 From: gmanhas12 Date: Thu, 24 Apr 2025 19:42:35 -0700 Subject: [PATCH 3/5] addressed comments on inital commit, creates a test file in the ./src/tests folder directory and used the unittest suite to explore the conditions when the function would return an exception, valid/invalid data. made a seperate file for cached_get,. --- webscraper/src/Cheaper_Scraper.py | 33 +++----- .../Cheaper_Scraper.cpython-39.pyc | Bin 3488 -> 3163 bytes .../__pycache__/fetch_utils.cpython-39.pyc | Bin 0 -> 782 bytes webscraper/src/fetch_utils.py | 21 +++++ webscraper/src/main.py | 19 ----- .../tests/__pycache__/__init__.cpython-39.pyc | Bin 0 -> 164 bytes .../test_fetch_and_cache.cpython-39.pyc | Bin 0 -> 1892 bytes webscraper/src/tests/test_fetch_and_cache.py | 74 ++++++++++++++++++ 8 files changed, 107 insertions(+), 40 deletions(-) create mode 100644 webscraper/src/__pycache__/fetch_utils.cpython-39.pyc create mode 100644 webscraper/src/fetch_utils.py create mode 100644 webscraper/src/tests/__pycache__/__init__.cpython-39.pyc create mode 100644 webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc create mode 100644 webscraper/src/tests/test_fetch_and_cache.py diff --git a/webscraper/src/Cheaper_Scraper.py b/webscraper/src/Cheaper_Scraper.py index e9593ad..d853015 100644 --- a/webscraper/src/Cheaper_Scraper.py +++ b/webscraper/src/Cheaper_Scraper.py @@ -1,15 +1,16 @@ import requests import time from bs4 import BeautifulSoup +from urllib.parse import urlparse import logging from typing import Dict, List, Optional # i added these imports below becasue when i ran it it wasnt finding the folders, it is probably me can remove if you dont need import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - +from src.fetch_utils import cached_get from ABC.base_scraper import BaseScraper -from robot_check import RoboCheck +from src.robot_check import RoboCheck from functools import lru_cache @@ -22,7 +23,10 @@ def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float user_agent: User agent string to identify the scraper delay: Time in seconds to wait between requests """ - + parsed_url = urlparse(base_url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid base URL: {base_url}") + self.base_url = base_url.rstrip('/') self.delay = delay self.user_agent = user_agent @@ -37,19 +41,7 @@ def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float self.robots = RoboCheck(base_url, user_agent) - @staticmethod - @lru_cache(maxsize=128) # cache up to 128 unique URLs - def _cached_get(url: str, user_agent: str) -> Optional[str]: - print(f"[HTTP Request] Fetching from web: {url}") # <== ADD THIS - headers = {"User-Agent": user_agent} - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - return response.text - except requests.RequestException as e: - logging.error(f"Error fetching {url}: {e}") - return None - + def fetch(self, path: str = "/") -> Optional[str]: """Fetch content from a specific path. @@ -65,13 +57,12 @@ def fetch(self, path: str = "/") -> Optional[str]: logging.warning(f"Disallowed by robots.txt: {path}") return None - url = self.base_url + path - cached_before = self._cached_get.cache_info().hits - html = self._cached_get(url, self.user_agent) - cached_after = self._cached_get.cache_info().hits + cached_before = cached_get.cache_info().hits + html = cached_get(url, self.user_agent) + cached_after = cached_get.cache_info().hits - if cached_after == cached_before: # No cache hit, so it was fetched + if cached_after == cached_before: time.sleep(self.delay) return html diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc index 2cdded46a744febff08c2cf4452bb7cfa41856f1..6a69690d1b69fdd27598d9e4dc8babd5d2d52181 100644 GIT binary patch delta 1443 zcmZux&2Jk;6yMpM{a)K~>bhx~HX%iTTg5cAr>a6~qXLysC5VdDLBd*l#$Q=`&CD9q z$dM(SGa}j(qLu6;2TnbgKV&L#N2qWp;s6|YGp0qVur)tF|K9hTw>v-gbgdW{3V{Ki zAD#!@XVFP<1O0e%d3$OlcwiG78PuRCafk!IIB^Fa@diHe2LTEG)rBO~HW7(f*e}eB zECMWPT+vu&s|!T!g}Fcrq`jQv)SyBtT6UAugDR&STD zpaBh!4H~lQ%p&I~+Ba&^&#LOXH6#nSSUk>p?Qyc7jz>UvW1fs+F4)Pl%`j-ituCXD z4$E>M?M)N#Vg^CfASNv{3%;0I6hFk&rp__+Ojm((n03~m-b0i6Gyv8G&eL27Tto}U znD|SE1ut}M6MTxGFVPjyhtz^XPs!q5m&GH-_gg&9e?uQlZKwgm*@Gjx0GTmCGjxO} zxQ}=R&=lL5MbQL(XFM`z=qr%sn8E-jM#G+1fX>7ObTe=6FAR`^iO~=G;XIZ~w96LY34ixfjK_Te9QhWR7&_vzv4o#7X8(xY3=Sg@^oAAfj#>reFdVQ^0{zH_s~ zhS}X(iCa*ebOelWz@#%|S(3J7@JXDE*=^2KE*&m1-W%~E81fYbCFSCA_W78JOh|X1 z3DHZ3(iJZR7)SopDqQt-aGo~POw??N1xwn?Kq`igCIfwwv;lYey}>v0624zv1+7qE zcpi#S(aZ(DSmKJQnG%oUtjo*!6Yo4~=HECwZ)Nt}S-?<^tjtzp>w7cv2)|_Q35@i> z!5;e zTvleg05yAA=sSwIqCl0*b>3IPz*2D7>QD2Z!uo2YtO@`%w7iBWzYy(CUsJTBKxa^U zuhVd~{rswe|DwK}1P?(`h9f)x4^=m+Yn9WQt!?l#%Ii%4Vl~AzZh|Bu7-`aL)^%rk zXZc0ox22Ok7^y4M`E+_!jkLrAA)z$uAbJDvpNa;9X?;j3K>QfnLm NR!k+oq5lD6{R3iKS9AaX delta 1725 zcmZux&2Jk;6rb6d{qWkc6UTMZ(pFX}6(%GkQ1wt%6`^Y3R1hgDT?E#~Gmh=rUNbXR z6SKiUB5oB_Du^cb^ z2P9Zxcnj72JAW-FWqdNZh~J!i3x#qNV1@#VRXKqlWrWeYl-bNVv;&(v++{BFZe!#1 zI5`S{cM0=Z{*WHv0}Q1vfiLsS-L+VemEfcgw!Dd+2ipQGAJU+ZS@_&z-x6wB0JjJM z6?Oyyii|*s%gICh$qfp%>H8Ag@(PF;PhnOw9HRz~2}mopKoUi#_6&jM#Hj0Z-KodG z-X$F~DV2+HY#r!G-=NdvSL^S|BmCMU{{zXz>gEm~sH^pS>IkVsXDAAgOBA)31Ce@y ze>&n)$-v#>Qg)(2;L1!%-8K&y7czB5Ll!D7Tz$e3(RQR{ou*XsUMmCCSO)bT63Zz0 zl75(90F%_&P%inCzBqaC1*mnLy8UohcE-F;QZf=fQ-&ykuW5mJa@nA0rkVMSK~XTB z82yNTvJNyutQdFjzBR5X+=cFAOE2~|*(WiXk{QA%CS7_RZNZQs!roI8jP@x;O9qjk_=W$xASsgK^dcI~~|``NTUu3h1(+3pN> zYAq4ZT^17&iCXK)sj+pwR`*hSC^`cr79lt# zJ6r|xBJ4=sXhotSRj5WXrHb!@{c+;UyG=e+u%*=PMLRoCKx%UV&~;m!0F-Q+COK?Y z4x@qODdw<~1~`{>#8I6r-)M%-HfIe;`Z5?~83ZCl>|qPfgY-aF@%Y#!;J}cJt)|eA zwP%irc{owGQaTJ(Ta=RH_6gih&fBYNKu>$>%uv^bb)c!OiQM&O);@j4W5;$(Zd!sR z-`M|zRlo{z&2Z2#BNC_dfA>}>2GC+gv6sM<`ljqgXV8jL+U_Wsv6$*Xr?%{IJ~T9} zJ7Njq#ThO1V3LQ<=~dJJ(;?U<*E#S?Ju&1LplM2cT);(KNH*P*&I*{b-tK?pE@B)c z-+3F3fnG^|^G;Tuha84N@^=%QJBv&A&*duk!mAo$0|czd9DH3H*0nIznH0~#IMU3U zC13k%Wz8s&*8n)1c`5nD-^f+8(KWg z_TMl>4jg);QhysfTwSt&1IG&0qm?7ZS6?u2@<-V-?_Ef&2>*1BZI+o zT=y3M(stxu@|9EXoH+5E0-7T~`#r~Bf4+9QvC${k9{lQ09$JL_G|AQGVDbdJe1U-z zP74yPM;cMgt->l<#HgvU!Y-Z2p+qolbLX527e7b1w+VN@G465yi+x5TkJCdE_ExA! z4nnGX1=LBFW)p#l|8l1CYMK_S&$x^QuT1AMZ$!v6tD(^SS_(+gv6w3Df>1R~GbI1t z46*UFi(NJt6jHb3EBS7nQRK8*n$xA#1d5(oEom)|wC8NeT6V!M2_-E%wU5amu?ha+ zEEFY6o08_PaxXmY7$xquHtN}zsCP`>qW7i!$>z9cJ=-SRI75%#L38Kb{?XB^;I;Tr z3#r}*L!q)sJ{<=msLJ4@IN1rB{u{J(@2Ro0GU%bDy@^P<02z5IFGW?WW^)gqf?)J- ze`DOmz3Gy`mbAkw-hi z=!XutNC2J*h9yE}Ijg2pXexdXxQ?~o!`+?OFw1zDmU*#=pH*31BEF2D=W4$`iSaSh znSdA#$_%=xge>mplUn74ygys$e)sA;L1c5FG~&d%Mty40EsQ?KTNG|$HS}SC18ic@ zy-_{QR8 Optional[str]: + print(f"[HTTP Request] Fetching from web: {url}") + headers = {"User-Agent": user_agent} + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + return response.text + except requests.RequestException as e: + logging.error(f"Error fetching {url}: {e}") + return None + + + + diff --git a/webscraper/src/main.py b/webscraper/src/main.py index 354fa53..4a27839 100644 --- a/webscraper/src/main.py +++ b/webscraper/src/main.py @@ -34,22 +34,3 @@ def main(): main() - - -# For testing cache -# def main(): -# scraper = CheaperScraper("https://books.toscrape.com") - -# print("=== First Request ===") -# start = time.time() -# html1 = scraper.fetch("/") # should print: [HTTP Request] ... -# print("Time taken:", round(time.time() - start, 2), "seconds\n") - -# print("=== Second Request (Should Be Cached) ===") -# start = time.time() -# html2 = scraper.fetch("/") # should NOT print: [HTTP Request] ... -# print("Time taken:", round(time.time() - start, 2), "seconds\n") -# print("Cache stats:", scraper._cached_get.cache_info()) - -# if __name__ == "__main__": -# main() diff --git a/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc b/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba417b4f25823975f9dd8e2fe78a99186cb9ae76 GIT binary patch literal 164 zcmYe~<>g`k0{bOg8LB|~F^Gc<7=auIATDMB5-AM944RC7D;bJF!U*D*r?XW|XmM&$ zag1AHZe~tpj7xrUX>Mv>NpXyOW{F2>QcQA2YGOfZQA~MiQgLz-m|0wu98;273{(>z epP83g5+AQuPY}F5GA?0(ngX4$3YP{Xo1u>6_tz@=_Lq)pl;&^D0FdrSqLE3ODbArKU9+I zG=g$Ueu#9;U*ffw<}bLH&X96qJ88@XhabCq!}mDDO}Dl>1j@hvbf!&2$lo})SV9;) zhN^!7BZ!~{NqkEa3S(BVGDreS4v7dv_?C!Jl3DmBNFv!1(F>*?NwR9M35B&sq~CfE zIAqkP_U6GRTfzht>@7(G?<)yKBw8?zL|eqrw!{t5 zfwnEW;Hzi9c_y{F?B|Iz+4LaIrZS@dd|t4R9uJ`EyI>6L_YGYV4s}LnY!x^T1aR4C zad&FWTtC>~KdI{%dT46xSPZke?1u`0Y}-#M+vOR$6a2X}w(I&lD`cubgiy<3@{Xt& zR-7FlJQyEqsr2|sTIR*8@vn8(l(I5cou^%vc0^5ffI+OF7-|cQ?Rd`5R3p_boZUf# zk#T|R9k<)O@@(g+nR?3G@Y&{gdu(z4dcvPVT;h#~jazg1FoTXEE3zhd*fY9a2l5GN}wG3x%;YOQqQFwyyF(>=64NYFkk2)#bn(g>J z%Bx9zF5DkdYrEq`JXA6@QvLR4lNL6d=0^7;7eSB>CRHYh5cEgi{G@gP@~YCiPHFE( zP9`!l`KheH$5-&N#3t?nx^6`8Cy2Qa6RUD#I-&LseHC zuRH|bgFS7sE5^HIcs=Kf%SUjeG3QlUO3rPE^RgCAf&CumcwYXE!vB)GjRpl*-=evP z=3_><9$o>%@2Yf`9G1r lzGKHKh06)C)puZQh%le?Z{vra#UqSazX=QuwzC)0{{X+&;Clc7 literal 0 HcmV?d00001 diff --git a/webscraper/src/tests/test_fetch_and_cache.py b/webscraper/src/tests/test_fetch_and_cache.py new file mode 100644 index 0000000..ff45a92 --- /dev/null +++ b/webscraper/src/tests/test_fetch_and_cache.py @@ -0,0 +1,74 @@ +import unittest +import time +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) +from src.fetch_utils import cached_get + +from src.Cheaper_Scraper import CheaperScraper + +#to test, be in the webscraper directory and use the following command in terminal +# python src/tests/test_fetch_and_cache.py -v + + +class TestCheaperScraperFetchCache(unittest.TestCase): + + def setUp(self): + self.scraper = CheaperScraper("https://books.toscrape.com") + cached_get.cache_clear() # Reset cache before each test + + def test_valid_fetch(self): + html = self.scraper.fetch("/") + self.assertIsInstance(html, str) + self.assertIn(" Date: Mon, 28 Apr 2025 10:48:06 -0700 Subject: [PATCH 4/5] fixed failing build --- .../api/__pycache__/interface.cpython-39.pyc | Bin 0 -> 638 bytes webscraper/src/Cheaper_Scraper.py | 9 +++++---- .../Cheaper_Scraper.cpython-39.pyc | Bin 3163 -> 3325 bytes .../src/__pycache__/main.cpython-39.pyc | Bin 878 -> 862 bytes .../test_fetch_and_cache.cpython-39.pyc | Bin 1892 -> 2601 bytes webscraper/src/tests/test_fetch_and_cache.py | 12 +++++------- 6 files changed, 10 insertions(+), 11 deletions(-) create mode 100644 webscraper/api/__pycache__/interface.cpython-39.pyc diff --git a/webscraper/api/__pycache__/interface.cpython-39.pyc b/webscraper/api/__pycache__/interface.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4597f1eeb5a3455a46e54e675609dd0afa97dbcf GIT binary patch literal 638 zcmZuv%}N6?5YEr`NAV9ldE0}Rg7g7I#9FcTQUvj`EQB&2{AL@@0Ga=H^$QtXkxf6X$y04H8M5qXH+mAPK=} zfFzOh5lIU8fhJT&SE$o?cRNvsSlT=7+burUMsZ;ZFw>b7er?r$;R8yrL35j6G? zG$no_fg^grNvp=UyH`T-IjG*{a9A2p6E48b>NXM)m3hEcl7LaMH=#nYB&_t`W`;c`iJSY2Bp(; zUS#E*SIR>^DNE26?EqdyK@7EeX%mh4fiZMD51;! ze`I(69vF4nw#`_{3t-HyGgi!GmHT*uvD=F0^@^en*hXHqrS5@jBr{=@{|i5&>S)c^ o{GItD;quyq;7}QLB1`#LDBod#4CVh2Z2PG0 None: +class CheaperScraper(BaseScraper): + def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None: """Initialize the scraper with base parameters. Args: @@ -101,6 +101,7 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]: if html: results[path] = self.parse(html) return results + def get_scraped_data(self, paths: List[str]) -> Dict[str, List[str]]: return self.scrape(paths) diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc index 6a69690d1b69fdd27598d9e4dc8babd5d2d52181..036324aec02001c2b398a411a5aecf374ada903d 100644 GIT binary patch delta 1207 zcmZuwJ8u&~5Z?7Ye0FRnaU3g7!c*aK7%U+L8W0eo2tt97BJtS9I^VAG!P#eK&q5%i zAc&}FqN9w&qoYnwNk>I@lr$6+`~Z-cT_?ec*wfCLnVdCNnPWfiDeq+@N$GQbppo9}{hsAD*m9(A>$Ga13LhDXA*)6lO)^p)h zx56rvTq8Ws$2JHb6NQ%so8J7flw6>Pj!v!l;PnM8-CAAk#RA%%FI<}Deb4U*-n=j3 z+rt8d>=%g!{-4k&b`Dy;P7Y@VH$SLq0k3y`Tj=yMOrIO>KCQ7ek2Yo(?XVS8v$*_zg7S-$UqM5+S_ zKN6Tto}2gRc#@inr6~ltnbhoiaRb8@pX3%R#so4o)ISkW` zgRV#|84A(YZd@B`h=&6xSn|_8S)El&E#{OVt1V)`Q%_E1&)Ey=sGQ7O2ew~iSE#LT zE+k)^+T4k~TO(IBgXM(g7HN6=e6B(-9#gP73fCI?J*_rQDWM_&?O|U8SPj(*cavB7 z`nsP%*zQoefpZzBRJ5OwcjOs;PqCKtrc2=r?zx!}Py0+yU>`My=6MrK<2_K8vU?#kGJUwL_uVQ~)xD!An9-7ppkieO?2#~NHFUngAnOP@hzF@1Kj{a( z`$M-n3KbNadn{T*BG6d6y4+}aQtYGwAgd6wBM1q8L-%SEUxoircQ2b60eJnO5$rxv z$Pp5gbiV=JMa@ARgc5Xv=GudvA4d_c$Izpu_rH9k&c$xT2cft`5hfLVhdQ)?pF=D9 Iuk?cP7m`C1Hvj+t delta 1051 zcmZuwzi-n(6!yhGlQ?aYrfFJIwLk@!h(Kw(R279*D$oHWDi%}8a&i~q)``QNGr*F8 zZVafh7?2pq$iS4HKY%~s7&|dAAqEh?-d%)16}J58-MjC;_ujqdA2T1yRX>-@DDb@f zoN0Zs73Ekg{(aK0xDFqCcU0Z!qcH1cv4T-(yEe)0=U$Hy8*p6Wgv2~43=q}_sR0@w zqyXCpC)-80C~Y&GYM0!Sw5@QuU3SaTPKPt?S$7sGHx-=0*+T_qN$IKTp2BEXaqN#G zr#naQR|`*4wVH!?rr|eQ1bYujeDrHR*+PqpAyA0oBCNP6oF^*08dkCPO2axf4s}<@ zh2$B!WndK~Cx>6rca-J^1D!VE$xpS?9|H+GAz&PUTa-NN5f(FU?h?j=sKZV6$7m5B z6Uw;J>ta79RNNFdXjG44=IC5wBy3VCD72(1&vbptAnP6t-zan*{NHxA*7oi*5Zc@J zlb70UR7j4sm9YZsIVrAn{kTQ*NynTUZtC|Csw6Lr(;HIO6ah!~;02v##I;rsvr*<+ z96(Jv3km7U(2hapL>gzrq2!Bkp(LWq6zGER=}i(hTD4@^Ts1C=t^BZK*3p_gB^<8C zqF$Gql(1eHGgqre(E+_As3)almD@=sy@K+?jdT&MR|Rbez|lu}TosNh0z^hsMm!+{ zouPi0JV>5rs#8vi!gwk)2jCzoug6Z@>{9=4S?sJtrW;_+KQ8kwZ z-}LKDbdPKMQ2>L31KRPSDVFCogOGS0A7`{tlh^L`;vi&Hbkv==vAI>N`;2&_`Y9C| zx{@rX%Y%V!9E|2I-RyN5@!cqb)8a>#1^lmGZmqW?+zZJy1T;1aU?B_H@LH&t681T{ H0+0F|*_!sU diff --git a/webscraper/src/__pycache__/main.cpython-39.pyc b/webscraper/src/__pycache__/main.cpython-39.pyc index d2f5041a36342fd5e75c133cc35f8a18e7604dca..bd339199e596383c4f2c650179cbba61fecbfd2d 100644 GIT binary patch delta 84 zcmaFIc8`rOk(ZZ?0SH*$ab+m6OyrYcOqrpF^4;fJB1~MwS^&yCzUsaEt{z* zBb8@k7dsOp&*Vu=28?`@4>N_c-C`-p%uSsv#cU)Z05pe#5d_tkc$hetIm9>^0oN84 ACjbBd diff --git a/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc b/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc index 1994ec8c3949a87b1c92ad66170b7958ab95c8fd..c22e0aa30d2f9a06a0926943d85cf34990cd4768 100644 GIT binary patch literal 2601 zcmai0TW=dh6rS0Oy>XlnQbf}WWmPIsqsAc>AQW0uDYT^(@&idhSe8~B&!paTy=!L1 zEw$XI^qn7okeoOA!e8Kj>?=?G3w`1{vu+coq48?ZTy|#8`Mx<9#|sNJhTqCR@Adwk zW9(lV9Q}D1+`_9{5P}JwvUb_>HpkdWoxa<4InBAL*Z12#XLp(Kg#VlgU$VW*zT2+K zKvW(%>b_)?xdVpPe2X=zFL5SoHF&YO*^}`=s;#b~1~av8-0evbJ(Q+PtB(F03~u4o zHy|VYBQ_-4pjCw zNqv_?!RsION%JON{UL+_ihVw25jW1nwWt`*bXueHJ!1xXV{L6G&mZZg$+gAR?B@N3 zr|6KPT23kD(rDhIjk>9f)eByDTBf@%n8JmyR}VHfIuEo|x^pM)C+XA9cX@Z%mzmL> zyNUUJxYL29#9HTx+?k%*(W=`qK&GvXb_r4xXCktSnuDi>ucdh~ScC>mcg%Q=k1oH9 zjHcZ%LkS`A@mfl*p)+QCY{CUU;LkwJ#2Itr9ylEH6Hj0?@ede{1rg=8Uh%b8{*9jL zr;UoDg9_hTS}esHh|N8HFViN@y0UP!QAIGF&I&)xpGft>Q2}Iwwobv&chccMMM`~QJ|h7;cgF4&L_E?dy>jmPYOzkxkEw`O{YULC{_ z<*LY~Ud?i|Dt}9WaRGrJZyzy(o6I6_u@}CJj$spa!058~_yjRXD9gDqUt!OjQ@*>#op{*q!9o17 zJ9ehdQW<-DRbfX{S5b9ndr2|a2nx?6eR;@H8AD~ph>~nKe@)sCQtM*5B=D}1v61Tb zufsSkyk25-!?)gpNHDQ7g-?gBm(6u`4w{a$ykc1{LsjnX%C1Qs%M3a$VzH*nVLTw; zat`I_%hO3e;lvr%2{rk-W&JC3z!o#$`u7#E4c7O;`p*a|56`~KS+=y!^wrOiXX*I` zuVv9`wHM)OQ5=DbG#aY3;TH9ho*&~xOI_4{iqoOItyHdTo>En)JI=%VmWM3QBFc02 z9ISfnXMEH=9rDr5GgwIP_pmsie`uL*bGdFEoZH+KI( z<@wVkR!-REy|v57uFBXy+-1EH{wS&kVAE74RcNkYRWPpXm+1_>y*a8Ww8FAG-x^)1 z-Mo1-+@#VGZYTXDdl(|;hFc~!IwVp;d@ntjQs-#-PNFbrb6#jrD>lMW5Ng@YGoeRk zwrpqi^R<~t-OGom2zO+-8*6xXD25UTh2&AtKB||tz`>2)P?QZuSGIdn$?$MzY(kVB zn6YlR$q=#8s1^P|C7Dqx*mD?Zg|G5qCe(SFbxGJbxj;P%f=1Pbl==w$woh4kU0r(9 zzpg&EeQVBfr@COBiVT+#A`#ZxdS%zHVw)zNh%c97Ynd;-X&2E$?fUw3r#t!ZR%5OR zq9}{|GKz{?6!mj4OsQXwBFYx!7c~#{>H`u~{8dQe6B3`2I84a4M-@m>rlho`Nd!*d z1wNjIz_VXSX+KV~C~DN&!7xhHi}en(gz)7L`SQXp&Ty o88NVw#HgfXiMk3=c%JruaFETa%ije2KiMOeP5=M^ delta 839 zcmZuv&2AGh5Vm){8#md6{s4`Dh*Al}3RF}?r3yiX$RUD5Ikcw;VVPZ*By7^kb`U}- z1(g%x68r$P5|=&&N8Vvi95}#%C*ZMXB86TbfS*yuvwM^jo`>Wg; z2IJZ#`fz;X^7R%;{gizmDO04!_WgFC!Zdj4=`}?TN{57!lx~nX9KWq!OmP3n$Ag~- zVCK&F+X{oU=rdiNy8saj+xF>*2(TXQxfZ>9Ew6|~CLNWE7s}?9i7%2&NgdOi8I^4v z5zXBfDrw14?;~aIlecoS1axiIvHeVQa0{3MFQ2x`Sn*0`-BfHTn{+DfkN5RS1Zy$V z43n#vSC+;oZ8!B$=^2d9BK$`-KOyT53c4L>I|b*o`{6%@3huLJP}K6C#f=Iu{H6dg z&zbG^)$wsUaNpVLEtH)H#!7A50cI>0pWC{A%IS6YE~wYf!HPj~gmN0u$SW{%&w|#T z4`H+>m{|ZQdb~?V^nj+`A>H*xmF=Eq9d8L#@m(v(d?hx#jTu%p#DO5CGMd>6;~Y#%n+wfqNZB z{(8(^ICXUoFP)x4!0L1ZVFBT|{}T@ Date: Mon, 28 Apr 2025 15:57:34 -0700 Subject: [PATCH 5/5] task 35 SQL script to run in local postgres --- webscraper/database/init_db.sql | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 webscraper/database/init_db.sql diff --git a/webscraper/database/init_db.sql b/webscraper/database/init_db.sql new file mode 100644 index 0000000..5effbec --- /dev/null +++ b/webscraper/database/init_db.sql @@ -0,0 +1,16 @@ +-- Create users table +CREATE TABLE users ( + id SERIAL PRIMARY KEY, + username VARCHAR(255) NOT NULL UNIQUE, + password_hash VARCHAR(255) NOT NULL +); + +-- Create products table +CREATE TABLE products ( + id SERIAL PRIMARY KEY, + product_name VARCHAR(255) NOT NULL, + price NUMERIC(10,2) NOT NULL, + url TEXT NOT NULL, + user_id INTEGER NOT NULL, + FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE +);