From 48c893a949fd544a93e69c04344d50cea345ee23 Mon Sep 17 00:00:00 2001 From: Illia Kaialainien Date: Fri, 15 Dec 2023 13:31:50 +0200 Subject: [PATCH 1/3] finished initial work with restapi --- main.py | 5 +++ requirements.txt | 36 +++++++++++++++++ server/__init__.py | 0 server/app.py | 11 ++++++ server/database.py | 71 ++++++++++++++++++++++++++++++++++ server/models/__init__.py | 0 server/models/search_result.py | 36 +++++++++++++++++ server/routes/__init__.py | 0 server/routes/search_result.py | 38 ++++++++++++++++++ server/scraper.py | 54 ++++++++++++++++++++++++++ 10 files changed, 251 insertions(+) create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 server/__init__.py create mode 100644 server/app.py create mode 100644 server/database.py create mode 100644 server/models/__init__.py create mode 100644 server/models/search_result.py create mode 100644 server/routes/__init__.py create mode 100644 server/routes/search_result.py create mode 100644 server/scraper.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..11f8970 --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +import uvicorn + + +if __name__ == '__main__': + uvicorn.run('server.app:app', host="0.0.0.0", port=8000, reload=False) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9425e7b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,36 @@ +annotated-types==0.6.0 +anyio==3.7.1 +attrs==23.1.0 +beautifulsoup4==4.12.2 +certifi==2023.11.17 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +colorama==0.4.6 +dnspython==2.4.2 +fastapi==0.105.0 +flake8==6.1.0 +h11==0.14.0 +idna==3.6 +mccabe==0.7.0 +motor==3.3.2 +outcome==1.3.0.post0 +pycodestyle==2.11.1 +pycparser==2.21 +pydantic==2.5.2 +pydantic_core==2.14.5 +pyflakes==3.1.0 +pymongo==4.6.1 +PySocks==1.7.1 +requests==2.31.0 +selenium==4.16.0 +sniffio==1.3.0 +sortedcontainers==2.4.0 +soupsieve==2.5 +starlette==0.27.0 +trio==0.23.1 +trio-websocket==0.11.1 +typing_extensions==4.9.0 +urllib3==2.1.0 +uvicorn==0.24.0.post1 +wsproto==1.2.0 diff --git a/server/__init__.py b/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/app.py b/server/app.py new file mode 100644 index 0000000..6589979 --- /dev/null +++ b/server/app.py @@ -0,0 +1,11 @@ +from fastapi import FastAPI +from server.routes.search_result import router as SearchResultRouter + + +app = FastAPI() +app.include_router(SearchResultRouter, tags=["Search"], prefix="/news/search") + + +@app.get('/', tags=['Root']) +async def read_root(): + return {'message': 'Welcome!'} diff --git a/server/database.py b/server/database.py new file mode 100644 index 0000000..2637b7c --- /dev/null +++ b/server/database.py @@ -0,0 +1,71 @@ +import motor.motor_asyncio +from bson import ObjectId +from bson.errors import InvalidId + + +MONGO_DETAILS = 'mongodb://localhost:27017' +client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_DETAILS) +db = client.news + +search_results_collection = db.get_collection('search_results') + + +def search_results_helper(search_result): + return { + "id": str(search_result["_id"]), + "link": search_result["link"], + "title": search_result["title"], + "author": search_result["author"], + "image": search_result["image"], + "date": search_result["date"], + "tags": search_result["tags"], + "description": search_result["description"], + "content": search_result["content"] + } + + +async def add_search_results(results: list[dict]): + new_results = [] + for result in results: + if await search_results_collection.find_one({"link": result['link']}): + new_result = await search_results_collection.find_one({"link": result['link']}) + new_result["tags"] = list(set(new_result["tags"] + result['tags'])) + await search_results_collection.update_one({"_id": ObjectId(new_result["_id"])}, {"$set": new_result}) + else: + result = await search_results_collection.insert_one(result) + new_result = await search_results_collection.find_one({"_id": result.inserted_id}) + new_results.append(search_results_helper(new_result)) + return new_results + + +async def retrieve_search_result_by_id(id_: str): + try: + result = await search_results_collection.find_one({"_id": ObjectId(id_)}) + if result: + return search_results_helper(result) + except InvalidId: + return + + +async def retrieve_search_results_by_tags(tags: list[str]): + matched_result = [] + results = search_results_collection.find() + search_tags = set(tags) + async for result in results: + common = search_tags.intersection(result["tags"]) + if len(common) > len(search_tags) / 2: + matched_result.append(search_results_helper(result)) + return matched_result + + +async def retrieve_newest_search_results(): + results = [] + async for result in search_results_collection.find().sort('date', -1).limit(20): + results.append(search_results_helper(result)) + return results + + +async def update_content_of_article(id_: str, content: list[list]): + await search_results_collection.update_one({'_id': ObjectId(id_)}, {"$set": {"content": content}}) + article = await search_results_collection.find_one({'_id': ObjectId(id_)}) + return search_results_helper(article) diff --git a/server/models/__init__.py b/server/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/models/search_result.py b/server/models/search_result.py new file mode 100644 index 0000000..f23f96f --- /dev/null +++ b/server/models/search_result.py @@ -0,0 +1,36 @@ +from datetime import datetime + +from pydantic import BaseModel, HttpUrl + + +class ArticleModel(BaseModel): + id: str + link: HttpUrl + title: str + author: str | None = None + image: HttpUrl | None = None + date: datetime | None = None + description: str = "" + tags: set[str] = set() + + model_config = { + "json_schema_extra": { + "examples": [ + { + "id": "657b4a8d9e6d5419e28aa3e1", + "link": "https://www.laptopmag.com/best-picks/tips-to-improve-macbook-sound", + "tags": ["acer", "aspire", "nvidia"], + "image": "https://cdn.mos.cms.futurecdn.net/vzWy7ZzZy4rfZUESfUw4Lg.jpg", + "title": "7 ways to improve sound on your MacBook", + "author": "Alex Bracetti", + "date": "2023-05-20T07:00:53Z", + "description": "Unhappy with the MacBook’s sound quality? Here are some tips and tricks to enhance " + "the audio performance on your Apple laptop." + }, + ] + } + } + + +class ExtendArticleModel(ArticleModel): + content: list[list] = [] diff --git a/server/routes/__init__.py b/server/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/routes/search_result.py b/server/routes/search_result.py new file mode 100644 index 0000000..c5a65ab --- /dev/null +++ b/server/routes/search_result.py @@ -0,0 +1,38 @@ +from fastapi import APIRouter, status, HTTPException + +from server.scraper import scrap_from_search, scrap_content +from server.models.search_result import ArticleModel, ExtendArticleModel +from server.database import ( + add_search_results, + retrieve_search_result_by_id, + retrieve_search_results_by_tags, + retrieve_newest_search_results, + update_content_of_article, +) + + +router = APIRouter() + + +@router.get("/", status_code=status.HTTP_200_OK, response_model=list[ArticleModel]) +async def get_search_results(find: str | None = None) -> list[ArticleModel]: + if find: + results = await retrieve_search_results_by_tags(find.split()) + if len(results) < 20: + new_results = scrap_from_search(find) + new_results = await add_search_results(new_results) + results.extend(new_results) + return results[:20] + return await retrieve_newest_search_results() + + +@router.get("/{id}", status_code=status.HTTP_200_OK, response_model=ExtendArticleModel) +async def get_article(id: str) -> ExtendArticleModel: + result = await retrieve_search_result_by_id(id) + if result: + if not result['content']: + content = scrap_content(result['link']) + result = await update_content_of_article(id, content) + return result + else: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Item not found") diff --git a/server/scraper.py b/server/scraper.py new file mode 100644 index 0000000..b98d87e --- /dev/null +++ b/server/scraper.py @@ -0,0 +1,54 @@ +import requests +from bs4 import BeautifulSoup + + +headers = { + 'Accept': + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/' + 'webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0' +} + + +def scrap_from_search(search: str) -> list[dict]: + link_src = f'https://www.laptopmag.com/search?searchTerm={search}&articleType=best-pick' + page_src = requests.get(link_src, headers) + soup_src = BeautifulSoup(page_src.content, 'html.parser') + + laptops = soup_src.find_all('div', class_='listingResult') + laptops_data = [] + for laptop in laptops: + dct = dict() + dct['link'] = laptop.find('a', class_='article-link').get('href') + dct['image'] = laptop.find('img').get('data-pin-media') + dct['title'] = laptop.find('h3', class_='article-name').text.strip() + dct['author'] = laptop.find('span', attrs={'style': 'white-space:nowrap'}).text.strip() + dct['date'] = laptop.find('time').get('datetime') + dct['description'] = laptop.find('p', class_='synopsis').text.strip() + dct['tags'] = search.split() + dct['content'] = [] + laptops_data.append(dct) + return laptops_data + + +def scrap_content(link: str) -> list[list]: + page_src = requests.get(link, headers) + soup_src = BeautifulSoup(page_src.content, 'html.parser') + + content = [] + body = soup_src.find('div', id='article-body') + for block in body.children: + if block.name == 'p': + paragraph = block.text + if ' ' in paragraph: + paragraph = paragraph.replace(' ', ' ') + content.append(('paragraph', paragraph)) + elif block.name == 'h2': + title = block.text + content.append(('title', title)) + elif block.name == 'figure': + image = block.find('img').get('data-pin-media') + content.append(('image', image)) + return content From 1d98f8c2368ed8f76a02e7367826205abc4d2d49 Mon Sep 17 00:00:00 2001 From: Illia Kaialainien Date: Sat, 16 Dec 2023 13:13:57 +0200 Subject: [PATCH 2/3] added docstrings --- server/app.py | 1 + server/database.py | 26 ++++++++++++++++++++++++++ server/routes/search_result.py | 13 +++++++++++++ server/scraper.py | 11 +++++++++++ 4 files changed, 51 insertions(+) diff --git a/server/app.py b/server/app.py index 6589979..b85effc 100644 --- a/server/app.py +++ b/server/app.py @@ -1,4 +1,5 @@ from fastapi import FastAPI + from server.routes.search_result import router as SearchResultRouter diff --git a/server/database.py b/server/database.py index 2637b7c..786fbd7 100644 --- a/server/database.py +++ b/server/database.py @@ -11,6 +11,7 @@ def search_results_helper(search_result): + """Take each article and convert it to JSONable format""" return { "id": str(search_result["_id"]), "link": search_result["link"], @@ -25,6 +26,14 @@ def search_results_helper(search_result): async def add_search_results(results: list[dict]): + """Add articles to database + + Check if each article does not exist in database. If it exists, + add search words to article's 'tags' field. Else, article will + be added to a database. + :param results: List of new articles + :return: List of articles added to a database + """ new_results = [] for result in results: if await search_results_collection.find_one({"link": result['link']}): @@ -39,6 +48,7 @@ async def add_search_results(results: list[dict]): async def retrieve_search_result_by_id(id_: str): + """Find concrete article in a database by ID""" try: result = await search_results_collection.find_one({"_id": ObjectId(id_)}) if result: @@ -48,6 +58,15 @@ async def retrieve_search_result_by_id(id_: str): async def retrieve_search_results_by_tags(tags: list[str]): + """Find articles by tags + + Take search words and check if database contain articles, + which have more than half of words in 'tags' fields matches + with words in search query. If database have them, return + this articles. + :param tags: List of search words + :return: List of articles + """ matched_result = [] results = search_results_collection.find() search_tags = set(tags) @@ -59,6 +78,7 @@ async def retrieve_search_results_by_tags(tags: list[str]): async def retrieve_newest_search_results(): + """Get 20 newest articles from database""" results = [] async for result in search_results_collection.find().sort('date', -1).limit(20): results.append(search_results_helper(result)) @@ -66,6 +86,12 @@ async def retrieve_newest_search_results(): async def update_content_of_article(id_: str, content: list[list]): + """Add content to article + + :param id_: ID of existing article + :param content: List of content + :return: Article with content + """ await search_results_collection.update_one({'_id': ObjectId(id_)}, {"$set": {"content": content}}) article = await search_results_collection.find_one({'_id': ObjectId(id_)}) return search_results_helper(article) diff --git a/server/routes/search_result.py b/server/routes/search_result.py index c5a65ab..1e57ce7 100644 --- a/server/routes/search_result.py +++ b/server/routes/search_result.py @@ -16,6 +16,13 @@ @router.get("/", status_code=status.HTTP_200_OK, response_model=list[ArticleModel]) async def get_search_results(find: str | None = None) -> list[ArticleModel]: + """Find articles by search query + + Get list of articles which match with search query from database. + If count of articles is less than 20, scrap new articles and add + them to a database. If 'find' param is empty, return 20 newest + articles. + """ if find: results = await retrieve_search_results_by_tags(find.split()) if len(results) < 20: @@ -28,6 +35,12 @@ async def get_search_results(find: str | None = None) -> list[ArticleModel]: @router.get("/{id}", status_code=status.HTTP_200_OK, response_model=ExtendArticleModel) async def get_article(id: str) -> ExtendArticleModel: + """Get concrete article with content + + Find article by ID in database and if it exists, check if it + has content in 'content' field. If it is, return it, else scrap + this content. If article is not exist in db, return 404. + """ result = await retrieve_search_result_by_id(id) if result: if not result['content']: diff --git a/server/scraper.py b/server/scraper.py index b98d87e..f856388 100644 --- a/server/scraper.py +++ b/server/scraper.py @@ -13,10 +13,16 @@ def scrap_from_search(search: str) -> list[dict]: + """Take search query and return list of articles + + :param search: Query string with what user want to find + :return: Result of search - list of articles + """ link_src = f'https://www.laptopmag.com/search?searchTerm={search}&articleType=best-pick' page_src = requests.get(link_src, headers) soup_src = BeautifulSoup(page_src.content, 'html.parser') + # parse search result laptops = soup_src.find_all('div', class_='listingResult') laptops_data = [] for laptop in laptops: @@ -34,6 +40,11 @@ def scrap_from_search(search: str) -> list[dict]: def scrap_content(link: str) -> list[list]: + """Parse concrete article's content + + :param link: URL of article + :return: list of content where each list contain type of content and content itself + """ page_src = requests.get(link, headers) soup_src = BeautifulSoup(page_src.content, 'html.parser') From 4daeeb1e4993b75613f3c0e7d75d3f5eff73b7b2 Mon Sep 17 00:00:00 2001 From: Illia Kaialainien Date: Sat, 16 Dec 2023 13:29:22 +0200 Subject: [PATCH 3/3] added readme.md --- README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++- requirements.txt | 1 - 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 109fb80..3c619ba 100644 --- a/README.md +++ b/README.md @@ -1 +1,72 @@ -# News about laptops (microservice) \ No newline at end of file +# Laptop's News Microservice + +This is a microservice designed to fetch and serve news articles using FastAPI, Requests, and BeautifulSoup. It provides a simple and efficient way to retrieve news articles about laptops. + +## Features + +- **FastAPI:** Utilizes the FastAPI framework for creating a fast and modern API with automatic documentation generation. +- **Requests:** Makes HTTP requests to external news sources to fetch the latest articles. +- **Beautiful Soup:** Parses and extracts relevant information from the HTML content of news articles. +- **Asynchronous:** Takes advantage of asynchronous programming for efficient handling of multiple requests. + +## Installation + +1. Clone the repository: + + ```bash + git clone https://github.com/LLkaia/news-laptops-ms.git + cd news-laptops-ms + ``` + +2. Create a virtual environment (optional but recommended): + + ```bash + python -m venv venv + ``` + +3. Activate the virtual environment: + + - On Windows: + + ```bash + .\venv\Scripts\activate + ``` + + - On Unix or MacOS: + + ```bash + source venv/bin/activate + ``` + +4. Install the dependencies: + + ```bash + pip install -r requirements.txt + ``` + +## Usage + +1. Run the FastAPI application: + + ```bash + python main + ``` + +2. Open your web browser and navigate to `http://localhost:8000/docs` to access the Swagger documentation. Here, you can test the different API endpoints and see example requests and responses. + +## API Endpoints + +- `/news/search`: Search articles about laptops. + + - Example: + + ``` + http://localhost:8000/news/search?find=acer+aspire+7+review + ``` +- `/news/search/{id}`: Show concrete article. + + - Example: + + ``` + http://localhost:8000/news/search/657c1690f253079b6f3ed074 + ``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9425e7b..90c2627 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,6 @@ pyflakes==3.1.0 pymongo==4.6.1 PySocks==1.7.1 requests==2.31.0 -selenium==4.16.0 sniffio==1.3.0 sortedcontainers==2.4.0 soupsieve==2.5