From 48c893a949fd544a93e69c04344d50cea345ee23 Mon Sep 17 00:00:00 2001
From: Illia Kaialainien <illia.kaialainien@gmail.com>
Date: Fri, 15 Dec 2023 13:31:50 +0200
Subject: [PATCH 1/3] finished initial work with restapi

---
 main.py                        |  5 +++
 requirements.txt               | 36 +++++++++++++++++
 server/__init__.py             |  0
 server/app.py                  | 11 ++++++
 server/database.py             | 71 ++++++++++++++++++++++++++++++++++
 server/models/__init__.py      |  0
 server/models/search_result.py | 36 +++++++++++++++++
 server/routes/__init__.py      |  0
 server/routes/search_result.py | 38 ++++++++++++++++++
 server/scraper.py              | 54 ++++++++++++++++++++++++++
 10 files changed, 251 insertions(+)
 create mode 100644 main.py
 create mode 100644 requirements.txt
 create mode 100644 server/__init__.py
 create mode 100644 server/app.py
 create mode 100644 server/database.py
 create mode 100644 server/models/__init__.py
 create mode 100644 server/models/search_result.py
 create mode 100644 server/routes/__init__.py
 create mode 100644 server/routes/search_result.py
 create mode 100644 server/scraper.py

diff --git a/main.py b/main.py
new file mode 100644
index 0000000..11f8970
--- /dev/null
+++ b/main.py
@@ -0,0 +1,5 @@
+import uvicorn
+
+
+if __name__ == '__main__':
+    uvicorn.run('server.app:app', host="0.0.0.0", port=8000, reload=False)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9425e7b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,36 @@
+annotated-types==0.6.0
+anyio==3.7.1
+attrs==23.1.0
+beautifulsoup4==4.12.2
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+dnspython==2.4.2
+fastapi==0.105.0
+flake8==6.1.0
+h11==0.14.0
+idna==3.6
+mccabe==0.7.0
+motor==3.3.2
+outcome==1.3.0.post0
+pycodestyle==2.11.1
+pycparser==2.21
+pydantic==2.5.2
+pydantic_core==2.14.5
+pyflakes==3.1.0
+pymongo==4.6.1
+PySocks==1.7.1
+requests==2.31.0
+selenium==4.16.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+soupsieve==2.5
+starlette==0.27.0
+trio==0.23.1
+trio-websocket==0.11.1
+typing_extensions==4.9.0
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+wsproto==1.2.0
diff --git a/server/__init__.py b/server/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/server/app.py b/server/app.py
new file mode 100644
index 0000000..6589979
--- /dev/null
+++ b/server/app.py
@@ -0,0 +1,11 @@
+from fastapi import FastAPI
+from server.routes.search_result import router as SearchResultRouter
+
+
+app = FastAPI()
+app.include_router(SearchResultRouter, tags=["Search"], prefix="/news/search")
+
+
+@app.get('/', tags=['Root'])
+async def read_root():
+    return {'message': 'Welcome!'}
diff --git a/server/database.py b/server/database.py
new file mode 100644
index 0000000..2637b7c
--- /dev/null
+++ b/server/database.py
@@ -0,0 +1,71 @@
+import motor.motor_asyncio
+from bson import ObjectId
+from bson.errors import InvalidId
+
+
+MONGO_DETAILS = 'mongodb://localhost:27017'
+client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_DETAILS)
+db = client.news
+
+search_results_collection = db.get_collection('search_results')
+
+
+def search_results_helper(search_result):
+    return {
+        "id": str(search_result["_id"]),
+        "link": search_result["link"],
+        "title": search_result["title"],
+        "author": search_result["author"],
+        "image": search_result["image"],
+        "date": search_result["date"],
+        "tags": search_result["tags"],
+        "description": search_result["description"],
+        "content": search_result["content"]
+    }
+
+
+async def add_search_results(results: list[dict]):
+    new_results = []
+    for result in results:
+        if await search_results_collection.find_one({"link": result['link']}):
+            new_result = await search_results_collection.find_one({"link": result['link']})
+            new_result["tags"] = list(set(new_result["tags"] + result['tags']))
+            await search_results_collection.update_one({"_id": ObjectId(new_result["_id"])}, {"$set": new_result})
+        else:
+            result = await search_results_collection.insert_one(result)
+            new_result = await search_results_collection.find_one({"_id": result.inserted_id})
+        new_results.append(search_results_helper(new_result))
+    return new_results
+
+
+async def retrieve_search_result_by_id(id_: str):
+    try:
+        result = await search_results_collection.find_one({"_id": ObjectId(id_)})
+        if result:
+            return search_results_helper(result)
+    except InvalidId:
+        return
+
+
+async def retrieve_search_results_by_tags(tags: list[str]):
+    matched_result = []
+    results = search_results_collection.find()
+    search_tags = set(tags)
+    async for result in results:
+        common = search_tags.intersection(result["tags"])
+        if len(common) > len(search_tags) / 2:
+            matched_result.append(search_results_helper(result))
+    return matched_result
+
+
+async def retrieve_newest_search_results():
+    results = []
+    async for result in search_results_collection.find().sort('date', -1).limit(20):
+        results.append(search_results_helper(result))
+    return results
+
+
+async def update_content_of_article(id_: str, content: list[list]):
+    await search_results_collection.update_one({'_id': ObjectId(id_)}, {"$set": {"content": content}})
+    article = await search_results_collection.find_one({'_id': ObjectId(id_)})
+    return search_results_helper(article)
diff --git a/server/models/__init__.py b/server/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/server/models/search_result.py b/server/models/search_result.py
new file mode 100644
index 0000000..f23f96f
--- /dev/null
+++ b/server/models/search_result.py
@@ -0,0 +1,36 @@
+from datetime import datetime
+
+from pydantic import BaseModel, HttpUrl
+
+
+class ArticleModel(BaseModel):
+    id: str
+    link: HttpUrl
+    title: str
+    author: str | None = None
+    image: HttpUrl | None = None
+    date: datetime | None = None
+    description: str = ""
+    tags: set[str] = set()
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "id": "657b4a8d9e6d5419e28aa3e1",
+                    "link": "https://www.laptopmag.com/best-picks/tips-to-improve-macbook-sound",
+                    "tags": ["acer", "aspire", "nvidia"],
+                    "image": "https://cdn.mos.cms.futurecdn.net/vzWy7ZzZy4rfZUESfUw4Lg.jpg",
+                    "title": "7 ways to improve sound on your MacBook",
+                    "author": "Alex Bracetti",
+                    "date": "2023-05-20T07:00:53Z",
+                    "description": "Unhappy with the MacBook’s sound quality? Here are some tips and tricks to enhance "
+                                   "the audio performance on your Apple laptop."
+                },
+            ]
+        }
+    }
+
+
+class ExtendArticleModel(ArticleModel):
+    content: list[list] = []
diff --git a/server/routes/__init__.py b/server/routes/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/server/routes/search_result.py b/server/routes/search_result.py
new file mode 100644
index 0000000..c5a65ab
--- /dev/null
+++ b/server/routes/search_result.py
@@ -0,0 +1,38 @@
+from fastapi import APIRouter, status, HTTPException
+
+from server.scraper import scrap_from_search, scrap_content
+from server.models.search_result import ArticleModel, ExtendArticleModel
+from server.database import (
+    add_search_results,
+    retrieve_search_result_by_id,
+    retrieve_search_results_by_tags,
+    retrieve_newest_search_results,
+    update_content_of_article,
+)
+
+
+router = APIRouter()
+
+
+@router.get("/", status_code=status.HTTP_200_OK, response_model=list[ArticleModel])
+async def get_search_results(find: str | None = None) -> list[ArticleModel]:
+    if find:
+        results = await retrieve_search_results_by_tags(find.split())
+        if len(results) < 20:
+            new_results = scrap_from_search(find)
+            new_results = await add_search_results(new_results)
+            results.extend(new_results)
+        return results[:20]
+    return await retrieve_newest_search_results()
+
+
+@router.get("/{id}", status_code=status.HTTP_200_OK, response_model=ExtendArticleModel)
+async def get_article(id: str) -> ExtendArticleModel:
+    result = await retrieve_search_result_by_id(id)
+    if result:
+        if not result['content']:
+            content = scrap_content(result['link'])
+            result = await update_content_of_article(id, content)
+        return result
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Item not found")
diff --git a/server/scraper.py b/server/scraper.py
new file mode 100644
index 0000000..b98d87e
--- /dev/null
+++ b/server/scraper.py
@@ -0,0 +1,54 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+headers = {
+    'Accept':
+        'text/html,application/xhtml+xml,application/xml;q=0.9,image/'
+        'webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'User-Agent':
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+        '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
+}
+
+
+def scrap_from_search(search: str) -> list[dict]:
+    link_src = f'https://www.laptopmag.com/search?searchTerm={search}&articleType=best-pick'
+    page_src = requests.get(link_src, headers)
+    soup_src = BeautifulSoup(page_src.content, 'html.parser')
+
+    laptops = soup_src.find_all('div', class_='listingResult')
+    laptops_data = []
+    for laptop in laptops:
+        dct = dict()
+        dct['link'] = laptop.find('a', class_='article-link').get('href')
+        dct['image'] = laptop.find('img').get('data-pin-media')
+        dct['title'] = laptop.find('h3', class_='article-name').text.strip()
+        dct['author'] = laptop.find('span', attrs={'style': 'white-space:nowrap'}).text.strip()
+        dct['date'] = laptop.find('time').get('datetime')
+        dct['description'] = laptop.find('p', class_='synopsis').text.strip()
+        dct['tags'] = search.split()
+        dct['content'] = []
+        laptops_data.append(dct)
+    return laptops_data
+
+
+def scrap_content(link: str) -> list[list]:
+    page_src = requests.get(link, headers)
+    soup_src = BeautifulSoup(page_src.content, 'html.parser')
+
+    content = []
+    body = soup_src.find('div', id='article-body')
+    for block in body.children:
+        if block.name == 'p':
+            paragraph = block.text
+            if ' ' in paragraph:
+                paragraph = paragraph.replace(' ', ' ')
+            content.append(('paragraph', paragraph))
+        elif block.name == 'h2':
+            title = block.text
+            content.append(('title', title))
+        elif block.name == 'figure':
+            image = block.find('img').get('data-pin-media')
+            content.append(('image', image))
+    return content

From 1d98f8c2368ed8f76a02e7367826205abc4d2d49 Mon Sep 17 00:00:00 2001
From: Illia Kaialainien <illia.kaialainien@gmail.com>
Date: Sat, 16 Dec 2023 13:13:57 +0200
Subject: [PATCH 2/3] added docstrings

---
 server/app.py                  |  1 +
 server/database.py             | 26 ++++++++++++++++++++++++++
 server/routes/search_result.py | 13 +++++++++++++
 server/scraper.py              | 11 +++++++++++
 4 files changed, 51 insertions(+)

diff --git a/server/app.py b/server/app.py
index 6589979..b85effc 100644
--- a/server/app.py
+++ b/server/app.py
@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+
 from server.routes.search_result import router as SearchResultRouter
 
 
diff --git a/server/database.py b/server/database.py
index 2637b7c..786fbd7 100644
--- a/server/database.py
+++ b/server/database.py
@@ -11,6 +11,7 @@
 
 
 def search_results_helper(search_result):
+    """Take each article and convert it to JSONable format"""
     return {
         "id": str(search_result["_id"]),
         "link": search_result["link"],
@@ -25,6 +26,14 @@ def search_results_helper(search_result):
 
 
 async def add_search_results(results: list[dict]):
+    """Add articles to database
+
+    Check if each article does not exist in database. If it exists,
+    add search words to article's 'tags' field. Else, article will
+    be added to a database.
+    :param results: List of new articles
+    :return: List of articles added to a database
+    """
     new_results = []
     for result in results:
         if await search_results_collection.find_one({"link": result['link']}):
@@ -39,6 +48,7 @@ async def add_search_results(results: list[dict]):
 
 
 async def retrieve_search_result_by_id(id_: str):
+    """Find concrete article in a database by ID"""
     try:
         result = await search_results_collection.find_one({"_id": ObjectId(id_)})
         if result:
@@ -48,6 +58,15 @@ async def retrieve_search_result_by_id(id_: str):
 
 
 async def retrieve_search_results_by_tags(tags: list[str]):
+    """Find articles by tags
+
+    Take search words and check if database contain articles,
+    which have more than half of words in 'tags' fields matches
+    with words in search query. If database have them, return
+    this articles.
+    :param tags: List of search words
+    :return: List of articles
+    """
     matched_result = []
     results = search_results_collection.find()
     search_tags = set(tags)
@@ -59,6 +78,7 @@ async def retrieve_search_results_by_tags(tags: list[str]):
 
 
 async def retrieve_newest_search_results():
+    """Get 20 newest articles from database"""
     results = []
     async for result in search_results_collection.find().sort('date', -1).limit(20):
         results.append(search_results_helper(result))
@@ -66,6 +86,12 @@ async def retrieve_newest_search_results():
 
 
 async def update_content_of_article(id_: str, content: list[list]):
+    """Add content to article
+
+    :param id_: ID of existing article
+    :param content: List of content
+    :return: Article with content
+    """
     await search_results_collection.update_one({'_id': ObjectId(id_)}, {"$set": {"content": content}})
     article = await search_results_collection.find_one({'_id': ObjectId(id_)})
     return search_results_helper(article)
diff --git a/server/routes/search_result.py b/server/routes/search_result.py
index c5a65ab..1e57ce7 100644
--- a/server/routes/search_result.py
+++ b/server/routes/search_result.py
@@ -16,6 +16,13 @@
 
 @router.get("/", status_code=status.HTTP_200_OK, response_model=list[ArticleModel])
 async def get_search_results(find: str | None = None) -> list[ArticleModel]:
+    """Find articles by search query
+
+    Get list of articles which match with search query from database.
+    If count of articles is less than 20, scrap new articles and add
+    them to a database. If 'find' param is empty, return 20 newest
+    articles.
+    """
     if find:
         results = await retrieve_search_results_by_tags(find.split())
         if len(results) < 20:
@@ -28,6 +35,12 @@ async def get_search_results(find: str | None = None) -> list[ArticleModel]:
 
 @router.get("/{id}", status_code=status.HTTP_200_OK, response_model=ExtendArticleModel)
 async def get_article(id: str) -> ExtendArticleModel:
+    """Get concrete article with content
+
+    Find article by ID in database and if it exists, check if it
+    has content in 'content' field. If it is, return it, else scrap
+    this content. If article is not exist in db, return 404.
+    """
     result = await retrieve_search_result_by_id(id)
     if result:
         if not result['content']:
diff --git a/server/scraper.py b/server/scraper.py
index b98d87e..f856388 100644
--- a/server/scraper.py
+++ b/server/scraper.py
@@ -13,10 +13,16 @@
 
 
 def scrap_from_search(search: str) -> list[dict]:
+    """Take search query and return list of articles
+
+    :param search: Query string with what user want to find
+    :return: Result of search - list of articles
+    """
     link_src = f'https://www.laptopmag.com/search?searchTerm={search}&articleType=best-pick'
     page_src = requests.get(link_src, headers)
     soup_src = BeautifulSoup(page_src.content, 'html.parser')
 
+    # parse search result
     laptops = soup_src.find_all('div', class_='listingResult')
     laptops_data = []
     for laptop in laptops:
@@ -34,6 +40,11 @@ def scrap_from_search(search: str) -> list[dict]:
 
 
 def scrap_content(link: str) -> list[list]:
+    """Parse concrete article's content
+
+    :param link: URL of article
+    :return: list of content where each list contain type of content and content itself
+    """
     page_src = requests.get(link, headers)
     soup_src = BeautifulSoup(page_src.content, 'html.parser')
 

From 4daeeb1e4993b75613f3c0e7d75d3f5eff73b7b2 Mon Sep 17 00:00:00 2001
From: Illia Kaialainien <illia.kaialainien@gmail.com>
Date: Sat, 16 Dec 2023 13:29:22 +0200
Subject: [PATCH 3/3] added readme.md

---
 README.md        | 73 +++++++++++++++++++++++++++++++++++++++++++++++-
 requirements.txt |  1 -
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 109fb80..3c619ba 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,72 @@
-# News about laptops (microservice)
\ No newline at end of file
+# Laptop's News Microservice
+
+This is a microservice designed to fetch and serve news articles using FastAPI, Requests, and BeautifulSoup. It provides a simple and efficient way to retrieve news articles about laptops.
+
+## Features
+
+- **FastAPI:** Utilizes the FastAPI framework for creating a fast and modern API with automatic documentation generation.
+- **Requests:** Makes HTTP requests to external news sources to fetch the latest articles.
+- **Beautiful Soup:** Parses and extracts relevant information from the HTML content of news articles.
+- **Asynchronous:** Takes advantage of asynchronous programming for efficient handling of multiple requests.
+
+## Installation
+
+1. Clone the repository:
+
+    ```bash
+    git clone https://github.com/LLkaia/news-laptops-ms.git
+    cd news-laptops-ms
+    ```
+
+2. Create a virtual environment (optional but recommended):
+
+    ```bash
+    python -m venv venv
+    ```
+
+3. Activate the virtual environment:
+
+   - On Windows:
+
+        ```bash
+        .\venv\Scripts\activate
+        ```
+
+   - On Unix or MacOS:
+
+        ```bash
+        source venv/bin/activate
+        ```
+
+4. Install the dependencies:
+
+    ```bash
+    pip install -r requirements.txt
+    ```
+
+## Usage
+
+1. Run the FastAPI application:
+
+    ```bash
+    python main
+    ```
+
+2. Open your web browser and navigate to `http://localhost:8000/docs` to access the Swagger documentation. Here, you can test the different API endpoints and see example requests and responses.
+
+## API Endpoints
+
+- `/news/search`: Search articles about laptops.
+
+    - Example:
+
+        ```
+          http://localhost:8000/news/search?find=acer+aspire+7+review
+        ```
+- `/news/search/{id}`: Show concrete article.
+
+    - Example:
+
+        ```
+          http://localhost:8000/news/search/657c1690f253079b6f3ed074
+        ```
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9425e7b..90c2627 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,6 @@ pyflakes==3.1.0
 pymongo==4.6.1
 PySocks==1.7.1
 requests==2.31.0
-selenium==4.16.0
 sniffio==1.3.0
 sortedcontainers==2.4.0
 soupsieve==2.5