Skip to content

Commit

Permalink
added cron weakly updating
Browse files Browse the repository at this point in the history
  • Loading branch information
LLkaia committed Jan 12, 2024
1 parent 2c5b0af commit 146ec21
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 23 deletions.
10 changes: 9 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
FROM python:3.11

WORKDIR /usr/local/etc/lappy/

RUN apt-get update && apt-get install -y cron supervisor
COPY ./update_news /etc/cron.d/update_news
COPY ./cron.py .
RUN crontab /etc/cron.d/update_news

COPY ./requirements.txt .
RUN pip install -r requirements.txt
COPY ./server ./server
COPY ./main.py .

CMD python3 main.py
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

ENTRYPOINT /usr/bin/supervisord
9 changes: 9 additions & 0 deletions cron.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import asyncio

from server.database import add_newest_news
from server.scraper import scrap_newest


if __name__ == '__main__':
laptops = scrap_newest()
asyncio.run(add_newest_news(laptops))
20 changes: 0 additions & 20 deletions fill_db.py

This file was deleted.

1 change: 0 additions & 1 deletion laptop_models.json

This file was deleted.

17 changes: 17 additions & 0 deletions server/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,23 @@ async def update_content_of_article(id_: str, content: list[list]):
return search_results_helper(article)


async def add_newest_news(articles: list):
"""Add the newest articles to database.
Check if each article does not exist in database. If it does,
add search words to article's 'tags' field. Else, article will
be added to a database.
:param articles: List of the newest articles
"""
for result in articles:
if await search_results_collection.find_one({"link": result['link']}):
new_result = await search_results_collection.find_one({"link": result['link']})
new_result["tags"] = list(set(new_result["tags"] + result['tags']))
await search_results_collection.update_one({"_id": ObjectId(new_result["_id"])}, {"$set": new_result})
else:
await search_results_collection.insert_one(result)


def resolve_period_expression(period: Period) -> dict:
"""Create expression based on Period from query"""
if period is Period.last_week:
Expand Down
24 changes: 23 additions & 1 deletion server/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def scrap_from_search(search: str) -> list[dict]:
return laptops_data


def scrap_content(link: str) -> list[list]:
def scrap_content(link: str) -> list:
"""Parse concrete article's content
:param link: URL of article
Expand All @@ -63,3 +63,25 @@ def scrap_content(link: str) -> list[list]:
image = block.find('img').get('data-pin-media')
content.append(('image', image))
return content


def scrap_newest() -> list[dict]:
"""Scrap list of the newest articles."""
laptops_data = []
for i in range(1, 10):
link_src = f'https://www.laptopmag.com/search/page/{i}?articleType=all&searchTerm=laptops&sortBy=publishedDate'
page_src = requests.get(link_src, headers)
soup_src = BeautifulSoup(page_src.content, 'html.parser')
laptops = soup_src.find_all('div', class_='listingResult')
for laptop in laptops:
dct = dict()
dct['link'] = laptop.find('a', class_='article-link').get('href')
dct['image'] = laptop.find('img').get('data-pin-media')
dct['title'] = laptop.find('h3', class_='article-name').text.strip()
dct['author'] = laptop.find('span', attrs={'style': 'white-space:nowrap'}).text.strip()
dct['date'] = laptop.find('time').get('datetime')
dct['description'] = laptop.find('p', class_='synopsis').text.strip()
dct['tags'] = dct['title'].split()
dct['content'] = []
laptops_data.append(dct)
return laptops_data
12 changes: 12 additions & 0 deletions supervisord.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[supervisord]
nodaemon=true

[program:cron]
command=cron -f

[program:server]
command=python3 /usr/local/etc/lappy/main.py
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
1 change: 1 addition & 0 deletions update_news
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0 0 * * 1 python3 /usr/local/etc/lappy/cron.py

0 comments on commit 146ec21

Please sign in to comment.