suchencjusz · suchencjusz · Jan 5, 2024 · Dec 14, 2023 · Dec 14, 2023 · Dec 19, 2023
diff --git a/.github/workflows/blacklint.yml b/.github/workflows/blacklint.yml
@@ -0,0 +1,14 @@
+name: 🎞 Lint
+
+on: [push, pull_request]
+
+jobs:
+  python-black:
+    name: Python Black
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Python Black
+        uses: cytopia/docker-black@0.8
+        with:
+          path: '.'
diff --git a/FILMAN-CRAWLER/Dockerfile b/FILMAN-CRAWLER/Dockerfile
@@ -1,5 +1,8 @@
 FROM python:3.11-slim-bullseye
 
+RUN apt update
+RUN apt install -y pkg-config default-libmysqlclient-dev build-essential
+
 WORKDIR /FILMAN-CRAWLER
 
 COPY requirements.txt requirements.txt

diff --git a/FILMAN-CRAWLER/main.py b/FILMAN-CRAWLER/main.py
@@ -6,9 +6,16 @@
 
 from fake_useragent import UserAgent
 
-from tasks.scrap_movie import Scraper as movie_scrapper
-from tasks.scrap_user_watched import Scraper as user_watched_scrapper
+from enum import Enum
+from pydantic import BaseModel
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
 
+from tasks.scrap_movie import Scraper as movie_scrapper
+from tasks.scrap_series import Scraper as series_scrapper
+from tasks.scrap_user_watched_movies import Scraper as user_watched_movies_scrapper
+from tasks.scrap_user_watched_series import Scraper as user_watched_series_scrapper
 
 from concurrent.futures import ThreadPoolExecutor
 
@@ -40,8 +47,7 @@
 logger.addHandler(c_handler)
 logger.addHandler(f_handler)
 
-CORE_ENDPOINT = os.environ.get("CORE_ENDPOINT", "http://localhost:8000")
-# CORE_ENDPOINT = "http://localhost:8000"
+CORE_ENDPOINT = os.environ.get("CORE_ENDPOINT", "http://localhost:8001")
 
 HEADERS = {
     "User-Agent": UserAgent().random,
@@ -64,105 +70,123 @@
     "TE": "trailers",
 }
 
-# status: waiting, in_progress, done, failed
-# type: scrap_movie, scrape_series, -send_discord-, check_user_new_movies, check_user_new_series,
+
+class TaskTypes(str, Enum):
+    SCRAP_USER = "scrap_user"
+    SCRAP_FILMWEB_MOVIE = "scrap_filmweb_movie"
+    SCRAP_FILMWEB_SERIES = "scrap_filmweb_series"
+    SCRAP_FILMWEB_USER_WATCHED_MOVIES = "scrap_filmweb_user_watched_movies"
+    SCRAP_FILMWEB_USER_WATCHED_SERIES = "scrap_filmweb_user_watched_series"
+    SEND_DISCORD_NOTIFICATION = "send_discord_notification"
+
+
+class TaskStatus(str, Enum):
+    QUEUED = "queued"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    ERROR = "error"
 
 
-class Task:
-    def __init__(
-        self, id_task, status, type, job, unix_timestamp, unix_timestamp_last_update
-    ):
-        self.id_task = id_task
-        self.status = status
-        self.type = type
-        self.job = job
-        self.unix_timestamp = unix_timestamp
-        self.unix_timestamp_last_update = unix_timestamp_last_update
+class Task(BaseModel):
+    task_id: int
+    task_status: TaskStatus
+    task_type: TaskTypes
+    task_job: str
+    task_created: datetime
+    task_started: Optional[datetime] = None
+    task_finished: Optional[datetime] = None
 
-    def __str__(self):
-        return (
-            f"{self.id_task} {self.status} {self.type} {self.job} {self.unix_timestamp}"
+
+ALLOWED_TASKS = [
+    TaskTypes.SCRAP_FILMWEB_MOVIE,
+    TaskTypes.SCRAP_FILMWEB_SERIES,
+    TaskTypes.SCRAP_FILMWEB_USER_WATCHED_MOVIES,
+    TaskTypes.SCRAP_FILMWEB_USER_WATCHED_SERIES,
+]
+
+TASK_TYPES = [task for task in ALLOWED_TASKS]
+
+
+def check_there_are_any_tasks():
+    try:
+        r = requests.head(
+            f"{CORE_ENDPOINT}/tasks/get/task/to_do",
+            params={"task_types": TASK_TYPES},
+            headers=HEADERS,
         )
 
+        if r.status_code != 200:
+            return False
 
-def fetch_tasks_from_endpoint():
-    # r = requests.get(
-    #     f'{CORE_ENDPOINT}/tasks/get?status=waiting&types=["scrap_movie","check_user_new_movies"]'
-    # )
+        return True
 
-    allowed_tasks = [
-        "scrap_movie",
-        "check_user_new_movies",
-    ]
+    except Exception as e:
+        logging.error(f"Error checking tasks: {e}")
+        return False
 
+def get_task_to_do() -> Task:
     try:
-        r = requests.post(
-            f"{CORE_ENDPOINT}/tasks/get",
-            json={
-                "status": "waiting",
-                "types": allowed_tasks,
-            },
+        r = requests.get(
+            f"{CORE_ENDPOINT}/tasks/get/task/to_do",
+            params={"task_types": TASK_TYPES},
+            headers=HEADERS,
         )
+
+        if r.status_code != 200:
+            return None
+
+        return Task(**r.json())
+
     except Exception as e:
-        logging.error(f"Error fetching tasks: {e}")
+        logging.error(f"Error getting task to do: {e}")
         return None
 
-    if r.status_code != 200:
-        logging.error(f"Error fetching tasks: HTTP {r.status_code}")
-        return None
 
-    return ujson.loads(r.text)
+def do_task(task: Task):
+    if task.task_type == TaskTypes.SCRAP_FILMWEB_MOVIE:
+        scraper = movie_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, movie_id=task.task_job)
+        scraper.scrap(task)
+
+    elif task.task_type == TaskTypes.SCRAP_FILMWEB_SERIES:
+        scraper = series_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, series_id=task.task_job)
+        scraper.scrap(task)
+
+    elif task.task_type == TaskTypes.SCRAP_FILMWEB_USER_WATCHED_MOVIES:
+        scraper = user_watched_movies_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, user_id=task.task_job)
+        scraper.scrap(task)
 
+    elif task.task_type == TaskTypes.SCRAP_FILMWEB_USER_WATCHED_SERIES:
+        scraper = user_watched_series_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, user_id=task.task_job)
+        scraper.scrap(task)
+
+    else:
+        logging.error(f"Unknown task type: {task.task_type}")
+
+
+
 
 def main():
     logging.info("Program started")
 
-    min_wait = 1  # Minimum wait time in seconds
+    min_wait = 2   # Minimum wait time in seconds
     max_wait = 60  # Maximum wait time in seconds
     wait_time = min_wait
 
     with ThreadPoolExecutor() as executor:
         while True:
             logging.info("Fetching tasks from endpoint")
-            tasks = fetch_tasks_from_endpoint()
-
-            if tasks:
-                logging.info(f"Found {len(tasks)} tasks")
-                wait_time = min_wait
-
-                for task in tasks:
-                    task = Task(
-                        task["id_task"],
-                        task["status"],
-                        task["type"],
-                        task["job"],
-                        task["unix_timestamp"],
-                        task["unix_timestamp_last_update"],
-                    )
-
-                    if task.type == "scrap_movie":
-                        m_scraper = movie_scrapper(
-                            headers=HEADERS,
-                            movie_id=task.job,
-                            endpoint_url=CORE_ENDPOINT,
-                        )
-                        executor.submit(m_scraper.scrap, task)
-
-                    if task.type == "check_user_new_movies":
-                        m_scraper = user_watched_scrapper(
-                            headers=HEADERS,
-                            movie_id=task.job,
-                            endpoint_url=CORE_ENDPOINT,
-                        )
-                        executor.submit(m_scraper.scrap, task)
-
-            else:
-                logging.info("No tasks found")
-                wait_time = min(wait_time * 2, max_wait)
-
-            logging.info(f"Waiting for {wait_time} seconds")
-            time.sleep(wait_time)
 
+            if check_there_are_any_tasks():
+                task = get_task_to_do()
+
+                if task is not None:
+                    executor.submit(do_task, task)
+                    wait_time = min_wait
+                else:
+                    logging.info("No tasks found")
+                    wait_time = min_wait
+
+            time.sleep(wait_time) 
 
 if __name__ == "__main__":
     main()
diff --git a/FILMAN-CRAWLER/requirements.txt b/FILMAN-CRAWLER/requirements.txt