Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API REWRITTEN #18

Merged
merged 9 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/blacklint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: 🎞 Lint

on: [push, pull_request]

jobs:
python-black:
name: Python Black
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Python Black
uses: cytopia/docker-black@0.8
with:
path: '.'
3 changes: 3 additions & 0 deletions FILMAN-CRAWLER/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
FROM python:3.11-slim-bullseye

RUN apt update
RUN apt install -y pkg-config default-libmysqlclient-dev build-essential

WORKDIR /FILMAN-CRAWLER

COPY requirements.txt requirements.txt
Expand Down
178 changes: 101 additions & 77 deletions FILMAN-CRAWLER/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@

from fake_useragent import UserAgent

from tasks.scrap_movie import Scraper as movie_scrapper
from tasks.scrap_user_watched import Scraper as user_watched_scrapper
from enum import Enum
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List, Dict, Any


from tasks.scrap_movie import Scraper as movie_scrapper
from tasks.scrap_series import Scraper as series_scrapper
from tasks.scrap_user_watched_movies import Scraper as user_watched_movies_scrapper
from tasks.scrap_user_watched_series import Scraper as user_watched_series_scrapper

from concurrent.futures import ThreadPoolExecutor

Expand Down Expand Up @@ -40,8 +47,7 @@
logger.addHandler(c_handler)
logger.addHandler(f_handler)

CORE_ENDPOINT = os.environ.get("CORE_ENDPOINT", "http://localhost:8000")
# CORE_ENDPOINT = "http://localhost:8000"
CORE_ENDPOINT = os.environ.get("CORE_ENDPOINT", "http://localhost:8001")

HEADERS = {
"User-Agent": UserAgent().random,
Expand All @@ -64,105 +70,123 @@
"TE": "trailers",
}

# status: waiting, in_progress, done, failed
# type: scrap_movie, scrape_series, -send_discord-, check_user_new_movies, check_user_new_series,

class TaskTypes(str, Enum):
SCRAP_USER = "scrap_user"
SCRAP_FILMWEB_MOVIE = "scrap_filmweb_movie"
SCRAP_FILMWEB_SERIES = "scrap_filmweb_series"
SCRAP_FILMWEB_USER_WATCHED_MOVIES = "scrap_filmweb_user_watched_movies"
SCRAP_FILMWEB_USER_WATCHED_SERIES = "scrap_filmweb_user_watched_series"
SEND_DISCORD_NOTIFICATION = "send_discord_notification"


class TaskStatus(str, Enum):
QUEUED = "queued"
RUNNING = "running"
COMPLETED = "completed"
ERROR = "error"


class Task:
def __init__(
self, id_task, status, type, job, unix_timestamp, unix_timestamp_last_update
):
self.id_task = id_task
self.status = status
self.type = type
self.job = job
self.unix_timestamp = unix_timestamp
self.unix_timestamp_last_update = unix_timestamp_last_update
class Task(BaseModel):
task_id: int
task_status: TaskStatus
task_type: TaskTypes
task_job: str
task_created: datetime
task_started: Optional[datetime] = None
task_finished: Optional[datetime] = None

def __str__(self):
return (
f"{self.id_task} {self.status} {self.type} {self.job} {self.unix_timestamp}"

ALLOWED_TASKS = [
TaskTypes.SCRAP_FILMWEB_MOVIE,
TaskTypes.SCRAP_FILMWEB_SERIES,
TaskTypes.SCRAP_FILMWEB_USER_WATCHED_MOVIES,
TaskTypes.SCRAP_FILMWEB_USER_WATCHED_SERIES,
]

TASK_TYPES = [task for task in ALLOWED_TASKS]


def check_there_are_any_tasks():
try:
r = requests.head(
f"{CORE_ENDPOINT}/tasks/get/task/to_do",
params={"task_types": TASK_TYPES},
headers=HEADERS,
)

if r.status_code != 200:
return False

def fetch_tasks_from_endpoint():
# r = requests.get(
# f'{CORE_ENDPOINT}/tasks/get?status=waiting&types=["scrap_movie","check_user_new_movies"]'
# )
return True

allowed_tasks = [
"scrap_movie",
"check_user_new_movies",
]
except Exception as e:
logging.error(f"Error checking tasks: {e}")
return False

def get_task_to_do() -> Task:
try:
r = requests.post(
f"{CORE_ENDPOINT}/tasks/get",
json={
"status": "waiting",
"types": allowed_tasks,
},
r = requests.get(
f"{CORE_ENDPOINT}/tasks/get/task/to_do",
params={"task_types": TASK_TYPES},
headers=HEADERS,
)

if r.status_code != 200:
return None

return Task(**r.json())

except Exception as e:
logging.error(f"Error fetching tasks: {e}")
logging.error(f"Error getting task to do: {e}")
return None

if r.status_code != 200:
logging.error(f"Error fetching tasks: HTTP {r.status_code}")
return None

return ujson.loads(r.text)
def do_task(task: Task):
if task.task_type == TaskTypes.SCRAP_FILMWEB_MOVIE:
scraper = movie_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, movie_id=task.task_job)
scraper.scrap(task)

elif task.task_type == TaskTypes.SCRAP_FILMWEB_SERIES:
scraper = series_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, series_id=task.task_job)
scraper.scrap(task)

elif task.task_type == TaskTypes.SCRAP_FILMWEB_USER_WATCHED_MOVIES:
scraper = user_watched_movies_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, user_id=task.task_job)
scraper.scrap(task)

elif task.task_type == TaskTypes.SCRAP_FILMWEB_USER_WATCHED_SERIES:
scraper = user_watched_series_scrapper(headers=HEADERS, endpoint_url=CORE_ENDPOINT, user_id=task.task_job)
scraper.scrap(task)

else:
logging.error(f"Unknown task type: {task.task_type}")




def main():
logging.info("Program started")

min_wait = 1 # Minimum wait time in seconds
min_wait = 2 # Minimum wait time in seconds
max_wait = 60 # Maximum wait time in seconds
wait_time = min_wait

with ThreadPoolExecutor() as executor:
while True:
logging.info("Fetching tasks from endpoint")
tasks = fetch_tasks_from_endpoint()

if tasks:
logging.info(f"Found {len(tasks)} tasks")
wait_time = min_wait

for task in tasks:
task = Task(
task["id_task"],
task["status"],
task["type"],
task["job"],
task["unix_timestamp"],
task["unix_timestamp_last_update"],
)

if task.type == "scrap_movie":
m_scraper = movie_scrapper(
headers=HEADERS,
movie_id=task.job,
endpoint_url=CORE_ENDPOINT,
)
executor.submit(m_scraper.scrap, task)

if task.type == "check_user_new_movies":
m_scraper = user_watched_scrapper(
headers=HEADERS,
movie_id=task.job,
endpoint_url=CORE_ENDPOINT,
)
executor.submit(m_scraper.scrap, task)

else:
logging.info("No tasks found")
wait_time = min(wait_time * 2, max_wait)

logging.info(f"Waiting for {wait_time} seconds")
time.sleep(wait_time)

if check_there_are_any_tasks():
task = get_task_to_do()

if task is not None:
executor.submit(do_task, task)
wait_time = min_wait
else:
logging.info("No tasks found")
wait_time = min_wait

time.sleep(wait_time)

if __name__ == "__main__":
main()
47 changes: 0 additions & 47 deletions FILMAN-CRAWLER/requirements.txt

This file was deleted.

Loading