diff --git a/.gitignore b/.gitignore index c41b3fc..3f3c3ef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ **/__pycache__ build dist -filmweb.egg-info +**/filmweb.egg-info +filmwebvenv .vscode diff --git a/README.md b/README.md index ee9e369..01bedae 100644 --- a/README.md +++ b/README.md @@ -40,19 +40,43 @@ filmweb ### Przykład ``` -$ filmweb -f csv pieca "canProfile=true_...tcKeywords=" -INFO:root:Fetching data... -100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00, 5.13it/s] -INFO:root:Parsing data... -100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00, 4.52it/s] -INFO:root:pieca_filmweb_20201031.csv written! -$ head -6 pieca_filmweb_20201031.csv -"timestamp","iso_date","user_comment","user_vote","global_rating","global_votes","original_title","pl_title","directors","countries","genres","link","duration_min","year" -"1580143639","2020-01-27T17:47:19","","3","7.2103400230407715","73632","What We Do in the Shadows","Co robimy w ukryciu","['Jemaine Clement']","['Nowa Zelandia', 'USA']","['Horror', 'Komedia', 'Dokumentalizowany']","https://www.filmweb.pl/film/Co+robimy+w+ukryciu-2014-707286","86","2015-02-27" -"1580143596","2020-01-27T17:46:36","","1","7.762599945068359","76768","","Jojo Rabbit","['Taika Waititi']","['Czechy', 'Niemcy', 'Nowa Zelandia', 'USA']","['Dramat', 'Komedia', 'Wojenny']","https://www.filmweb.pl/film/Jojo+Rabbit-2019-817417","108","2020-01-24" -"1580033558","2020-01-26T11:12:38","","6","6.284679889678955","966","Quick","Seryjny morderca","['Mikael Håfström']","['Szwecja']","['Thriller']","https://www.filmweb.pl/film/Seryjny+morderca-2019-832513","132","2020-09-03" -"1579429860","2020-01-19T11:31:00","","7","6.661180019378662","425","","Difret","['Zeresenay Mehari']","['USA', 'Etiopia']","['Dramat']","https://www.filmweb.pl/film/Difret-2014-700409","99","2015-03-27" -"1579354699","2020-01-18T14:38:19","","5","7.180500030517578","4471","Dylda","Wysoka dziewczyna","['Kantemir Balagov']","['Rosja']","['Dramat']","https://www.filmweb.pl/film/Wysoka+dziewczyna-2019-829460","130","2019-10-11" +$ filmweb -f all pieca "didomi_token=(...)==" +INFO:root:Checking args... +INFO:root:Fetching list of movies... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:06<00:00, 6.26it/s] +INFO:root:Parsing list of movies... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:02<00:00, 12.79it/s] +INFO:root:User pieca has 926 movies... +INFO:root:Fetching movie details... +INFO:root:Fetching user ratings [1/3]... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:39<00:00, 23.49it/s] +INFO:root:Fetching info about movies [2/3]... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.22it/s] +INFO:root:Fetching global rating for movies [3/3]... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.36it/s] +INFO:root:Writing data... +INFO:root:pieca_filmweb_20230121.json written! +INFO:root:pieca_filmweb_20230121.csv written! +$ cat pieca_filmweb_20230121.json | jq .[0] +{ + "timestamp": 1657484863818, + "favorite": false, + "user_rating": 8, + "global_rating": 7.36859, + "global_rating_count": 1579, + "original_title": "Tehran Taboo", + "pl_title": "Teheran tabu", + "year": 2017, + "movie_id": "786978", + "url": "https://www.filmweb.pl/film/Teheran+tabu-2017-786978" +} +$ cat pieca_filmweb_20230121.csv | xsv sample 5 | xsv table +timestamp favorite user_rating global_rating global_rating_count original_title pl_title year movie_id url +1464302814850 False 4 6.91279 1743 Pupendo Pupendo 2003 103930 https://www.filmweb.pl/film/Pupendo-2003-103930 +1581177494926 False 7 6.51905 210 Dukhtar Dukhtar 2014 727743 https://www.filmweb.pl/film/Dukhtar-2014-727743 +1601716769499 False 8 7.59777 179 Shah-re ziba Piękne miasto 2004 155344 https://www.filmweb.pl/film/Pi%C4%99kne+miasto-2004-155344 +1548505975360 False 8 7.12276 1784 Geu-mul W sieci 2016 766555 https://www.filmweb.pl/film/W+sieci-2016-766555 +1638616845248 False 5 6.59127 115166 Ida Ida 2013 546529 https://www.filmweb.pl/film/Ida-2013-546529 ``` ### Wszystkie opcje @@ -72,20 +96,18 @@ Options: ## Dostępne dane: -- duration_min: _długość w min_ -- year: _premiera_ -- global_votes: _ilość ocen filmu_ -- global_rating: _ocena filmweb_ -- directors: _reżyserzy (lista)_ -- countries: _kraje (lista)_ -- genres: _gatunki (lista)_ -- timestamp: _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_ -- iso_date: _[czas oceny (ISO)](https://pl.wikipedia.org/wiki/ISO_8601)_ -- user_vote: _ocena użytkownika_ -- user_comment: _komentarz użytkownika_ -- original_title: _tytuł oryginalny_ -- pl_title: _tytuł polski_ -- link: _strona filmu_ +Kolumna | Opis +--- | --- +year | _premiera_ +global\_rating\_count | _ilość ocen filmu_ +global\_rating | _ocena filmweb_ +timestamp | _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_ +user\_rating | _ocena użytkownika_ +favorite | _dodany do ulubionych_ +original\_title | _tytuł oryginalny_ +pl\_title | _tytuł polski_ +movie\_id | _id filmu_ +url | _strona filmu_ ## Znane problemy: diff --git a/filmweb/getter.py b/filmweb/getter.py index 11957dd..e9bd303 100644 --- a/filmweb/getter.py +++ b/filmweb/getter.py @@ -1,7 +1,10 @@ +import json import requests HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0', + # https://www.whatismybrowser.com/guides/the-latest-user-agent/firefox + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13.1; rv:109.0) Gecko/20100101 Firefox/109.0', + 'x-locale': 'pl_PL', 'Host': 'www.filmweb.pl', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', @@ -24,16 +27,65 @@ def get_films_page(args): response.raise_for_status() return response.text -def get_profile_page(user): +def auth_check(cookie): """ - Parse films page to extract total count of votes + Check if auth is OK (valid cookie after login) + """ + url = "https://www.filmweb.pl/api/v1/logged/info" + content = _get_json(url, cookie, "auth_check") + user = content["name"] + return user + +def get_votes_count(user): + """ + Get total count of votes Args: user: user to get ratings for """ - url = f'https://www.filmweb.pl/user/{user}' - response = requests.get(url, headers=HEADERS) + url = f'https://www.filmweb.pl/api/v1/user/{user}/votes/film/count' + return _get_json(url, "", "get_votes_count") + +def get_user_rating(args): + """ + Gets user rating + """ + (cookie, movie_id, user, friend_query) = args + if friend_query: + url = f"https://www.filmweb.pl/api/v1/logged/friend/{user}/vote/film/{movie_id}/details" + else: + url = f"https://www.filmweb.pl/api/v1/logged/vote/film/{movie_id}/details" + data = _get_json(url, cookie, "get_user_rating") + data["movie_id"] = movie_id + return json.dumps(data) + +def get_global_info(movie_id): + """ + Get info about a movie (title etc) + """ + url = f"https://www.filmweb.pl/api/v1/title/{movie_id}/info" + data = _get_json(url, "", "get_global_info") + data["movie_id"] = movie_id + return json.dumps(data) + +def get_global_rating(movie_id): + """ + Get global rating for a movie + """ + url = f"https://www.filmweb.pl/api/v1/film/{movie_id}/rating" + data = _get_json(url, "", "get_global_rating") + data["movie_id"] = movie_id + data["global_rate"] = data.pop("rate") + return json.dumps(data) + +def _get_json(url, cookie, func_name): + """ + Wrapper for request and unified error + """ try: + response = requests.get(url, headers={'Cookie': cookie, **HEADERS}) response.raise_for_status() + content = response.json() except Exception as e: - raise ValueError(f'No user {user} found: {str(e)}') - return response.text + raise ValueError(f'Failure in {func_name}: {str(e)}') + else: + return content \ No newline at end of file diff --git a/filmweb/main.py b/filmweb/main.py index db7d955..cefada5 100644 --- a/filmweb/main.py +++ b/filmweb/main.py @@ -10,22 +10,18 @@ """ from docopt import docopt +import itertools +import json import re import logging +from math import ceil import multiprocessing import tqdm -from .getter import ( - get_films_page, - get_profile_page, -) -from .parser import ( - auth_check, - extract_movie_ratings, - get_pages_count, - write_data, -) +from . import getter +from . import parser PARALLEL_PROC = multiprocessing.cpu_count() +MOVIES_PER_PAGE = 25 def main(): args = docopt(__doc__) @@ -42,14 +38,30 @@ def main(): try: logging.info('Checking args...') cookie = re.sub('Cookie:', '', cookie).strip() - pages = get_pages_count(get_profile_page(user)) - auth_check(get_films_page((cookie, user, 1))) - logging.info('Fetching data...') + votes_total = getter.get_votes_count(user) + pages = ceil(votes_total/MOVIES_PER_PAGE) + logged_in_user = getter.auth_check(cookie) + friend_query = (user != logged_in_user) + logging.info('Fetching list of movies...') get_films_page_args = ((cookie, user, page) for page in range(1, pages+1)) - raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(get_films_page, get_films_page_args), total=pages)) - logging.info('Parsing data...') - movies = tuple(tqdm.tqdm(pool.imap_unordered(extract_movie_ratings, raw_responses), total=pages)) - write_data(movies, user, file_format) + raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages)) + logging.info('Parsing list of movies...') + ids = tuple(tqdm.tqdm(pool.imap_unordered(parser.extract_movie_ids, raw_responses), total=pages)) + ids = tuple(set(itertools.chain.from_iterable((json.loads(el) for el in ids)))) + total_movies = len(ids) + logging.info(f'User {user} has {total_movies} movies...') + logging.info('Fetching movie details...') + logging.info('Fetching user ratings [1/3]...') + get_user_rating_args = ((cookie, movie_id, user, friend_query) for movie_id in ids) + user_ratings = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_user_rating, get_user_rating_args), total=total_movies)) + # TODO make these 2 optional? + logging.info('Fetching info about movies [2/3]...') + global_info = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_info, ids), total=total_movies)) + logging.info('Fetching global rating for movies [3/3]...') + global_rating = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_rating, ids), total=total_movies)) + movies = parser.merge_data(ids, user_ratings, global_info, global_rating) + logging.info('Writing data...') + parser.write_data(movies, user, file_format) except Exception as e: logging.error(f'Program error: {str(e)}') finally: diff --git a/filmweb/parser.py b/filmweb/parser.py index b8b9f0b..aac9b08 100644 --- a/filmweb/parser.py +++ b/filmweb/parser.py @@ -1,134 +1,66 @@ -import re import csv -import itertools import logging import json from datetime import datetime -from math import ceil from bs4 import BeautifulSoup +from urllib.parse import quote_plus -MOVIES_PER_PAGE = 25 -ATTRS_MAPPING = { - 'global_votes': 'data-count', - 'global_rating': 'data-rate', - 'duration_min': 'data-duration', - 'year': 'data-release', +KEY_MAPPING = { + 'timestamp': 'timestamp', + 'favorite': 'favorite', + 'rate': 'user_rating', + 'global_rate': 'global_rating', + 'count': 'global_rating_count', + 'originalTitle': 'original_title', + 'title': 'pl_title', + 'year': 'year', + 'movie_id': 'movie_id', + 'url': 'url', } -LISTS_MAPPING = { - 'directors': 'filmPreview__info--directors', - 'countries': 'filmPreview__info--countries', - 'genres': 'filmPreview__info--genres', -} -CSV_ROWS = ( - 'timestamp', - 'iso_date', - 'user_comment', - 'user_vote', - 'global_rating', - 'global_votes', - 'original_title', - 'pl_title', - 'directors', - 'countries', - 'genres', - 'link', - 'duration_min', - 'year', -) -def get_pages_count(content): +def extract_movie_ids(content): """ - Parse profile page to extract pages count + Extract movie ids from films page Args: content: raw html """ soup = BeautifulSoup(content, 'html.parser') - try: - # TODO? future: other types than films are counted here as well - user_info_container = soup.find('div', attrs={'class': 'voteStatsBoxData'}) - user_info = json.loads(user_info_container.text) - ratings = int(user_info.get('votes').get('films')) - except Exception as e: - raise ValueError(f'No ratings count found on website: {str(e)}') - assert ratings > 0, 'no rating data available' - pages = ceil(ratings/MOVIES_PER_PAGE) - return pages + id_containers = soup.find_all('div', attrs={'data-film-id': True}) + ids = set(el['data-film-id'] for el in id_containers) + # necessary for multiprocessing pickle to work + return json.dumps(list(ids)) -def auth_check(content): +def merge_data(ids, user_ratings, global_info, global_rating): """ - Parse films page to check authorization - Args: - content: raw html + Merge all data into one """ - access_error = """ - Ratings for this user cannot be accessed. - Either auth cookie is incorrect or this user is not your friend + all_data = tuple(_movie_id_key(el) for el in (user_ratings, global_info, global_rating)) + merged = ({**all_data[0][id], **all_data[1][id], **all_data[2][id]} for id in ids) + return tuple(_rewrite_keys(entry) for entry in merged) + +def _movie_id_key(data): """ - soup = BeautifulSoup(content, 'html.parser') - no_rating_access = soup.find('div', attrs={'class': 'userVotesPage__limitedView'}) - assert not no_rating_access, access_error - return True + Reformat data into dict with movie_id as key + """ + data = (json.loads(el) for el in data) + return {entry["movie_id"]: entry for entry in data} -def extract_movie_ratings(content): +def _rewrite_keys(entry): """ - Parse films page to extract movie ratings - Args: - content: raw html + Fix keys names for data """ - soup = BeautifulSoup(content, 'html.parser') - user_data_container = soup.find('span', attrs={'data-source': 'userVotes'}) - raw_votes = tuple(json.loads(script.contents[0]) for script in user_data_container.find_all('script')) - movies = [] - for movie in raw_votes: - movie_id = movie.get('eId') - film_info_container = soup.find('div', attrs={'id': f'filmPreview_{movie_id}'}) - assert film_info_container, f'no container for {movie}' - film_data = {} - for el in film_info_container.find_all(): - for key, data_attr in ATTRS_MAPPING.items(): - try: - film_data[key] = el[data_attr] - except: - continue - for key, css_class in LISTS_MAPPING.items(): - data_container = film_info_container.find(re.compile('.*'), attrs={'class': css_class}) - try: - data = tuple(el.text for el in data_container.find_all('li')) - except: - data = tuple() - film_data[key] = data - try: - film_data['original_title'] = film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__originalTitle'}).contents[0] - except: - pass - try: - film_data['pl_title'] = film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__title'}).contents[0] - except: - pass - try: - film_data['link'] = 'https://www.filmweb.pl' + film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__link'})['href'] - except: - pass - timestamp = movie.get('t') - clean_movie = { - **film_data, - 'timestamp': timestamp, - 'iso_date': datetime.fromtimestamp(timestamp).isoformat(), - 'user_vote': movie.get('r'), - 'user_comment': movie.get('c'), - } - movies.append(clean_movie) - # necessary for multiprocessing pickle to work - movies = json.dumps(movies) - return movies + fixed = {new_key: entry.get(old_key) for old_key, new_key in KEY_MAPPING.items()} + if fixed.get("original_title") is None: + fixed["original_title"] = fixed["pl_title"] + path = quote_plus(f"{fixed['pl_title'].strip()}-{fixed['year']}-{fixed['movie_id']}") + fixed["url"] = f"https://www.filmweb.pl/film/{path}" + return fixed def write_data(movies, user, data_format='json'): """ """ assert movies, 'no data to write' date = datetime.now().strftime('%Y%m%d') - movies_clean = itertools.chain.from_iterable((json.loads(el) for el in movies)) - movies_clean = tuple(movies_clean) if data_format == 'all': file_formats = ('csv', 'json') else: @@ -136,14 +68,14 @@ def write_data(movies, user, data_format='json'): if 'json' in file_formats: file_name = f'{user}_filmweb_{date}.json' with open(file_name, 'w', encoding='utf-8') as out_file: - out_file.write(json.dumps(movies_clean)) + out_file.write(json.dumps(movies)) logging.info(f'{file_name} written!') if 'csv' in file_formats: file_name = f'{user}_filmweb_{date}.csv' with open(file_name, 'w', encoding='utf-8') as out_file: - writer = csv.DictWriter(out_file, fieldnames=CSV_ROWS, dialect='unix') + writer = csv.DictWriter(out_file, fieldnames=KEY_MAPPING.values(), dialect='unix') writer.writeheader() - for movie in movies_clean: + for movie in movies: writer.writerow(movie) logging.info(f'{file_name} written!') return file_name diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..118da74 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[project] +name = "filmweb" +version = "0.6" +description = "Export movie ratings from filmweb.pl" +authors = [ {name = "Piotr Patrzyk"}, ] +license = {text = "MIT"} +readme = "README.md" +requires-python = ">=3.10" +dynamic = ["dependencies", ] +keywords = ["filmweb", "movie", "crawler", "data"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Internet", +] + +[project.urls] +Source = "https://github.com/ppatrzyk/filmweb" + +[project.scripts] +filmweb = "filmweb.main:main" + +[build-system] +requires = ["setuptools", ] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["filmweb"] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} diff --git a/requirements.txt b/requirements.txt index 40e74f5..f17f72c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -beautifulsoup4==4.10.0 -certifi==2021.10.8 -charset-normalizer==2.0.7 +beautifulsoup4==4.11.1 +certifi==2022.12.7 +charset-normalizer==3.0.1 docopt==0.6.2 -idna==3.3 -requests==2.26.0 -soupsieve==2.3 -tqdm==4.62.3 -urllib3==1.26.7 +idna==3.4 +requests==2.28.2 +soupsieve==2.3.2.post1 +tqdm==4.64.1 +urllib3==1.26.14 diff --git a/setup.py b/setup.py deleted file mode 100644 index dbd505e..0000000 --- a/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup - -with open("README.md", "r", encoding='utf-8') as fh: - long_description = fh.read() - -setup(name='filmweb', - version='0.5', - license='MIT', - description='Export movie ratings from filmweb.pl', - long_description=long_description, - long_description_content_type="text/markdown", - keywords=['filmweb', 'movie', 'crawler'], - author='Piotr Patrzyk', - url='https://github.com/ppatrzyk/filmweb-export', - packages=['filmweb'], - python_requires='>=3.6', - install_requires=[ - 'beautifulsoup4>=4.10.0', - 'docopt>=0.6.2', - 'requests>=2.26.0', - 'tqdm>=4.62.3', - ], - entry_points={ - 'console_scripts': [ - 'filmweb=filmweb.main:main', - ], - }, -) \ No newline at end of file