diff --git a/.gitignore b/.gitignore index 0b66ceb..c41b3fc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build dist filmweb.egg-info +.vscode diff --git a/README.md b/README.md index afe6ea2..8239637 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,14 @@ INFO:root:Fetching data... 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00, 5.13it/s] INFO:root:Parsing data... 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00, 4.52it/s] -INFO:root:pieca_filmweb_20201003.csv written! -$ head -3 pieca_filmweb_20201003.csv -"duration_min","year","global_votes","global_rating","directors","countries","genres","timestamp","iso_date","user_vote","original_title","pl_title","link" -"105","2013-03-15","19006","7.108230113983154","['Sławomir Fabicki']","['Polska']","['Dramat obyczajowy']","1594412058","2020-07-10T22:14:18","9","","Miłość","https://www.filmweb.pl/film/Mi%C5%82o%C5%9B%C4%87-2012-631551" -"113","2017-09-22","5418","5.435029983520508","['Krzysztof Krauze']","['Polska']","['Dramat społeczny']","1594304693","2020-07-09T16:24:53","7","","Ptaki śpiewają w Kigali","https://www.filmweb.pl/film/Ptaki+%C5%9Bpiewaj%C4%85+w+Kigali-2017-595615" +INFO:root:pieca_filmweb_20201030.csv written! +$ head -6 pieca_filmweb_20201030.csv +"timestamp","iso_date","user_comment","user_vote","global_rating","global_votes","original_title","pl_title","directors","countries","genres","link","duration_min","year" +"1570914666","2019-10-12T23:11:06","","7","6.438159942626953","3590","Play","Gra","['Ruben Östlund']","['Dania', 'Francja', 'Szwecja']","['Dramat', 'Akcja']","https://www.filmweb.pl/film/Gra-2011-508918","118","2011-11-11" +"1570914495","2019-10-12T23:08:15","","4","7.019690036773682","14935","Kraftidioten","Obywatel roku","['Hans Petter Moland']","['Norwegia', 'Szwecja']","['Komedia kryminalna']","https://www.filmweb.pl/film/Obywatel+roku-2014-684846","116","2014-05-16" +"1588403409","2020-05-02T09:10:09","","8","6.9715399742126465","773","Slava","Sława","['Kristina Grozeva']","['Grecja', 'Bułgaria']","['Dramat']","https://www.filmweb.pl/film/S%C5%82awa-2016-769511","101","2017-08-25" +"1570477126","2019-10-07T21:38:46","","5","6.0","4","","Důvěrný nepřítel","[]","['Czechy', 'Słowacja']","['Thriller']","https://www.filmweb.pl/film/D%C5%AFv%C4%9Brn%C3%BD+nep%C5%99%C3%ADtel-2018-819208","","2018-08-16" +"1570272939","2019-10-05T12:55:39","","6","6.264530181884766","5557","","Attenberg","['Athina Rachel Tsangari']","['Grecja']","['Dramat']","https://www.filmweb.pl/film/Attenberg-2010-591326","95","2011-11-25" ``` lub ocen innego użytkownika (musi być znajomym logującego się): @@ -72,11 +75,12 @@ Options: - timestamp: _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_ - iso_date: _[czas oceny (ISO)](https://pl.wikipedia.org/wiki/ISO_8601)_ - user_vote: _ocena użytkownika_ +- user_comment: _komentarz użytkownika_ - original_title: _tytuł oryginalny_ - pl_title: _tytuł polski_ - link: _strona filmu_ -## Znane ograniczenia: +## Znane problemy: - Logowanie tylko kontem filmweb, - Eksport tylko ocen filmów, inne (np. seriale) niedostępne, diff --git a/filmweb/main.py b/filmweb/main.py index 780a330..bc981e7 100644 --- a/filmweb/main.py +++ b/filmweb/main.py @@ -15,7 +15,7 @@ from math import ceil from copy import deepcopy import requests -from multiprocessing import Pool +import multiprocessing import tqdm from .utils import ( get_movie_ratings, @@ -27,7 +27,7 @@ write_data, ) -PARALLEL_PROC = 4 +PARALLEL_PROC = multiprocessing.cpu_count() MOVIES_PER_PAGE = 25 def main(): @@ -42,7 +42,7 @@ def main(): else: logging.basicConfig(level=logging.INFO) session = requests.session() - pool = Pool(processes=PARALLEL_PROC) + pool = multiprocessing.Pool(processes=PARALLEL_PROC) try: login(session, user, password) get_vote_count_kwargs = { diff --git a/filmweb/utils.py b/filmweb/utils.py index 6e26bd5..ca00f57 100644 --- a/filmweb/utils.py +++ b/filmweb/utils.py @@ -34,6 +34,22 @@ 'countries': 'filmPreview__info--countries', 'genres': 'filmPreview__info--genres', } +CSV_ROWS = ( + 'timestamp', + 'iso_date', + 'user_comment', + 'user_vote', + 'global_rating', + 'global_votes', + 'original_title', + 'pl_title', + 'directors', + 'countries', + 'genres', + 'link', + 'duration_min', + 'year', +) def login(session, user, password): """ @@ -155,23 +171,24 @@ def get_movie_ratings(content): data = tuple() film_data[key] = data try: - original_title = film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__originalTitle'}).contents[0] + film_data['original_title'] = film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__originalTitle'}).contents[0] except: - original_title = None + pass try: - pl_title = film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__title'}).contents[0] + film_data['pl_title'] = film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__title'}).contents[0] except: - pl_title = None - link = 'https://www.filmweb.pl' + film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__link'})['href'] + pass + try: + film_data['link'] = 'https://www.filmweb.pl' + film_info_container.find(re.compile('.*'), attrs={'class': 'filmPreview__link'})['href'] + except: + pass timestamp = movie.get('t') clean_movie = { **film_data, 'timestamp': timestamp, 'iso_date': datetime.fromtimestamp(timestamp).isoformat(), 'user_vote': movie.get('r'), - 'original_title': original_title, - 'pl_title': pl_title, - 'link': link, + 'user_comment': movie.get('c'), } movies.append(clean_movie) # necessary for multiprocessing pickle to work @@ -191,9 +208,8 @@ def write_data(movies, user, data_format='json'): out_file.write(json.dumps(movies_clean)) elif data_format == 'csv': file_name = f'{user}_filmweb_{date}.csv' - field_names = tuple(movies_clean[0].keys()) with open(file_name, 'w') as out_file: - writer = csv.DictWriter(out_file, fieldnames=field_names, dialect='unix') + writer = csv.DictWriter(out_file, fieldnames=CSV_ROWS, dialect='unix') writer.writeheader() for movie in movies_clean: writer.writerow(movie) diff --git a/setup.py b/setup.py index fd161fe..177c5f2 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setup(name='filmweb', - version='0.2', + version='0.3', license='MIT', description='Export movie ratings from filmweb.pl', long_description=long_description, @@ -13,7 +13,7 @@ author='Piotr Patrzyk', url='https://github.com/ppatrzyk/filmweb-export', packages=['filmweb'], - python_requires='>=3.7', + python_requires='>=3.6', install_requires=[ 'beautifulsoup4>=4.9.1', 'docopt>=0.6.2',