diff --git a/README.md b/README.md index 01bedae..593d93a 100644 --- a/README.md +++ b/README.md @@ -40,43 +40,41 @@ filmweb ### Przykład ``` -$ filmweb -f all pieca "didomi_token=(...)==" +$ filmweb -f csv -f json pieca "didomi_token=(...)==" INFO:root:Checking args... -INFO:root:Fetching list of movies... -100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:06<00:00, 6.26it/s] -INFO:root:Parsing list of movies... -100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:02<00:00, 12.79it/s] -INFO:root:User pieca has 926 movies... -INFO:root:Fetching movie details... -INFO:root:Fetching user ratings [1/3]... -100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:39<00:00, 23.49it/s] -INFO:root:Fetching info about movies [2/3]... -100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.22it/s] -INFO:root:Fetching global rating for movies [3/3]... -100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.36it/s] -INFO:root:Writing data... -INFO:root:pieca_filmweb_20230121.json written! -INFO:root:pieca_filmweb_20230121.csv written! -$ cat pieca_filmweb_20230121.json | jq .[0] +INFO:root:Fetching list of movies [1/6]... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:07<00:00, 4.98it/s] +INFO:root:Parsing list of movies [2/6]... +100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:02<00:00, 15.47it/s] +INFO:root:User pieca has 938 movies... +INFO:root:Fetching user ratings [3/6]... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [00:34<00:00, 27.34it/s] +INFO:root:Fetching info about movies [4/6]... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [00:33<00:00, 27.63it/s] +INFO:root:Fetching global rating for movies [5/6]... +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [00:35<00:00, 26.57it/s] +INFO:root:Writing data [6/6]... +INFO:root:pieca_20230523.json written! +INFO:root:pieca_20230523.csv written! +$ cat pieca_20230523.json | jq .[0] { - "timestamp": 1657484863818, - "favorite": false, - "user_rating": 8, - "global_rating": 7.36859, - "global_rating_count": 1579, - "original_title": "Tehran Taboo", - "pl_title": "Teheran tabu", - "year": 2017, - "movie_id": "786978", - "url": "https://www.filmweb.pl/film/Teheran+tabu-2017-786978" + "timestamp": 1579354599456, + "favorite": null, + "user_rating": 5, + "global_rating": 6.03865, + "global_rating_count": 414, + "original_title": "Ejdeha Vared Mishavad!", + "pl_title": "Wejście smoka!", + "year": 2016, + "movie_id": "757318", + "url": "https://www.filmweb.pl/film/Wej%C5%9Bcie+smoka%21-2016-757318", + "date": "2020-01-18" } -$ cat pieca_filmweb_20230121.csv | xsv sample 5 | xsv table -timestamp favorite user_rating global_rating global_rating_count original_title pl_title year movie_id url -1464302814850 False 4 6.91279 1743 Pupendo Pupendo 2003 103930 https://www.filmweb.pl/film/Pupendo-2003-103930 -1581177494926 False 7 6.51905 210 Dukhtar Dukhtar 2014 727743 https://www.filmweb.pl/film/Dukhtar-2014-727743 -1601716769499 False 8 7.59777 179 Shah-re ziba Piękne miasto 2004 155344 https://www.filmweb.pl/film/Pi%C4%99kne+miasto-2004-155344 -1548505975360 False 8 7.12276 1784 Geu-mul W sieci 2016 766555 https://www.filmweb.pl/film/W+sieci-2016-766555 -1638616845248 False 5 6.59127 115166 Ida Ida 2013 546529 https://www.filmweb.pl/film/Ida-2013-546529 +$ cat pieca_20230523.csv | xsv sample 3 | xsv table +timestamp favorite user_rating global_rating global_rating_count original_title pl_title year movie_id url date +1588407481213 9 7.91448 3777 Werckmeister harmóniák Harmonie Werckmeistera 2000 117108 https://www.filmweb.pl/film/Harmonie+Werckmeistera-2000-117108 2020-05-02 +1425511804375 4 6.69102 87448 Czas surferów Czas surferów 2005 137466 https://www.filmweb.pl/film/Czas+surfer%C3%B3w-2005-137466 2015-03-05 +1496177689168 6 7.16478 619 Kukačka v temném lese Kukułka w ciemnym lesie 1984 35947 https://www.filmweb.pl/film/Kuku%C5%82ka+w+ciemnym+lesie-1984-35947 2017-05-30 2015-03-05 ``` ### Wszystkie opcje @@ -86,12 +84,12 @@ $ filmweb -h filmweb Usage: - filmweb [--format=] [--debug] + filmweb [--format=]... [--debug] Options: -h --help Show this screen - -f --format= Output file format: json (default), csv, all (writes both) - -d --debug Debug prints Debug prints + -f --format= Output file format: json (default), csv, letterboxd + -d --debug Debug prints ``` ## Dostępne dane: @@ -102,11 +100,12 @@ year | _premiera_ global\_rating\_count | _ilość ocen filmu_ global\_rating | _ocena filmweb_ timestamp | _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_ +date | _data oceny_ (yyyy-mm-dd) user\_rating | _ocena użytkownika_ favorite | _dodany do ulubionych_ original\_title | _tytuł oryginalny_ pl\_title | _tytuł polski_ -movie\_id | _id filmu_ +movie\_id | _id filmu_ (filmweb) url | _strona filmu_ ## Znane problemy: diff --git a/filmweb/main.py b/filmweb/main.py index 0d21320..92423a6 100644 --- a/filmweb/main.py +++ b/filmweb/main.py @@ -5,7 +5,7 @@ Options: -h --help Show this screen - -f --format= Output file format: json (default), csv, all (writes both) + -f --format= Output file format: json (default), csv, letterboxd -d --debug Debug prints """ @@ -22,7 +22,7 @@ PARALLEL_PROC = multiprocessing.cpu_count() MOVIES_PER_PAGE = 25 -FORMATS = {"csv", "json"} +FORMATS = {"csv", "json", "letterboxd"} def main(): args = docopt(__doc__) @@ -43,25 +43,24 @@ def main(): pages = ceil(votes_total/MOVIES_PER_PAGE) logged_in_user = getter.auth_check(cookie) friend_query = (user != logged_in_user) - logging.info("Fetching list of movies...") + logging.info("Fetching list of movies [1/6]...") get_films_page_args = ((cookie, user, page) for page in range(1, pages+1)) raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages)) - logging.info("Parsing list of movies...") + logging.info("Parsing list of movies [2/6]...") ids = tuple(tqdm.tqdm(pool.imap_unordered(parser.extract_movie_ids, raw_responses), total=pages)) ids = tuple(set(itertools.chain.from_iterable((json.loads(el) for el in ids)))) total_movies = len(ids) logging.info(f"User {user} has {total_movies} movies...") - logging.info("Fetching movie details...") - logging.info("Fetching user ratings [1/3]...") + assert total_movies, "No movies available" + logging.info("Fetching user ratings [3/6]...") get_user_rating_args = ((cookie, movie_id, user, friend_query) for movie_id in ids) user_ratings = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_user_rating, get_user_rating_args), total=total_movies)) - # TODO make these 2 optional? - logging.info("Fetching info about movies [2/3]...") + logging.info("Fetching info about movies [4/6]...") global_info = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_info, ids), total=total_movies)) - logging.info("Fetching global rating for movies [3/3]...") + logging.info("Fetching global rating for movies [5/6]...") global_rating = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_rating, ids), total=total_movies)) + logging.info("Writing data [6/6]...") movies = parser.merge_data(ids, user_ratings, global_info, global_rating) - logging.info("Writing data...") parser.write_data(movies, user, formats) except Exception as e: logging.error(f"Program error: {str(e)}") diff --git a/filmweb/parser.py b/filmweb/parser.py index 2f986da..1f77ecc 100644 --- a/filmweb/parser.py +++ b/filmweb/parser.py @@ -16,7 +16,9 @@ "year": "year", "movie_id": "movie_id", "url": "url", + "date": "date", } +# TODO? country/genre info not visible in api, would need to parse htmls def extract_movie_ids(content): """ @@ -36,16 +38,16 @@ def merge_data(ids, user_ratings, global_info, global_rating): """ all_data = tuple(_movie_id_key(el) for el in (user_ratings, global_info, global_rating)) merged = ({**all_data[0][id], **all_data[1][id], **all_data[2][id]} for id in ids) - return tuple(_rewrite_keys(entry) for entry in merged) + return tuple(_fix_keys(entry) for entry in merged) def _movie_id_key(data): """ - Reformat data into dict with movie_id as key + Parse and reformat data into dict with movie_id as key """ data = (json.loads(el) for el in data) return {entry["movie_id"]: entry for entry in data} -def _rewrite_keys(entry): +def _fix_keys(entry): """ Fix keys names for data """ @@ -54,6 +56,31 @@ def _rewrite_keys(entry): fixed["original_title"] = fixed["pl_title"] path = quote_plus(f"""{fixed["pl_title"].strip()}-{fixed["year"]}-{fixed["movie_id"]}""") fixed["url"] = f"https://www.filmweb.pl/film/{path}" + fixed["date"] = datetime.fromtimestamp(fixed["timestamp"]/1000).strftime("%Y-%m-%d") + return fixed + +def _write_csv(file_name, field_names, movies): + """ + Helper for writing csv + """ + with open(file_name, "w", encoding="utf-8") as out_file: + writer = csv.DictWriter(out_file, fieldnames=field_names, dialect="unix") + writer.writeheader() + for movie in movies: + writer.writerow(movie) + return True + +def _letterbox_entry(entry): + """ + Format letterboxd column names + https://letterboxd.com/about/importing-data/ + """ + fixed = { + "Title": entry.get("original_title"), + "Year": entry.get("year"), + "Rating10": entry.get("user_rating"), + "WatchedDate": entry.get("date"), + } return fixed def write_data(movies, user, formats): @@ -62,16 +89,17 @@ def write_data(movies, user, formats): assert movies, "no data to write" date = datetime.now().strftime("%Y%m%d") if "json" in formats: - file_name = f"{user}_filmweb_{date}.json" + file_name = f"{user}_{date}.json" with open(file_name, "w", encoding="utf-8") as out_file: out_file.write(json.dumps(movies)) logging.info(f"{file_name} written!") if "csv" in formats: - file_name = f"{user}_filmweb_{date}.csv" - with open(file_name, "w", encoding="utf-8") as out_file: - writer = csv.DictWriter(out_file, fieldnames=KEY_MAPPING.values(), dialect="unix") - writer.writeheader() - for movie in movies: - writer.writerow(movie) + file_name = f"{user}_{date}.csv" + _write_csv(file_name, KEY_MAPPING.values(), movies) + logging.info(f"{file_name} written!") + if "letterboxd" in formats: + file_name = f"{user}_{date}_letterboxd.csv" + movies_letterbox = tuple(_letterbox_entry(entry) for entry in movies) + _write_csv(file_name, movies_letterbox[0].keys(), movies_letterbox) logging.info(f"{file_name} written!") return file_name