Merge pull request #9 from ppatrzyk/fix23

Zmiana działania strony filmweb
ppatrzyk · Jan 21, 2023 · 183a67c · 183a67c
2 parents 4af574d + f889253
commit 183a67c
Show file tree

Hide file tree

Showing 8 changed files with 220 additions and 196 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 **/__pycache__
 build
 dist
-filmweb.egg-info
+**/filmweb.egg-info
+filmwebvenv
 .vscode
diff --git a/README.md b/README.md
@@ -40,19 +40,43 @@ filmweb <username> <cookie>
 ### Przykład
 
 ```
-$ filmweb -f csv pieca "canProfile=true_...tcKeywords="
-INFO:root:Fetching data...
-100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00,  5.13it/s]
-INFO:root:Parsing data...
-100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00,  4.52it/s]
-INFO:root:pieca_filmweb_20201031.csv written!
-$ head -6 pieca_filmweb_20201031.csv
-"timestamp","iso_date","user_comment","user_vote","global_rating","global_votes","original_title","pl_title","directors","countries","genres","link","duration_min","year"
-"1580143639","2020-01-27T17:47:19","","3","7.2103400230407715","73632","What We Do in the Shadows","Co robimy w ukryciu","['Jemaine Clement']","['Nowa Zelandia', 'USA']","['Horror', 'Komedia', 'Dokumentalizowany']","https://www.filmweb.pl/film/Co+robimy+w+ukryciu-2014-707286","86","2015-02-27"
-"1580143596","2020-01-27T17:46:36","","1","7.762599945068359","76768","","Jojo Rabbit","['Taika Waititi']","['Czechy', 'Niemcy', 'Nowa Zelandia', 'USA']","['Dramat', 'Komedia', 'Wojenny']","https://www.filmweb.pl/film/Jojo+Rabbit-2019-817417","108","2020-01-24"
-"1580033558","2020-01-26T11:12:38","","6","6.284679889678955","966","Quick","Seryjny morderca","['Mikael Håfström']","['Szwecja']","['Thriller']","https://www.filmweb.pl/film/Seryjny+morderca-2019-832513","132","2020-09-03"
-"1579429860","2020-01-19T11:31:00","","7","6.661180019378662","425","","Difret","['Zeresenay Mehari']","['USA', 'Etiopia']","['Dramat']","https://www.filmweb.pl/film/Difret-2014-700409","99","2015-03-27"
-"1579354699","2020-01-18T14:38:19","","5","7.180500030517578","4471","Dylda","Wysoka dziewczyna","['Kantemir Balagov']","['Rosja']","['Dramat']","https://www.filmweb.pl/film/Wysoka+dziewczyna-2019-829460","130","2019-10-11"
+$ filmweb -f all pieca "didomi_token=(...)=="
+INFO:root:Checking args...
+INFO:root:Fetching list of movies...
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:06<00:00,  6.26it/s]
+INFO:root:Parsing list of movies...
+100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:02<00:00, 12.79it/s]
+INFO:root:User pieca has 926 movies...
+INFO:root:Fetching movie details...
+INFO:root:Fetching user ratings [1/3]...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:39<00:00, 23.49it/s]
+INFO:root:Fetching info about movies [2/3]...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.22it/s]
+INFO:root:Fetching global rating for movies [3/3]...
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.36it/s]
+INFO:root:Writing data...
+INFO:root:pieca_filmweb_20230121.json written!
+INFO:root:pieca_filmweb_20230121.csv written!
+$ cat pieca_filmweb_20230121.json | jq .[0]
+{
+  "timestamp": 1657484863818,
+  "favorite": false,
+  "user_rating": 8,
+  "global_rating": 7.36859,
+  "global_rating_count": 1579,
+  "original_title": "Tehran Taboo",
+  "pl_title": "Teheran tabu",
+  "year": 2017,
+  "movie_id": "786978",
+  "url": "https://www.filmweb.pl/film/Teheran+tabu-2017-786978"
+}
+$ cat pieca_filmweb_20230121.csv | xsv sample 5 | xsv table
+timestamp      favorite  user_rating  global_rating  global_rating_count  original_title  pl_title       year  movie_id  url
+1464302814850  False     4            6.91279        1743                 Pupendo         Pupendo        2003  103930    https://www.filmweb.pl/film/Pupendo-2003-103930
+1581177494926  False     7            6.51905        210                  Dukhtar         Dukhtar        2014  727743    https://www.filmweb.pl/film/Dukhtar-2014-727743
+1601716769499  False     8            7.59777        179                  Shah-re ziba    Piękne miasto  2004  155344    https://www.filmweb.pl/film/Pi%C4%99kne+miasto-2004-155344
+1548505975360  False     8            7.12276        1784                 Geu-mul         W sieci        2016  766555    https://www.filmweb.pl/film/W+sieci-2016-766555
+1638616845248  False     5            6.59127        115166               Ida             Ida            2013  546529    https://www.filmweb.pl/film/Ida-2013-546529
 ```
 
 ### Wszystkie opcje
@@ -72,20 +96,18 @@ Options:
 
 ## Dostępne dane:
 
-- duration_min: _długość w min_
-- year: _premiera_
-- global_votes: _ilość ocen filmu_
-- global_rating: _ocena filmweb_
-- directors: _reżyserzy (lista)_
-- countries: _kraje (lista)_
-- genres: _gatunki (lista)_
-- timestamp: _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_
-- iso_date: _[czas oceny (ISO)](https://pl.wikipedia.org/wiki/ISO_8601)_
-- user_vote: _ocena użytkownika_
-- user_comment: _komentarz użytkownika_
-- original_title: _tytuł oryginalny_
-- pl_title: _tytuł polski_
-- link: _strona filmu_
+Kolumna | Opis
+--- | ---
+year | _premiera_
+global\_rating\_count | _ilość ocen filmu_
+global\_rating | _ocena filmweb_
+timestamp | _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_
+user\_rating | _ocena użytkownika_
+favorite | _dodany do ulubionych_
+original\_title | _tytuł oryginalny_
+pl\_title | _tytuł polski_
+movie\_id | _id filmu_
+url | _strona filmu_
 
 ## Znane problemy:
 

diff --git a/filmweb/getter.py b/filmweb/getter.py
@@ -1,7 +1,10 @@
+import json
 import requests
 
 HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0',
+    # https://www.whatismybrowser.com/guides/the-latest-user-agent/firefox
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13.1; rv:109.0) Gecko/20100101 Firefox/109.0',
+    'x-locale': 'pl_PL',
     'Host': 'www.filmweb.pl',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
     'Accept-Language': 'en-US,en;q=0.5',
@@ -24,16 +27,65 @@ def get_films_page(args):
     response.raise_for_status()
     return response.text
 
-def get_profile_page(user):
+def auth_check(cookie):
     """
-    Parse films page to extract total count of votes
+    Check if auth is OK (valid cookie after login)
+    """
+    url = "https://www.filmweb.pl/api/v1/logged/info"
+    content = _get_json(url, cookie, "auth_check")
+    user = content["name"]
+    return user
+
+def get_votes_count(user):
+    """
+    Get total count of votes
     Args:
         user: user to get ratings for
     """
-    url = f'https://www.filmweb.pl/user/{user}'
-    response = requests.get(url, headers=HEADERS)
+    url = f'https://www.filmweb.pl/api/v1/user/{user}/votes/film/count'
+    return _get_json(url, "", "get_votes_count")
+
+def get_user_rating(args):
+    """
+    Gets user rating
+    """
+    (cookie, movie_id, user, friend_query) = args
+    if friend_query:
+        url = f"https://www.filmweb.pl/api/v1/logged/friend/{user}/vote/film/{movie_id}/details"
+    else:
+        url = f"https://www.filmweb.pl/api/v1/logged/vote/film/{movie_id}/details"
+    data = _get_json(url, cookie, "get_user_rating")
+    data["movie_id"] = movie_id
+    return json.dumps(data)
+
+def get_global_info(movie_id):
+    """
+    Get info about a movie (title etc)
+    """
+    url = f"https://www.filmweb.pl/api/v1/title/{movie_id}/info"
+    data = _get_json(url, "", "get_global_info")
+    data["movie_id"] = movie_id
+    return json.dumps(data)
+
+def get_global_rating(movie_id):
+    """
+    Get global rating for a movie
+    """
+    url = f"https://www.filmweb.pl/api/v1/film/{movie_id}/rating"
+    data = _get_json(url, "", "get_global_rating")
+    data["movie_id"] = movie_id
+    data["global_rate"] = data.pop("rate")
+    return json.dumps(data)
+
+def _get_json(url, cookie, func_name):
+    """
+    Wrapper for request and unified error
+    """
     try:
+        response = requests.get(url, headers={'Cookie': cookie, **HEADERS})
         response.raise_for_status()
+        content = response.json()
     except Exception as e:
-        raise ValueError(f'No user {user} found: {str(e)}')
-    return response.text
+        raise ValueError(f'Failure in {func_name}: {str(e)}')
+    else:
+        return content
diff --git a/filmweb/main.py b/filmweb/main.py
@@ -10,22 +10,18 @@
 """
 
 from docopt import docopt
+import itertools
+import json
 import re
 import logging
+from math import ceil
 import multiprocessing
 import tqdm
-from .getter import (
-    get_films_page,
-    get_profile_page,
-)
-from .parser import (
-    auth_check,
-    extract_movie_ratings,
-    get_pages_count,
-    write_data,
-)
+from . import getter
+from . import parser
 
 PARALLEL_PROC = multiprocessing.cpu_count()
+MOVIES_PER_PAGE = 25
 
 def main():
     args = docopt(__doc__)
@@ -42,14 +38,30 @@ def main():
     try:
         logging.info('Checking args...')
         cookie = re.sub('Cookie:', '', cookie).strip()
-        pages = get_pages_count(get_profile_page(user))
-        auth_check(get_films_page((cookie, user, 1)))
-        logging.info('Fetching data...')
+        votes_total = getter.get_votes_count(user)
+        pages = ceil(votes_total/MOVIES_PER_PAGE)
+        logged_in_user = getter.auth_check(cookie)
+        friend_query = (user != logged_in_user)
+        logging.info('Fetching list of movies...')
         get_films_page_args = ((cookie, user, page) for page in range(1, pages+1))
-        raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(get_films_page, get_films_page_args), total=pages))
-        logging.info('Parsing data...')
-        movies = tuple(tqdm.tqdm(pool.imap_unordered(extract_movie_ratings, raw_responses), total=pages))
-        write_data(movies, user, file_format)
+        raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages))
+        logging.info('Parsing list of movies...')
+        ids = tuple(tqdm.tqdm(pool.imap_unordered(parser.extract_movie_ids, raw_responses), total=pages))
+        ids = tuple(set(itertools.chain.from_iterable((json.loads(el) for el in ids))))
+        total_movies = len(ids)
+        logging.info(f'User {user} has {total_movies} movies...')
+        logging.info('Fetching movie details...')
+        logging.info('Fetching user ratings [1/3]...')
+        get_user_rating_args = ((cookie, movie_id, user, friend_query) for movie_id in ids)
+        user_ratings = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_user_rating, get_user_rating_args), total=total_movies))
+        # TODO make these 2 optional?
+        logging.info('Fetching info about movies [2/3]...')
+        global_info = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_info, ids), total=total_movies))
+        logging.info('Fetching global rating for movies [3/3]...')
+        global_rating = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_rating, ids), total=total_movies))
+        movies = parser.merge_data(ids, user_ratings, global_info, global_rating)
+        logging.info('Writing data...')
+        parser.write_data(movies, user, file_format)
     except Exception as e:
         logging.error(f'Program error: {str(e)}')
     finally: