Skip to content

Commit

Permalink
refactor; letterboxd format
Browse files Browse the repository at this point in the history
  • Loading branch information
ppatrzyk committed May 23, 2023
1 parent cd1207c commit 832dd64
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 58 deletions.
75 changes: 37 additions & 38 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,41 @@ filmweb <username> <cookie>
### Przykład

```
$ filmweb -f all pieca "didomi_token=(...)=="
$ filmweb -f csv -f json pieca "didomi_token=(...)=="
INFO:root:Checking args...
INFO:root:Fetching list of movies...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:06<00:00, 6.26it/s]
INFO:root:Parsing list of movies...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:02<00:00, 12.79it/s]
INFO:root:User pieca has 926 movies...
INFO:root:Fetching movie details...
INFO:root:Fetching user ratings [1/3]...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:39<00:00, 23.49it/s]
INFO:root:Fetching info about movies [2/3]...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.22it/s]
INFO:root:Fetching global rating for movies [3/3]...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 926/926 [00:43<00:00, 21.36it/s]
INFO:root:Writing data...
INFO:root:pieca_filmweb_20230121.json written!
INFO:root:pieca_filmweb_20230121.csv written!
$ cat pieca_filmweb_20230121.json | jq .[0]
INFO:root:Fetching list of movies [1/6]...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:07<00:00, 4.98it/s]
INFO:root:Parsing list of movies [2/6]...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:02<00:00, 15.47it/s]
INFO:root:User pieca has 938 movies...
INFO:root:Fetching user ratings [3/6]...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [00:34<00:00, 27.34it/s]
INFO:root:Fetching info about movies [4/6]...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [00:33<00:00, 27.63it/s]
INFO:root:Fetching global rating for movies [5/6]...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 938/938 [00:35<00:00, 26.57it/s]
INFO:root:Writing data [6/6]...
INFO:root:pieca_20230523.json written!
INFO:root:pieca_20230523.csv written!
$ cat pieca_20230523.json | jq .[0]
{
"timestamp": 1657484863818,
"favorite": false,
"user_rating": 8,
"global_rating": 7.36859,
"global_rating_count": 1579,
"original_title": "Tehran Taboo",
"pl_title": "Teheran tabu",
"year": 2017,
"movie_id": "786978",
"url": "https://www.filmweb.pl/film/Teheran+tabu-2017-786978"
"timestamp": 1579354599456,
"favorite": null,
"user_rating": 5,
"global_rating": 6.03865,
"global_rating_count": 414,
"original_title": "Ejdeha Vared Mishavad!",
"pl_title": "Wejście smoka!",
"year": 2016,
"movie_id": "757318",
"url": "https://www.filmweb.pl/film/Wej%C5%9Bcie+smoka%21-2016-757318",
"date": "2020-01-18"
}
$ cat pieca_filmweb_20230121.csv | xsv sample 5 | xsv table
timestamp favorite user_rating global_rating global_rating_count original_title pl_title year movie_id url
1464302814850 False 4 6.91279 1743 Pupendo Pupendo 2003 103930 https://www.filmweb.pl/film/Pupendo-2003-103930
1581177494926 False 7 6.51905 210 Dukhtar Dukhtar 2014 727743 https://www.filmweb.pl/film/Dukhtar-2014-727743
1601716769499 False 8 7.59777 179 Shah-re ziba Piękne miasto 2004 155344 https://www.filmweb.pl/film/Pi%C4%99kne+miasto-2004-155344
1548505975360 False 8 7.12276 1784 Geu-mul W sieci 2016 766555 https://www.filmweb.pl/film/W+sieci-2016-766555
1638616845248 False 5 6.59127 115166 Ida Ida 2013 546529 https://www.filmweb.pl/film/Ida-2013-546529
$ cat pieca_20230523.csv | xsv sample 3 | xsv table
timestamp favorite user_rating global_rating global_rating_count original_title pl_title year movie_id url date
1588407481213 9 7.91448 3777 Werckmeister harmóniák Harmonie Werckmeistera 2000 117108 https://www.filmweb.pl/film/Harmonie+Werckmeistera-2000-117108 2020-05-02
1425511804375 4 6.69102 87448 Czas surferów Czas surferów 2005 137466 https://www.filmweb.pl/film/Czas+surfer%C3%B3w-2005-137466 2015-03-05
1496177689168 6 7.16478 619 Kukačka v temném lese Kukułka w ciemnym lesie 1984 35947 https://www.filmweb.pl/film/Kuku%C5%82ka+w+ciemnym+lesie-1984-35947 2017-05-30 2015-03-05
```

### Wszystkie opcje
Expand All @@ -86,12 +84,12 @@ $ filmweb -h
filmweb
Usage:
filmweb [--format=<fileformat>] [--debug] <username> <cookie>
filmweb [--format=<fileformat>]... [--debug] <username> <cookie>
Options:
-h --help Show this screen
-f --format=<fileformat> Output file format: json (default), csv, all (writes both)
-d --debug Debug prints Debug prints
-f --format=<fileformat> Output file format: json (default), csv, letterboxd
-d --debug Debug prints
```

## Dostępne dane:
Expand All @@ -102,11 +100,12 @@ year | _premiera_
global\_rating\_count | _ilość ocen filmu_
global\_rating | _ocena filmweb_
timestamp | _[czas oceny (unix)](https://pl.wikipedia.org/wiki/Czas_uniksowy)_
date | _data oceny_ (yyyy-mm-dd)
user\_rating | _ocena użytkownika_
favorite | _dodany do ulubionych_
original\_title | _tytuł oryginalny_
pl\_title | _tytuł polski_
movie\_id | _id filmu_
movie\_id | _id filmu_ (filmweb)
url | _strona filmu_

## Znane problemy:
Expand Down
19 changes: 9 additions & 10 deletions filmweb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Options:
-h --help Show this screen
-f --format=<fileformat> Output file format: json (default), csv, all (writes both)
-f --format=<fileformat> Output file format: json (default), csv, letterboxd
-d --debug Debug prints
"""

Expand All @@ -22,7 +22,7 @@

PARALLEL_PROC = multiprocessing.cpu_count()
MOVIES_PER_PAGE = 25
FORMATS = {"csv", "json"}
FORMATS = {"csv", "json", "letterboxd"}

def main():
args = docopt(__doc__)
Expand All @@ -43,25 +43,24 @@ def main():
pages = ceil(votes_total/MOVIES_PER_PAGE)
logged_in_user = getter.auth_check(cookie)
friend_query = (user != logged_in_user)
logging.info("Fetching list of movies...")
logging.info("Fetching list of movies [1/6]...")
get_films_page_args = ((cookie, user, page) for page in range(1, pages+1))
raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages))
logging.info("Parsing list of movies...")
logging.info("Parsing list of movies [2/6]...")
ids = tuple(tqdm.tqdm(pool.imap_unordered(parser.extract_movie_ids, raw_responses), total=pages))
ids = tuple(set(itertools.chain.from_iterable((json.loads(el) for el in ids))))
total_movies = len(ids)
logging.info(f"User {user} has {total_movies} movies...")
logging.info("Fetching movie details...")
logging.info("Fetching user ratings [1/3]...")
assert total_movies, "No movies available"
logging.info("Fetching user ratings [3/6]...")
get_user_rating_args = ((cookie, movie_id, user, friend_query) for movie_id in ids)
user_ratings = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_user_rating, get_user_rating_args), total=total_movies))
# TODO make these 2 optional?
logging.info("Fetching info about movies [2/3]...")
logging.info("Fetching info about movies [4/6]...")
global_info = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_info, ids), total=total_movies))
logging.info("Fetching global rating for movies [3/3]...")
logging.info("Fetching global rating for movies [5/6]...")
global_rating = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_rating, ids), total=total_movies))
logging.info("Writing data [6/6]...")
movies = parser.merge_data(ids, user_ratings, global_info, global_rating)
logging.info("Writing data...")
parser.write_data(movies, user, formats)
except Exception as e:
logging.error(f"Program error: {str(e)}")
Expand Down
48 changes: 38 additions & 10 deletions filmweb/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
"year": "year",
"movie_id": "movie_id",
"url": "url",
"date": "date",
}
# TODO? country/genre info not visible in api, would need to parse htmls

def extract_movie_ids(content):
"""
Expand All @@ -36,16 +38,16 @@ def merge_data(ids, user_ratings, global_info, global_rating):
"""
all_data = tuple(_movie_id_key(el) for el in (user_ratings, global_info, global_rating))
merged = ({**all_data[0][id], **all_data[1][id], **all_data[2][id]} for id in ids)
return tuple(_rewrite_keys(entry) for entry in merged)
return tuple(_fix_keys(entry) for entry in merged)

def _movie_id_key(data):
"""
Reformat data into dict with movie_id as key
Parse and reformat data into dict with movie_id as key
"""
data = (json.loads(el) for el in data)
return {entry["movie_id"]: entry for entry in data}

def _rewrite_keys(entry):
def _fix_keys(entry):
"""
Fix keys names for data
"""
Expand All @@ -54,6 +56,31 @@ def _rewrite_keys(entry):
fixed["original_title"] = fixed["pl_title"]
path = quote_plus(f"""{fixed["pl_title"].strip()}-{fixed["year"]}-{fixed["movie_id"]}""")
fixed["url"] = f"https://www.filmweb.pl/film/{path}"
fixed["date"] = datetime.fromtimestamp(fixed["timestamp"]/1000).strftime("%Y-%m-%d")
return fixed

def _write_csv(file_name, field_names, movies):
"""
Helper for writing csv
"""
with open(file_name, "w", encoding="utf-8") as out_file:
writer = csv.DictWriter(out_file, fieldnames=field_names, dialect="unix")
writer.writeheader()
for movie in movies:
writer.writerow(movie)
return True

def _letterbox_entry(entry):
"""
Format letterboxd column names
https://letterboxd.com/about/importing-data/
"""
fixed = {
"Title": entry.get("original_title"),
"Year": entry.get("year"),
"Rating10": entry.get("user_rating"),
"WatchedDate": entry.get("date"),
}
return fixed

def write_data(movies, user, formats):
Expand All @@ -62,16 +89,17 @@ def write_data(movies, user, formats):
assert movies, "no data to write"
date = datetime.now().strftime("%Y%m%d")
if "json" in formats:
file_name = f"{user}_filmweb_{date}.json"
file_name = f"{user}_{date}.json"
with open(file_name, "w", encoding="utf-8") as out_file:
out_file.write(json.dumps(movies))
logging.info(f"{file_name} written!")
if "csv" in formats:
file_name = f"{user}_filmweb_{date}.csv"
with open(file_name, "w", encoding="utf-8") as out_file:
writer = csv.DictWriter(out_file, fieldnames=KEY_MAPPING.values(), dialect="unix")
writer.writeheader()
for movie in movies:
writer.writerow(movie)
file_name = f"{user}_{date}.csv"
_write_csv(file_name, KEY_MAPPING.values(), movies)
logging.info(f"{file_name} written!")
if "letterboxd" in formats:
file_name = f"{user}_{date}_letterboxd.csv"
movies_letterbox = tuple(_letterbox_entry(entry) for entry in movies)
_write_csv(file_name, movies_letterbox[0].keys(), movies_letterbox)
logging.info(f"{file_name} written!")
return file_name

0 comments on commit 832dd64

Please sign in to comment.