-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathhydrate.py
70 lines (57 loc) · 1.99 KB
/
hydrate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
#
# This script will walk through all the tweet id files and
# hydrate them with twarc. The line oriented JSON files will
# be placed right next to each tweet id file.
#
# Note: you will need to install twarc, tqdm, and run twarc configure
# from the command line to tell it your Twitter API keys.
#
# Special thanks to Github users echen102, edsu and SamSamhuns for contributing to this file. This file was repurposed from another
# data repository on COVID-19 related tweets : https://github.com/echen102/COVID-19-TweetIDs
#
import gzip
import json
from tqdm import tqdm
from twarc import Twarc
from pathlib import Path
twarc = Twarc()
tweet_data_dirs = ["data/tweets/2020-10", "data/tweets/2020-11", "data/tweets/2020-12"]
def main():
for tweet_data_dir in tweet_data_dirs:
for path in Path(tweet_data_dir).iterdir():
if path.name.endswith(".csv"):
hydrate(path)
def line_count(filepath):
"""
Counts number of lines in a file
"""
i = 0
with open(filepath, "r") as f:
for line in f.readlines():
i += 1
return i
def extract_tweet_ids(filepath):
with open(filepath, "r") as f:
# skip header
f.readline()
# Extract ids
for line in f.readlines():
yield line[0:line.find(',')]
def hydrate(filepath):
print("Hydrating {}".format(filepath))
hydrated_path = filepath.with_suffix(".jsonl.gz")
if hydrated_path.is_file():
print("skipping json file already exists: {}".format(hydrated_path))
return
print(filepath.name)
# Subtract header from line count
num_ids = line_count(filepath) - 1
print("Hydrating {} tweets".format(num_ids))
with gzip.open(hydrated_path, "w") as output:
with tqdm(total=num_ids) as pbar:
for tweet in twarc.hydrate(extract_tweet_ids(filepath)):
output.write(json.dumps(tweet).encode("utf8") + b"\n")
pbar.update(1)
if __name__ == "__main__":
main()