-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_fetcher.py
88 lines (72 loc) · 3.88 KB
/
reddit_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import requests
import json
import emoji
from time import sleep
class RedditStoryFetcher:
def __init__(self, subreddits_file, existing_stories_file, max_num_tries=5, min_upvote_ratio=0.9):
self.subreddits_file = subreddits_file
self.existing_stories_file = existing_stories_file
self.max_num_tries = max_num_tries
self.min_upvote_ratio = min_upvote_ratio
self.stories = []
@staticmethod
def replace_emojis(text, replacement=""):
return emoji.replace_emoji(text, replacement)
@staticmethod
def process_text(text):
return RedditStoryFetcher.replace_emojis(text.replace("\n", " ").replace("\u2019", "'").replace("\u201c", " ").replace("\u201d", " ").replace("AITA", "Am I the asshole"))
def fetch_stories(self):
with open(self.subreddits_file, "r") as f:
subreddits = f.read().split("\n")
assert len(subreddits) > 0 and subreddits[0] != ""
for subreddit in subreddits:
print("")
print(f"Getting stories from r/{subreddit}")
for i in range(self.max_num_tries):
r = requests.get(f"https://www.reddit.com/r/{subreddit}/top.json?t=month?limit=99", headers={'User-agent': 'Mozilla/5.0'})
if r.status_code == 200:
print(f"Successfully downloaded stories from r/{subreddit}")
break
else:
print(f"Error downloading stories from r/{subreddit}: {r.status_code}. Trying again...")
sleep(1)
if i == self.max_num_tries - 1:
print(f"Failed to download stories from r/{subreddit} after {self.max_num_tries} tries.")
print("Exiting to prevent ratelimits...")
exit()
data = r.json()
raw_stories = data["data"]["children"]
for story in raw_stories:
clean_title = self.process_text(story["data"]["title"])
clean_text = self.process_text(story["data"]["selftext"])
# limit length of text to 1450 characters (so under 90 seconds of audio)
# check that text (story) is at least 10 characters, so that it's not like a caption
# limit length of title to 800 characters (so title is under a minute)
if clean_title and len(clean_text) <= 1450 and len(clean_text) > 20 and len(clean_title) <= 800:
self.stories.append({
"title": clean_title,
"text": clean_text,
"ups": story["data"]["ups"],
"upvote_ratio": story["data"]["upvote_ratio"],
"subreddit": story["data"]["subreddit"],
"permalink": story["data"]["permalink"],
"created": story["data"]["created"],
"id": story["data"]["id"]
})
def save_stories(self, output_file):
# sort stories by upvote ratio
self.stories = sorted(self.stories, key=lambda x: x.get('upvote_ratio', 0), reverse=True)
# remove stories with low upvote ratios
self.stories = [story for story in self.stories if story["upvote_ratio"] >= self.min_upvote_ratio]
# remove stories that already exist
existing_stories = json.load(open(self.existing_stories_file, "r"))
existing_ids = [story["id"] for story in existing_stories]
self.stories = [story for story in self.stories if story["id"] not in existing_ids]
with open(output_file, "w", encoding="utf-8") as f:
json.dump(self.stories, f, ensure_ascii=False, indent=4)
def run(self, output_file):
self.fetch_stories()
self.save_stories(output_file)
if __name__ == "__main__":
fetcher = RedditStoryFetcher("subreddits.txt", "stories.json")
fetcher.run("to_be_processed.json")