-
Notifications
You must be signed in to change notification settings - Fork 1
/
readersdigest.py
157 lines (127 loc) · 4.41 KB
/
readersdigest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from bs4 import BeautifulSoup
import requests
from requests.exceptions import ConnectionError
import pickle
import random
import os
from content import Content
from datetime import datetime
class JokesData(Content):
def __init__(self):
self.set_type('text')
super(JokesData, self).__init__()
def set_content(self, message, title,
genre, post_id,
website, timestamp):
super(JokesData, self).set_content(
message=message,
website=website,
timestamp=timestamp
)
self._message = ['*' + title + '*', '_' + genre + '_'] \
+ self._message
self._title = title
self._post_id = post_id
self._genre = genre
def get_title(self):
return self._title
def get_id(self):
return self._post_id
def get_genre(self):
return self._genre
def save(self, file):
# Pass a json file and save data of the
# Content data in json format in the passed
# file
pass
def __str__(self):
stng = super(JokesData, self).__str__()
stng += "%s %s\n" % ("Title:", self._title)
stng += "%s %s\n" % ("Genre:", self._genre)
stng += "%s %s" % ("Post ID:", self._post_id)
return stng
def __repr__(self):
stng = super(JokesData, self).__str__()
stng += "%s %s\n" % ("Title:", self._title)
stng += "%s %s\n" % ("Genre:", self._genre)
stng += "%s %s" % ("Post ID:", self._post_id)
return stng
# Connect to the URL and return BeautifulSoup object of the html
def get_soup(url):
while True:
try:
req = requests.get(url)
except ConnectionError:
continue
break
soup = BeautifulSoup(req.text, 'lxml')
return soup
# Download Jokes and makes JokesData object for each joke, and returns a
# list of JokesData
def download_jokes(url, joke_type, limit=-1):
# Append url extension to the base
url += '/jokes' + joke_type
soup = get_soup(url)
# Find tags with class='jokes-river'
joke_articles = soup.find_all('article', attrs={'class': 'jokes-river'})
# Get joke content and joke title
_jokes = [[i.get('id'), i.header.text, i.div.text] for i in joke_articles]
# Restrict max length of jokes
char_limit = 1000
_jokes = [i for i in _jokes if len(i[1]) < char_limit]
if limit > -1:
_jokes = _jokes[:limit]
for i in range(len(_jokes)):
_jokes[i][1] = _jokes[i][1].strip('\n').strip(' ')
_jokes[i][2] = _jokes[i][2].strip('\n').strip(' ')
# Make a list of JokesData
website = 'readersdigest'
jokes = []
for joke in _jokes:
item = JokesData()
post_id, title, message = joke
genre = joke_type.split('/')[-1]
timestamp = datetime.ctime(datetime.now())
item.set_content(
title=title,
genre=genre,
message=message,
timestamp=timestamp,
website=website,
post_id=post_id
)
jokes.append(item)
return jokes
# Return a joke from the web or the off-line storage and refill
# the off-line storage with new jokes as needed
def get_content():
# Load if we already have any metadata about the
# images we have retrieved so far, and if not create
# metadata
meta_file = 'readersdigest.pkl'
url = 'https://www.rd.com'
if os.path.isfile(meta_file):
with open(meta_file, 'rb') as meta:
data = pickle.load(meta)
else:
data = {
'JokesSeen': [],
'JokesNotSeen': [],
'JokeTypesNotFetched': ['/knock-knock',
'/corny',
'/one-liners',
'/riddles']
}
# If unseen jokes are less than 100 and if there are any types
# left for scraping, scrape those types of jokes from the web
while len(data['JokesNotSeen']) < 100 \
and len(data['JokeTypesNotFetched']) > 0:
joke_type = data['JokeTypesNotFetched'].pop()
data['JokesNotSeen'] += download_jokes(url, joke_type)
# Shuffle so we can return different types every time this is called
random.shuffle(data['JokesNotSeen'])
joke = data['JokesNotSeen'].pop()
data['JokesSeen'].append(joke)
with open(meta_file, 'wb') as meta:
pickle.dump(data, meta)
return joke