-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmeme_scrape.py
82 lines (60 loc) · 2.89 KB
/
meme_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import praw
from psaw import PushshiftAPI
from datetime import datetime, timedelta
import requests
import urllib.request, urllib.error
reddit = praw.Reddit(client_id = 'YOUR-ID',
client_secret = 'YOUR_SECRET_ID',
user_agent = 'USER_AGENT')
reddit=PushshiftAPI(reddit)
def download_memes(start,end):
start_date=start
end_date=end
save_path = './output/meme_pics'
if not os.path.exists(save_path):
os.mkdir(save_path)
while True:
t=0
generator = reddit.search_submissions(limit=250,
subreddit='memes',
after=int(start_date.timestamp()),
before=int((start_date+timedelta(days=1)).timestamp()))
for post in generator:
try:
temp_var = urllib.request.urlopen(post.url)
except:
continue
#except urllib.error.HTTPError as httpErr:
#print(post.url)
#print('HTTP', httpErr.code, 'ERROR \n')
#continue
#except urllib.error.URLError as urlErr:
#print(post.url)
#print('URL', urlErr.reason, 'ERROR \n')
#continue
try:
img_data = requests.get(post.url).content
if post.url[-3:] in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg']:
if post.selftext.replace(";",",") == '[deleted]':
continue
#print(post.url, '\n')
if (requests.get(post.url).url == 'https://i.imgur.com/removed.png') or (requests.get(post.url).url == 'http://www.noelshack.com/'):
continue
#print(post.selftext.replace(";",","))
file_name = f'{datetime.strftime(start_date,"%Y.%m.%d")}_{t}.png'
path = os.path.join(save_path, file_name)
with open(path, 'wb') as handler:
handler.write(img_data)
with open('./output/memes1921.csv','a') as fd:
fd.write(f'"{datetime.strftime(start_date,"%Y.%m.%d")}_{t}";"{post.title.replace(";",",")}";"{post.selftext.replace(";",",")}";{str(post.score).strip()}\n')
#fd.write(f'"{datetime.strftime(start_date,"%Y.%m.%d")}_{t}";"{post.title.replace(";",",")}";{str(post.score).strip()}\n')
t+=1
else:
continue
except:
continue
start_date=start_date+timedelta(days=1)
if start_date==end_date:
break
download_memes(datetime.strptime('2019.01.01.', '%Y.%m.%d.'), datetime.strptime('2021.10.17.', '%Y.%m.%d.'))