-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdownl_meme1118.py
76 lines (57 loc) · 2.52 KB
/
downl_meme1118.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import praw
from psaw import PushshiftAPI
from datetime import datetime, timedelta
import requests
import urllib.request, urllib.error
import pandas as pd
df=pd.read_csv('./output/memes1118.csv')
df=df[['id','url']]
keep=[]
for ind, row in df.iterrows():
if row.url.split(".")[-1] in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg'] or 'imgur' in row.url or 'imgflip' in row.url:
keep.append(True)
else:
keep.append(False)
df=df[keep]
df.url=df.url.apply(lambda x: x.replace('/i/','/').replace('://','://i.')+'.jpg' if 'imgflip' in x and x.split(".")[-1] not in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg', 'gif']
else x.replace('/gallery/','/').replace('://m.','://i.')+'.jpg' if '/m.' in x and 'imgur' in x and x.split(".")[-1] not in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg', 'gif']
else x.replace('/gallery/','/').replace('://www.','://i.')+'.jpg' if '/www.' in x and 'imgur' in x and x.split(".")[-1] not in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg', 'gif']
else x.replace('/gallery/','/').replace('://www.','://i.') if '/www.' in x and 'imgur' in x
else x.replace('/gallery/','/').replace('://','://i.')+'.jpg' if '//i.' not in x and 'imgur' in x and x.split(".")[-1] not in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg', 'gif']
else x.replace('/gallery/','/')+'.jpg' if 'imgur' in x and x.split(".")[-1] not in ['jpg', 'png', 'bmp', 'jpeg', 'tiff', 'svg', 'gif']
else x)
df2=df
url=df2.copy()
url.columns=['image_name','image_url']
for ind, row in url.iterrows():
row['image_name']=row['image_name']+'.'+row['image_url'].split('.')[-1]
downloaded=[]
failed=[]
import concurrent.futures
import os
import requests
def save_image_from_url(url, output_folder):
image = requests.get(url.image_url)
output_path = os.path.join(
output_folder, url.image_name
)
with open(output_path, "wb") as f:
f.write(image.content)
def load(df, output_folder):
with concurrent.futures.ThreadPoolExecutor(
) as executor:
future_to_url = {
executor.submit(save_image_from_url, url, output_folder): url
for _, url in df.iterrows()
}
for future in concurrent.futures.as_completed(
future_to_url
):
url = future_to_url[future]
try:
future.result()
except Exception as exc:
print(
"%r generated an exception: %s" % (url, exc)
)
load(url, "./output/meme_pics/")