-
Notifications
You must be signed in to change notification settings - Fork 26
/
scrape-wikiart-by-artist.py
99 lines (77 loc) · 3.94 KB
/
scrape-wikiart-by-artist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import urllib
import re
from bs4 import BeautifulSoup
import time
file_path = "art/wikiart"
base_url = "https://www.wikiart.org"
# iterate through all artists by last name alphabetically
for c in range(ord('a'), ord('z')+1):
char = chr(c)
artist_list_url = base_url + '/en/Alphabet/' + char + '/text-list'
genre_soup = BeautifulSoup(urllib.request.urlopen(artist_list_url), "lxml")
artist_list_main = genre_soup.find("main")
lis = artist_list_main.find_all("li")
# for each list element
for li in lis:
born = 0
died = 0
# get the date range
for line in li.text.splitlines():
if line.startswith(",") and "-" in line:
parts = line.split('-')
if len(parts) == 2:
born = int(re.sub("[^0-9]", "",parts[0]))
died = int(re.sub("[^0-9]", "",parts[1]))
# look for artists who may have created work that could in public domain
if born>1850 and died>0 and (born<1900 or died<1950):
link = li.find("a")
artist = link.attrs["href"]
if artist == "/en/salvador-dali": # skip Dali
continue
# get the artist's main page
artist_url = base_url + artist
artist_soup = BeautifulSoup(urllib.request.urlopen(artist_url), "lxml")
# only look for artists with the word abstract on their main page
if "Abstract" in artist_soup.text or "abstract" in artist_soup.text or "Avant-garde" \
in artist_soup.text or "avant-garde" in artist_soup.text:
print(artist + " " + str(born) + " - " + str(died))
# get the artist's web page for the artwork
url = base_url + artist + '/all-works/text-list'
artist_work_soup = BeautifulSoup(urllib.request.urlopen(url), "lxml")
# get the main section
artist_main = artist_work_soup.find("main")
image_count = 0
artist_name = artist.split("/")[2]
# get the list of artwork
lis = artist_main.find_all("li")
# for each list element
for li in lis:
link = li.find("a")
if link != None:
painting = link.attrs["href"]
# get the painting
url = base_url + painting
print(url)
try:
painting_soup = BeautifulSoup(urllib.request.urlopen(url), "lxml")
except:
print("error retreiving page")
continue
# check the copyright
if "Public domain" in painting_soup.text:
#check the genre
genre = painting_soup.find("span", {"itemprop":"genre"})
if genre != None and genre.text == "abstract":
# get the url
og_image = painting_soup.find("meta", {"property":"og:image"})
image_url = og_image["content"].split("!")[0] # ignore the !Large.jpg at the end
print(image_url)
save_path = file_path + "/" + artist_name + "_" + str(image_count) + ".jpg"
#download the file
try:
print("downloading to " + save_path)
time.sleep(0.2) # try not to get a 403
urllib.request.urlretrieve(image_url, save_path)
image_count = image_count + 1
except Exception as e:
print("failed downloading " + image_url, e)