-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpunk_scraper.py
139 lines (105 loc) · 4 KB
/
punk_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
### IMPORTS
from bs4 import BeautifulSoup
import requests
from datetime import datetime
from typing import List, Dict
import re
import itertools
### CLASS
class Concert:
id = None
date = None
band = None
time = None
venue = None
region = None
url = None
id_counter = itertools.count()
def __init__(self, date:str, concert:str, url:str) -> None:
self.date = self._date_from_string(date)
self.band, self.time, self.venue, self.region = self._parse_concert_title(concert)
self.id = next(self.id_counter)
self.url = url
def __repr__(self) -> str:
date_str = datetime.strftime(self.date, '%d/%m/%y')
s = f'**{date_str}**\n\n'
for b in self.band:
s += f'- {b})\n'
s += f'\n*{self.time} @ {self.venue}*\n'
s += f'{self.region}'
return s
def _date_from_string(self, string) -> datetime.date:
date_string = string[-10:] # extracting only the date as dd.mm.yyyy
date = datetime.strptime(date_string, '%d.%m.%Y')
return date
def _parse_concert_title(self, concert_title:str):
# HEMELBESTORMER (Post-Rock, Sludge, Black Metal aus Belgien),
# NOORVIK (Rock, Metal)
# ab 21 Uhr im Sonic Ballroom (Oskar-Jäger-Str. 190)
# / Köln-Ehrenfeld
# 1. split string by commas and abs
split_str = re.split('ab\s', concert_title)
# 2. getting the bands
band = [split_str[0]] if not '),' in split_str[0] else split_str[0].split('),')
# removing leading whitespace and fixing umlauts
band = [
b.strip()
.replace('ö', 'ö')
.replace('ü', 'ü')
for b in band
]
# 3. getting the time, venue, region
venue_data = re.split('\sim\s|in der|auf dem|/', split_str[-1])
venue_data = [x.strip() for x in venue_data]
time, venue, region, *rest = venue_data
# 3.1 fixing the stupid umlauts
venue = venue.replace('ö', 'ö')
region = region.replace('ö', 'ö')
region = region.replace('ü', 'ü')
# 3.2 removing address in brackets
venue = re.sub("[\(\[].*?[\)\]]", "", venue)
# returning the data
return band, time, venue, region
@property
def shorthand(self) -> str:
'''
Succinct version of the concert info displaying only date and headliner.
'''
date_str = datetime.strftime(self.date, '%d/%m/%y')
s = f'[{self.id}] **{date_str}** - {self.band[0]})'
if len(self.band) > 1:
other_bands = len(self.band) - 1
s += f' [+{other_bands}]'
return s
### SCRAPING WEB DATA
def fetch_concerts() -> Dict[int, Concert]:
# getting the website source code
source = requests.get('http://www.punkstelle.de/').text
# creating html parser object
soup = BeautifulSoup(source, 'lxml')
events = soup.find('div', { 'id' : 'calendar'})
all_events = events.find_all('div', class_='event')
filtered_events = []
# finding all event elements without "full" or "month" CSS classes
for event in all_events:
if event.find(class_='month') or event.find(class_='full'):
continue
filtered_events.append(event)
# parsing the data into Concert objects
concerts = []
for concert in filtered_events:
date = concert.find('div', class_='date').text
concert_title = concert.find('div', class_='concert-title').text
url = concert.find('a').get('href')
x = Concert(date, concert_title, url)
concerts.append(x)
# filtering out past concerts
concerts = [c for c in concerts if c.date > datetime.today()]
# creating a dictonary indexed by concert id
concerts = {c.id : c for c in concerts}
return concerts
# When running the script directly
if __name__ == "__main__":
concerts = fetch_concerts()
for c in concerts.values():
print(c.shorthand)