-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_wingedwords.py
72 lines (55 loc) · 2.32 KB
/
extract_wingedwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import csv
import string
# basic information, suffix used to loop through files
base_url = 'https://de.wikipedia.org/wiki/Liste_gefl%C3%BCgelter_Worte/'
suffix = list(string.ascii_uppercase)
start_number = 1
# create columns
dates = []
time = []
tweet_content = []
# dates
today = datetime.date.today()
day1 = today + datetime.timedelta(days=1)
# start loop
for letter in suffix:
with open('html/input_' + letter + '.html','r', encoding='utf-8') as f:
html_doc = f.read()
parent_path = "https://de.wikipedia.org/wiki/Liste_gefl%C3%BCgelter_Worte/" + letter
soup = BeautifulSoup(html_doc, 'html.parser')
toc_listings = soup.find_all("li", class_="toclevel-1")
for listing in toc_listings:
toc_link = parent_path + listing.find_next("a").get('href')
word_name = listing.find("span", class_="toctext").string
if word_name != "Einzelnachweise":
# check if links end with punctuation & replace with utf-8 chars, since those will not be linked automatically by twitter
if toc_link[-1] == ".":
toc_link = toc_link[:-1]
toc_link += "%2E"
elif toc_link[-1] == "!":
toc_link = toc_link[:-1]
toc_link += "%21"
elif toc_link[-1] == "?":
toc_link = toc_link[:-1]
toc_link += "%3F"
# to do: check if links contain any of the following characters: ', ’, ", () and replace them
content = '#WingedWord des Tages: \"' + word_name + '\" (# ' + str(start_number) + '). Zur Entstehung: ' + toc_link
tweet_content.append(content)
dates.append(day1)
time.append(datetime.time(12).isoformat(timespec='minutes'))
start_number += 1
day1 = day1 + datetime.timedelta(days=1)
# save data as a dataframe
columns = ["Date", "time", "Tweet content", "image attachment", "latitude", "longitude"]
df = pd.DataFrame(data=zip(dates,time,tweet_content),columns=columns[0:3])
df[columns[3]] = ""
df[columns[4]] = ""
df[columns[5]] = ""
# shuffle tweet content:
df['Tweet content'] = df["Tweet content"].sample(frac=1).values
print(df)
# create tsv from df
df.to_csv('wingedwords_tweets.tsv', sep="\t", index=False, quoting=csv.QUOTE_NONE)