-
-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathextract_content.py
105 lines (95 loc) · 3.48 KB
/
extract_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
def get_content(soup):
"""Retrieves contents of the article"""
# heuristics
div_tags = soup.find_all('div', id='articleContentBody')
div_tags_2 = soup.find_all('div', class_='ArticleText')
div_tags_3 = soup.find_all('div', id='ArticleText')
div3 = soup.find_all('div', id='article_content')
div4 = soup.find_all('div', class_='articleBodyText')
div5 = soup.find_all('div', class_='story-container')
div_tags_l = soup.find_all('div', id=re.compile('article'))
div6 = soup.find_all('div', class_='kizi-honbun')
div7 = soup.find_all('div', class_='main-text')
rest = soup.find_all(id='articleText')
if div_tags:
return collect_content(div_tags)
elif div_tags_2:
return collect_content(div_tags_2)
elif div_tags_3:
return collect_content(div_tags)
elif div3:
return collect_content(div3)
elif div4:
return collect_content(div4)
elif div5:
return collect_content(div5)
elif div_tags_l and len(collect_content(div_tags_l)) > 0:
return collect_content(div_tags_l)
elif div6:
return collect_content(div6)
elif div7:
return collect_content(div7)
elif rest:
return collect_content(rest)
else:
# contingency
c_list = [v.text for v in soup.find_all('p') if len(v.text) > 0]
words_to_bans = ['<', 'javascript']
for word_to_ban in words_to_bans:
c_list = list(filter(lambda x: word_to_ban not in x.lower(), c_list))
clean_html_ratio_letters_length = 0.33
c_list = [t for t in c_list if
len(re.findall('[a-z]', t.lower())) / (
len(t) + 1) < clean_html_ratio_letters_length]
content = ' '.join(c_list)
content = content.replace('\n', ' ')
content = re.sub('\s\s+', ' ', content) # remove multiple spaces.
return content
def collect_content(parent_tag):
"""Collects all text from children p tags of parent_tag"""
content = ''
for tag in parent_tag:
p_tags = tag.find_all('p')
for tag in p_tags:
content += tag.text + '\n'
return content
def get_title(soup):
"""Retrieves Title of Article. Use Google truncated title trick instead."""
# Heuristics
div_tags = soup.find_all('div', class_='Title')
article_headline_tags = soup.find_all('h1', class_='article-headline')
headline_tags = soup.find_all('h2', id='main_title')
hl = soup.find_all(class_='Title')
all_h1_tags = soup.find_all('h1')
title_match = soup.find_all(class_=re.compile('title'))
Title_match = soup.find_all(class_=re.compile('Title'))
headline_match = soup.find_all(class_=re.compile('headline'))
item_prop_hl = soup.find_all(itemprop='headline')
if item_prop_hl:
return item_prop_hl[0].text
if div_tags:
for tag in div_tags:
h1Tag = tag.find_all('h1')
for tag in h1Tag:
if tag.text:
return tag.text
elif article_headline_tags:
for tag in article_headline_tags:
return tag.text
elif headline_tags:
for tag in headline_tags:
return tag.text
elif headline_match:
return headline_match[0].text
elif all_h1_tags:
return all_h1_tags[0].text
elif hl:
return hl[0].text
else:
if title_match:
return title_match[0].text
elif Title_match:
return Title_match[0].text
else:
return ""