-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhtml_proc.py
116 lines (100 loc) · 3.98 KB
/
html_proc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import glob
import json
import time
import urllib.request
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from pathlib import Path
import pymongo
import lxml.etree as ET
from utils.telegramBot import updateStatus
import asyncio
from datetime import datetime
# localhost
# from dotenv import load_dotenv
# load_dotenv()
# localhost
# Connects to the Mongo instance
client = pymongo.MongoClient(os.environ['MONGO'])
db = client.ilpost
words = db.words
# Parses newspaper's feed
opener = urllib.request.build_opener()
tree = ET.parse(opener.open('https://www.ilpost.it/feed')) # https://rss.draghetti.it/ilpost.xml alternative
for item in tree.findall('channel/item'):
try:
title = item.find('title')
link = item.find('link')
date = item.find('pubDate')
pub_date = datetime.strptime(date.text, '%a, %d %b %Y %H:%M:%S %z').strftime('%Y-%m-%dT%H:%M:%S.%f%z')
innerHtml = urllib.request.urlopen(link.text).read()
innerSoup = BeautifulSoup(innerHtml, features="lxml")
# ignore all scripts and css
for script in innerSoup(["script", "style"]):
script.extract()
#ignore iframes
for div in innerSoup.find_all("blockquote", {'class':'twitter-tweet'}):
div.decompose()
for div in innerSoup.find_all("blockquote", {'class':'instagram-media'}):
div.decompose()
#ignore tags
for div in innerSoup.find_all("a", {'rel':'tag'}):
div.decompose()
#get title
title = title.text
# get and cleans the text
text = innerSoup.find('article').get_text(separator=" ")
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip()
for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# get tokens - ignore punctuation and capital letters
tokenizer = RegexpTokenizer(r'(?<![@#])\b\w+\b')
tokens = tokenizer.tokenize(text)
# remove duplicates
tokens = list(set(tokens))
# Check if exists in db
database_tokens = list(words.find({ "word": { "$in": tokens}} ))
# Check differences with tokens
cleaned_tokens = []
for item in database_tokens:
word = ''
word = word.join(item['word'])
cleaned_tokens.append(word)
different_tokens = list(set(tokens) - set(cleaned_tokens))
for token in different_tokens:
if any(str.isdigit(c) or str.isupper(c) for c in token) or len(token) <= 3:
continue
else:
print('new token!', token)
#Defines text snippet
range_snippet = 50
start_index = text.find(token)
end_index = start_index + len(token)
snippet = ''
for i in range(start_index - range_snippet, end_index + range_snippet):
snippet += text[i]
finalsnippet = ' '.join(snippet.split()[1:-1])
#Adds word to Mongo
#x = words.insert_one({ "word": token })
#x = words.update_one({'word': token},{'$set': {'word': token}}, upsert=True)
x = words.update_one(
{'word': token},
{
'$set': {
'word': token,
'context': finalsnippet,
'url': link.text,
'date_added': pub_date #datetime.now()
}
},
upsert=True
)
# tweets w
loop = asyncio.get_event_loop()
loop.run_until_complete(updateStatus(token, link.text, title, finalsnippet))
time.sleep(5)
except Exception as e:
print(e)