-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNews-scraper.py
135 lines (116 loc) · 4.55 KB
/
News-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import tweepy
import time
import requests
from bs4 import BeautifulSoup
from transformers import *
from itertools import islice
import os, dotenv
from datetime import datetime
currentTime = datetime.now().time()
givenTimePreRangeMorning = datetime.strptime("09:00AM", "%I:%M%p").time()
givenTimePostRangeMorning = datetime.strptime("09:20AM", "%I:%M%p").time()
givenTimePreRangeEvening = datetime.strptime("06:00PM", "%I:%M%p").time()
givenTimePostRangeEvening = datetime.strptime("06:20PM", "%I:%M%p").time()
def get_var_value(filename="/Users/parambhatia/News/varstore.dat"):
with open(filename, "a+") as f:
f.seek(0)
val = int(f.read() or 0) + 1
f.seek(0)
f.truncate()
f.write(str(val))
return val
def set_var_value(filename="/Users/parambhatia/News/varstore.dat"):
with open(filename, "a+") as f:
f.seek(0)
f.truncate()
f.write(str(1))
with open("/Users/parambhatia/News/data.txt", 'r+') as f:
f.truncate(0)
if(currentTime >= givenTimePreRangeMorning and currentTime < givenTimePostRangeMorning):
set_var_value()
if(currentTime >= givenTimePreRangeEvening and currentTime < givenTimePostRangeEvening):
set_var_value()
global counterExecute
counterExecute = int(get_var_value())
print("The current article count is :" + " " + str(counterExecute))
if(counterExecute >=27):
set_var_value()
print()
print('---------------------------')
print()
with open("/Users/parambhatia/News/data.txt", "r+") as file:
lines = file.readlines()
print(lines)
file.close()
set_var_value()
print("This script has been run {} times.".format(str(counterExecute)))
dotenv.load_dotenv()
consumerKey = os.environ["CONSUMER_KEY"]
consumerSecret = os.environ["CONSUMER_SECRET"]
accessToken = os.environ["ACCESS_TOKEN"]
accessTokenSecret = os.environ["SECRET_ACCESS_TOKEN"]
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
# tokenize the text to be form of a list of token IDs
inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
# generate the paraphrased sentences
outputs = model.generate(
**inputs,
num_beams=num_beams,
num_return_sequences=num_return_sequences,
)
# decode the generated sentences using the tokenizer to get them back to text
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
def printNews():
response = requests.get('http://feeds.bbci.co.uk/news/rss.xml?edition=uk')
try:
soup = BeautifulSoup(response.content, features='xml')
except Exception as E:
print("lol")
items = soup.findAll('item')
news_articles = []
i = 0
for item in items:
if(i<36):
news_item = {}
news_item['title'] = item.title.text
news_item['description'] = item.description.text
news_item['link'] = item.link.text
news_item['pubDate'] = item.pubDate.text
news_articles.append(news_item)
i += 1
return news_articles
def job():
global counterExecute
if(counterExecute == 1):
newsStack = printNews()
print(len(newsStack))
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")
with open("/Users/parambhatia/News/data.txt", 'w') as file:
for words in newsStack:
amendedSentence = get_paraphrased_sentences(model, tokenizer, words['title'], num_beams=10, num_return_sequences=1)
headline = f'"{amendedSentence[0]}"'
source = "Source : " + words['link']
final_string = headline + "\n" + source
print(final_string)
file.write("%s\n" % final_string)
file.close()
with open("/Users/parambhatia/News/data.txt", "r+") as file:
head = list(islice(file, 2))
print("the current headline is:")
print(head)
print("Hope you enjoyed reading that headline")
now = datetime.now()
current_time = now.strftime("%H:%M")
api.update_status(head[0] + head[1] + "- " + current_time)
file.close()
with open("/Users/parambhatia/News/data.txt", "r+") as file:
lines = file.readlines()
file.seek(0)
file.truncate()
file.writelines(lines[2:])
file.close()
job()