-
Notifications
You must be signed in to change notification settings - Fork 0
/
news.py
105 lines (85 loc) · 3.71 KB
/
news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
from flask import Flask, render_template, send_file
from newspaper import Article # type: ignore
import nltk # type: ignore
from bs4 import BeautifulSoup # type: ignore
import json
from io import BytesIO
# Download NLTK resources if not already downloaded
nltk.download('punkt')
app = Flask(__name__)
def fetch_article_urls():
base_url = 'https://www.hindustantimes.com/'
categories = ['technology', 'sports', 'education', 'latest-news'] # Define categories
all_matching_links = []
seen_article_ids = set() # To keep track of unique article IDs
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
for category in categories:
url = base_url + category
response = requests.get(url, headers=headers)
matching_links = []
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
a_tags = soup.find_all('a', href=True)
for a_tag in a_tags:
href = a_tag['href']
if a_tag.has_attr('data-articleid'):
article_id = a_tag['data-articleid']
if article_id not in seen_article_ids:
seen_article_ids.add(article_id)
full_url = base_url + href if href.startswith('/') else href
matching_links.append({'article_id': article_id, 'url': full_url})
else:
print(f'Failed to retrieve the page for {category}. Status code:', response.status_code)
if matching_links:
all_matching_links.append({'category': category, 'links': matching_links})
return all_matching_links
def scrape_articles():
all_urls = fetch_article_urls()
articles = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
for category in all_urls:
for article in category['links']:
article_id = article['article_id']
url = article['url']
response = requests.get(url, headers=headers)
if response.status_code == 200:
article_data = Article(url)
article_data.download()
article_data.parse()
article_data.nlp()
title = article_data.title
summary = article_data.summary
publish_date = str(article_data.publish_date) if article_data.publish_date else 'N/A'
image_url = article_data.top_image
articles[article_id] = {
'category': category['category'],
'url': url,
'title': title,
'summary': summary,
'publish_date': publish_date,
'image_url': image_url
}
else:
print(f'Failed to retrieve article from {url}. Status code:', response.status_code)
return articles
@app.route('/')
def index():
articles = scrape_articles()
return render_template('news.html', articles=articles)
@app.route('/download')
def download():
articles = scrape_articles()
json_data = json.dumps(articles, indent=4)
json_file = BytesIO()
json_file.write(json_data.encode('utf-8'))
json_file.seek(0)
return send_file(json_file, as_attachment=True, download_name='articles.json', mimetype='application/json')
if __name__ == '__main__':
app.run(debug=True)