-
Notifications
You must be signed in to change notification settings - Fork 0
/
news_scraper.py
78 lines (65 loc) · 2.61 KB
/
news_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
import json
import time
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
DELAY = 1.5 # Delay between requests to avoid overwhelming servers
SCHOLAR_NAME = "Professor Dr Jodie Rummer"
RESULTS_FILE = f"{SCHOLAR_NAME.replace(' ', '_')}_portfolio.json"
def fetch_news_api(query, api_key):
url = f"https://newsapi.org/v2/everything?q={query}&apiKey={api_key}"
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors
return response.json()["articles"]
def fetch_google_search(query, api_key, cx):
url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cx}"
response = requests.get(url)
response.raise_for_status()
return response.json()["items"]
def scrape_web_content(url):
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
articles = []
for article in soup.find_all("div", class_="article"):
title = article.find("h2").get_text()
link = article.find("a")["href"]
description = article.find("p").get_text()
articles.append({
"title": title,
"link": link,
"description": description
})
return articles
def main(SEARCH_TERMS):
portfolio_data = {"news_articles": [], "interviews": [], "podcasts": []}
# Fetch articles from NewsAPI
print("Fetching news articles...")
news_articles = fetch_news_api(SEARCH_TERMS, NEWS_API_KEY)
portfolio_data["news_articles"].extend(news_articles)
time.sleep(DELAY)
# Fetch data from Google Custom Search API
print("Fetching Google search results...")
try:
google_results = fetch_google_search(SEARCH_TERMS, GOOGLE_API_KEY, GOOGLE_CX_ID)
portfolio_data["interviews"].extend(google_results)
except requests.HTTPError as e:
print(f"Failed to fetch Google results: {e}")
time.sleep(DELAY)
# Example scrape (customised for specific site structure)
print("Scraping additional sources...")
additional_articles = scrape_web_content(SEARCH_TERMS)
portfolio_data["news_articles"].extend(additional_articles)
time.sleep(DELAY)
# Save results to a JSON file
with open(RESULTS_FILE, "w") as file:
json.dump(portfolio_data, file, indent=4)
print("Data collection complete. Results saved to", RESULTS_FILE)
if __name__ == "__main__":
main(SCHOLAR_NAME)