-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfansidedScrape.py
69 lines (56 loc) · 2.49 KB
/
fansidedScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
from bs4 import BeautifulSoup
def scrape_website(URL):
try:
# Send a GET request to the URL and check if the response is successful
page = requests.get(URL)
page.raise_for_status()
# Create soup object, set results to desired elements
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("a", {"class":"articleLink_1iblg8d"})
#print(results)
# Set up lists to store scraped data
titles = []
urls = []
imageURLS = []
blurbs = []
# Loop through the sub elements, extract relevant data, add to arrays
for result in results:
title_element = result.get("title")
if title_element:
titles.append(title_element)
print(title_element)
link = result.get('href')
if link:
urls.append(link)
print(link)
article_page = requests.get(link)
article_soup = BeautifulSoup(article_page.content, "html.parser")
# Find the picture tag with the specific class
picture_tag = article_soup.find("picture", class_="base_1emrqjj")
image_url = None
if picture_tag:
# Find the nested <img> tag within <picture>
img_tag = picture_tag.find("img")
if img_tag and img_tag.has_attr('src'):
image_url = img_tag['src']
print(image_url)
imageURLS.append(image_url)
# Extract the blurb (article description)
blurb_tag = article_soup.find("meta", {"name": "description"})
blurb = blurb_tag['content'] if blurb_tag else "No blurb found"
print(blurb)
blurbs.append(blurb)
return titles, urls, imageURLS, blurbs
except requests.exceptions.RequestException as e:
print(f"An error occured: {e}")
return None
URL1 = "https://insidetheiggles.com/philadelphia-eagles-news/"
URL2 = "https://thesixersense.com/philadelphia-76ers-news/"
URL3 = "https://thatballsouttahere.com/philadelphia-phillies-news/"
URL4 = "https://broadstreetbuzz.com/philadelphia-flyers-news/"
titles5, urls5, imageURLS5, blurbs5 = scrape_website(URL1)
titles5a, urls5a, imageURLS5a, blurbs5a = scrape_website(URL2)
titles5b, urls5b, blurbs5b, imageURLS5b = scrape_website(URL3)
titles5c, urls5c, imageURLS5c, blurbs5c = scrape_website(URL4)
# print(titles5[0])