-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeedback_channels_scraper.py
91 lines (68 loc) · 2.93 KB
/
feedback_channels_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
def scrape_feedback_channels(url):
try:
response = requests.get(f"http://{url}", timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
feedback_channels = {
"Email": 0,
"Feedback Form": 0,
"Social Media": 0,
"Discussion Forum": 0,
"Like/Rate/Fav": 0
}
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
if email_pattern.search(response.text):
feedback_channels["Email"] = 1
forms = soup.find_all('form')
for form in forms:
if any(keyword in form.get_text().lower() for keyword in ['feedback', 'contact']):
feedback_channels["Feedback Form"] = 1
break
social_media_domains = ['twitter.com', 'facebook.com', 'linkedin.com', 'instagram.com']
links = soup.find_all('a', href=True)
for link in links:
if any(domain in link['href'] for domain in social_media_domains):
feedback_channels["Social Media"] = 1
break
for link in links:
if any(keyword in link['href'].lower() for keyword in ['forum', 'discussion', 'community']):
feedback_channels["Discussion Forum"] = 1
break
buttons = soup.find_all(['button', 'a'])
for button in buttons:
if any(keyword in button.get_text().lower() for keyword in ['like', 'rate', 'favorite']):
feedback_channels["Like/Rate/Fav"] = 1
break
return feedback_channels
except requests.exceptions.RequestException as e:
print(f"Error fetching URL {url}: {e}")
return None
def main():
# Load Excel file
file_path = "open_data_portals.xlsx" # Replace with the path to your Excel file
df = pd.read_excel(file_path)
# Create a results list
results = []
for url in df['Open Data Portal URL']:
print(f"Scraping {url}...")
channels = scrape_feedback_channels(url)
if channels:
total_channels = sum(channels.values()) # Calculate the total feedback channels found
results.append({
"URL": url,
"Email": channels["Email"],
"Feedback Form": channels["Feedback Form"],
"Social Media": channels["Social Media"],
"Discussion Forum": channels["Discussion Forum"],
"Like/Rate/Fav": channels["Like/Rate/Fav"],
"Total Feedback Channels": total_channels
})
results_df = pd.DataFrame(results)
results_df.to_excel("feedback_channels_summary.xlsx", index=False)
print("Scraping complete! Results saved to 'feedback_channels_summary.xlsx'.")
if __name__ == "__main__":
main()