-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataprocessing.py
235 lines (185 loc) · 8.32 KB
/
dataprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import json
import requests
import flask
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from random import randrange
app = flask.Flask(__name__)
keyword = ""
def gen_scraper(url):
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
# http request for site
webpage = requests.get(url, headers=HEADERS)
if webpage.status_code > 500:
print("Page was blocked.")
return None
soup = BeautifulSoup(webpage.content, "lxml")
# obtaining domain of website
domain = urlparse(url).netloc
return (soup, domain)
def company_scraper(soup, domain):
amazon_attributes = {"product_attr": ["span", {"id": 'productTitle'}],
"company_attr": ["a", {"id": 'bylineInfo'}]}
walmart_attributes = {"product_attr": ["h1", {"itemprop": 'name'}],
"company_attr": ["a", {"class": 'bg-transparent bn lh-solid pa0 sans-serif tc underline inline-button mid-gray pointer f6'}]}
if domain == "www.amazon.com":
attributes_to_use = amazon_attributes
elif domain == "www.walmart.com":
attributes_to_use = walmart_attributes
else:
print("Website not yet supported.")
return None
# get name of the product
product = {}
# get name of product
try:
prod_title = soup.find(attributes_to_use["product_attr"][0],
attrs=attributes_to_use["product_attr"][1])
prod_title_val = prod_title.string
prod_title_string = prod_title_val.strip().replace(',', '')
except AttributeError:
prod_title_string = "NA"
# get name of company
try:
company_title = soup.find(attributes_to_use["company_attr"][0],
attrs=attributes_to_use["company_attr"][1])
company_title_val = company_title.string
company_title_string = company_title_val.strip().replace(",", '')
except AttributeError:
company_title_string = "NA"
# get metadata about the product
# try:
# metadata_formatted_list = []
# metadata_list = soup.find_all(attributes_to_use["metadata"][0],
# attrs = attributes_to_use["metadata"][1])
# for m in metadata_list:
# m.string
# metadata_formatted_list.append(m)
# except AttributeError:
# metadata_formatted_list = []
product.update({"product": prod_title_string})
product.update({"company": company_title_string})
product.update({"domain": domain})
# product.update({"metadata": metadata_formatted_list})
return product
def api_request_verification(url):
try:
request = requests.get(url)
except requests.exceptions.HTTPError as e:
print(e.response.text)
return None
response_json = request.json()
#formatted_response = json.dumps(response_json, indent = 2)
return response_json
def call_disaster_api(json_obj):
if json_obj == None:
print("No JSON object.")
return None
formatted_json = json.dumps(json_obj, indent = 2)
print(formatted_json)
def call_charity_api(json_obj):
# printing to test for values
formatted_json = json.dumps(json_obj, indent = 2)
print (formatted_json)
# add links, urls, etc.
# choose at random which should be matched
def search_charities(url):
...
def call_esg_api(domain):
esg_scores = {}
if domain == "www.amazon.com":
esg_scores = {'total_risk': 30,
'environment_risk': 5.6,
'social_risk': 14.8,
'governance_risk': 9.9,
'string': 'high'}
return esg_scores
elif domain == "www.walmart.com":
esg_scores = {'total_risk': 24.6,
'environment_risk': 4.4,
'social_risk': 13.9,
'governance_risk': 6.3,
'string': 'medium'}
return esg_scores
else:
print("Company not supported yet.")
return None
# esg_reports = {}
# domain_clean = domain.replace('www.', '') # not general enough
# domain_clean = domain.replace('.com','') # not general enough
# # making a call to the api
# esg_request_url1 = "https://tf689y3hbj.execute-api.us-east-1.amazonaws.com/prod/authorization/search?q="
# esg_request_url2 = "&token=908167892ef856ccda3872036d98ce10"
# # first trying with specific vendor of the product
# try:
# r = requests.get(esg_request_url1 + company + esg_request_url2)
# except requests.exceptions.HTTPError as e:
# print(e.response.text)
# # trying with e-commerce host (i.e. amazon) if company does not have ESG rating
# try:
# r = requests.get(esg_request_url1 + domain_clean + esg_request_url2)
# except requests.exceptions.HTTPError as e:
# print(e.response.text)
# return None
# # if a successful call is made
# response_json = r.json()
# esg_reports.add() # total, env, social, gov
# implementing routes
@app.route("/products", methods=["POST"])
def create_product_info():
file1 = open("/Users/karfania/Downloads/user-info.json", 'r')
text = json.load(file1)
url = text["url"]
scrape_tuple = gen_scraper(url)
product = company_scraper(scrape_tuple[0], scrape_tuple[1])
print(product['product'])
print(product['company'])
print(product['domain'])
return flask.jsonify({'product': product['product'],
'company': product['company'],
'domain': product['domain']})
@app.route("/disasters", methods=["POST"])
def create_disaster_info():
natural_disasters = api_request_verification("https://api.reliefweb.int/v1/reports?appname=apidoc&limit=10")
return natural_disasters
@app.route("/search", methods=["GET"])
def get_charity_search():
keyword = flask.request.args.get('keyword')
@app.route("/charities", methods=["POST"])
def create_charity_info():
if not keyword == "":
charity_list = api_request_verification("https://partners.every.org/v0.2/browse/" + keyword + "?apiKey=420cbc91eca55c118a8c839f968c7af9")
if charity_list == None:
charity_list = api_request_verification("https://partners.every.org/v0.2/browse/environment?apiKey=420cbc91eca55c118a8c839f968c7af9")
else:
charity_list = api_request_verification("https://partners.every.org/v0.2/browse/environment?apiKey=420cbc91eca55c118a8c839f968c7af9")
return charity_list
@app.route("/esg", methods=["POST"])
def generate_esg_risk():
file1 = open("/Users/karfania/Downloads/user-info.json", 'r')
text = json.load(file1)
url = text["url"]
domain = urlparse(url).netloc
esg = call_esg_api(domain)
return flask.jsonify({'total_risk': esg['total_risk'],
'environment_risk': esg['environment_risk'],
'social_risk': esg['social_risk'],
'governance_risk': esg['governance_risk'],
'string': esg['string']})
def main():
# esg_rating = call_esg_api(product.get(company), product.get(domain))
app.run(host="localhost", port =8080)
# # upon opening the extension, call the donation and disaster APIs
# natural_disasters = api_request_verification("https://api.reliefweb.int/v1/reports?appname=apidoc&limit=10")
# call_disaster_api(natural_disasters)
# #disaster_matching(natural_disasters) -- later
charity_list = api_request_verification("https://partners.every.org/v0.2/browse/environment?apiKey=420cbc91eca55c118a8c839f968c7af9")
call_charity_api(charity_list)
# after the above 'pre-processing':
# imagine extension is at a url for a product on amazon
scrape_tuple = gen_scraper("https://www.amazon.com/Natures-Bounty-Supplement-Supporting-Cardiovascular/dp/B000NPYY04/ref=zg-bs_6943343011_sccl_2/147-4502840-2384645?pd_rd_w=xMGu7&content-id=amzn1.sym.7cc5d8c9-0275-49fe-af63-f984f7dc4330&pf_rd_p=7cc5d8c9-0275-49fe-af63-f984f7dc4330&pf_rd_r=R7923MPZV4JSFGXK22YK&pd_rd_wg=lBV2i&pd_rd_r=e91f94bd-f44a-41c4-9ce5-48cd3b0a0e70&pd_rd_i=B000NPYY04&psc=1")
product = company_scraper(scrape_tuple[0], scrape_tuple[1])
if __name__ == '__main__':
main()