-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwsj.py
200 lines (162 loc) · 7.77 KB
/
wsj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import requests
import logging
from bs4 import BeautifulSoup
from common import BadResponse, parse_to_int, to_file
headers = {
"Host" : "www.wsj.com",
"Referer" : "https://www.wsj.com",
"Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; rv:45.0) Gecko/20100101 Firefox/45.0",
}
class CompanyData:
url : str = ""
name : str = ""
#overview data
market_value = None
shares_outstanding = None
public_float = None
#balance sheet data
assets_currency_type = None #Will hold something like "CAD"
net_property_plant_and_equipment = None
total_assets = None
total_liabilities = None
net_goodwill = None
def __init__(self, name : str, url : str):
self.name = name
self.url = url
def get_fair_value(self)->float or None:
res = None
try:
AA = self.total_assets - self.net_property_plant_and_equipment - self.net_goodwill - self.total_liabilities
res = AA / self.shares_outstanding
except TypeError:
logging.warning("Cannot calculate fair_value of " + self.name)
return res
def to_str(self):
return str(self.__dict__)
def get_company_data(name: str, url: str) -> CompanyData or None:
"""Creates a CompanyData object and tries to fill all its datapoints, return None if essential data is not found"""
company_data = CompanyData(name, url)
overview_data = get_overview_data(url)
#Check for essential overview data
if (not overview_data["market_value"] or
(not overview_data["shares_outstanding"] and not overview_data["public_float"])
):
logging.debug("Requested data not found on company overview " + url + ", aborting")
return None
company_data.market_value = overview_data["market_value"]
company_data.shares_outstanding = overview_data["shares_outstanding"]
company_data.public_float = overview_data["public_float"]
#Perhaps not needed to explicitly delete, could just ignore later
if company_data.shares_outstanding != None and company_data.public_float != None:
company_data.public_float = None
balance_sheet_url = url + "/financials/annual/balance-sheet"
balance_sheet_data = get_balance_sheet_data(balance_sheet_url)
company_data.assets_currency_type = balance_sheet_data["assets_currency_type"]
company_data.net_property_plant_and_equipment = balance_sheet_data["net_property_plant_and_equipment"]
company_data.total_assets = balance_sheet_data["total_assets"]
company_data.total_liabilities = balance_sheet_data["total_liabilities"]
company_data.net_goodwill = balance_sheet_data["net_goodwill"]
#Check for essential balance sheet data
if not company_data.net_goodwill or not company_data.total_assets or not company_data.net_property_plant_and_equipment or not company_data.total_liabilities:
logging.debug("Requested data not found on balance sheet " + url + ", aborting")
return None
return company_data
def get_company_list_page(page: int=1):
base = "https://www.wsj.com/market-data/quotes/company-list/country/united-states/"
url = base + str(page)
return __get_html(url)
def get_overview_data(url: str):
"""Returns dict with shares_outstanding, public_float and market_value"""
html = __get_html(url)
soup = BeautifulSoup(html, features="html.parser")
output = {
"shares_outstanding": None,
"public_float": None,
"market_value":None,
}
market_value_elems = soup.select("[class*=WSJTheme--cr_num] *")
if len(market_value_elems) > 1:
output["market_value"] = parse_to_int(market_value_elems[1].decode_contents())
key_stock_data = soup.select("[class*=cr_data_field]")
for entry in key_stock_data:
data_label = entry.select_one("[class*=data_lbl]").decode_contents().strip()
data_value_html = entry.select_one("[class*=data_data]")
if not data_value_html:
continue
data_value = data_value_html.decode_contents()
if data_label == "Shares Outstanding":
output["shares_outstanding"] = parse_to_int(data_value)
elif data_label == "Public Float":
output["public_float"] = parse_to_int(data_value)
return output
def get_balance_sheet_data(url : str):
"""Returns dict with assets_currency_type, net_property_plant_and_equipment, total_assets, total_liabilities and net_goodwill"""
html = __get_html(url)
soup = BeautifulSoup(html, features="html.parser")
output = {
"assets_currency_type": None,
"net_property_plant_and_equipment": None,
"total_assets" : None,
"total_liabilities": None,
"net_goodwill": None,
}
tables = soup.select(".cr_dataTable")
if len(tables) == 0:
logging.debug("Failed to get balance_sheet_data from " + url)
return output
assets_table = tables[0]
table_header = assets_table.find(class_="fiscalYr")
#Something like "Fiscal year is November-October. All values CAD Thousands."
table_header_str = table_header.decode_contents()
dot_pos = table_header_str.find(".")
currency_value_str = table_header_str[dot_pos+1 : len(table_header_str)-1]
arr = currency_value_str.strip().split(" ")
assets_amount_type = ""
if len(arr) != 4:
logging.info("Failed to get assets_currency_type and assets_amount_type from " + url)
return output
else:
output["assets_currency_type"] = arr[2]
assets_amount_type = arr[3]
for row in assets_table.select(".cr_dataTable tr"):
#@todo: process strings to numbers?
table_data = row.find_all("td")
if len(table_data) == 0:
continue
elif "Net Property, Plant" in table_data[0].decode_contents():
output["net_property_plant_and_equipment"] = parse_to_int(table_data[1].decode_contents(), assets_amount_type)
elif table_data[0].decode_contents().strip() == "Total Assets":
output["total_assets"] = parse_to_int(table_data[1].decode_contents(), assets_amount_type)
elif table_data[0].decode_contents().strip() == "Net Goodwill":
net_goodwill = table_data[1].decode_contents().strip();
output["net_goodwill"] = parse_to_int(net_goodwill, assets_amount_type) if net_goodwill != "-" else None
liabilities_table = tables[1]
for row in liabilities_table.select(".cr_dataTable tr"):
#@todo: process strings to numbers?
table_data = row.find_all("td")
if len(table_data) == 0:
continue
elif "Total Liabilities" in table_data[0].decode_contents():
output["total_liabilities"] = parse_to_int(table_data[1].decode_contents(), assets_amount_type)
break
return output
def get_links_from_company_list_page(page: int):
"""Returns list of dicts, containing a name and url."""
html = get_company_list_page(page)
soup = BeautifulSoup(html, features="html.parser")
company_links = soup.select(".cl-table a")
results = list(map(lambda a: {"name": a.find(class_="cl-name").decode_contents(),"url": a["href"]}, company_links))
return results
def get_company_list_page_count():
html = get_company_list_page()
soup = BeautifulSoup(html, features="html.parser")
pagination_list_items = soup.select(".cl-pagination li a")
#@todo Check if final item content == next for sanity
last_index = len(pagination_list_items)-2 # -2 because final entry will contain "next"
return int(pagination_list_items[last_index].decode_contents().split("-")[-1])
def __get_html(url : str):
res = requests.get(url, allow_redirects=True, headers=headers)
if (res.status_code != 200):
raise BadResponse(res.status_code, url)
return res.text