-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scrapper.py
68 lines (41 loc) · 1.79 KB
/
Scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import project_helper
from bs4 import BeautifulSoup
import pandas as pd
import tqdm
cik_lookup = {
'AMZN': '0001018724',
'BMY': '0000014272',
'CNP': '0001130310',
'CVX': '0000093410',
'FL': '0000850209',
'FRT': '0000034903',
'HON': '0000773840'}
sec_api = project_helper.SecAPI()
def get_sec_data(cik, doc_type, start=0, count=60):
newest_pricing_data = pd.to_datetime('2018-01-01')
rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
'&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
.format(cik, doc_type, start, count)
sec_data = sec_api.get(rss_url)
feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
entries = [
(
entry.content.find('filing-href').getText(),
entry.content.find('filing-type').getText(),
entry.content.find('filing-date').getText())
for entry in feed.find_all('entry', recursive=False)
if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
return entries
example_ticker = 'AMZN'
sec_data = {}
for ticker, cik in cik_lookup.items():
sec_data[ticker] = get_sec_data(cik, '10-K')
print(sec_data[example_ticker][:5])
raw_fillings_by_ticker = {}
for ticker, data in sec_data.items():
raw_fillings_by_ticker[ticker] = {}
for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
if (file_type == '10-K'):
file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')
raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)
print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))