-
Notifications
You must be signed in to change notification settings - Fork 0
/
FCA scrape.py
66 lines (49 loc) · 2.21 KB
/
FCA scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
# coding: utf-8
# In[3]:
import pandas as pd
import requests
from io import BytesIO
import sqlite3
from pandas.tseries.offsets import BDay
# Function to download the Excel file from the FCA website
def download_fca_short_positions():
url = "https://www.fca.org.uk/publication/data/short-positions-daily-update.xlsx"
response = requests.get(url)
if response.status_code == 200:
return BytesIO(response.content)
else:
raise ValueError("Failed to download the data from the FCA website.")
#read the file into a pandas dataframe and format the dates
df = pd.read_excel(download_fca_short_positions(), sheet_name=1)
df['Position Date'] = pd.to_datetime(df['Position Date'], format='%d/%m/%Y')
#set dates as index
df.set_index('Position Date', inplace=True)
# Group by 'Position Holder' and 'ISIN', and resample to business days
df = df.groupby(['Position Holder', 'ISIN']).resample('B').first()
#Insert rows for dates between disclosures
df=df.drop(['Position Holder', 'ISIN'], axis=1)
df.reset_index(inplace=True)
#fill new rows with last known disclosed value up until the position is considered closed
df['Net Short Position (%)'] = df.groupby(['Position Holder', 'ISIN'])['Net Short Position (%)'].fillna(method='ffill')
# Sort the DataFrame by 'Position Holder', 'ISIN', and 'Position Date'
#df.sort_values(by=['Position Holder', 'ISIN', 'Position Date'], inplace=True)
# Group by 'ISIN' and 'Position Date' and aggregate to get total % short and total # of funds disclosed
result_df = df.groupby(['ISIN', 'Position Date']).agg({
'Net Short Position (%)': 'sum',
'Position Holder': 'nunique' # Count unique funds disclosed
}).reset_index()
# Rename columns for clarity
result_df.rename(columns={
'Net Short Position (%)': 'Total % Short',
'Position Holder': 'Total # of Funds Disclosed'
}, inplace=True)
# Store the result_df DataFrame in a SQLite database table
database_file = 'fca_short_scrape.db'
table_name = 'short_positions_summary'
# Connect to the SQLite database
conn = sqlite3.connect(database_file)
# Store the DataFrame in the SQLite database
result_df.to_sql(table_name, conn, if_exists='replace', index=False)
# Close the database connection
conn.close()