Skip to content

Commit

Permalink
ENH: Use IEX Trading data instead of pandas-datareader (quantopian#2031)
Browse files Browse the repository at this point in the history
* ENH: Use IEX Trading data instead of pandas-datareader

* MAINT: Add attribution for IEX
  • Loading branch information
Freddie Vargus authored and tibkiss committed Apr 7, 2018
1 parent 6d25653 commit 2e3cecb
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 192 deletions.
2 changes: 1 addition & 1 deletion zipline/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ def _create_benchmark_source(self):
else:
benchmark_asset = None
# get benchmark info from trading environment, which defaults to
# downloading data from Yahoo.
# downloading data from IEX Trading.
benchmark_returns = self.trading_environment.benchmark_returns
return BenchmarkSource(
benchmark_asset=benchmark_asset,
Expand Down
37 changes: 20 additions & 17 deletions zipline/data/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,32 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import pandas as pd
import requests

from zipline.utils.calendars import get_calendar
import pandas_datareader.data as web


def get_benchmark_returns(symbol, start_date, end_date):
def get_benchmark_returns(symbol):
"""
Get a Series of benchmark returns from Google finance.
Get a Series of benchmark returns from IEX associated with `symbol`.
Default is `SPY`.
Returns a Series with returns from (start_date, end_date].
Parameters
----------
symbol : str
Benchmark symbol for which we're getting the returns.
start_date is **not** included because we need the close from day N - 1 to
compute the returns for day N.
The data is provided by IEX (https://iextrading.com/), and we can
get up to 5 years worth of data.
"""
df = web.DataReader(symbol, 'google', start_date, end_date)
df.index = df.index.tz_localize('UTC')
r = requests.get(
'https://api.iextrading.com/1.0/stock/{}/chart/5y'.format(symbol)
)
data = json.loads(r.text)

calendar = get_calendar("NYSE")
start_index = calendar.all_sessions.searchsorted(start_date)
end_index = calendar.all_sessions.searchsorted(end_date)
df = pd.DataFrame(data)

# fill price data for missing dates
df = df["Close"].reindex(calendar.all_sessions[start_index:end_index],
method='ffill')
df.index = pd.DatetimeIndex(df['date'])
df = df['close']

return df.pct_change(1).iloc[1:]
return df.sort_index().tz_localize('UTC').pct_change(1).iloc[1:]
218 changes: 44 additions & 174 deletions zipline/data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from collections import OrderedDict

import logbook
import pandas as pd
from pandas_datareader.data import DataReader
import pytz
from six import iteritems
from six.moves.urllib_error import HTTPError

from .benchmarks import get_benchmark_returns
Expand All @@ -28,9 +24,9 @@
cache_root,
data_root,
)
from ..utils.deprecate import deprecated
from zipline.utils.calendars import get_calendar


logger = logbook.Logger('Loader')

# Mapping from index symbol to appropriate bond data
Expand Down Expand Up @@ -97,7 +93,7 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
Load benchmark returns and treasury yield curves for the given calendar and
benchmark symbol.
Benchmarks are downloaded as a Series from Yahoo Finance. Treasury curves
Benchmarks are downloaded as a Series from IEX Trading. Treasury curves
are US Treasury Bond rates and are downloaded from 'www.federalreserve.gov'
by default. For Canadian exchanges, a loader for Canadian bonds from the
Bank of Canada is also available.
Expand All @@ -115,8 +111,8 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
A calendar of trading days. Also used for determining what cached
dates we should expect to have cached. Defaults to the NYSE calendar.
bm_symbol : str, optional
Symbol for the benchmark index to load. Defaults to 'SPY', the Google
ticker for the SPDR S&P 500 ETF.
Symbol for the benchmark index to load. Defaults to 'SPY', the ticker
for the S&P 500, provided by IEX Trading.
Returns
-------
Expand All @@ -139,21 +135,8 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
first_date = trading_days[0]
now = pd.Timestamp.utcnow()

# We expect to have benchmark and treasury data that's current up until
# **two** full trading days prior to the most recently completed trading
# day.
# Example:
# On Thu Oct 22 2015, the previous completed trading day is Wed Oct 21.
# However, data for Oct 21 doesn't become available until the early morning
# hours of Oct 22. This means that there are times on the 22nd at which we
# cannot reasonably expect to have data for the 21st available. To be
# conservative, we instead expect that at any time on the 22nd, we can
# download data for Tuesday the 20th, which is two full trading days prior
# to the date on which we're running a test.

# We'll attempt to download new data if the latest entry in our cache is
# before this date.
last_date = trading_days[trading_days.get_loc(now, method='ffill') - 2]
# we will fill missing benchmark data through latest trading date
last_date = trading_days[trading_days.get_loc(now, method='ffill')]

br = ensure_benchmark_data(
bm_symbol,
Expand All @@ -172,6 +155,12 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
now,
environ,
)

# combine dt indices and reindex using ffill then bfill
all_dt = br.index.union(tc.index)
br = br.reindex(all_dt, method='ffill').fillna(method='bfill')
tc = tc.reindex(all_dt, method='ffill').fillna(method='bfill')

benchmark_returns = br[br.index.slice_indexer(first_date, last_date)]
treasury_curves = tc[tc.index.slice_indexer(first_date, last_date)]
return benchmark_returns, treasury_curves
Expand Down Expand Up @@ -215,20 +204,28 @@ def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day,

# If no cached data was found or it was missing any dates then download the
# necessary data.
logger.info('Downloading benchmark data for {symbol!r}.', symbol=symbol)
logger.info(
('Downloading benchmark data for {symbol!r} '
'from {first_date} to {last_date}'),
symbol=symbol,
first_date=first_date - trading_day,
last_date=last_date
)

try:
data = get_benchmark_returns(
symbol,
first_date - trading_day,
last_date,
)
data = get_benchmark_returns(symbol)
data.to_csv(get_data_filepath(filename, environ))
except (OSError, IOError, HTTPError):
logger.exception('failed to cache the new benchmark returns')
logger.exception('Failed to cache the new benchmark returns')
raise
if not has_data_for_dates(data, first_date, last_date):
logger.warn("Still don't have expected data after redownload!")
logger.warn(
("Still don't have expected benchmark data for {symbol!r} "
"from {first_date} to {last_date} after redownload!"),
symbol=symbol,
first_date=first_date - trading_day,
last_date=last_date
)
return data


Expand Down Expand Up @@ -271,15 +268,27 @@ def ensure_treasury_data(symbol, first_date, last_date, now, environ=None):

# If no cached data was found or it was missing any dates then download the
# necessary data.
logger.info('Downloading treasury data for {symbol!r}.', symbol=symbol)
logger.info(
('Downloading treasury data for {symbol!r} '
'from {first_date} to {last_date}'),
symbol=symbol,
first_date=first_date,
last_date=last_date
)

try:
data = loader_module.get_treasury_data(first_date, last_date)
data.to_csv(get_data_filepath(filename, environ))
except (OSError, IOError, HTTPError):
logger.exception('failed to cache treasury data')
if not has_data_for_dates(data, first_date, last_date):
logger.warn("Still don't have expected data after redownload!")
logger.warn(
("Still don't have expected treasury data for {symbol!r} "
"from {first_date} to {last_date} after redownload!"),
symbol=symbol,
first_date=first_date,
last_date=last_date
)
return data


Expand All @@ -297,7 +306,8 @@ def _load_cached_data(filename, first_date, last_date, now, resource_name,
# yet, so don't try to read from 'path'.
if os.path.exists(path):
try:
data = from_csv(path).tz_localize('UTC')
data = from_csv(path)
data.index = data.index.to_datetime().tz_localize('UTC')
if has_data_for_dates(data, first_date, last_date):
return data

Expand Down Expand Up @@ -331,146 +341,6 @@ def _load_cached_data(filename, first_date, last_date, now, resource_name,
return None


def _load_raw_yahoo_data(indexes=None, stocks=None, start=None, end=None):
"""Load closing prices from yahoo finance.
:Optional:
indexes : dict (Default: {'SPX': 'SPY'})
Financial indexes to load.
stocks : list (Default: ['AAPL', 'GE', 'IBM', 'MSFT',
'XOM', 'AA', 'JNJ', 'PEP', 'KO'])
Stock closing prices to load.
start : datetime (Default: datetime(1993, 1, 1, 0, 0, 0, 0, pytz.utc))
Retrieve prices from start date on.
end : datetime (Default: datetime(2002, 1, 1, 0, 0, 0, 0, pytz.utc))
Retrieve prices until end date.
:Note:
This is based on code presented in a talk by Wes McKinney:
http://wesmckinney.com/files/20111017/notebook_output.pdf
"""
assert indexes is not None or stocks is not None, """
must specify stocks or indexes"""

if start is None:
start = pd.datetime(1990, 1, 1, 0, 0, 0, 0, pytz.utc)

if start is not None and end is not None:
assert start < end, "start date is later than end date."

data = OrderedDict()

if stocks is not None:
for stock in stocks:
logger.info('Loading stock: {}'.format(stock))
stock_pathsafe = stock.replace(os.path.sep, '--')
cache_filename = "{stock}-{start}-{end}.csv".format(
stock=stock_pathsafe,
start=start,
end=end).replace(':', '-')
cache_filepath = get_cache_filepath(cache_filename)
if os.path.exists(cache_filepath):
stkd = pd.DataFrame.from_csv(cache_filepath)
else:
stkd = DataReader(stock, 'yahoo', start, end).sort_index()
stkd.to_csv(cache_filepath)
data[stock] = stkd

if indexes is not None:
for name, ticker in iteritems(indexes):
logger.info('Loading index: {} ({})'.format(name, ticker))
stkd = DataReader(ticker, 'yahoo', start, end).sort_index()
data[name] = stkd

return data


def load_from_yahoo(indexes=None,
stocks=None,
start=None,
end=None,
adjusted=True):
"""
Loads price data from Yahoo into a dataframe for each of the indicated
assets. By default, 'price' is taken from Yahoo's 'Adjusted Close',
which removes the impact of splits and dividends. If the argument
'adjusted' is False, then the non-adjusted 'close' field is used instead.
:param indexes: Financial indexes to load.
:type indexes: dict
:param stocks: Stock closing prices to load.
:type stocks: list
:param start: Retrieve prices from start date on.
:type start: datetime
:param end: Retrieve prices until end date.
:type end: datetime
:param adjusted: Adjust the price for splits and dividends.
:type adjusted: bool
"""
data = _load_raw_yahoo_data(indexes, stocks, start, end)
if adjusted:
close_key = 'Adj Close'
else:
close_key = 'Close'
df = pd.DataFrame({key: d[close_key] for key, d in iteritems(data)})
df.index = df.index.tz_localize(pytz.utc)
return df


@deprecated(
'load_bars_from_yahoo is deprecated, please register a'
' yahoo_equities data bundle instead',
)
def load_bars_from_yahoo(indexes=None,
stocks=None,
start=None,
end=None,
adjusted=True):
"""
Loads data from Yahoo into a panel with the following
column names for each indicated security:
- open
- high
- low
- close
- volume
- price
Note that 'price' is Yahoo's 'Adjusted Close', which removes the
impact of splits and dividends. If the argument 'adjusted' is True, then
the open, high, low, and close values are adjusted as well.
:param indexes: Financial indexes to load.
:type indexes: dict
:param stocks: Stock closing prices to load.
:type stocks: list
:param start: Retrieve prices from start date on.
:type start: datetime
:param end: Retrieve prices until end date.
:type end: datetime
:param adjusted: Adjust open/high/low/close for splits and dividends.
The 'price' field is always adjusted.
:type adjusted: bool
"""
data = _load_raw_yahoo_data(indexes, stocks, start, end)
panel = pd.Panel(data)
# Rename columns
panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price']
panel.major_axis = panel.major_axis.tz_localize(pytz.utc)
# Adjust data
if adjusted:
adj_cols = ['open', 'high', 'low', 'close']
for ticker in panel.items:
ratio = (panel[ticker]['price'] / panel[ticker]['close'])
ratio_filtered = ratio.fillna(0).values
for col in adj_cols:
panel[ticker][col] *= ratio_filtered
return panel


def load_prices_from_csv(filepath, identifier_col, tz='UTC'):
data = pd.read_csv(filepath, index_col=identifier_col)
data.index = pd.DatetimeIndex(data.index, tz=tz)
Expand Down

0 comments on commit 2e3cecb

Please sign in to comment.