ENH: Use IEX Trading data instead of pandas-datareader (quantopian#2031)

* ENH: Use IEX Trading data instead of pandas-datareader * MAINT: Add attribution for IEX
zipline-live · Apr 7, 2018 · 2e3cecb · 2e3cecb
1 parent 6d25653
commit 2e3cecb
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 192 deletions.
diff --git a/zipline/algorithm.py b/zipline/algorithm.py
@@ -548,7 +548,7 @@ def _create_benchmark_source(self):
         else:
             benchmark_asset = None
             # get benchmark info from trading environment, which defaults to
-            # downloading data from Yahoo.
+            # downloading data from IEX Trading.
             benchmark_returns = self.trading_environment.benchmark_returns
         return BenchmarkSource(
             benchmark_asset=benchmark_asset,

diff --git a/zipline/data/benchmarks.py b/zipline/data/benchmarks.py
@@ -12,29 +12,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
+import pandas as pd
+import requests
 
-from zipline.utils.calendars import get_calendar
-import pandas_datareader.data as web
 
-
-def get_benchmark_returns(symbol, start_date, end_date):
+def get_benchmark_returns(symbol):
     """
-    Get a Series of benchmark returns from Google finance.
+    Get a Series of benchmark returns from IEX associated with `symbol`.
+    Default is `SPY`.
 
-    Returns a Series with returns from (start_date, end_date].
+    Parameters
+    ----------
+    symbol : str
+        Benchmark symbol for which we're getting the returns.
 
-    start_date is **not** included because we need the close from day N - 1 to
-    compute the returns for day N.
+    The data is provided by IEX (https://iextrading.com/), and we can
+    get up to 5 years worth of data.
     """
-    df = web.DataReader(symbol, 'google', start_date, end_date)
-    df.index = df.index.tz_localize('UTC')
+    r = requests.get(
+        'https://api.iextrading.com/1.0/stock/{}/chart/5y'.format(symbol)
+    )
+    data = json.loads(r.text)
 
-    calendar = get_calendar("NYSE")
-    start_index = calendar.all_sessions.searchsorted(start_date)
-    end_index = calendar.all_sessions.searchsorted(end_date)
+    df = pd.DataFrame(data)
 
-    # fill price data for missing dates
-    df = df["Close"].reindex(calendar.all_sessions[start_index:end_index],
-                             method='ffill')
+    df.index = pd.DatetimeIndex(df['date'])
+    df = df['close']
 
-    return df.pct_change(1).iloc[1:]
+    return df.sort_index().tz_localize('UTC').pct_change(1).iloc[1:]
diff --git a/zipline/data/loader.py b/zipline/data/loader.py
@@ -13,13 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from collections import OrderedDict
 
 import logbook
 import pandas as pd
-from pandas_datareader.data import DataReader
-import pytz
-from six import iteritems
 from six.moves.urllib_error import HTTPError
 
 from .benchmarks import get_benchmark_returns
@@ -28,9 +24,9 @@
     cache_root,
     data_root,
 )
-from ..utils.deprecate import deprecated
 from zipline.utils.calendars import get_calendar
 
+
 logger = logbook.Logger('Loader')
 
 # Mapping from index symbol to appropriate bond data
@@ -97,7 +93,7 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
     Load benchmark returns and treasury yield curves for the given calendar and
     benchmark symbol.
 
-    Benchmarks are downloaded as a Series from Yahoo Finance.  Treasury curves
+    Benchmarks are downloaded as a Series from IEX Trading.  Treasury curves
     are US Treasury Bond rates and are downloaded from 'www.federalreserve.gov'
     by default.  For Canadian exchanges, a loader for Canadian bonds from the
     Bank of Canada is also available.
@@ -115,8 +111,8 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
         A calendar of trading days.  Also used for determining what cached
         dates we should expect to have cached. Defaults to the NYSE calendar.
     bm_symbol : str, optional
-        Symbol for the benchmark index to load.  Defaults to 'SPY', the Google
-        ticker for the SPDR S&P 500 ETF.
+        Symbol for the benchmark index to load. Defaults to 'SPY', the ticker
+        for the S&P 500, provided by IEX Trading.
 
     Returns
     -------
@@ -139,21 +135,8 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
     first_date = trading_days[0]
     now = pd.Timestamp.utcnow()
 
-    # We expect to have benchmark and treasury data that's current up until
-    # **two** full trading days prior to the most recently completed trading
-    # day.
-    # Example:
-    # On Thu Oct 22 2015, the previous completed trading day is Wed Oct 21.
-    # However, data for Oct 21 doesn't become available until the early morning
-    # hours of Oct 22.  This means that there are times on the 22nd at which we
-    # cannot reasonably expect to have data for the 21st available.  To be
-    # conservative, we instead expect that at any time on the 22nd, we can
-    # download data for Tuesday the 20th, which is two full trading days prior
-    # to the date on which we're running a test.
-
-    # We'll attempt to download new data if the latest entry in our cache is
-    # before this date.
-    last_date = trading_days[trading_days.get_loc(now, method='ffill') - 2]
+    # we will fill missing benchmark data through latest trading date
+    last_date = trading_days[trading_days.get_loc(now, method='ffill')]
 
     br = ensure_benchmark_data(
         bm_symbol,
@@ -172,6 +155,12 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
         now,
         environ,
     )
+
+    # combine dt indices and reindex using ffill then bfill
+    all_dt = br.index.union(tc.index)
+    br = br.reindex(all_dt, method='ffill').fillna(method='bfill')
+    tc = tc.reindex(all_dt, method='ffill').fillna(method='bfill')
+
     benchmark_returns = br[br.index.slice_indexer(first_date, last_date)]
     treasury_curves = tc[tc.index.slice_indexer(first_date, last_date)]
     return benchmark_returns, treasury_curves
@@ -215,20 +204,28 @@ def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day,
 
     # If no cached data was found or it was missing any dates then download the
     # necessary data.
-    logger.info('Downloading benchmark data for {symbol!r}.', symbol=symbol)
+    logger.info(
+        ('Downloading benchmark data for {symbol!r} '
+            'from {first_date} to {last_date}'),
+        symbol=symbol,
+        first_date=first_date - trading_day,
+        last_date=last_date
+    )
 
     try:
-        data = get_benchmark_returns(
-            symbol,
-            first_date - trading_day,
-            last_date,
-        )
+        data = get_benchmark_returns(symbol)
         data.to_csv(get_data_filepath(filename, environ))
     except (OSError, IOError, HTTPError):
-        logger.exception('failed to cache the new benchmark returns')
+        logger.exception('Failed to cache the new benchmark returns')
         raise
     if not has_data_for_dates(data, first_date, last_date):
-        logger.warn("Still don't have expected data after redownload!")
+        logger.warn(
+            ("Still don't have expected benchmark data for {symbol!r} "
+                "from {first_date} to {last_date} after redownload!"),
+            symbol=symbol,
+            first_date=first_date - trading_day,
+            last_date=last_date
+        )
     return data
 
 
@@ -271,15 +268,27 @@ def ensure_treasury_data(symbol, first_date, last_date, now, environ=None):
 
     # If no cached data was found or it was missing any dates then download the
     # necessary data.
-    logger.info('Downloading treasury data for {symbol!r}.', symbol=symbol)
+    logger.info(
+        ('Downloading treasury data for {symbol!r} '
+            'from {first_date} to {last_date}'),
+        symbol=symbol,
+        first_date=first_date,
+        last_date=last_date
+    )
 
     try:
         data = loader_module.get_treasury_data(first_date, last_date)
         data.to_csv(get_data_filepath(filename, environ))
     except (OSError, IOError, HTTPError):
         logger.exception('failed to cache treasury data')
     if not has_data_for_dates(data, first_date, last_date):
-        logger.warn("Still don't have expected data after redownload!")
+        logger.warn(
+            ("Still don't have expected treasury data for {symbol!r} "
+                "from {first_date} to {last_date} after redownload!"),
+            symbol=symbol,
+            first_date=first_date,
+            last_date=last_date
+        )
     return data
 
 
@@ -297,7 +306,8 @@ def _load_cached_data(filename, first_date, last_date, now, resource_name,
     # yet, so don't try to read from 'path'.
     if os.path.exists(path):
         try:
-            data = from_csv(path).tz_localize('UTC')
+            data = from_csv(path)
+            data.index = data.index.to_datetime().tz_localize('UTC')
             if has_data_for_dates(data, first_date, last_date):
                 return data
 
@@ -331,146 +341,6 @@ def _load_cached_data(filename, first_date, last_date, now, resource_name,
     return None
 
 
-def _load_raw_yahoo_data(indexes=None, stocks=None, start=None, end=None):
-    """Load closing prices from yahoo finance.
-
-    :Optional:
-        indexes : dict (Default: {'SPX': 'SPY'})
-            Financial indexes to load.
-        stocks : list (Default: ['AAPL', 'GE', 'IBM', 'MSFT',
-                                 'XOM', 'AA', 'JNJ', 'PEP', 'KO'])
-            Stock closing prices to load.
-        start : datetime (Default: datetime(1993, 1, 1, 0, 0, 0, 0, pytz.utc))
-            Retrieve prices from start date on.
-        end : datetime (Default: datetime(2002, 1, 1, 0, 0, 0, 0, pytz.utc))
-            Retrieve prices until end date.
-
-    :Note:
-        This is based on code presented in a talk by Wes McKinney:
-        http://wesmckinney.com/files/20111017/notebook_output.pdf
-    """
-    assert indexes is not None or stocks is not None, """
-must specify stocks or indexes"""
-
-    if start is None:
-        start = pd.datetime(1990, 1, 1, 0, 0, 0, 0, pytz.utc)
-
-    if start is not None and end is not None:
-        assert start < end, "start date is later than end date."
-
-    data = OrderedDict()
-
-    if stocks is not None:
-        for stock in stocks:
-            logger.info('Loading stock: {}'.format(stock))
-            stock_pathsafe = stock.replace(os.path.sep, '--')
-            cache_filename = "{stock}-{start}-{end}.csv".format(
-                stock=stock_pathsafe,
-                start=start,
-                end=end).replace(':', '-')
-            cache_filepath = get_cache_filepath(cache_filename)
-            if os.path.exists(cache_filepath):
-                stkd = pd.DataFrame.from_csv(cache_filepath)
-            else:
-                stkd = DataReader(stock, 'yahoo', start, end).sort_index()
-                stkd.to_csv(cache_filepath)
-            data[stock] = stkd
-
-    if indexes is not None:
-        for name, ticker in iteritems(indexes):
-            logger.info('Loading index: {} ({})'.format(name, ticker))
-            stkd = DataReader(ticker, 'yahoo', start, end).sort_index()
-            data[name] = stkd
-
-    return data
-
-
-def load_from_yahoo(indexes=None,
-                    stocks=None,
-                    start=None,
-                    end=None,
-                    adjusted=True):
-    """
-    Loads price data from Yahoo into a dataframe for each of the indicated
-    assets.  By default, 'price' is taken from Yahoo's 'Adjusted Close',
-    which removes the impact of splits and dividends. If the argument
-    'adjusted' is False, then the non-adjusted 'close' field is used instead.
-
-    :param indexes: Financial indexes to load.
-    :type indexes: dict
-    :param stocks: Stock closing prices to load.
-    :type stocks: list
-    :param start: Retrieve prices from start date on.
-    :type start: datetime
-    :param end: Retrieve prices until end date.
-    :type end: datetime
-    :param adjusted: Adjust the price for splits and dividends.
-    :type adjusted: bool
-
-    """
-    data = _load_raw_yahoo_data(indexes, stocks, start, end)
-    if adjusted:
-        close_key = 'Adj Close'
-    else:
-        close_key = 'Close'
-    df = pd.DataFrame({key: d[close_key] for key, d in iteritems(data)})
-    df.index = df.index.tz_localize(pytz.utc)
-    return df
-
-
-@deprecated(
-    'load_bars_from_yahoo is deprecated, please register a'
-    ' yahoo_equities data bundle instead',
-)
-def load_bars_from_yahoo(indexes=None,
-                         stocks=None,
-                         start=None,
-                         end=None,
-                         adjusted=True):
-    """
-    Loads data from Yahoo into a panel with the following
-    column names for each indicated security:
-
-        - open
-        - high
-        - low
-        - close
-        - volume
-        - price
-
-    Note that 'price' is Yahoo's 'Adjusted Close', which removes the
-    impact of splits and dividends. If the argument 'adjusted' is True, then
-    the open, high, low, and close values are adjusted as well.
-
-    :param indexes: Financial indexes to load.
-    :type indexes: dict
-    :param stocks: Stock closing prices to load.
-    :type stocks: list
-    :param start: Retrieve prices from start date on.
-    :type start: datetime
-    :param end: Retrieve prices until end date.
-    :type end: datetime
-    :param adjusted: Adjust open/high/low/close for splits and dividends.
-        The 'price' field is always adjusted.
-    :type adjusted: bool
-
-    """
-    data = _load_raw_yahoo_data(indexes, stocks, start, end)
-    panel = pd.Panel(data)
-    # Rename columns
-    panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price']
-    panel.major_axis = panel.major_axis.tz_localize(pytz.utc)
-    # Adjust data
-    if adjusted:
-        adj_cols = ['open', 'high', 'low', 'close']
-        for ticker in panel.items:
-            ratio = (panel[ticker]['price'] / panel[ticker]['close'])
-            ratio_filtered = ratio.fillna(0).values
-            for col in adj_cols:
-                panel[ticker][col] *= ratio_filtered
-    return panel
-
-
 def load_prices_from_csv(filepath, identifier_col, tz='UTC'):
     data = pd.read_csv(filepath, index_col=identifier_col)
     data.index = pd.DatetimeIndex(data.index, tz=tz)