-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcryptocoinscraper.py
147 lines (118 loc) · 4.28 KB
/
cryptocoinscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
This module implements a scraper for CoinMarketCap.com.
"""
import os
import sys
import abc
import collections
import json
import logging
import datetime
import pandas as pd
from pyquery import PyQuery
import requests
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
CryptoCoin = collections.namedtuple('CryptoCoin', ['name', 'symbol',
'price',
'circulatingSupply',
'percentChange24h',
'percentChange7d',
'marketCap',
'volume24h',
'datetime'])
class Scraper(abc.ABC):
"""
Abstract base class for a web scraper.
"""
def __init__(self, urls: list):
"""
Initialize a scraper object.
Args:
urls (list): a list of the URLs that will be scraped.
"""
self.urls = urls
def pages(self):
"""
Generator that iterates over all pages and yields each HTML document.
"""
for url in self.urls:
resp = requests.get(url)
if not resp.ok:
raise RuntimeError("response code: {}".format(resp.status_code))
yield resp.content
@abc.abstractmethod
def parse_html(self, html: str) -> pd.DataFrame:
"""
For implementation by subclasses: parse an HTML document.
Args:
html (str): the HTML, as a string, that contains our desired data.
Returns:
A Pandas DataFrame with the desired entries.
"""
raise NotImplementedError()
class CoinMarketCap(Scraper):
URL = "https://coinmarketcap.com"
def __init__(self):
super().__init__([self.URL])
def parse_html(self, html: str) -> pd.DataFrame:
pq = PyQuery(html)
elems = pq('script#__NEXT_DATA__')
if len(elems) == 0:
raise ValueError("unrecognized HTML structure")
outer_data = json.loads(elems[0].text)
try:
outer_data = outer_data['props']['initialState']['cryptocurrency']
data = outer_data['listingLatest']['data']
except KeyError as exc:
logger.error("unrecognized structure: {}".format(exc))
raise exc
return self._parse_json_data(data)
def _parse_json_data(self, data: list) -> pd.DataFrame:
"""
Parse a list of dictionary objects, each with a structure as seen on
CoinMarketCap.com.
Args:
data (list): a list of dict objects (e.g. loaded from JSON)
Returns:
A Pandas DataFrame.
"""
coins = []
for coin in data:
name = coin['name']
symbol = coin['symbol']
circulatingSupply = coin['circulatingSupply']
quote = coin['quote']
price = quote['USD']['price']
percentChange24h = quote['USD']['percentChange24h']
percentChange7d = quote['USD']['percentChange7d']
marketCap = quote['USD']['marketCap']
volume24h = quote['USD']['volume24h']
cryptocoin = CryptoCoin(name, symbol, price,
circulatingSupply, percentChange24h,
percentChange7d, marketCap, volume24h,
datetime.datetime.now())
coins.append(cryptocoin)
return pd.DataFrame(coins)
def to_dataframe(self) -> pd.DataFrame:
"""
Convenience method to return a Pandas DataFrame in one line.
Returns:
A Pandas DataFrame of the current market data.
"""
dfs = []
for page in self.pages():
dfs.append(self.parse_html(page))
return pd.concat(dfs)
# tests
if __name__ == "__main__":
try:
scraper = CoinMarketCap()
for page in scraper.pages():
df = scraper.parse_html(page)
df.to_csv(None, index=False)
except Exception as exc:
logger.error("tests failed: {}".format(exc))
sys.exit(os.EX_SOFTWARE)
logger.info("tests passed")
sys.exit(os.EX_OK)