-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmixpandas.py
216 lines (174 loc) · 7.18 KB
/
mixpandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
mixpandas.py
A library to request data from Mixpanel's Raw Data Export API
"""
import datetime
import hashlib
import urllib.request, urllib.parse, urllib.error
import time
try:
import json
except ImportError:
import simplejson as json
import pandas as pd
VERSION = '2.0' # Mixpanel API version
date_format = '%Y-%m-%d' # Mixpanel's API date format
def read_events(keys, events=None, start=None, end=None,
where=None, bucket=None, columns=None, exclude_mp=True):
"""
Request data from Mixpanel's Raw Data Export API and return as a pandas
DataFrame with event times converted to pandas Timestamp objects
Parameters
----------
keys : tuple containing (Mixpanel API Key, Mixpanel Secret Key)
events : event name or list of event names to get, optional
If not specified or None, all events will be downloaded
start : start date. String or datetime-like, default 2011-07-10
The input is converted to a date using pandas.to_datetime which
accepts a variety of inputs, e.g., '5/6/2013', '2013-05-06',
'May 6, 2013'. It can also be a datetime object
The default comes from the earliest start date allowed by the API
end: end date. String or datetime-like, default yesterday
The input is converted in the same manner as start. The latest this
date can be is yesterday's date, which is the default value.
where: string, Mixpanel filter expression, optional
See the documentation:
https://mixpanel.com/docs/api-documentation/data-export-api#segmentation-expressions
bucket: string, optional
See the documentation:
https://mixpanel.com/docs/api-documentation/displaying-mixpanel-data-to-your-users
columns: string or list of strings, optional
Returned DataFrame will only contain the specified parameters
exclude_mp: Filter out Mixpanel-specific data. default True
Filter out event properties that begin with '$' or 'mp_'.
These properties are automatically inserted by Mixpanel and indicate
things like region, OS, etc.
For more information, see:
https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel
"""
if start is None:
# This default comes from an error message you'll receive if you
# try an start date earlier than 7/10/2011
start = datetime.date(2011, 0o7, 10)
start = pd.to_datetime(start)
start_str = start.strftime(date_format)
if end is None: # Defaults to yesterday, the latest allowed
end = datetime.date.today() - datetime.timedelta(1)
end = pd.to_datetime(end)
end_str = end.strftime(date_format)
payload = {
'from_date' : start_str,
'to_date' : end_str,
}
# Handle when single event passed in as string
if isinstance(events, str):
events = [events]
# Fill the payload with the parameters if they're specified
params = {'event' : events, 'where' : where, 'bucket' : bucket}
for k, v in params.items():
if v is not None:
payload[k] = v
data = request(keys, ['export'], payload, data_api=True)
return _export_to_df(data, columns, exclude_mp)
def _export_to_df(data, columns, exclude_mp):
# Keep track of the parameters each returned event
parameters = set()
# Calls to the data export API do not return JSON. They return
# records separated by newlines, where each record is valid JSON.
# The event parameters are in the properties field
events = []
for line in data.split(b'\n'):
try:
event = json.loads(line)
ev = event['properties']
ev['event']=event['event']
except ValueError: # Not valid JSON
continue
parameters.update(list(ev.keys()))
events.append(ev)
# If columns is excluded, leave off parameters that start with '$' as
# these are automatically included in the Mixpanel events and clutter the
# real data
if columns is None:
if exclude_mp:
columns = [p for p in parameters if not (p.startswith('$') or
p.startswith('mp_'))]
else:
columns = parameters
elif 'time' not in columns:
columns.append('time')
df = pd.DataFrame(events, columns=columns)
# Make time a datetime.
df['time'] = df['time'].map(lambda x: datetime.datetime.fromtimestamp(x))
return df
# The code below is from Mixpanel's Python client for the data export API.
# There are only a few modifications:
# * Add data_api optional argument
# * Change the base URL to http://data.mixpanel.com if data_api is set
# * make it a function that takes keys instead of a class initialized
# with the keys (this is just a personal preference)
# * Changed max. line width
# Mixpanel, Inc. -- http://mixpanel.com/
#
# Python API client library to consume mixpanel.com analytics data.
# https://mixpanel.com/site_media//api/v2/mixpanel.py
def request(keys, methods, params, format='json', data_api=False):
"""
methods - List of methods to be joined,
e.g. ['events', 'properties', 'values']
will give us
http://mixpanel.com/api/2.0/events/properties/values/
params - Extra parameters associated with method
"""
api_key, api_secret = keys
params['api_key'] = api_key
params['expire'] = int(time.time()) + 600 # Grant this request 10 minutes.
params['format'] = format
if 'sig' in params: del params['sig']
params['sig'] = hash_args(params, api_secret)
if data_api:
url_base = r'http://data.mixpanel.com/api'
else:
url_base = r'http://mixpanel.com/api'
request_url = ('/'.join([url_base, str(VERSION)] + methods) + '/?' +
unicode_urlencode(params))
request = urllib.request.urlopen(request_url)
data = request.read()
if data_api:
return data
return json.loads(data)
def unicode_urlencode(params):
"""
Convert lists to JSON encoded strings, and correctly handle any
unicode URL parameters.
"""
if isinstance(params, dict):
params = list(params.items())
for i, param in enumerate(params):
if isinstance(param[1], list):
params[i] = (param[0], json.dumps(param[1]),)
return urllib.parse.urlencode(
[(k, isinstance(v, str) and v.encode('utf-8') or v)
for k, v in params]
)
def hash_args(args, api_secret):
"""
Hashes arguments by joining key=value pairs, appending a secret, and
then taking the MD5 hex digest.
"""
for a in args:
if isinstance(args[a], list): args[a] = json.dumps(args[a])
args_joined = b''
for a in sorted(args.keys()):
if isinstance(a, str):
args_joined += a.encode('utf-8')
else:
args_joined += str(a).encode('utf-8')
args_joined += b'='
if isinstance(args[a], str):
args_joined += args[a].encode('utf-8')
else:
args_joined += str(args[a]).encode('utf-8')
hash = hashlib.md5(args_joined)
hash.update(api_secret.encode('utf-8'))
return hash.hexdigest()