forked from RyanZotti/Self-Driving-Car
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_pysparkling_h2o_gbm.py
83 lines (60 loc) · 2.39 KB
/
train_pysparkling_h2o_gbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from pysparkling import *
from h2o.estimators.gbm import H2OGradientBoostingEstimator as GBM
'''
This link might be helpful
http://learn.h2o.ai/content/tutorials/pysparkling/Chicago_Crime_Demo.html
'''
def remove_pandas_index_column(line):
line = str(line).split(",")
line = line[1:] # Remove Pandas index column
return line
def contains_target(row):
answer = False
if len(set(['Up','Left','Right']).intersection(row)) > 0:
answer = True
return answer
def make_float_predictors(old_line):
# Convert only predictors to floats. Do not convert target, which is last element
new_line = [float(str(x)) for x in old_line[:len(old_line)-1]]
# Append the target, which is a class (ie String) and not a float
new_line.append(old_line[-1])
return new_line
# Create H2O context for use later
hc = H2OContext(sc)
# Pull the data from S3
rdd = sc.textFile("s3n://self-driving-car/data/*/predictors_and_targets.csv")
# Remove index column
rdd = rdd.map(remove_pandas_index_column)
# Remove the header rows, which are easy to find because they won't have target values
rdd = rdd.filter(contains_target)
# Convert predictor values from String to Float
rdd = rdd.map(lambda line: make_float_predictors(line))
# Create predictor column names
column_names = [str(x) for x in list(range(230400))]
# Add on the target column to make a complete list of column names
column_names.append('target')
# Use the programmatically-generated column names to make a dataframe
df = rdd.toDF(column_names)
# Optionally print the quanity of columns in your dataframe
df.columns
# Optionally print column data types. Note that Spark intelligentlly
# identifies that the predictor columns are double because I had
# made all of them rdd elements double (above). This saved me from
# having to write really ugly Spark casting code
df.schema.fields
# Convert the Spark DataFrame to something that H2O can ingest
df_h2o = hc.as_h2o_frame(df,"df_h2o")
'''
'''
predictors = column_names[:-1]
response = column_names[-1]
ratios = [0.6,0.2]
h2o_frame_splits = df_h2o.split_frame(ratios,seed=12345)
train = h2o_frame_splits[0]
train.frame_id = "Train"
valid = h2o_frame_splits[2]
valid.frame_id = "Validation"
test = h2o_frame_splits[1]
test.frame_id = "Test"
model = GBM(ntrees=50,max_depth=6,learn_rate=0.1,distribution="multinomial")
model.train(x=predictors,y=response,training_frame=train,validation_frame=valid)