-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
67 lines (67 loc) · 1.76 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
main:
project_name: genre_classification_prod
experiment_name: dev
execute_steps:
- download
- preprocess
- check_data
- segregate
- random_forest
- evaluate
# This seed will be used to seed the random number generator
# to ensure repeatibility of the data splits and other
# pseudo-random operations
random_seed: 42
data:
file_url: "https://github.com/udacity/nd0821-c2-build-model-workflow-exercises/blob/master/lesson-2-data-exploration-and-preparation/exercises/exercise_4/starter/genres_mod.parquet?raw=true"
reference_dataset: "exercise_14/preprocessed_data.csv:latest"
# Threshold for Kolomorov-Smirnov test
ks_alpha: 0.05
test_size: 0.3
val_size: 0.3
# Stratify according to the target when splitting the data
# in train/test or in train/val
stratify: genre
random_forest_pipeline:
random_forest:
n_estimators: 100
criterion: 'gini'
max_depth: 13
min_samples_split: 2
min_samples_leaf: 1
min_weight_fraction_leaf: 0.0
max_features: 10
max_leaf_nodes: null
min_impurity_decrease: 0.0
min_impurity_split: null
bootstrap: true
oob_score: false
n_jobs: null
# This is a different random seed than main.random_seed,
# because this is used only within the RandomForest
random_state: 42
verbose: 0
warm_start: false
class_weight: "balanced"
ccp_alpha: 0.0
max_samples: null
tfidf:
max_features: 10
features:
numerical:
- "danceability"
- "energy"
- "loudness"
- "speechiness"
- "acousticness"
- "instrumentalness"
- "liveness"
- "valence"
- "tempo"
- "duration_ms"
categorical:
- "time_signature"
- "key"
nlp:
- "text_feature"
export_artifact: "model_export"