"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of DT Base Model for Multiclass Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_dt['column'][:10], y=feat_imp_tuned_dt['weight'][:10],data=feat_imp_tuned_dt)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from DT Multiclass Base Model\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Decision Tree Grid Search Multiclass Classification"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Initializing DT Grid Pipeline \n",
+ "\n",
+ "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Creating pipeline for DT Grid Model \n",
+ "\n",
+ "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n",
+ "\n",
+ "# Creating Grid Search for Hyper Parameter Tuning for DT Model\n",
+ "\n",
+ "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n",
+ "\n",
+ "# Cross Validator Pipeline with 5 fold cv to fit the training data\n",
+ "\n",
+ "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator_dt,seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fitting the training data using the Cross Validator Pipeline \n",
+ "\n",
+ "dtModel_t = cv1_dt.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Transform Test data using Cross Validation Pipeline Built earlier for prediction of Test data\n",
+ "\n",
+ "pred_dtt = dtModel_t.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy is 0.6133068955674265\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Evaluation of Testing Data using Multiclass Evaluator \n",
+ "\n",
+ "print(\"Accuracy is\",evaluator_dt.evaluate(pred_dtt))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='DecisionTreeClassifier_b6336355b38a', name='featuresCol', doc='features column name.'): 'features',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='labelCol', doc='label column name.'): 'label',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='predictionCol', doc='prediction column name.'): 'prediction',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='seed', doc='random seed.'): 42,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 35,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 30,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 500,\n",
+ " Param(parent='DecisionTreeClassifier_b6336355b38a', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0}"
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Decision Tree Hyper Parameter Values from Best Model\n",
+ "\n",
+ "dtModel_t.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SparseVector(46, {0: 0.0004, 1: 0.0001, 11: 0.0046, 14: 0.0, 16: 0.1281, 17: 0.0045, 18: 0.002, 19: 0.0092, 21: 0.0006, 22: 0.0002, 23: 0.001, 25: 0.0049, 27: 0.0056, 32: 0.0071, 34: 0.0834, 35: 0.5412, 36: 0.1411, 37: 0.0011, 38: 0.0093, 39: 0.0142, 40: 0.019, 41: 0.0001, 44: 0.0006, 45: 0.0218})"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Feature Importance Sparse Matrix\n",
+ "\n",
+ "dtModel_t.bestModel.stages[-1].featureImportances"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_dtt=pred_dtt.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True Labels from test data for Target Variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.88 0.55 0.68 131724\n",
+ " 1 0.52 0.73 0.61 58339\n",
+ " 2 0.17 0.90 0.29 6121\n",
+ "\n",
+ " accuracy 0.61 196184\n",
+ " macro avg 0.52 0.73 0.52 196184\n",
+ "weighted avg 0.75 0.61 0.64 196184\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_dtt,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of DT Grid Model for Multiclass Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_dtt = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Multiclass tuned')"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of DT Grid Model for Multiclass Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_dtt['column'][:10], y=feat_imp_tuned_dtt['weight'][:10],data=feat_imp_tuned_dtt)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from DT Multiclass tuned\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ },
+ "latex_envs": {
+ "LaTeX_envs_menu_present": true,
+ "autoclose": false,
+ "autocomplete": true,
+ "bibliofile": "biblio.bib",
+ "cite_by": "apalike",
+ "current_citInitial": 1,
+ "eqLabelWithNumbers": true,
+ "eqNumInitial": 1,
+ "hotkeys": {
+ "equation": "Ctrl-E",
+ "itemize": "Ctrl-I"
+ },
+ "labels_anchors": false,
+ "latex_user_defs": false,
+ "report_style_numbering": false,
+ "user_envs_cfg": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/RF_DT_GBT_Binary.ipynb b/RF_DT_GBT_Binary.ipynb
new file mode 100644
index 0000000..a6110d6
--- /dev/null
+++ b/RF_DT_GBT_Binary.ipynb
@@ -0,0 +1,1596 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# import the required libraries\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql import Row\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from pyspark.sql.types import *\n",
+ "from pyspark.sql.functions import *\n",
+ "import matplotlib.pyplot as plt\n",
+ "from pyspark.sql import functions as fn\n",
+ "from pyspark.ml import feature, regression, evaluation, Pipeline\n",
+ "import seaborn as sns\n",
+ "from pyspark.ml.feature import VectorAssembler\n",
+ "from pyspark.ml import Pipeline\n",
+ "from pyspark.ml.regression import LinearRegression\n",
+ "from pyspark.ml.classification import GBTClassifier\n",
+ "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
+ "from pyspark.ml.stat import Correlation\n",
+ "from pyspark.ml.feature import VectorAssembler\n",
+ "from pyspark.ml.classification import DecisionTreeClassifier\n",
+ "from pyspark.ml import Pipeline\n",
+ "from sklearn.metrics import classification_report\n",
+ "from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer\n",
+ "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n",
+ "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
+ "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n",
+ "spark = SparkSession.builder.getOrCreate()\n",
+ "sc = spark.sparkContext\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Do not delete or change this cell\n",
+ "\n",
+ "import os\n",
+ "\n",
+ "# Define a function to determine if we are running on data bricks\n",
+ "# Return true if running in the data bricks environment, false otherwise\n",
+ "def is_databricks():\n",
+ " # get the databricks runtime version\n",
+ " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n",
+ " \n",
+ " # if running on data bricks\n",
+ " if db_env != None:\n",
+ " return True\n",
+ " else:\n",
+ " return False\n",
+ "\n",
+ "# Define a function to read the data file. The full path data file name is constructed\n",
+ "# by checking runtime environment variables to determine if the runtime environment is \n",
+ "# databricks, or a student's personal computer. The full path file name is then\n",
+ "# constructed based on the runtime env.\n",
+ "# \n",
+ "# Params\n",
+ "# data_file_name: The base name of the data file to load\n",
+ "# \n",
+ "# Returns the full path file name based on the runtime env\n",
+ "#\n",
+ "def get_training_filename(data_file_name): \n",
+ " # if running on data bricks\n",
+ " if is_databricks():\n",
+ " # build the full path file name assuming data brick env\n",
+ " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n",
+ " # else the data is assumed to be in the same dir as this notebook\n",
+ " else:\n",
+ " # Assume the student is running on their own computer and load the data\n",
+ " # file from the same dir as this notebook\n",
+ " full_path_name = data_file_name\n",
+ " \n",
+ " # return the full path file name to the caller\n",
+ " return full_path_name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# reads the train data\n",
+ "us_train_cat = spark.read.csv(get_training_filename('USAccident_train_categorical.csv'), header = True, inferSchema = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# reads the test data\n",
+ "us_test_cat = spark.read.csv(get_training_filename('USAccident_validation_categorical.csv'), header = True, inferSchema = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# creates a vector assembler\n",
+ "va = VectorAssembler().setInputCols([i for i in us_train_cat.columns if i!='Severity']).setOutputCol('features')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# creates a string indexer\n",
+ "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Below we convert the multiclass data into binary data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "us_train_cat=us_train_cat.withColumn(\"Severity\",when(((us_train_cat[\"Severity\"]==4) | (us_train_cat[\"Severity\"]==3)),1).otherwise(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "us_test_cat=us_test_cat.withColumn(\"Severity\",when(((us_test_cat[\"Severity\"]==4) | (us_test_cat[\"Severity\"]==3)),1).otherwise(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating the evaluator for our binary classification\n",
+ "evaluator_rfb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RF Base Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create an initial RandomForest model.\n",
+ "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Train model with Training Data\n",
+ "rfModel = Pipeline(stages=[label_stringIdx,va, rf])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rf_fit = rfModel.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC ROC score: 0.7634901876492165\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"AUC ROC score:\",evaluator_rfb.evaluate(rf_fit.transform(us_test_cat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.6892473173146102\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Prints the accuracy of our binary classification\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]\n",
+ "evaluator_rfb.evaluate(rf_fit.transform(us_test_cat))\n",
+ "binary_prediction=rf_fit.transform(us_test_cat).select(\"prediction\").collect()\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rf_fit.stages[-1].getMaxDepth()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "20"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rf_fit.stages[-1].getNumTrees"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'gini'"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rf_fit.stages[-1].getImpurity()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# tranforming the test data for predictions\n",
+ "prediction_rfb=(rf_fit.transform(us_test_cat)).toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# storing the true labels for evaluation purpose below\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.69 0.98 0.81 131571\n",
+ " 1 0.73 0.09 0.16 64408\n",
+ "\n",
+ " micro avg 0.69 0.69 0.69 195979\n",
+ " macro avg 0.71 0.54 0.48 195979\n",
+ "weighted avg 0.70 0.69 0.59 195979\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_pred=prediction_rfb,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create a dataframe to print the feature importance\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_rf = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], rf_fit.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_rf['column'][:10], y=feat_imp_tuned_rf['weight'][:10],data=feat_imp_tuned_rf)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from Random Forest\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RF Grid Search "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create an initial RandomForest model.\n",
+ "rf_new = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Train model with Training Data\n",
+ "rfModel_new = Pipeline(stages=[label_stringIdx,va, rf_new])\n",
+ "\n",
+ "#paramGrid_rft = ParamGridBuilder().addGrid(rf_new.numTrees, [10, 30, 60]).addGrid(rf_new.maxDepth, [3, 5, 10]).addGrid(rf_new.impurity,[\"entropy\", \"gini\"]).build()\n",
+ "paramGrid_rft = ParamGridBuilder().addGrid(rf_new.numTrees, [60]).addGrid(rf_new.maxDepth, [10]).addGrid(rf_new.impurity,[\"gini\"]).build()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# creating a cross validator for tuning our model\n",
+ "cv_rf = CrossValidator(estimator=rfModel_new, estimatorParamMaps=paramGrid_rft, evaluator=evaluator_rfb, numFolds=5).fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# store the predictions from our test set\n",
+ "pred_rft = cv_rf.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ROC AUC sccore: 0.7880870240899686\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"ROC AUC sccore:\",evaluator_rfb.evaluate(pred_rft))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7257971517356452\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Printing the accuracy of our binary predictions\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]\n",
+ "evaluator_rfb.evaluate(pred_rft)\n",
+ "binary_prediction=pred_rft.select(\"prediction\").collect()\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cv_rf.bestModel.stages[-1].getMaxDepth()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "60"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cv_rf.bestModel.stages[-1].getNumTrees"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'gini'"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cv_rf.bestModel.stages[-1].getImpurity()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='RandomForestClassifier_deb569e64636', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='featuresCol', doc='features column name'): 'features',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='labelCol', doc='label column name'): 'label',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='numTrees', doc='Number of trees to train (>= 1)'): 60,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='predictionCol', doc='prediction column name'): 'prediction',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities'): 'probability',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction',\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='seed', doc='random seed'): 42,\n",
+ " Param(parent='RandomForestClassifier_deb569e64636', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cv_rf.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# storing the predictions of our test set\n",
+ "prediction_rft=pred_rft.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# storing the true labels of our test set\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.74 0.91 0.82 131571\n",
+ " 1 0.66 0.34 0.45 64408\n",
+ "\n",
+ " micro avg 0.73 0.73 0.73 195979\n",
+ " macro avg 0.70 0.63 0.63 195979\n",
+ "weighted avg 0.71 0.73 0.70 195979\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_pred=prediction_rft,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creates a Dataframe of feature importances from our model\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_rfg = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cv_rf.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_rfg['column'][:10], y=feat_imp_tuned_rfg['weight'][:10],data=feat_imp_tuned_rfg)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from Random Forest Grid\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# GBT Base Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Declares the gbt classifier model\n",
+ "gbt = GBTClassifier(seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creates a pipeline for our model\n",
+ "gbt_pipe = Pipeline(stages=[label_stringIdx, va, gbt])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fits the pipeline on our train data\n",
+ "gbtModel = gbt_pipe.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ROC AUC Score: 0.787408211157604\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"ROC AUC Score:\",evaluator_rfb.evaluate(gbtModel.transform(us_test_cat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7289046275366238\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculates the accuracy of our model\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]\n",
+ "evaluator_rfb.evaluate(gbtModel.transform(us_test_cat))\n",
+ "binary_prediction=gbtModel.transform(us_test_cat).select(\"prediction\").collect()\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gbtModel.stages[-1].getMaxDepth()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.1"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gbtModel.stages[-1].getStepSize()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "20"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gbtModel.stages[-1].getMaxIter()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predict on test data\n",
+ "prediction_gbtn=gbtModel.transform(us_test_cat).toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Stores the true labels from our test data\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.76 0.88 0.81 131571\n",
+ " 1 0.63 0.43 0.51 64408\n",
+ "\n",
+ " micro avg 0.73 0.73 0.73 195979\n",
+ " macro avg 0.69 0.65 0.66 195979\n",
+ "weighted avg 0.72 0.73 0.71 195979\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_pred=prediction_gbtn,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a dataframe for our feature importances\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_gtbb = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], gbtModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_gtbb['column'][:10], y=feat_imp_tuned_gtbb['weight'][:10],data=feat_imp_tuned_gtbb)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from GBT Base Model\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# GBT Binary Tuned Best Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a GBT Classifier\n",
+ "gbt_t_new = GBTClassifier(maxIter=55,seed=42)\n",
+ "gbt_pipe_t_new = Pipeline(stages=[label_stringIdx, va, gbt_t_new])\n",
+ "\n",
+ "# Create a evaluator for our model\n",
+ "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n",
+ "\n",
+ "# Create grid for tuning the model\n",
+ "#grid_gbt_t_new = ParamGridBuilder().addGrid(gbt_t_new.stepSize, [0.1,0.3,0.01]).addGrid(gbt_t_new.maxDepth, [3, 5, 8]).build()\n",
+ "grid_gbt_t_new = ParamGridBuilder().addGrid(gbt_t_new.stepSize, [0.3]).addGrid(gbt_t_new.maxDepth, [8]).build()\n",
+ "cv1_gbt_t_new = CrossValidator(estimator=gbt_pipe_t_new,estimatorParamMaps=grid_gbt_t_new, numFolds=5, evaluator=evaluator,seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# fit the cross validation model\n",
+ "cvModel_gbt_t_new = cv1_gbt_t_new.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ROC AUC Score: 0.8032377340351009\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"ROC AUC Score:\",evaluator_rfb.evaluate(cvModel_gbt_t_new.transform(us_test_cat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7407885538756703\n"
+ ]
+ }
+ ],
+ "source": [
+ "# calculates the accuracy of the binary model\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]\n",
+ "evaluator_rfb.evaluate(cvModel_gbt_t_new.transform(us_test_cat))\n",
+ "binary_prediction=cvModel_gbt_t_new.transform(us_test_cat).select(\"prediction\").collect()\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cvModel_gbt_t_new.bestModel.stages[-1].getMaxDepth()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.3"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cvModel_gbt_t_new.bestModel.stages[-1].getStepSize()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='GBTClassifier_34f6d1b395a8', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'all',\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='featuresCol', doc='features column name'): 'features',\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='labelCol', doc='label column name'): 'label',\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: logistic'): 'logistic',\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='maxIter', doc='maximum number of iterations (>= 0)'): 55,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='predictionCol', doc='prediction column name'): 'prediction',\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='seed', doc='random seed'): 42,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.3,\n",
+ " Param(parent='GBTClassifier_34f6d1b395a8', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cvModel_gbt_t_new.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Stores the prediction from ourr test set\n",
+ "prediction_gbt_t_new=cvModel_gbt_t_new.transform(us_test_cat).toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Stores the true labels from our test set\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.78 0.86 0.82 131571\n",
+ " 1 0.63 0.50 0.56 64408\n",
+ "\n",
+ " micro avg 0.74 0.74 0.74 195979\n",
+ " macro avg 0.71 0.68 0.69 195979\n",
+ "weighted avg 0.73 0.74 0.73 195979\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_pred=prediction_gbt_t_new,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a dataframe of feature importances\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_gbt_t_new = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cvModel_gbt_t_new.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAK/CAYAAAA244rdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3Xu8tfWc//HXW3cnoRxySKUQTQ7jkFANQ5iMITPKFEYM+vmN0GBmMn5EzIFBjuMwYnIYnQwTIgwyolSK5DBuOaUiOqmk4vP743ttrXZ73/e+u/e613et+/V8PPZjr+uw1vpca6+19vv6Xt/re6WqkCRJUp9uMukCJEmStDjDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGvSeiLJjkmunXANmySpJFtPsg41SQ5K8rMklyfZbNL1aDx6+Oxr7RjWtOyGL/65n98m+dXI9JOX+bmenOTLw3N8coHlD0hyZpIrk3wlyT1X8VgnJ7lqXv33Xcv6/JKcgN5CYZILkuw+6TpGDeHs1cAfVNXNquqKCdSwcZJDk3x3+Iyem+RjSR4+ss4Fw7LLk1yU5LgkWw3LPjvyWb0mya9Hpt+wwPM9O8m1I+t8L8kzl2E7np3kM6tZ5+QkT1nb59L6ybCmZTd88d+sqm4G/Ah47Mi8Dyzz0/0CeB3w+vkLkmwK/BfwTuCWwDHAh5OsWMXjPXO0/qo6Y5nrXSNJbpLEz+mUWs17bdLuAGxQVd9ZaOG4a08S4DjgUcB+wObAXYC3AY+Zt/qjhu+TOwK/ZPi8V9XDR75rPgS8cuSze9AiT/35kfvsB7wxyU7LvX3ScvKfgNa5JJsmeWuS84c96X9JsuGwbM8kK5O8YtiLPifJPos9VlV9sqqOBc5fYPEjgauq6l+r6te0UHdzYI1bOJLcc9iLvzjJt5I8fmTZnyb5WpLLkvwwyd+P3PULwAajLXVJ/jnJu0buf73Wt2EP/NAkpwBXAlsluVWS9w6tDD9OcshciBvu/8Uklya5MMl7V7Mtzx5e+/OSPHdk/m5JThke57wkh839w06yQZK3DI9/6bC9dx+WbZrkDUNdFyR5c5KNRx73JUl+muRcYJUtC0m2TXL88Lf/3yT7jyz75yQfSPLBJL9M8vUk91nV4y1w36OGv8OZSbYfXsefJ/lBkofN+xu8Msnpw/Z+KMnmI8ufkOSbSS5J8pkkO4wsuyDJi5KcDVyW5BjgtsCnhud+XpIVw2P+dHiMz829nsNjHDm8picM23pSkjuNLP/9kffjBUleOPJ3eunwufn5sM1bLPB63Av4Gte9Nz+R61oj/2+S7wHfGNZ9aJKvDq/DyUkeMO91enlaq/XlSf4zya2THJ32eTg5i7duPgbYFdirqk6rqmuq6tdV9fGqeuFCd6iqX9FC2bKEq6r6CvA9YMeRbfqD4XNwybDdu40se9bwXvnl8Brvk9b6/gbgD4fX4IL5z5PkdcADgHcN67wuC7S6Z6T1Le1z+t9J3jTU8r0kjxhZd1XfCSuSvDHJL5KspH0XappVlT/+jO0H+AHwiHnzXgP8D3Ab4HbAqcBLhmV7AtcC/wRsBDyCFli2X83zHAh8ct68FwMfnjfvM8BzFnmMk4GnLDD/FrQw+GRgA9qX7kXAXYflewD3oO383G9YtuewbEfg2nmP98/Au0amr7fOUMc5wN2BDYEVwCeANwM3pbWInAHsP6z/YeBFQIBNgd0W2b4dgQKOGNa771Dr7sPyXYZt24DWwrESePawbC/gy8NrcZNhe287LHs7cCywBa115ATgkGHZ44GfDM891/pRwNaL1HgKcBiwMbDzUN9uI6/blbR/PBsM631+kcfZZPR5Ru77sOH1PAr4/vC6rQCeC3xr3t/ghyN1f3Tubwbck9a684e09+hLgW8BK4blF9De01sBm47M233k8VcA+w+PvQmtNenkkeVHAj+jvZ82HF7ffx+W3RK4kPae33j4mzxgWHYw7bO11fC4/w68ZxXvh2sXeM0+PvwtN6WFzMuAJw41P2147s1HXqdvAdsBtwK+C3wbeOjI6/y2RZ7/Dcz7zC6y3u9eu+H1+iDwzgXWOxL4f6t5rGcDnxmZ3hW4FNhumN6O1lr/CNr7/I+H7b3l8HMJcJdh3TsCv7fQ4y7l+2X+6z9/neExrwGeSnu//zXwg5F1V/WdcBBw1vA+2BL44vzn8me6fiZegD+z/cPCYe0nwMNHpvcCvj3c3hO4CthkZPlxwN+s5nkWCmv/wPAPbmTeh4CDF3mMk4Erhi/kS4AvDfP3Bz49b90jgL9b5HHeDvzTcPvGhrW/H5m+01DXhiPzng58Yrh9NPAW4A6reY3mwtp2I/PeBLx1kfUPBj443P5j4GxaoMvIOiuAq4E7jsx7GEPwAf4DePnIsnuzSFgDdhj+9puOzDsMePvI6/axkWX3Ay5ZpPaFwtpHR5bvQ/unnGF6y2H9uXB18ry67wdcMfK+eu/Isg1o/9AfNExfADxpXj3XC2sL1Ht74LcM73ta8HjLyPI/A84c+dt/eZHH+T4jYR3YnhZSs8C6i4W1XUfmPQv4wrz7nQHsO/I6vXBk2VsZ2UEaXueTF6n1/Yx8PmnB4hJaeLpkZP4FtHB8CW1H7scMIWne4y01rF0zPNblw/a+dmT5IcC/zbvPicCfc11Y24uR76eRxx1HWPvGyLJbDfVuweq/E74EPG1k2ePmP5c/0/XjYVCtU0lC+8f0w5HZP6Ttpc65sKqumrd8qxvxdJfTWh1G3YL2xb+Y/1NVWww/uw7z7gQ8ZDgUcUmSS4An0PZm5w4fnpjhECGt9eE2N6LeUT8euX0n2j/SC0ee/420Vkloe9w3Bc4YDg2urhPz6GP/7rVNstNwOOynSS4DXjayHZ8ADgfeAfw0yb8mudlw3w2Bs0dq+witRYZh+fznW8xWtL/9r+atP/reGD3EdCWtpWWpfjpy+1fDc9XINMDoGZHz677pcCh0K0a2o6p+Q9sBueMi972B4TDVa4dDaZfRWqMC3HpktcW2dRvaobv5j5lh2fEjf4szaC1Et56//iqM1n69bR3M/5vMf13nTy/2N/oFw2cIoKrOq6otgN1o7/dRjx6WbQL8LfCFJGuyTaNOHD7fc+/fByY5ZFh2J+Ap8z7rOwNbVdXFtNb15wEXpJ3ocNcbWcNSzX8PQHs9V/edsCafO00Bw5rWqeGf4wW0L5s529L+2c25TZJN5i0/70Y83dnA789NDP057jnMXxM/Bj41EuK2qOt3YD6adrhnm6ranHboKcOyuuHDcQUtXM25/QLrjN7vx7TgecuR579FVd0PoKp+UlV/SfvH9zzg3Um2XcX2bDNye/S1/Tfgq7TDPLcADp3bjmpeX1X3pbWO/T7wfNrh4WuH+8zVtnlVzf0jPX+B51vMecCWaSeGjK7/k0XWH7f5dV9ZVZfS6vzd+zfJBrTwMlrn/L/7/Omn0zrWP4x26Hiuz1RYvR/TDlNf/wnaZ2uu1Xr0vbpJVf18CY+7UK3X29bBcv1N/hvYNcntVrvmXGFV11bVB2lB5cFrW0BVnU/buXjsMOvHtFbv0ddvs6o6bFj/41W1By0M/Yh2+BoW/pzf4OnmTV9B6zO48ci8hb4LFrLK7wTW7HOnKWBY0yR8EDhk6Ih8W+AltEMiczYEXppko7RT+B9JO3x5A0OH6k1oh+NuMnSSnjuL7dPApkNH3Y1pLVBX0PpvrImPAPdN8udJNhzqelCSuw2tGTcDflFVVyXZlXboZ87PaF/Io1+WZwIPS3LHJLcE/m5VT15V36cdHnlNkpunnSG6Q4ahIIa6thr+WV8y3G1Vw4UcknZSwO8Df0ELmtBOvri0qi5Pcg/aITCG53hQkp2H1/YK2qHP31TVNcC7aWfU3SbNNknmOjQfDTxzeK1uRmutW8xK4OvAq9KGdLgf7RD0cp9BvFRPG6n75Vz3Oh0F/GmSh6SdGHMwrZXotFU81k+BO49M35x2yPcXtNa8V61BXR8B7pp2IsBGSW6R6zr9vx345yTbACS5bZLHLvpIq3cc7b2/99Aa+FTaP/4bDJNzI3yM1kfxv4b31oZJNgIeuNgdhvf+PrT+dN9e2wKSbEk7rDm3A3cEsE+SPYbvlk2H27cfPq+PSXJT4Ne0sPSb4X4/BbYZ3g+Lmf8eOI92+PzJw3P9FddvsVzU6r4TaJ+7v05yhyS3obVGaooZ1jQJLwO+SfuCPBM4iXbSwZwf0MLGBbQg8PSqOmeRx3oW7VDLYbRQ9yta/y2Gw2l70fp+XALsCzy+qtZo3LPh8Mcf0VpDzqd9yb6K1l+khsd/bZJf0r4Uj5l339cApw+HK+5D68D9seE1OJn2z3d19qP1Vfk2rdP9UVx3yOPBw+NfPjz3AVW1WEvkb2j/IL9P+4d7aFV9YVj217RgdTmt79FRI/fbgtZieAnt5Icf0vq7QevMfB4trFw6PO5dh+3/MG3olP8Zaj9hsQ0cXssn0s70u2B4/r+pqv9Z/GUZq/fRdix+QutP9sKhzq8Dz6AdEr6QdoLJXqt5X/0D8A/De+BA2iHlC2nbeRZrsAMxvKceSXs//wz4Dted4fwa2kk0nx3ej1+i9be7Uarqp7T+Ti+hBcsDgT+pqktWecelPfZvgT8Z6j2K9t75Hq1/3h/PW/1Tw/vyUtoJHU+qqpU38qnnztq8nHbG64+AFww1nUPr4vAK4Oe09/nzaf8rN6CdtHQB7bV4AO3EFGjv+R8AP0s763khhwFPTTuD9zXD4fNn0vrJ/ZzWEnb6GmzHqr4T3kL7zJ1N+7wfvQaPqw7Nda6VupBkT1rH6nH3BZEWleRk2vvw/atdWZLGzJY1SZKkjo01rKUNcPqdtEFOD15g+UPSBh28Nsne85btn3YJku9mZGBMSZKk9cnYDoMOZ0j9L61vxbm0QSL3q6pvjqyzHW0ohRcBx1UbiZ4kt6L1f9mZdgbN6cD9h74akiRJ641xtqztAqysqnOq6mragIV7ja5QVT8YOuv+dt59/4g2COlFQ0D7NG2wVEmSpPXKOMPaHbn+oHznssTTktfyvpIkSTNjxepXudEWGtxxqcdcl3TfJAcABwBsttlm999xxx1vcCdJkqTenH766T+vqi2Xsu44w9q5XH8E5a1Z+ij059Iukjx638/PX6mq3kkbw4mdd965TjttVWNSSpIk9SHJki8DNs7DoKcCOyTZfhiVel/aaNhLcQLwqCS3HEZ4fxSrGExTkiRpVo0trA2jeR9IC1nfAo6uqrOTHJrkcQBJHjCM9rwP8I4kZw/3vQh4JS3wnUobZf2icdUqSZLUq5m5goGHQSVJ0rRIcnpV7byUdb2CgSRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUsRWTLmDc7v837510CWvs9H956qRLkCRJnbBlTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpYysmXYDWzo8OvdekS1gj277srEmXIEnSVLFlTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY2MNa0n2TPKdJCuTHLzA8o2THDUsPyXJdsP8DZMckeSsJN9K8uJx1ilJktSrsYW1JBsAbwUeDewE7Jdkp3mrPQO4uKruChwGvHqYvw+wcVXdC7g/8H/mgpwkSdL6ZJwta7sAK6vqnKq6GjgS2GveOnsBRwy3jwX2SBKggM2SrAA2Ba4GLhtjrZIkSV0aZ1i7I/Djkelzh3kLrlNV1wKXAremBbcrgPOBHwGvraqLxlirJElSl8YZ1rLAvFriOrsAvwG2ArYHXpjkzjd4guSAJKclOe3CCy9c23olSZK6M86wdi6wzcj01sB5i60zHPLcHLgIeBLwyaq6pqp+BpwE7Dz/CarqnVW1c1XtvOWWW45hEyRJkiZrnGHtVGCHJNsn2QjYFzhu3jrHAfsPt/cGPltVRTv0+fA0mwEPAr49xlolSZK6NLawNvRBOxA4AfgWcHRVnZ3k0CSPG1Y7HLh1kpXAC4C54T3eCtwM+AYt9L2nqr4+rlolSZJ6tWKcD15VxwPHz5v3spHbV9GG6Zh/v8sXmi9JkrS+8QoGkiRJHTOsSZIkdWysh0GltbXbm3ebdAlr5KTnnjTpEiRJM8aWNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6tiKSRcgra9OfMhDJ13CGnvoF06cdAmStN6xZU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjrmtUEljcVbXvjRSZewxg583WMnXYIk3YAta5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DGvDSpJN8I/PGXvSZewxl7y/mMnXYKkG8GWNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6tiKSRcgSerPt/7hs5MuYY383ksePukSpLGxZU2SJKljYw1rSfZM8p0kK5McvMDyjZMcNSw/Jcl2I8vuneTLSc5OclaSTcZZqyRJUo/GFtaSbAC8FXg0sBOwX5Kd5q32DODiqrorcBjw6uG+K4D3A8+uqnsAfwhcM65aJUmSejXOlrVdgJVVdU5VXQ0cCew1b529gCOG28cCeyQJ8Cjg61X1NYCq+kVV/WaMtUqSJHVpnGHtjsCPR6bPHeYtuE5VXQtcCtwauBtQSU5I8tUkf7vQEyQ5IMlpSU678MILl30DJEmSJm2cYS0LzKslrrMC2B148vD7T5PscYMVq95ZVTtX1c5bbrnl2tYrSZLUnXGGtXOBbUamtwbOW2ydoZ/a5sBFw/wTq+rnVXUlcDxwvzHWKkmS1KVxhrVTgR2SbJ9kI2Bf4Lh56xwH7D/c3hv4bFUVcAJw7yQ3HULcQ4FvjrFWSZKkLo1tUNyqujbJgbTgtQHw7qo6O8mhwGlVdRxwOPC+JCtpLWr7Dve9OMnraYGvgOOr6uPjqlWSJKlXY72CQVUdTzuEOTrvZSO3rwL2WeS+76cN3yFJkrTe8goGkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHVsjcNaklsmufc4ipEkSdL1LSmsJfl8klskuRXwNeA9SV4/3tIkSZK01Ja1zavqMuDPgPdU1f2BR4yvLEmSJMHSw9qKJHcAngh8bIz1SJIkacRSw9orgBOAlVV1apI7A98dX1mSJEkCWLHE9c6vqt+dVFBV59hnTZIkafyW2rL25iXOkyRJ0jJaZctakgcDuwJbJnnByKJbABuMszBJkiSt/jDoRsDNhvVuPjL/MmDvcRUlSZKkZpVhrapOBE5M8u9V9cN1VJMkSZIGSz3BYOMk7wS2G71PVT18HEVJkiSpWWpYOwZ4O/Au4DfjK0eSJEmjlhrWrq2qt421EkmSJN3A6s4GvdVw86NJ/gr4MPDrueVVddEYa5MkSVrvra5l7XSggAzTfzOyrIA7j6MoSZIkNas7G3T7dVWIJEmSbmhJfdaS/NkCsy8Fzqqqny1vSZIkSZqz1BMMngE8GPjcMP2HwMnA3ZIcWlXvG0NtkiRJ672lhrXfAr9XVT8FSHI74G3AA4EvAIY1SZKkMVjqhdy3mwtqg58BdxvOBr1m+cuSJEkSLL1l7X+SfIw2OC7AE4AvJNkMuGQslUmSJGnJYe05tIC2G20Yj/cCH6qqAh42ptokSZLWe0sKa0MoO3b4kSRJ0jqyuisYfLGqdk/yS9oguL9bRMtwtxhrdZIkSeu51Q2Ku/vw++brphxJkiSNWurZoCTZPcnTh9u3SeLVDSRJksZsSWEtySHA3wEvHmZtBLx/XEVJkiSpWWrL2p8CjwOuAKiq8wAPjUqSJI3ZUsPa1cMZoQUwjK8mSZKkMVvqOGtHJ3kHsEWSZwF/Cfzb+MqSJGl8Xv7yl0+6hDUybfVqea1u6I6DgJOAN9AGv70MuDvwsqr69PjLkyRJWr+trmVta+CNwI7A14Ev0cLb6WOuS5IkSax+nLUXASTZCNgZ2JXhEGiSS6pqp/GXKEmStP5aap+1TYFbAJsPP+cBZ42rKEmSJDWr67P2TuAewC+BU2iHQV9fVRevg9okSZLWe6sbumNbYGPgAuAnwLnAJeMuSpIkSc3q+qztmSS01rVdgRcC90xyEfDlqjpkHdQoSZK03lptn7VhMNxvJLkEuHT4+RNgF8CwJkmSNEar67P2PFqL2m7ANbRhO74MvBtPMJAkSRq71bWsbQccC/x1VZ0//nIkSZI0anV91l6wrgqRJEnSDS31Qu6SJEmaAMOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdW92F3CVJ0hQ5+phdJl3CGnviPl9Z8rq/f+wJY6xkPL629x+t1f1tWZMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6NtawlmTPJN9JsjLJwQss3zjJUcPyU5JsN2/5tkkuT/KicdYpSZLUq7GFtSQbAG8FHg3sBOyXZKd5qz0DuLiq7gocBrx63vLDgE+Mq0ZJkqTejbNlbRdgZVWdU1VXA0cCe81bZy/giOH2scAeSQKQ5PHAOcDZY6xRkiSpa+MMa3cEfjwyfe4wb8F1qupa4FLg1kk2A/4OeMUY65MkSereOMNaFphXS1znFcBhVXX5Kp8gOSDJaUlOu/DCC29kmZIkSf1aMcbHPhfYZmR6a+C8RdY5N8kKYHPgIuCBwN5JXgNsAfw2yVVV9ZbRO1fVO4F3Auy8887zg6AkSdLUG2dYOxXYIcn2wE+AfYEnzVvnOGB/4MvA3sBnq6qAP5hbIcnLgcvnBzVJkqT1wdjCWlVdm+RA4ARgA+DdVXV2kkOB06rqOOBw4H1JVtJa1PYdVz2SJEnTaJwta1TV8cDx8+a9bOT2VcA+q3mMl4+lOEmSpCngFQwkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZLngenLAAAgAElEQVQ6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjo01rCXZM8l3kqxMcvACyzdOctSw/JQk2w3zH5nk9CRnDb8fPs46JUmSejW2sJZkA+CtwKOBnYD9kuw0b7VnABdX1V2Bw4BXD/N/Djy2qu4F7A+8b1x1SpIk9WycLWu7ACur6pyquho4Ethr3jp7AUcMt48F9kiSqjqjqs4b5p8NbJJk4zHWKkmS1KVxhrU7Aj8emT53mLfgOlV1LXApcOt56zwBOKOqfj2mOiVJkrq1YoyPnQXm1Zqsk+QetEOjj1rwCZIDgAMAtt122xtXpSRJUsfG2bJ2LrDNyPTWwHmLrZNkBbA5cNEwvTXwYeCpVfW9hZ6gqt5ZVTtX1c5bbrnlMpcvSZI0eeMMa6cCOyTZPslGwL7AcfPWOY52AgHA3sBnq6qSbAF8HHhxVZ00xholSZK6NrawNvRBOxA4AfgWcHRVnZ3k0CSPG1Y7HLh1kpXAC4C54T0OBO4KvDTJmcPPbcdVqyRJUq/G2WeNqjoeOH7evJeN3L4K2GeB+70KeNU4a5MkSZoGXsFAkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6thYw1qSPZN8J8nKJAcvsHzjJEcNy09Jst3IshcP87+T5I/GWackSVKvxhbWkmwAvBV4NLATsF+Sneat9gzg4qq6K3AY8OrhvjsB+wL3APYE/nV4PEmSpPXKOFvWdgFWVtU5VXU1cCSw17x19gKOGG4fC+yRJMP8I6vq11X1fWDl8HiSJEnrlVTVeB442RvYs6qeOUz/BfDAqjpwZJ1vDOucO0x/D3gg8HLg5Kp6/zD/cOATVXXsvOc4ADhgmLw78J2xbMzCbgP8fB0+37rm9k23Wd6+Wd42cPumnds3vdb1tt2pqrZcyoorxlhEFpg3Pxkuts5S7ktVvRN455qXtvaSnFZVO0/iudcFt2+6zfL2zfK2gds37dy+6dXzto3zMOi5wDYj01sD5y22TpIVwObARUu8ryRJ0swbZ1g7FdghyfZJNqKdMHDcvHWOA/Yfbu8NfLbacdnjgH2Hs0W3B3YAvjLGWiVJkro0tsOgVXVtkgOBE4ANgHdX1dlJDgVOq6rjgMOB9yVZSWtR23e479lJjga+CVwLPKeqfjOuWm+kiRx+XYfcvuk2y9s3y9sGbt+0c/umV7fbNrYTDCRJkrT2vIKBJElSxwxrkiRJHTOsSZIkdcywtgaSbJ3kYcPtjZNsNumapPVBkmfMm94gySGTqme5eTk9aTKSbLLAvNtMopZVMawtUZK/pA0p8q5h1p2A/5pcReOR5E5JHjHc3jTJzSdd03JJcrskhyf5xDC90/wQMK2S3GS4Isis2iPJ8UnukOSewMnAzLw3gZVJ/mWB6yfPhCSnJXlOkltOupbllOSjSY5b7GfS9S2XJLddYN7dJ1HLGJya5EFzE0meAHxpgvUsyLC2dM8DHgRcBlBV/wvc4A08zZI8i3aN1ncMs7YGPjK5ipbdv9OGktlqmP5f4KCJVbOMquq3wNeSbDvpWsahqp5Eu47wWcDxwEFV9aLJVrWs7k17P74ryclJDkhyi0kXtYz2pX3uTk1yZJI/Gq4DPe1eC7wO+D7wK+Dfhp/LgVnaefqfJE+cm0jyQuDDE6xnOT0JePOws/QB4FnAwydc0w04dMcSJTm5qh6U5Iyquu9w2OLMqrrXpGtbLknOBHYBTqmq+w7zzpqVbUxyalU9YO5vOMw7s6ruM+nalkOSzwIPoA0gfcXc/Kp63MSKWiZJduC6sPZ7tDEYX1BVV060sDFI8hDgg8AWtJ2nV1bVyslWtTyS3AT4E+BtwG+BdwNvrKqLJlrYWkryhap6yOrmTaskd6CNQXYVcDvgW8ALq+ryiRa2TJI8Hngf8EvgIT1+3sZ5bdBZc1KSvwU2GfqtPQf42IRrWm6/rqqr53Z4h0uAzVKavyLJrRm2aWj6vnSyJS2rV0y6gDH6KHBgVX1maJF5Ae0qKfeYbFnLY9j5ewzwdGA7WmvNB4A/oLUk3m1ixS2TJPembd8fAx+ibd/uwGeBad9h2jLJnavqHIDhyjtLukD3NKiq85N8EngxLWS/eIaC2uHAXWit23cDPprkLVX11slWdn2GtaX7W+AA4NvA82mH096xyntMnxOT/D2waZJHAn9F+yc5K15A63d4lyQn0b5M955sScunqk6cdA1jtEtVzXVBKOB1s9QnCPgu8DngX6pqtL/MsUNL21RLcjpwCe2qNQdX1a+HRack2W1ylS2bvwY+n+ScYXo74P9MrpzlleTTwPnAPWndY949tBzOQleEbwDPHL5Xvj/sxL9+wjXdgIdBl2DY6313Ve2/2pWn2HCI4hnAo4DQAum7agbeJMO2PYh2iPDutO37TlVdM9HCltHwJfNm2mHCjWiXebuiqqa+71OSmwIvBLatqmcNh0XvXlUz0bqdZPeq+uK8ebtV1UmTqmk5jbY6zaokGwM7DpPfHgmkUy/J46vqIyPTK2ita6+cYFnLJsmmtO+W70y6lsUY1pYoyaeAx8zSP/f5hqFIrpq7DusQUjeelX5BSb5cVQ+edB3jkuQ0WkfuY4CdgacCO1TV30+0sGWQ5CjgdOCpVXXP4cv1yzPU3/CrVXW/1c2bZkkeQzts/buhEqrq0MlVtHyGnYkXAHeaxZ0JaCMF0L5PPjN8/lZU1S8nXdfaSvJY2okiG1XV9knuAxzaW19fD4Mu3Tm0M2L+i+t33n7T5Epadv8NPIJ2JhPApsCngF0nVtHy+tRwWvZ/zkJr4UKqamWSDYbA/Z4k3Z2CfiPdpar+PMl+AFX1q1k4mzDJg2mfry2TvGBk0S1oLaMzIcnbgZsCD6MNf7Q3rZV7VryHtjMxtzN4Lm2naSbC2jBSwAHArWj9u7YG3g7sMcm6lsnLaSfWfR6gqs4c+hx2xbC2dBcCn6Z94dx0wrWMyyajnUar6vJhj3FWvADYDLg2yVW0Q6E1C4cJB1cm2Qg4M8lraH1MZmXg5quHvfm5k0PuAszCYaaNgJvRvotHx427jBnqTwnsWlX3TvL1qnpFktcB/znpopbRTO5MjHgOw0gBAFX13YXGXptS11bVpfP+XN3tzBvWlqiqXjrpGtaBK5Lcr6q+CpDk/rSxg2ZCVc3SIKoL+Qva2IkH0jo8bwM8YaIVLZ9DgE8C2wxjIe0GPG2iFS2D4aSQE5P8e1X9cNL1jNHc98iVSbYCfgF013qxFmZ1Z2LOLI8U8I0kTwI2GA5fP48OB8U1rC3RcDbMDd6cVfWoCZQzLgcBxyQ5b5i+A/DnE6xnWS12Vl1VfWFd1zIOVfXD4R/GHapqpobxqKpPJ/kq7SSRAM+vqp9PuKy1luQNVXUQ8JYkC32/dNVvZi18LMkWwL8AX6V9l75r1XeZKjO5MzFilkcKeC7wElq4/iDtxLruTpzwBIMlSvLAkclNaC0Wv66qv5lQSWORZEOuO1vy27N0QkWS0S+XTWjN+qdXVXejVd8Y09JRdk0kWWUH+7lW4GmV5P5VdXqShy60fBaHYxnOmtykqmZpjEOGMRzndiZOnoWdiTmzPFLAtDCsrYUkJ1bVgl+y0yrJrrQxgn7X6lpV751YQWOUZBvgNVW136RrWQ7DWFYPBz4/coWGr1fVvSdb2Y2X5HPDzU1oZ7h+jfbP4t60K23sPqnatHpJ/mxVy6tqJvqtDf3TngzcuaoOTbvs2+2rapZOopgpw877ogGot51cD4MuUa5/nb6bAPenHSacGUneRzvT50zgN8PsAmYyrNHO2LrnpItYRgt1lJ1qVfUwgCRHAgdU1VnD9D2BqR+QM8lZrPofxtQG7cFjh9+3pZ31+tlh+mG0s+9mIqwB/0ob2f/hwKG0yxZ9iHb5t6k14+/P1w6//wy4PfD+YXo/4AeTKGhVDGtLdzbtTRvgWtqFe5810YqW387ATrPatJ3kzVz3xXMT2iVuvja5ipbdVHSUvZF2nAtqAFX1jeEw77T7k+H3c4bf7xt+PxmY+vENq+rpAEk+RvtuOX+YvgPQ1eV81tIDq+p+Sc4AqKqLhzOzp93Mvj/nuhgkeeW8a7h+NEl3/ZgNa0t35/n9t4YzYmbJN2h7GOdPupAxOW3k9rXAB2dlhPjBaEfZ/6D1K3nVRCtaPt9K8i7a3m8BT6FdTHqqzZ0BOlytYPSySwenXRJtJgaNBbabC2qDnzID1zsdcc0wiPjc2aBb0lraptp68v6ciuu6zlrYGKdTgPmdnb+ywLxpdhvgm0m+wshp570du7+xquqIudtJbkkb2mKW3LmqXkILbLPm6cD/pV2XF+ALwNsmV86y22z0klND39FZGSMP2nUzT6CdbVe0K218btV3mSpvAj4M3C7JP9DGyPt/ky1pWc3y+3MqruvqCQarMQz8dwfgSOCJtMOg0EYYf1dV7bjYfafNrJ+RluTzwONoOyln0gY6PrGqXrCq+02LJF+kDbL678B/VNUlk61ISzWMafhuYPNh1iXAX0772a6jkvwpMHe46QtV9eFJ1rPckuzIdSP6f7aqpr7ld86svz8zBdd1NaytRpKnA39J69905siiXwLvqapjJlKY1liSM6rqvkmeCWxTVYdM+9mS8yW5G60Vah9ay+97qurTk61q7SXZjXZZmDtx/TOV7zypmsZhOJEpszasxepkBq7bOwwzszut5fCkWQkyo2b1/TkNoyAY1pYoyROr6uhJ1zFOSR4EvBn4PVoLzQbAFbNyOabhzKZHAUcAL6mqU2ctrAEMfWceTzs0cxmtNfjvp3mYhCTfph2uOJ3rzlSmqn4xsaKWQZKnVNX7c/3rgv5OVb1+Xdc0CXM7UpOu48ZK8jLaDtKHaJ+3xwPHVNVM9BkdWp6ewA0DzdT3WVtsFISqet7kqroh+6wtUVUdneSPgHvQxnyam/+Pk6tq2b2F1pfkGNqZoU8FdphoRcvrUFqn+y8OQe3OwHcnXNOySXJvWqvaY2jXsX1sVX11uLzPl5nuYRIurapPTLqIMZjr9zPrl0JbnWlvNdgPuG9VXQWQ5J9pV2qYibAG/BdwKW1nqbtDhGtpKkZBMKwtUZJ/Bbag9bl4D20v4+SJFjUGVbUyyQZV9RvgPUlmZegHhkPWx4xMn8PsXDsTWtj+N1or2u+u6VpV5yWZ9s7On0vyL7TAOXryy1Qfaqqqdwy/Z+ryYOuhH9B24q8apjcGvjexapbf1lW156SLGJOpGAXBsLZ0u1fVvZN8rapemuQ1tCbvWXLlMDbQmcP2nc/snPHDsE2vol1U+pPA7wMHVdX7V3nHKTFvrKD5y96X5ENVNa3hdO5ybzuPzCvaIKRTbxgu4Lnc8DDTTJyJvQTTPpLzr4GzR64h/Ujgi0neBNDbIbUb4UtJ7jU61uEMmYpREAxrSze3x3RVktsDv6B9sc6Sv6ANFnsgrX/QNrTRnWfFo6rqb4ez0s6l9TH5HNeNXD3rprYz/tyVDGbYR4DDaRfHnvrxuUYNfShPqKpHrGK1v1hX9YzJh4efOZ+fUB3jsjvwtCTfpwWa0Pp1zUJ/35dPuoClMKwt3fFJtqBdomKuI+IRq77L1Hl8Vb2RFkxfAZDk+cAbJ1rV8tlw+P3HtAFxL5qlSzMtQdd9MhayWMf7OTPUAf+qqnrTpIsYh6r6TZIrk2y+2FmEVfWNdV3XcpobwzHJhrRL2P2kqn422aqW1aMnXcC4TMvQVIa1JUhyE+ATw7hVxwyXTtm0qi6acGnLbX9uGMyetsC8afXR4azCXwF/NYwyftVq7qPJWl863r8xySHAp5ihPnkjrgLOGg4TXjE3c9oPDyZ5O/Dmqjo7yea0E3l+A9wqyYuq6oOTrXDtJLnVcPOXEy1kDJL8koV3YOdaDbsaBcGhO5YoyclV9aBJ1zEOSfYDnkRr6v6fkUW3oF0cfFWHL6bKcOWCy4a9/c2Am1fVBZOua12Y9uERViXJi6vqnyZdx42V5J9ohwK/x3WHQauqZqVP3v4LzR+9qsg0SnJ2Vd1juH0Q8IdV9fihq8wnpv3zNhz2nLsm9nw1a+McLiTJLavq4knXYcva0n06yV5V9V+TLmQMvkQ7meA2wOtG5v8S+PpEKhqDJDelXZB4W+AAYCvg7sDHJlnX2kry31W1R5JXV9XfrWLVVS2bdvsAUxvWgD+lXS7s6kkXMg7THspWYfTv9UiGs82r6oJZ6GJRVdsvZb0k96iqs8ddz4T8Nx1cVtKwtnQHApsn+TXtMNpcU+mtVn23/g0X6/1hkkcAv6qq3w4j4e8IzNLZP++hjRO06zB9Lu3LdarDGnCH4VJhj0tyJPP2gucOpVXVpyZR3Doy7f8Zv0YbGmiW+jn9zkgLzfXMQMvMJUn+BPgJsBvwDIAkK4BNJ1nYOvY+Ogg0Y9LFd4thbeluM+kC1oEvAH8wHCr8b+A04M+BJ0+0quVzl6r68+GwL1X1q8zC7i+8DDgY2BqY3+F+Zoa3WI1p789xO+DbSU6l4+ED1sLokCub0FpCp35Hl3bB7zfRxuk6aKRLxR7AxydW1bo3C9+ji+niu8WwtkRDH6d9aYcq/jHJ1rQv2NMnXNpySlVdmeQZtE6zr0lyxqSLWkZXJ9mU4cOX5C7MwGjcVXUscGySl1bVKyddz4RM+z+LQyZdwDgtcFmwNyT5Im1HY2pV1f8CNxgstqpOoF0tBZj+PpVL0EWgmWWGtSVK8hba0A8PAf4RuBJ4O/CASda1zJLkwbSWtGcM82bpPXIIbTDcbZJ8gHbY4mkTrWgZVdUrkzyO9h4F+HxVTfsh3qU6ZvWr9Gtahg+4sYaLnM+5Ca2lbX050xemv0/l+qyLHcFZ+kc8brtW1f3mWpqGMbo2mnRRy+wg4MXAh4dT0e9MGzR26g2HO79NG+T3QbQP4POr6ucTLWwZDWcU7gJ8YJj1/CS7VdWLJ1jWWhmuOnFOVb193vy/Bm4/d0LFtF+jd94wAhvRdgyv6G34gLUweuLStbTLMz1xMqVMRBf/8Mdoqk+MGQZuvh3Xv3rIj4abe0ykqHkcumOJkpwCPBg4bQhttwY+M+2nZq9PkpxeVfefdB3jkuTrwH2q6rfD9AbAGdM8yniSbwL3nNumkfk3Ab5eVfecTGXjleTxwC5V9feTrkVrL8lXq2pqO+AnObSqXjYyvQHw3qqa+v7MSZ5LO+ryU64/bE5X35u2rC3dW2nXAt0yyStoe4UzcfHlJG+oqoOSfJSFz9ialU7OJyd5QFWdOulCxmgLYG6w5s0nWcgyqflBbZj52xk5OWRBVfWRJAdPuo7lMgwYewjXHaI/ETh0sSsazKBpf69uO9fvLsnGtG4HszJg8/OBuy/Qr7IrhrUlqqr3JjkdmBsgdp9pv0TKiPcNv1870SrG72HAs5P8gDaK+ixd3w5an5gzknyOtm0PoR3WnmZXJtmhqr47OjPJDrQhdGZCktFr8M716Zqlwx7vBr7BdYc+/4I2lM4sXXt4Vaa6TyXwdOADSV5M+x79RFUdNuGalsuPge53GjwMugaS3Js2yn8BJ1XVzAwYO2e4BBNVdeGka1luSe600PxhnLmZkOQOtJNeApwyenWGaRy4MsmjgTcDr+K6M693poXQg6rq+EnVtpySvGdkcq5P17/NyvUlk5xZVfdZ3bxpk+TNrCJUz8DltEYP3W4IvAM4CTgcZuNyaEkOpw2O/nGuP2xOV9cdtmVtiZK8hHZJpg/T/hH+R5IPzMLp2MPhpENoA/8GuEmSa2nDdxw60eKWQZJNgGcDd6UN8nt4VV072arGo6rOB45bZPHUDVxZVZ8Y+m/9DfDcYfbZwBOqamYGbK6qp0+6hjH7VZLdq+qLAEl2YzZaRk8bfu8G7AQcNUzvw2wM6/S6edMX07bzdczOGI4/Gn42Gn66ZMvaEiX5FnD/qrpymL4pcHpV/d5kK1t7w5l1fwwcUFXfH+bdGXgb8Mlpb+5OchRwDe26p48GflhVz59sVeveLF8bdFrNesvMnCT3AY7gun6UFwP7z8rRiaHrwaOq6pphekPgU1X1sMlWpqVKcnNat5jLJ13LQmxZW7ofcv3XawVwzoRqWW5PBR45OoxFVZ2T5CnAp4CpDmvATlV1L/hdk/dX/n979x4sZ13fcfz9IVwLGOQ+FLmlAoY0oUDKJQiIFXW0XIoRwRaotko7FmumndLWgtSOQhnpAE47gyggg0BA2iKWyzQCwUBQIIGECoXhUkEZEQhELpLAp388zyHnnOzuOUk257ns5zVz5uw+u3v2u8lJ9re/5/f7fCuupyqN+2QmqdssIdCKzS/3Drt8Nu0Nx/0J8M/AFIpNMC8Bx9Ke3sM7UeTGDW3u2aI81gqSdqDIF93J9oclTQUOtv3NiktbZ5KmUZx12Lq8/kvg5LotGclgbfxeBR6SdAvFm95RwA8lnQ9ge06Vxa2jjTrljdl+rvyE2HQrhi7YXtniTYRtdDDFAuCrgHto/q66EYY3OJf0ly1ueP6fwDKKHYTPVFzL+nAOqzb3ABwOfKm6cvruMooNIX9fXv9filO+jR+sARcDc2zfBiDpCOAbrOohXQsZrI3f9xnZ621hVYWsB70CDRsddliaIenl8rKAzcrrQ7tB2xI8OpYm/l3uCHwAOJFizej3gavq9qm3Txo387kGdra9WlumtrB9qaSbgAPLQ2cM39zTAtvanlvuBh360Ptm1UX1yeZDAzUA27dL2rzKgjrJYG2c2jDd28Pwwcxwomi63Gi2J1Vdw0QYK7jS9kGVFbeWbL9J0SLs5jLf6UTg9vK1XlRtdbEG7pL0223aFNLBJOA5ivfVPSXtaXt+xTX1yytlEPxQX+WDaEDcxTg9LukfWBVh9YfAExXW01EGa+Mk6UPAl4FdKf7chmZltq60sD4YlMHMAGhlcGX5Wj5CMVDbDbgQuL7KmvplVJup3xg1A9z4WV9JSyhe34bAH0t6nCIeoVUZh5LOBU6g2Kn8dgo+0JbB2hyKXeZTJC0AtgM+Vm1JffMpivWi11P8Xs6nyJWrlewGHSdJj1EEOi5h1T/GoU/+EZUrI1iupPgdbUVwpaTLgWnATcDVLQqiHgjdsg2HtCXjUNIjwHTbvx7zzg0laUOKPDIBjwztfI2JkcHaOEm6HTiyU+ubiCq1ObhS0lsU3SZg5JquVsw8RTuU69Vm1zX2YV2VUVVzgF1t/2nZQWQv2zdWXNpaa1qbxQzWxknS71Jsq7+dkSnHF1ZVUwS8nfHUjW03Nrgy2XDRBJK+C8wA5jHy/aEtOXnXUIT8nmx7mqTNgLub3IFC0v6275N0eKfbbd8x0TX1kjVr43c2RQTEVgw7DRpRtZYHb+bTZDTBDXTvHNIGU2yfIOlEANuvqeEZSLaHOkzsa/uC4bdJ+jyQwVpDbW97/6qLiOimpcGV20vqmmFYt/59MZhanI835I1yNm1oN+gUhs0gNtwpwAWjjp3a4VilMlgbv3mSjrT9g6oLiejiMtoXXDmJIg2+0Z/io50kzbX98WG7Xkdoy25XioDfm4F3SbqSohfqqVUWtK7KWcKTgN1HdUrZEni+mqq6y5q1cZL0IkVfu1cpwkVbE90R7SDpx7ZnDl/nJWlxw9eV3G+7Uc3nY3CUIbH/TdHrdLXdkW3Z7QpQ5qwdRPHet7BT15smKXcq7w58FThj2E3LgQdtr6yksC4yszZ+21ZdQMQY2hhcmRm1qLNtKE6X7U3R5/Quip3Yd9t+odcDm0TSFRT5Y3fafrjqevqhHEg/JemTwM9svw5Qnu7dGXiywvJWk5m1NSDpE8Aetr8iaWdgh2GLFCMqVUZ4XESRS7aUMrjSdmObZUvauk1vetFOkjYGDqDoJ3lw+bXM9tRKC+sTSUcChwLvBfYAFgPzRy/MbyJJ9wKH2H6jvL4xsMD2zGorGymDtXGS9HWKDKvDbL9H0tbALXX7C43BluDKiIknaTLFAG1W+X0rYInt2iXhr62yfXbpWBcAAAm+SURBVN1MisDt04DXbO9dbVXrrtNSEUkP2J5RVU2d5DTo+B1iez9JiwBsv1COwCNqoVNwpaRGB1dG1Jmki4F9KNY53UNxGvR82y9WWlifSZoHbA7cDdwJzLT9i2qr6pvnJB1t+wYASccAtVuPt0HVBTTICkkbsGo90DYkby3q5VKKzS8Hl9efBv6punIiWm8XYBPgWeAZin9zyyqtaP14kOL/lmnAdGAoGLcNTgP+TtJPJf0f8DfAZyuuaTU5DToGSRvaXinpZOA4inUJ36LoE3q27asrLTCiJOle2weM2g1au+n8iDYpw2H3oVivdgjFgOYFik0GZ1VZW79J2oKiyflfATva3qTikvqmfG2yvbzqWjrJadCx/QjYz/a3Jd0H/B7FeqDZaSodNdPm4MqIWnIx47FU0jKK3dcvAR8FhloUNp6kz1FsLtgfeIpiwuLOSovqk6aEiWewNra3owNsPwQ8VGEtEb2cRcuCKyPqTNLpFLNpsyhy1hZQrOv6FrCkwtL6bTPgfOC+Tvljkt7Z4HV6l9GAMPGcBh2DpKcpfkk7SrubqIsyC2kJ8BrwOHBP04MrI+pM0vmU2Wq2f151PVVpcnh1U8LEM7M2trS7iaa4lCIL6QOUWUiSWpGFFFFHtrv2rR0wTX5/bESYeGbWxtDkTwwxeNqahRQR9dXk98mmhIlnZm1sTf7EEAOk5VlIERF9VcZxbQocTs3DxJOzNrb3V11AxDi1OQspIiaYpN3He9f1Wsh6Yvst4Gu2V9p+yPbSOg7UIIO1MaUvYTSF7S/YPowiD/B5ijVsbQzojIiJcR28PWvfS5MnNW6VdHyZl1dbWbMW0RIdspDmA3fa/kGlhUVEI5XtFf8D+BPgX0bf3oY0BEnLKZaPrARep5gltO13VFrYKFmzFtEePbOQIiLW0CeAYynGCltWXMt6YbsRryszaxEREdGVpA/bvqnqOvpJ0t62Hy53g67G9v0TXVMvGaxFREREV5ImU3RIOaw8dAfwj7Zrl0c2XpIutv0ZSbd1uNm2j5zwonrIYC0iIiK6kvRdigyyy8tDfwTMsP0H1VU1WDJYi4iIiK46tV+qY0umNVV2LjgJGAoO/wnwnTqmQCS6IyIiInp5TdKhQ1ckzaLoQdxYkt5DMVu4P0Xz9kcpur8slVS7ri+ZWYuIiIiuJM0Avg1MLg+9CJxSt5ZMa0LSdcBc23NHHT8eOMn28dVU1lkGaxERETEmSe8AsP3yqOOn2L6886PqSdIjtvda09uqktOgERERMSbbL48eqJU+P+HFrLtX1vK2SiQUNyIiItZFrVs1dbG9pDkdjgvYbqKLGUsGaxEREbEumrie6ht078pwyUQWMh5ZsxYRERFrTdIi279TdR3rg6S/tf3VquvImrWIiIjoStKkMe6yYEIKqcbsqguADNYiIiKit8cknSdpaqcbbX9uoguaQLVYj5fBWkRERPQynSI49hJJCyV9ZijGYwDUYq1Y1qxFRETEuEg6DLgK2Aq4Dviy7ceqrWr9qct6vMysRURERFeSJkk6WtK/AxcAXwP2AL4H/Felxa1/11ZdAGRmLSIiInqQ9DhwG/BN23eNuu1C26dXU9nak3QRPU5x1u01JWctIiIieplu+1edbqjboGYN3Ft+nwVMBa4pr88G7qukoh4ysxYRERFdSdoU+DSwD7Dp0HHbn6qsqD6RdBtwlO0V5fWNgFttv6/aykbKmrWIiIjo5QpgR+CDwB3AzsDySivqn50Y2clgi/JYreQ0aERERPTyW7ZnSzrG9uWSvgPcUnVRfXIOsKicYQM4HPhSdeV0lsFaRERE9LKi/L5M0jTgWWC36srpH9uXSroJOLA8dIbtZ6usqZOcBo2IiIheLpb0TuCLwA3A/wDnVltSX00CngNeBPYss+RqJRsMIiIiYjWS5nQ6XH637fMnsp71QdK5wAnAQ8Bb5WHbPrq6qlaX06ARERHRydDC+72AmRSzagC/D8yvpKL+OxbYy/avqy6kl8ysRURERFeSbgWOt728vL4lcK3tD1Vb2bor16vN7pYjVxeZWYuIiIhedgHeGHb9DVqywQB4FVgsaR7w9uxa3cJ+M1iLiIiIXq4AflT2BjVwHHB5tSX1zQ2sOr1bWzkNGhERET1J2g94b3l1vu1FVdYzaDJYi4iIiIEiaa7tj0taQoeG7ranV1BWVzkNGhEREYNmkaSZFKd0V4x156plsBYRERGDZhvgAmBv4EHgLmABcLftF6osrJOcBo2IiIiBJGlj4ADgEODg8muZ7amVFjZKZtYiIiJiUG0GvAOYXH79DFhSaUUdZGYtIiIiBoqki4F9gOXAPcBCYKHtFystrIs0co+IiIhBswuwCfAs8AzwNLCs0op6yMxaREREDBxJophdO6T8mga8QLHJ4Kwqaxstg7WIiIgYWJJ2BmZRDNg+Cmxje6tqqxopg7WIiIgYKJJOpxiczaLIWVsA3F1+X2L7rQrLW012g0ZERMSg2Q24DviC7Z9XXMuYMrMWERERUWPZDRoRERFRYxmsRURERNRYBmsR0RqS3pS0eNjXbmvxM7aS9Of9ry4iYu1kzVpEtIakX9neYh1/xm7AjbanreHjJtl+c12eOyKik8ysRUSrSZok6TxJP5b0oKTPlse3kDRP0v2Slkg6pnzIOcCUcmbuPElHSLpx2M/7uqRTy8tPSjpT0g+B2ZKmSLpZ0n2S7pS0d3m/2ZKWSnpA0vyJ/ROIiKZLdEdEtMlmkhaXl5+wfRzwaeAl2zMlbQIskHQr8FPgONsvS9oWWCjpBuAMYJrtfQEkHTHGc75u+9DyvvOA02w/KulA4F+BI4EzgQ/afkZSrcI2I6L+MliLiDZ5bWiQNcxRwHRJHyuvTwbeTdEL8CuSDgPeAn4T2GEtnvMaKGbqKEI2ry262ABF70EogjYvkzQXuH4tniMiBlgGaxHRdgL+wvYtIw4WpzK3A/a3vULSk8CmHR6/kpFLRkbf55Xy+wbAsg6DRWyfVs60fQRYLGlf28+vzYuJiMGTNWsR0Xa3AH8maSMASXtK2pxihu0X5UDtfcCu5f2XA1sOe/xTwFRJm0iaDLy/05PYfhl4QtLs8nkkaUZ5eYrte2yfCfwSeFf/X2ZEtFVm1iKi7S6haC1zv4rzk88BxwJXAt+TdC+wGHgYwPbzkhZIWgrcZPuvy9OXDwKPAot6PNcngX+T9EVgI+Bq4AHgPEnvppjlm1cei4gYl0R3RERERNRYToNGRERE1FgGaxERERE1lsFaRERERI1lsBYRERFRYxmsRURERNRYBmsRERERNZbBWkRERESNZbAWERERUWP/D5vaGBW2b61dAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_gbt_t_new['column'][:10], y=feat_imp_tuned_gbt_t_new['weight'][:10],data=feat_imp_tuned_gbt_t_new)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from GBT Best tuned\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Decision Trees Binary Classification Base Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create initial Decision Tree Model\n",
+ "dt = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Creates a pipeline\n",
+ "dt_pipe = Pipeline(stages=[label_stringIdx, va, dt])\n",
+ "\n",
+ "# Train model with Training Data\n",
+ "dtModel = dt_pipe.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Makes prediction from our test set\n",
+ "pred_dt = dtModel.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ROC AUC: 0.6639903304724255\n"
+ ]
+ }
+ ],
+ "source": [
+ "# prints the ROC AUC score\n",
+ "print(\"ROC AUC:\",evaluator.evaluate(pred_dt))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 203,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5"
+ ]
+ },
+ "execution_count": 203,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel.stages[-1].getMaxDepth()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 204,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1"
+ ]
+ },
+ "execution_count": 204,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel.stages[-1].getMinInstancesPerNode()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 205,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "32"
+ ]
+ },
+ "execution_count": 205,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel.stages[-1].getMaxBins()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7156021818664244\n"
+ ]
+ }
+ ],
+ "source": [
+ "# calculates the accuracy of our model\n",
+ "evaluator.evaluate(pred_dt)\n",
+ "\n",
+ "binary_prediction=pred_dt.select(\"prediction\").collect()\n",
+ "\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "\n",
+ "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prediction_dtb=pred_dt.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.74 0.89 0.81 131571\n",
+ " 1 0.61 0.37 0.46 64408\n",
+ "\n",
+ " micro avg 0.72 0.72 0.72 195979\n",
+ " macro avg 0.68 0.63 0.63 195979\n",
+ "weighted avg 0.70 0.72 0.69 195979\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_pred=prediction_dtb,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_dtb= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_dtb['column'][:10], y=feat_imp_tuned_dtb['weight'][:10],data=feat_imp_tuned_dtb)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from DT Binary Base Model\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Decision Tree Binary Classification Grid Search"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# declares a decision tree classifier\n",
+ "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\")\n",
+ "\n",
+ "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n",
+ "\n",
+ "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n",
+ "\n",
+ "#grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n",
+ "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10]).addGrid(dt_new.minInstancesPerNode, [500]).addGrid(dt_new.maxBins,[50]).build()\n",
+ "\n",
+ "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dtModel_t = cv1_dt.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pred_dtt = dtModel_t.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC ROC: 0.5687880524415173\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"AUC ROC:\",evaluator.evaluate(pred_dtt))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7307313538695472\n"
+ ]
+ }
+ ],
+ "source": [
+ "# calculates the accuracy \n",
+ "evaluator.evaluate(pred_dtt)\n",
+ "\n",
+ "binary_prediction=pred_dtt.select(\"prediction\").collect()\n",
+ "\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "\n",
+ "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='DecisionTreeClassifier_6cf0199ad377', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='featuresCol', doc='features column name'): 'features',\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='labelCol', doc='label column name'): 'label',\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 50,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 500,\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='predictionCol', doc='prediction column name'): 'prediction',\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities'): 'probability',\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction',\n",
+ " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='seed', doc='random seed'): -3198175077911245588}"
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel_t.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 179,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10"
+ ]
+ },
+ "execution_count": 179,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel_t.bestModel.stages[-1].getMaxDepth()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 180,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "500"
+ ]
+ },
+ "execution_count": 180,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel_t.bestModel.stages[-1].getMinInstancesPerNode()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 181,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "50"
+ ]
+ },
+ "execution_count": 181,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dtModel_t.bestModel.stages[-1].getMaxBins()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prediction_dtbt=pred_dtt.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.77 0.85 0.81 131571\n",
+ " 1 0.61 0.49 0.54 64408\n",
+ "\n",
+ " micro avg 0.73 0.73 0.73 195979\n",
+ " macro avg 0.69 0.67 0.68 195979\n",
+ "weighted avg 0.72 0.73 0.72 195979\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_pred=prediction_dtbt,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_dtbt= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_dtbt['column'][:10], y=feat_imp_tuned_dtbt['weight'][:10],data=feat_imp_tuned_dtbt)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from DT Binary Grid Model\");"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.4"
+ },
+ "latex_envs": {
+ "LaTeX_envs_menu_present": true,
+ "autoclose": false,
+ "autocomplete": true,
+ "bibliofile": "biblio.bib",
+ "cite_by": "apalike",
+ "current_citInitial": 1,
+ "eqLabelWithNumbers": true,
+ "eqNumInitial": 1,
+ "hotkeys": {
+ "equation": "Ctrl-E",
+ "itemize": "Ctrl-I"
+ },
+ "labels_anchors": false,
+ "latex_user_defs": false,
+ "report_style_numbering": false,
+ "user_envs_cfg": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/RF_DT_GBT_LR_Binary_Bal.ipynb b/RF_DT_GBT_LR_Binary_Bal.ipynb
new file mode 100644
index 0000000..bd8fd9a
--- /dev/null
+++ b/RF_DT_GBT_LR_Binary_Bal.ipynb
@@ -0,0 +1,2548 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 214,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:62046)\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 958, in _get_connection\n",
+ " connection = self.deque.pop()\n",
+ "IndexError: pop from an empty deque\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1096, in start\n",
+ " self.socket.connect((self.address, self.port))\n",
+ "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n"
+ ]
+ },
+ {
+ "ename": "Py4JNetworkError",
+ "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:62046)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 958\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1095\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1096\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1097\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mevaluation\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mBinaryClassificationEvaluator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeature\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOneHotEncoder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOneHotEncoderModel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mStringIndexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mVectorAssembler\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m \u001b[0mspark\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetOrCreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 27\u001b[0m \u001b[0msc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msparkContext\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\sql\\session.py\u001b[0m in \u001b[0;36mgetOrCreate\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_options\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jsparkSession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msessionState\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetConfString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 189\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1282\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1283\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1284\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1285\u001b[0m return_value = get_return_value(\n\u001b[0;32m 1286\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \"\"\"\n\u001b[1;32m-> 1012\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1013\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1014\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 958\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 960\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 961\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 962\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 964\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 965\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 966\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 967\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 968\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1106\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1107\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1108\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1109\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1110\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:62046)"
+ ]
+ }
+ ],
+ "source": [
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql import Row\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from pyspark.sql.types import *\n",
+ "from pyspark.sql.functions import *\n",
+ "import matplotlib.pyplot as plt\n",
+ "from pyspark.sql import functions as fn\n",
+ "from pyspark.ml import feature, regression, evaluation, Pipeline\n",
+ "import seaborn as sns\n",
+ "from pyspark.ml.feature import VectorAssembler\n",
+ "from pyspark.ml import Pipeline\n",
+ "from pyspark.ml.regression import LinearRegression\n",
+ "from pyspark.ml.stat import Correlation\n",
+ "from pyspark.ml.feature import VectorAssembler\n",
+ "from pyspark.ml.classification import RandomForestClassifier\n",
+ "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n",
+ "from pyspark.ml.classification import GBTClassifier\n",
+ "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
+ "from pyspark.ml.classification import DecisionTreeClassifier\n",
+ "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n",
+ "from pyspark.ml.classification import LogisticRegression\n",
+ "from pyspark.ml import Pipeline\n",
+ "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
+ "from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer, VectorAssembler\n",
+ "spark = SparkSession.builder.getOrCreate()\n",
+ "sc = spark.sparkContext\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Do not delete or change this cell\n",
+ "\n",
+ "import os\n",
+ "\n",
+ "# Define a function to determine if we are running on data bricks\n",
+ "# Return true if running in the data bricks environment, false otherwise\n",
+ "def is_databricks():\n",
+ " # get the databricks runtime version\n",
+ " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n",
+ " \n",
+ " # if running on data bricks\n",
+ " if db_env != None:\n",
+ " return True\n",
+ " else:\n",
+ " return False\n",
+ "\n",
+ "# Define a function to read the data file. The full path data file name is constructed\n",
+ "# by checking runtime environment variables to determine if the runtime environment is \n",
+ "# databricks, or a student's personal computer. The full path file name is then\n",
+ "# constructed based on the runtime env.\n",
+ "# \n",
+ "# Params\n",
+ "# data_file_name: The base name of the data file to load\n",
+ "# \n",
+ "# Returns the full path file name based on the runtime env\n",
+ "#\n",
+ "def get_training_filename(data_file_name): \n",
+ " # if running on data bricks\n",
+ " if is_databricks():\n",
+ " # build the full path file name assuming data brick env\n",
+ " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n",
+ " # else the data is assumed to be in the same dir as this notebook\n",
+ " else:\n",
+ " # Assume the student is running on their own computer and load the data\n",
+ " # file from the same dir as this notebook\n",
+ " full_path_name = data_file_name\n",
+ " \n",
+ " # return the full path file name to the caller\n",
+ " return full_path_name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Loading Train data\n",
+ "\n",
+ "us_train_cat = spark.read.csv(get_training_filename('USAccident_train_bal_bin.csv'), header = True, inferSchema = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---------------+\n",
+ "|count(Severity)|\n",
+ "+---------------+\n",
+ "| 2|\n",
+ "+---------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Number of unique labels in Severity column\n",
+ "\n",
+ "us_train_cat.agg(countDistinct(\"Severity\")).show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+--------+------+\n",
+ "|Severity| count|\n",
+ "+--------+------+\n",
+ "| 1|258836|\n",
+ "| 0|263700|\n",
+ "+--------+------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Checking the balance of data in training dataset\n",
+ "\n",
+ "us_train_cat.groupBy('Severity').count().show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Loading the test data\n",
+ "\n",
+ "us_test_cat = spark.read.csv(get_training_filename('USAccident_validation_new.csv'), header = True, inferSchema = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+--------+------+\n",
+ "|Severity| count|\n",
+ "+--------+------+\n",
+ "| 3| 58617|\n",
+ "| 4| 5993|\n",
+ "| 2|131790|\n",
+ "+--------+------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Checking the balance of data in testing dataset\n",
+ "\n",
+ "us_test_cat.groupBy('Severity').count().show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Vector Assembler to convert all features except Severity to a single column features for feeding it to input of model\n",
+ "\n",
+ "va = VectorAssembler().setInputCols([i for i in us_train_cat.columns if i!='Severity']).setOutputCol('features')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# String Indexer to assign target Variable Severity name Label needed for the model to predict\n",
+ "\n",
+ "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Assigning label of 1 to category 3 and 4 and combine them to make it 1 category for train data\n",
+ "\n",
+ "us_train_cat=us_train_cat.withColumn(\"Severity\",when(((us_train_cat[\"Severity\"]==4) | (us_train_cat[\"Severity\"]==3)),1).otherwise(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Assigning label of 1 to category 3 and 4 and combine them to make it 1 category for test data\n",
+ "\n",
+ "us_test_cat=us_test_cat.withColumn(\"Severity\",when(((us_test_cat[\"Severity\"]==4) | (us_test_cat[\"Severity\"]==3)),1).otherwise(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Evaluator defined for Binary Classification\n",
+ "\n",
+ "evaluator_rfb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RF Base Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Create an initial RandomForest model.\n",
+ "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Pipeline with stages for fitting the training data\n",
+ "\n",
+ "rfModel = Pipeline(stages=[label_stringIdx,va, rf])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fit the training data using RF pipeline\n",
+ "\n",
+ "rf_fit = rfModel.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predict the test data using fitted train pipeline\n",
+ "\n",
+ "pred_rfbb = rf_fit.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC Score is 0.7516960347127739\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AUC Score for the test data\n",
+ "\n",
+ "print(\"AUC Score is\", evaluator_rfb.evaluate(pred_rfbb))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_rfbb=(pred_rfbb).toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True Labels from test data for Target Variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.86 0.59 0.70 131790\n",
+ " 1 0.49 0.80 0.61 64610\n",
+ "\n",
+ " accuracy 0.66 196400\n",
+ " macro avg 0.68 0.70 0.66 196400\n",
+ "weighted avg 0.74 0.66 0.67 196400\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_rfbb,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SparseVector(119, {2: 0.0001, 3: 0.0001, 4: 0.0, 5: 0.0, 7: 0.0, 9: 0.0006, 13: 0.0003, 14: 0.0001, 15: 0.0, 17: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 23: 0.0228, 24: 0.0088, 25: 0.0142, 26: 0.0025, 27: 0.006, 28: 0.0, 30: 0.0268, 32: 0.0003, 34: 0.0164, 36: 0.0976, 38: 0.0, 40: 0.0007, 42: 0.0017, 44: 0.0025, 48: 0.0, 49: 0.0, 50: 0.0014, 51: 0.0508, 52: 0.0274, 53: 0.0, 54: 0.0, 55: 0.0, 58: 0.0001, 59: 0.0, 60: 0.0001, 61: 0.0, 62: 0.0001, 63: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 0.0, 70: 0.0, 71: 0.0001, 73: 0.0, 75: 0.0001, 77: 0.0, 78: 0.0, 81: 0.1403, 82: 0.0434, 83: 0.0004, 84: 0.0001, 85: 0.0, 87: 0.0, 89: 0.0, 92: 0.0, 93: 0.0, 94: 0.0, 95: 0.0112, 97: 0.0, 98: 0.0005, 100: 0.05, 101: 0.0002, 102: 0.0003, 103: 0.0014, 104: 0.0, 105: 0.0209, 106: 0.0001, 107: 0.0121, 109: 0.1052, 110: 0.0, 111: 0.0209, 113: 0.001, 115: 0.0088, 116: 0.013, 117: 0.0, 118: 0.288})"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Feature Importance from RF model \n",
+ "\n",
+ "rf_fit.stages[-1].featureImportances"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of RF Base Model for Binary Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_rfbb = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], rf_fit.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from Random Forest binary balanced')"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of RF Base Model for Binary Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_rfbb['column'][:10], y=feat_imp_tuned_rfbb['weight'][:10],data=feat_imp_tuned_rfbb)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from Random Forest binary balanced\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6618991853360489"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Accuracy calculation for RF Base Model\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]\n",
+ "\n",
+ "binary_prediction=(pred_rfbb).select(\"prediction\").collect()\n",
+ "\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "\n",
+ "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RF Grid Search "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 210,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:62046)\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1174, in send_command\n",
+ " self.socket.sendall(command.encode(\"utf-8\"))\n",
+ "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1014, in send_command\n",
+ " response = connection.send_command(command)\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1178, in send_command\n",
+ " \"Error while sending\", e, proto.ERROR_ON_SEND)\n",
+ "py4j.protocol.Py4JNetworkError: Error while sending\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 958, in _get_connection\n",
+ " connection = self.deque.pop()\n",
+ "IndexError: pop from an empty deque\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1096, in start\n",
+ " self.socket.connect((self.address, self.port))\n",
+ "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n"
+ ]
+ },
+ {
+ "ename": "Py4JNetworkError",
+ "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:62046)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mConnectionResetError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1173\u001b[0m \u001b[1;31m# if it sent a RST packet (SO_LINGER)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1174\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"utf-8\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1175\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mConnectionResetError\u001b[0m: [WinError 10054] An existing connection was forcibly closed by the remote host",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1013\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1014\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1015\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1177\u001b[0m raise Py4JNetworkError(\n\u001b[1;32m-> 1178\u001b[1;33m \"Error while sending\", e, proto.ERROR_ON_SEND)\n\u001b[0m\u001b[0;32m 1179\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mPy4JNetworkError\u001b[0m: Error while sending",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 958\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1095\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1096\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1097\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Create an initial RandomForest model.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mrf_new\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mRandomForestClassifier\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabelCol\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"label\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeaturesCol\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"features\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m42\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m# Train model with Training Data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mrfModel_new\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPipeline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstages\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlabel_stringIdx\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mva\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrf_new\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\__init__.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 109\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Method %s forces keyword arguments.\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 110\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_input_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 111\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 112\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\classification.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, featuresCol, labelCol, predictionCol, probabilityCol, rawPredictionCol, maxDepth, maxBins, minInstancesPerNode, minInfoGain, maxMemoryInMB, cacheNodeIds, checkpointInterval, impurity, numTrees, featureSubsetStrategy, seed, subsamplingRate, leafCol, minWeightFractionPerNode)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mRandomForestClassifier\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1425\u001b[0m self._java_obj = self._new_java_obj(\n\u001b[1;32m-> 1426\u001b[1;33m \"org.apache.spark.ml.classification.RandomForestClassifier\", self.uid)\n\u001b[0m\u001b[0;32m 1427\u001b[0m self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,\n\u001b[0;32m 1428\u001b[0m \u001b[0mmaxMemoryInMB\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m256\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcacheNodeIds\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheckpointInterval\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_new_java_obj\u001b[1;34m(java_class, *args)\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[0mjava_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_jvm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mjava_class\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\".\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 67\u001b[1;33m \u001b[0mjava_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjava_obj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 68\u001b[0m \u001b[0mjava_args\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0m_py2java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0marg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mjava_obj\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mjava_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 1676\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mREFLECTION_COMMAND_NAME\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1677\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mREFL_GET_UNKNOWN_SUB_COMMAND_NAME\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mname\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"\\n\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_id\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1678\u001b[1;33m \"\\n\" + proto.END_COMMAND_PART)\n\u001b[0m\u001b[0;32m 1679\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSUCCESS_PACKAGE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1680\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mJavaPackage\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_gateway_client\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjvm_id\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1027\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_should_retry\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mretry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpne\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1028\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Exception while sending command.\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc_info\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1029\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1030\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1031\u001b[0m logging.exception(\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \"\"\"\n\u001b[1;32m-> 1012\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1013\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1014\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 958\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 960\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 961\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 962\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 964\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 965\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 966\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 967\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 968\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1106\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1107\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1108\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1109\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1110\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:62046)"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "# Create an initial RandomForest model\n",
+ "\n",
+ "rf_new = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Pipeline with stages for RF Grid Search Model\n",
+ "\n",
+ "rfModel_new = Pipeline(stages=[label_stringIdx,va, rf_new])\n",
+ "\n",
+ "# Grid Search for tuning Hyper parameters \n",
+ "\n",
+ "paramGrid_rft = ParamGridBuilder().addGrid(rf_new.numTrees, [10,25,60]).addGrid(rf_new.maxDepth, [3,6,10]).addGrid(rf_new.impurity,[\"entropy\", \"gini\"]).build()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 209,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "Py4JJavaError",
+ "evalue": "An error occurred while calling o124422.fit.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 51187.0 failed 1 times, most recent failure: Lost task 7.0 in stage 51187.0 (TID 368623, DESKTOP-TT8TT9T.fios-router.home, executor driver): java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n\nDriver stacktrace:\r\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1989)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1977)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1976)\r\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\r\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\r\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1976)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:956)\r\n\tat scala.Option.foreach(Option.scala:407)\r\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2206)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2155)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2144)\r\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:758)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2116)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2137)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2156)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2181)\r\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1003)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:588)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:226)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:144)\r\n\tat org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)\r\n\tat scala.util.Try$.apply(Try.scala:213)\r\n\tat org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:122)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:48)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:152)\r\n\tat sun.reflect.GeneratedMethodAccessor2715.invoke(Unknown Source)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)\r\n\tat java.lang.reflect.Method.invoke(Unknown Source)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Unknown Source)\r\nCaused by: java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mPy4JJavaError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcv_rf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCrossValidator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrfModel_new\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mestimatorParamMaps\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparamGrid_rft\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mevaluator_rfb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnumFolds\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m42\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mus_train_cat\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m raise ValueError(\"Params must be either a param map or a list/tuple of param maps, \"\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\tuning.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 350\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 351\u001b[0m \u001b[0mtasks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_parallelFitTasks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meva\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepm\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 352\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msubModel\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpool\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimap_unordered\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 353\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mj\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mmetric\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mnFolds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 354\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\lib\\multiprocessing\\pool.py\u001b[0m in \u001b[0;36mnext\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msuccess\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[0m__next__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m \u001b[1;31m# XXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\lib\\multiprocessing\\pool.py\u001b[0m in \u001b[0;36mworker\u001b[1;34m(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)\u001b[0m\n\u001b[0;32m 119\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtask\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 120\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 121\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 122\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 123\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mwrap_exception\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfunc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0m_helper_reraises_exception\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\tuning.py\u001b[0m in \u001b[0;36m\u001b[1;34m(f)\u001b[0m\n\u001b[0;32m 350\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 351\u001b[0m \u001b[0mtasks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_parallelFitTasks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meva\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepm\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 352\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msubModel\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpool\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimap_unordered\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 353\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mj\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mmetric\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mnFolds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 354\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\tuning.py\u001b[0m in \u001b[0;36msingleTask\u001b[1;34m()\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0msingleTask\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 52\u001b[1;33m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodelIter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 53\u001b[0m \u001b[0mmetric\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0meva\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalidation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepm\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 54\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcollectSubModel\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36m__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"No models remaining.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcounter\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 62\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfitSingleModel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 64\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfitSingleModel\u001b[1;34m(index)\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 104\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mfitSingleModel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 105\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparamMaps\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 106\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_FitMultipleIterator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfitSingleModel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparamMaps\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 127\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 128\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 129\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\pipeline.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstage\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# must be an Estimator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 109\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstage\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 110\u001b[0m \u001b[0mtransformers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mindexOfLastEstimator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m raise ValueError(\"Params must be either a param map or a list/tuple of param maps, \"\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[0mjava_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjava_model\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_copyValues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_fit_java\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 316\u001b[0m \"\"\"\n\u001b[0;32m 317\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transfer_params_to_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 318\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_java_obj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 319\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1284\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1285\u001b[0m return_value = get_return_value(\n\u001b[1;32m-> 1286\u001b[1;33m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m 1287\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1288\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\sql\\utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 98\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 99\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 100\u001b[0m \u001b[0mconverted\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconvert_exception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m 326\u001b[0m raise Py4JJavaError(\n\u001b[0;32m 327\u001b[0m \u001b[1;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m format(target_id, \".\", name), value)\n\u001b[0m\u001b[0;32m 329\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 330\u001b[0m raise Py4JError(\n",
+ "\u001b[1;31mPy4JJavaError\u001b[0m: An error occurred while calling o124422.fit.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 51187.0 failed 1 times, most recent failure: Lost task 7.0 in stage 51187.0 (TID 368623, DESKTOP-TT8TT9T.fios-router.home, executor driver): java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n\nDriver stacktrace:\r\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1989)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1977)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1976)\r\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\r\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\r\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1976)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:956)\r\n\tat scala.Option.foreach(Option.scala:407)\r\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2206)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2155)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2144)\r\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:758)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2116)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2137)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2156)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2181)\r\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1003)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:588)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:226)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:144)\r\n\tat org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)\r\n\tat scala.util.Try$.apply(Try.scala:213)\r\n\tat org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:122)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:48)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:152)\r\n\tat sun.reflect.GeneratedMethodAccessor2715.invoke(Unknown Source)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)\r\n\tat java.lang.reflect.Method.invoke(Unknown Source)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Unknown Source)\r\nCaused by: java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ERROR:root:Exception while sending command.\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1181, in send_command\n",
+ " answer = smart_decode(self.stream.readline()[:-1])\n",
+ " File \"C:\\Users\\hites\\Anaconda3\\lib\\socket.py\", line 589, in readinto\n",
+ " return self._sock.recv_into(b)\n",
+ "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1014, in send_command\n",
+ " response = connection.send_command(command)\n",
+ " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1193, in send_command\n",
+ " \"Error while receiving\", e, proto.ERROR_ON_RECEIVE)\n",
+ "py4j.protocol.Py4JNetworkError: Error while receiving\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 5-fold Cross Validator Pipeline and fitting the train data using this cv pipeline\n",
+ "\n",
+ "cv_rf = CrossValidator(estimator=rfModel_new, estimatorParamMaps=paramGrid_rft, evaluator=evaluator_rfb, numFolds=5,seed=42).fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predicting the test data using fitted cv pipeline\n",
+ "\n",
+ "pred_rft = cv_rf.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC Score is 0.7819275566312943\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AUC Score of tuned RF model\n",
+ "\n",
+ "print(\"AUC Score is\", evaluator_rfb.evaluate(pred_rft))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 216,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='RandomForestClassifier_8713e549bc1f', name='featuresCol', doc='features column name.'): 'features',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='labelCol', doc='label column name.'): 'label',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='predictionCol', doc='prediction column name.'): 'prediction',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='seed', doc='random seed.'): 42,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='featureSubsetStrategy', doc=\"The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'\"): 'auto',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='numTrees', doc='Number of trees to train (>= 1).'): 60,\n",
+ " Param(parent='RandomForestClassifier_8713e549bc1f', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}"
+ ]
+ },
+ "execution_count": 216,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Best Model Hyper Parameters after tuning\n",
+ "\n",
+ "cv_rf.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "60"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Best model number of trees parameter from Grid Search\n",
+ "\n",
+ "cv_rf.bestModel.stages[-1].getNumTrees"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6788441955193483"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Accuracy of the model on test data\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]\n",
+ "\n",
+ "binary_prediction=pred_rft.select(\"prediction\").collect()\n",
+ "\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "\n",
+ "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_rft=pred_rft.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True Labels from test data for Target Variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.87 0.61 0.72 131790\n",
+ " 1 0.51 0.81 0.62 64610\n",
+ "\n",
+ " accuracy 0.68 196400\n",
+ " macro avg 0.69 0.71 0.67 196400\n",
+ "weighted avg 0.75 0.68 0.69 196400\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_rft,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of RF Grid Model for Binary Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_rfg = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cv_rf.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from Random Forest Grid')"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of RF Grid Model for Binary Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_rfg['column'][:10], y=feat_imp_tuned_rfg['weight'][:10],data=feat_imp_tuned_rfg)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from Random Forest Grid\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# GBT Base Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize the GBT Base model\n",
+ "\n",
+ "gbt = GBTClassifier(seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Pipeline with stages to be used to fit the train data\n",
+ "\n",
+ "gbt_pipe = Pipeline(stages=[label_stringIdx, va, gbt])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fitting the training data using the pipeline above\n",
+ "\n",
+ "gbtModel = gbt_pipe.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC Score is 0.7842374103722184\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AUC Score from the model on the test data\n",
+ "\n",
+ "print(\"AUC Score is\", evaluator_rfb.evaluate(gbtModel.transform(us_test_cat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_gbtn=(gbtModel.transform(us_test_cat)).toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True labels from the test data for the target variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.88 0.60 0.72 131790\n",
+ " 1 0.51 0.83 0.63 64610\n",
+ "\n",
+ " accuracy 0.68 196400\n",
+ " macro avg 0.69 0.72 0.67 196400\n",
+ "weighted avg 0.75 0.68 0.69 196400\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_gbtn,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of GBT Base Model for Binary Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_gtbb = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], gbtModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from GBT Base Model')"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of GBT Base Model for Binary Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_gtbb['column'][:10], y=feat_imp_tuned_gtbb['weight'][:10],data=feat_imp_tuned_gtbb)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from GBT Base Model\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# GBT Binary Classification Grid Search"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# GBT Binary Tuned Best Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Initializing GBT Grid Pipeline \n",
+ "\n",
+ "gbt_t_new = GBTClassifier(maxIter=55,seed=42)\n",
+ "\n",
+ "# Creating pipeline for GBT grid Model \n",
+ "\n",
+ "gbt_pipe_t_new = Pipeline(stages=[label_stringIdx, va, gbt_t_new])\n",
+ "\n",
+ "# Binary Classification Evaluator\n",
+ "\n",
+ "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n",
+ "\n",
+ "# Creating Grid Search for Hyper Parameter Tuning for GBT model\n",
+ "\n",
+ "grid_gbt_t_new = ParamGridBuilder().addGrid(gbt_t_new.stepSize, [0.2,0.4,0.01]).addGrid(gbt_t_new.maxDepth, [3, 5, 8]).build()\n",
+ "\n",
+ "# Cross Validator Pipeline with 5 fold cv to fit the training data\n",
+ "\n",
+ "cv1_gbt_t_new = CrossValidator(estimator=gbt_pipe_t_new,estimatorParamMaps=grid_gbt_t_new, numFolds=5, evaluator=evaluator,seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fitting train data using 5-fold cross validator pipeline\n",
+ "\n",
+ "cvModel_gbt_t_new = cv1_gbt_t_new.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 159,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC Score is 0.8118245222266022\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AUC Score from the fitted pipeline for the test data\n",
+ "\n",
+ "print(\"AUC Score is\", evaluator.evaluate(cvModel_gbt_t_new.transform(us_test_cat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.7997203339021297"
+ ]
+ },
+ "execution_count": 173,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# AUC Score from the Cross validator Pipeline\n",
+ "\n",
+ "np.max(cvModel_gbt_t_new.avgMetrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 217,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='GBTClassifier_48357a426a79', name='featuresCol', doc='features column name.'): 'features',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='labelCol', doc='label column name.'): 'label',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='predictionCol', doc='prediction column name.'): 'prediction',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='seed', doc='random seed.'): 42,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='featureSubsetStrategy', doc=\"The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'\"): 'all',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: variance'): 'variance',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: logistic'): 'logistic',\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='maxIter', doc='max number of iterations (>= 0).'): 55,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.2,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0,\n",
+ " Param(parent='GBTClassifier_48357a426a79', name='validationTol', doc='Threshold for stopping early when fit with validation is used. If the error rate on the validation input changes by less than the validationTol, then learning will stop early (before `maxIter`). This parameter is ignored when fit without validation is used.'): 0.01}"
+ ]
+ },
+ "execution_count": 217,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Hyper pramaters from the best model \n",
+ "\n",
+ "cvModel_gbt_t_new.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 177,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_gbt_t_new=cvModel_gbt_t_new.transform(us_test_cat).toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 178,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True Labels from test data for Target Variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 179,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 180,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.88 0.65 0.75 131790\n",
+ " 1 0.53 0.82 0.65 64610\n",
+ "\n",
+ " accuracy 0.70 196400\n",
+ " macro avg 0.71 0.73 0.70 196400\n",
+ "weighted avg 0.77 0.70 0.71 196400\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_gbt_t_new,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 181,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of GBT Grid Model for Binary Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_gbt_t_new = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cvModel_gbt_t_new.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 182,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from GBT Best tuned')"
+ ]
+ },
+ "execution_count": 182,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of GBT Grid Model for Binary Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_gbt_t_new['column'][:10], y=feat_imp_tuned_gbt_t_new['weight'][:10],data=feat_imp_tuned_gbt_t_new)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from GBT Best tuned\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Decision Trees Binary Classification Base Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Create initial Decision Tree Model\n",
+ "dt = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Pipeline with stages created for DT Model \n",
+ "\n",
+ "dt_pipe = Pipeline(stages=[label_stringIdx, va, dt])\n",
+ "\n",
+ "# Train model with Training Data\n",
+ "\n",
+ "dtModel = dt_pipe.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Binary Class Evaluator Initialize\n",
+ "\n",
+ "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Transform the test data to get prediction from the model for the test data\n",
+ "\n",
+ "pred_dt = dtModel.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC Score is 0.7039599188458128\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AUC Score from the evaluator for the test data\n",
+ "\n",
+ "print(\"AUC Score is\",evaluator.evaluate(pred_dt))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6643584521384929"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Accuracy Calculation for DT Base Model\n",
+ "\n",
+ "binary_prediction=pred_dt.select(\"prediction\").collect()\n",
+ "\n",
+ "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n",
+ "\n",
+ "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_dtb=pred_dt.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True Labels from test data for Target Variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.86 0.60 0.70 131790\n",
+ " 1 0.49 0.81 0.61 64610\n",
+ "\n",
+ " accuracy 0.66 196400\n",
+ " macro avg 0.68 0.70 0.66 196400\n",
+ "weighted avg 0.74 0.66 0.67 196400\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_dtb,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of DT Base Model for Binary Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_dtb= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Binary Base Model')"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of DT Base Model for Binary Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_dtb['column'][:10], y=feat_imp_tuned_dtb['weight'][:10],data=feat_imp_tuned_dtb)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from DT Binary Base Model\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Decision Tree Binary Classification Grid Search"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 160,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing DT Grid Pipeline\n",
+ "\n",
+ "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n",
+ "\n",
+ "# Creating pipeline for DT Grid Model \n",
+ "\n",
+ "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n",
+ "\n",
+ "# Binary Evaluator Initializing\n",
+ "\n",
+ "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n",
+ "\n",
+ "# Creating Grid Search for Hyper Parameter Tuning for DT Model\n",
+ "\n",
+ "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n",
+ "\n",
+ "# Cross Validator Pipeline with 5 fold cv to fit the training data\n",
+ "\n",
+ "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator,seed=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fitting the train data using the 5-fold Cross validator pipeline\n",
+ "\n",
+ "dtModel_t = cv1_dt.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predicting the test data using the fitted pipeline\n",
+ "\n",
+ "pred_dtt = dtModel_t.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AUC Score 0.6710458592255817\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AUC Score for the fitted pipeline for test data\n",
+ "\n",
+ "print(\"AUC Score\", evaluator.evaluate(pred_dtt))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 218,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='featuresCol', doc='features column name.'): 'features',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='labelCol', doc='label column name.'): 'label',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='predictionCol', doc='prediction column name.'): 'prediction',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='seed', doc='random seed.'): 42,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 50,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1500,\n",
+ " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0}"
+ ]
+ },
+ "execution_count": 218,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Best Model Hyper parameters for the DT Grid Search Model\n",
+ "\n",
+ "dtModel_t.bestModel.stages[-1].extractParamMap()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction output from the model to pandas\n",
+ "\n",
+ "prediction_dtbt=pred_dtt.toPandas()[\"prediction\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# True Labels from test data for Target Variable\n",
+ "\n",
+ "true_labels=us_test_cat.toPandas()[\"Severity\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initializing Classification Report from sklearn\n",
+ "\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.88 0.57 0.70 131790\n",
+ " 1 0.49 0.85 0.62 64610\n",
+ "\n",
+ " accuracy 0.66 196400\n",
+ " macro avg 0.69 0.71 0.66 196400\n",
+ "weighted avg 0.76 0.66 0.67 196400\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Classification Report Generation for all metrics display at once\n",
+ "\n",
+ "print(classification_report(y_pred=prediction_dtbt,y_true=true_labels))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating Pandas Dataframe for Features and their Importance of DT Grid Model for Binary Classification\n",
+ "\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "feat_imp_tuned_dtbt= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n",
+ " columns = ['column', 'weight']).sort_values('weight',ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Binary Grid Model')"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plotting top 10 Features from Feature Importance of DT Grid Model for Binary Classification\n",
+ "\n",
+ "plt.figure(figsize=(10,10))\n",
+ "sns.barplot(x=feat_imp_tuned_dtbt['column'][:10], y=feat_imp_tuned_dtbt['weight'][:10],data=feat_imp_tuned_dtbt)\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.xlabel(\"Features\")\n",
+ "plt.ylabel(\"Weights\")\n",
+ "plt.title(\"Top 10 Features based on Importance from DT Binary Grid Model\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Logistic Regression "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Logistic Base Model Binary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Standard Scaler to standardize the output of vector assembler before feeding it to Logistic Regression\n",
+ "\n",
+ "center = feature.StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='centered_features')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create initial LogisticRegression model\n",
+ "lr = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")\n",
+ "\n",
+ "# Pipeline for training data \n",
+ "\n",
+ "lrModel = Pipeline(stages=[label_stringIdx,va, center, lr])\n",
+ "\n",
+ "# Fit the train data using LR model\n",
+ "\n",
+ "lr_fit = lrModel.fit(us_train_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Transsform test data to predict Severity by using fitted pipeline from training data\n",
+ "\n",
+ "pred_lrb = lr_fit.transform(us_test_cat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test Area Under ROC for Logistic Base Model 0.7619402091983631\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Evaluator to get AUC Score for test data\n",
+ "\n",
+ "evaluator_lrb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n",
+ "print('Test Area Under ROC for Logistic Base Model ', evaluator_lrb.evaluate(lr_fit.transform(us_test_cat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3deXxU5b3H8c8PSEjCFggQdgIYRKyIGEBc41bXXrW1datWW6vWWm17b6u3tXq7XrWby9W6XesO7kq97ktcUdlk38IethCWQBIg2+/+MYPGmIRJzMlk5nzfr1dezJnzzMz3IXB+c7bnMXdHRETCq0O8A4iISHypEIiIhJwKgYhIyKkQiIiEnAqBiEjIqRCIiIScCoGISMipEEhSMbNVZrbLzMrMbKOZPWhmXeu1OdzM3jKznWZWamb/MrPR9dp0N7NbzWxN9L0Ko8u9G/lcM7OrzWy+mZWbWZGZPWVmBwXZX5HWoEIgyegb7t4VGAscAvzn3hVmNgl4DXgBGAAMA+YAH5jZ8GibVOBN4EDgZKA7cDiwBZjQyGfeBlwDXA30AkYCzwOnNTe8mXVq7mtEvgrTncWSTMxsFXCpu78RXb4FONDdT4suvwfMc/cr673uZWCzu19kZpcCfwRGuHtZDJ+ZCywGJrn7J420KQAedff7o8sXR3MeGV124Crgp0An4FWgzN3/o857vAC84+5/M7MBwB3A0UAZ8Hd3vz2GvyKRL9EegSQtMxsEnAIURpcziHyzf6qB5k8CJ0YfnwC8EksRiDoeKGqsCDTDmcBEYDTwOHCOmRmAmfUEvg5MMbMOwL+I7MkMjH7+T83spK/4+RJSKgSSjJ43s53AWqAYuDH6fC8i/+Y3NPCaDcDe4/9ZjbRpTHPbN+a/3X2ru+8C3gMcOCq67mxgmruvB8YDfdz9d+5e6e4rgPuAc1shg4SQCoEkozPdvRuQD4zi8w38NqAW6N/Aa/oDJdHHWxpp05jmtm/M2r0PPHLMdgpwXvSp84HHoo+HAgPMbPveH+BXQHYrZJAQUiGQpOXu7wAPAn+JLpcD04BvN9D8O0ROEAO8AZxkZl1i/Kg3gUFmltdEm3Igo85yv4Yi11ueDJxtZkOJHDJ6Jvr8WmClu2fW+enm7qfGmFfkC1QIJNndCpxoZmOjy9cB34te6tnNzHqa2R+AScBvo20eIbKxfcbMRplZBzPLMrNfmdmXNrbuvgy4C5hsZvlmlmpmaWZ2rpldF232KfBNM8sws/2AH+wruLvPBjYD9wOvuvv26KpPgB1mdq2ZpZtZRzP7mpmNb8lfkIgKgSQ1d98MPAz8Jrr8PnAS8E0ix/VXE7nE9MjoBh1330PkhPFi4HVgB5GNb2/g40Y+6mrgf4A7ge3AcuAsIid1Af4OVAKbgIf4/DDPvkyOZnm8Tp9qgG8QuTx2JZFDWvcDPWJ8T5Ev0OWjIiIhpz0CEZGQUyEQEQk5FQIRkZBTIRARCbmEG9yqd+/enpOT06LXlpeX06VLrJeGJwf1ORzU53D4Kn2eOXNmibv3aWhdwhWCnJwcZsyY0aLXFhQUkJ+f37qB2jn1ORzU53D4Kn02s9WNrdOhIRGRkFMhEBEJORUCEZGQUyEQEQk5FQIRkZALrBCY2QNmVmxm8xtZb2Z2e3RS8LlmNi6oLCIi0rgg9wgeJDLxd2NOAXKjP5cB/wgwi4iINCKw+wjc/V0zy2miyRnAw9GZmD4ys0wz6+/urTHln4hIQqquqWXH7mq2V1SyfVcV2ysq2bxzD8U79mDbasgP4DPjeUPZQOpMzQcURZ/7UiEws8uI7DWQnZ1NQUFBiz6wrKysxa9NVOpzOKjP7VN1rVNW6ZRXQ0WVU17lVEQf76x0duz92fP5413Vjb/fiYM8kD7HsxBYA881ODmCu98L3AuQl5fnLb2zTncihoP6HA7x7HNNrbNpx27Wb9/F+tLon9t3sWnHbraUVbK1vJKSsj3s2N34Vt0Memak0rtrKn2yOjOqa2eyuqSSmZFCZnoKmRmp9Ig+7tOtM326dWba++8F0ud4FoIiYHCd5UHA+jhlERH5gtpaZ33pLlaWlLOqpJyVJRWs3lLOyi3lrN1aQVXNF7+3dk/rRHb3NHp37czoAd3J6pJKVtfO9OySSmZ6Ct3TU+ie1oke6Sl0S0uhZ0YKnTq2jws341kIpgJXmdkUIhNzl+r8gIi0td1VNazYXM6y4p0s3bST5cXlkY3/lnL2VNd+1i49pSNDszIY2bcbXx/dj8G90hmYGfnpn5lO184JN3TbZwJLbmaTgXygt5kVATcCKQDufjfwEnAqUAhUAJcElUVEpKqmllUl5SzZtJOlm8pYujGy4V+1pZza6Jf7jh2Mob0yGN6nC0eP7M2w3l0Z1rsLw3p3Ibt7Z8waOqKd+IK8aui8fax34MdBfb6IhNPuqhpWbSlnxeZylheXsbQ4stFfUVL22eGcDgY5WV3Ize7KaWP6k5vdjZHZkY1+504d49yDtpe4+zIiElq7Kmso2lnLaws2smZrBau3VHy28V9fuguvc/h+UM90RmZ3I39UH0b160Zu327s17craSnh2+A3RoVARNqd2lqneOce1m6rYO3WCtbs/dkS+bN4555Iww9mApETtUOzunDo0J58u88ghvfpyvDoIZ0uCXzsvq3ob0hE2lxVTS0btu9mXfSyy8hlmLtYt303RVsrKNq+i8o6J2rNoH/3NAb3yuCYkX0Y0iuD8uLVnHJkHkOzMsjMSI1jbxKfCoGItLrdVTXRDfxu1m2v+HwDv20XRdsq2Lhj92cnaPfq3TWV/j3SGdW/GyeOzmZQrwwG90xncK8MBmamf+lQTkHBOg4enNmGvUpeKgQi0my1tc6mnbtZVVLv0M3WCoq2VVBSVvmF9h0M+vdIZ2DPdA4bkcWgnhkMyowsD8hMp3+PNB2zjyMVAhFpUE2ts6F0F6tKIidiV28pZ9WWyE1Vq7dUfOEa+44djIGZ6QzplcEJB2RHrq+PbuQHZqbTr0caKe3k5in5MhUCkZDbVVnD8s1lLN9cxrJNZSwr3snyzeWs2VJBZc3nG/vOnTqQk9WFnKwu5O/fl6FZGQzt1YWhWRn075HWbu6SleZTIRAJidpaZ/XWChZt2MHijTtZsnEHSzbuZPXWis8ut+zUwRialcGIPl05/oC+n234c3pnkN0tjQ4dkvOGqrBTIRBJUhtLd/Pp2m3MKSplbtF25haVsjM6CNreG6pGD+jOmYcMZGR2N3L7dmVoVhdSO+mbfdioEIgkgcoaZ8aqrcxes53Za7cxe812NpTuBiLf8kf178Y3Dh7AwYN6MLp/D3KzdUOVfE6FQCQB7dxdxfRVW/l45VZmrtrGp2sqqH59GgBDemUwYVgvDhmcycGDMzmgf3dt9KVJKgQiCWBPdQ0zVm3jg8ISpq3YwtyiUmpqnZSOxkEDe3DC0BTOOmoM44b0pE+3zvGOKwlGhUCknVpVUs47SzfzztLNTFu+hV1VNXTqYBw8OJMfHTOCw0dkMW5oT9JSOkYmaTmwX7wjS4JSIRBpJ/ZU1/Dxiq28vaSYtxcXs2pLBQA5WRl8J28QR4/sw8ThWQk97r20T/oXJRJHu6tqKFhSzL/mbuDtxcVUVNbQuVMHJo3I4uLDc8jfvy85vbvEO6YkORUCkTZWUVnNu0s38/L8jbyxcBPllTVkdUnlzEMGcsIBfZk0vDfpqTq5K21HhUCkDWwrr+S1hRt5bcEm3issobK6lsyMFL5x8ABOHzOAw4b30p25EjcqBCIBqa6p5Z2lm3l6ZhFvLiqmsqaWgZnpXDBxCF8f3Y/xOT218Zd2QYVApBW5OwvW7+DZWeuYOmc9JWV76NUllQsOG8I3DxnE1wZ2T9p5byVxqRCItIL123fx3Ox1PDd7HYXFZaR0NI4b1ZdvjRtE/v59NWyDtGsqBCIttLuqhtcWbuKpGWt5v7AEd5iQ04s/nXUQpx7UT7NmScJQIRBppsLiMh7/eA3PzCqidFcVAzPT+clxuZw9bhBDsjLiHU+k2VQIRGJQVVPLK/M38uhHq/l45VZSOhonHdiPc8cP4fARWRqeWRKaCoFIE0rK9jD54zU8+vFqNu3Yw+Be6Vx78ii+nTeI3l01po8kBxUCkQYUFu/kvndX8tzsdVTW1HJUbm/+dNZB5O/fl4769i9JRoVApI7pq7ZyzzvLeWNRMWkpHfjO+EFcfPgw9uvbNd7RRAKjQiCh5+68u6yEO98q5JNVW+mZkcI1x+dy0aShZOnwj4SACoGElrvz5qJibn9rGXOLSunfI40bvzGac8cP0Vg/EioqBBI6ewvArW8uZf66HQzNyuCmbx7EN8cN0o1fEkoqBBIqHy4v4eaXFzOnqJQhvTL489ljOOuQgRrzR0JNhUBCYeH6Hdz8ymLeWbqZAT3SuPlbkT2AFBUAkWALgZmdDNwGdATud/eb6q3vATwKDIlm+Yu7/zPITBIuG0t385fXlvDMrCK6p6Xwq1NHcdGkHE3mLlJHYIXAzDoCdwInAkXAdDOb6u4L6zT7MbDQ3b9hZn2AJWb2mLtXBpVLwqGispq731nBve8up7YWfnjUcH6cvx89MlLiHU2k3Qlyj2ACUOjuKwDMbApwBlC3EDjQzSLj8nYFtgLVAWaSJOfufLi+mmv/UsCmHXs4bUx/rjt5FIN7aQwgkcaYuwfzxmZnAye7+6XR5QuBie5+VZ023YCpwCigG3COu/9fA+91GXAZQHZ29qFTpkxpUaaysjK6dg3XjUFh6vPK0hoeW1RJ4fZacrp34IIDUsntGY5DQGH6Pe+lPjfPscceO9Pd8xpaF+QeQUP34devOicBnwLHASOA183sPXff8YUXud8L3AuQl5fn+fn5LQpUUFBAS1+bqMLQ5+Idu7nl1SU8PbOI3l0784OvOb8+/4RQDQQXht9zfepz6wmyEBQBg+ssDwLW12tzCXCTR3ZLCs1sJZG9g08CzCVJoqqmlv99fyV3vLmMqhrn8mOGc9Wx+zHzow9CVQREvqogC8F0INfMhgHrgHOB8+u1WQMcD7xnZtnA/sCKADNJkpi+aiu/fm4eSzeVccIBfbn+tNHk9O4S71giCSmwQuDu1WZ2FfAqkctHH3D3BWZ2RXT93cDvgQfNbB6RQ0nXuntJUJkk8W0rr+SmlxfzxIy1DOiRxr0XHsrXD+wX71giCS3Q+wjc/SXgpXrP3V3n8Xrg60FmkOTxyvwNXP/8fLZXVHH50cO5+vhcunTWPZEiX5X+F0m7t7W8khtemM+Lczdw4IDuPPz9iYwe0D3esUSShgqBtFvuztQ56/ntvxayc3cV/37iSK7IH6FhIURamQqBtEsbSndx/XPzeXNxMQcPzuTmbx3EqH7aCxAJggqBtDtT56zn+ufmUVXjXH/aAVxyxDBNDykSIBUCaTdKd1Vx4wvzef7T9YwbksnfzxnL0CxdEioSNBUCaRfeX1bCL56eQ/HOPfz8xJFcmT9CcwSItBEVAomrXZU13PzKYh78cBXD+3ThmR8dztjBmfGOJRIqKgQSN8s27eTKx2axrLiMS47I4ZcnjdJcwSJxsM9CYGYZwL8DQ9z9h2aWC+zv7i8Gnk6S1lMz1nLDCwvo0rkjj/xgAkfl9ol3JJHQimWP4J/ATGBSdLkIeApQIZBmK9tTzQ0vzOfZWeuYNDyL284dS9/uafGOJRJqsRSCEe5+jpmdB+Duu6ITyYg0y7yiUq6eMpvVW8r56Qm5/OS4XF0WKtIOxFIIKs0snehcAmY2AtgTaCpJKu7O/76/kptfWUzvrp2Z/MPDmDg8K96xRCQqlkLwX8ArwGAzeww4gsg8AiL7tK28kl88PYc3FhVzwgHZ/PnsMfTskhrvWCJSxz4Lgbu/ZmYzgcOIDBV9jYaKlljMWrONqx6bxeayPdxw+mguOSIHHVUUaX9iuWroTXc/Hvi/Bp4T+RJ359GPVvO7FxfSr0caz/zocMYM0r0BIu1Vo4XAzNKADKC3mfXk8zmIuwMD2iCbJKDdVTX86rl5PDtrHcfu34dbzzmEHhkp8Y4lIk1oao/gcuCnRDb6M/m8EOwA7gw4lySgDaW7uPyRmcwtKuWnJ+Ry9XG5mjtYJAE0Wgjc/TbgNjP7ibvf0YaZJAHNWLWVKx6dxe6qGu67KI8TR2fHO5KIxCiWk8V3mNnXgNFAWp3nHw4ymCQGd+eRj1bzu38tZGDPdCb/cCK52d3iHUtEmiGWk8U3AvlECsFLwCnA+4AKQcjtrqrh18/N55lZRRw/qi9/+85YnQ8QSUCx3EdwNnAwMNvdLzGzbOD+YGNJe7ehdBeXPTyTeetKueb4XK45XucDRBJVLIVgl7vXmlm1mXUHioHhAeeSdmzm6m1c/shMnQ8QSRKxFIIZZpYJ3Efk6qEy4JNAU0m79fTMIn717Dz6Z6bpfIBIkojlZPGV0Yd3m9krQHd3nxtsLGlvamqdW15dzD3vrOCI/bK48/xxZGZoqAiRZNCsuQDdfRWwx8zuCyaOtEfle6q5/JGZ3PPOCr572BAevGSCioBIEmm0EJjZGDN7zczmm9kfzCzbzJ4B3gQWtl1EiaeNpbv5zj3TeGvxJn77bwfyhzMPIkVzCYsklaYODd0H/AOYBpwMzAIeBy5w991tkE3ibMH6Un7w4Ax27q7ify8ez7H79413JBEJQFOFoLO7Pxh9vMTM/gO4zt1rgo8l8TZt+RYufWg63dNTeOqKwxk9oHu8I4lIQJoqBGlmdgifjzFUBozZOzuZu88KOpzExxsLN3Hl47MY2iuDR34wkX49NJWkSDJrqhBsAP5WZ3ljnWUHjgsqlMTPC5+u4+dPzuHAAd156JIJmkRGJASaGnTu2K/65mZ2MnAb0BG4391vaqBNPnArkAKUuPsxX/VzpWWenL6Wa5+dy4ScXtz/vTy6pWm4CJEwiOWGshYxs45Ehqs+ESgCppvZVHdfWKdNJnAXcLK7rzEznY2Mk4c+XMWNUxdw9Mg+3PPdQ0lP7RjvSCLSRoK8DnACUOjuK9y9EpgCnFGvzfnAs+6+BsDdiwPMIw1wd25/cxk3Tl3AiaOzue8iFQGRsDF3D+aNzc4m8k3/0ujyhcBEd7+qTpu9h4QOBLoBtzU0vLWZXQZcBpCdnX3olClTWpSprKyMrl27tui1iaqpPte6M3lxJa+vrubwAZ34/tdS6ZQEA8fp9xwO6nPzHHvssTPdPa+hdbEMQ23ABcBwd/+dmQ0B+rn7vsYbamiLUr/qdAIOBY4H0oFpZvaRuy/9wovc7wXuBcjLy/P8/Px9xW5QQUEBLX1tomqszzW1znXPzOX11UVcckQOvzltdNKMHqrfczioz60nlkNDdwGTgPOiyzuJbarKImBwneVBwPoG2rzi7uXuXgK8S2TIawlQVU0t10yZzVMzi7jm+FxuOD15ioCINF8shWCiu/8Y2A3g7tuAWK4pnA7kmtkwM0sFzgWm1mvzAnCUmXUyswxgIrAo5vTSbFU1tVz1+CxenLuB604Zxc9OHEn01hARCalYrhqqil4B5ABm1geo3deL3L3azK4CXiVy+egD7r7AzK6Irr/b3RdFRzSdG33P+919fgv7IvtQU+v87IlPeXXBJm44fTTfP3JYvCOJSDsQSyG4HXgO6GtmfyQyY9n1sby5u79EZHrLus/dXW/5z8CfY0orLVZb61z7zFxenLuB/zxllIqAiHwmlvkIHjOzmURO6Bpwprvr8E0Cqa11fvPCfJ6eWcTVx+dy+TEj4h1JRNqRWK4aug14wt1jOUEs7UytR4rAYx+v4YpjRvCzE3LjHUlE2plYDg3NAq43s5FEDhE94e4zgo0lrcHdeWRhJW+vXcOV+SP4xUn768SwiHzJPq8acveH3P1UIncKLwVuNrNlgSeTr8TduXHqAt5eW82PVAREpAnNGWJiP2AUkAMsDiSNtAp353cvLuThaas5OSeFX6oIiEgTYjlHcDPwTWA58CTwe3ffHnQwabm/vraUf36wikuOyOHorsUqAiLSpFjOEawEJkXv/JV27r53V/A/bxdy7vjB3HD6aN55Z3O8I4lIO9doITCzUe6+GPgEGBIdY+gzmqGs/Xly+lr++NIiTjuoP3886yDtCYhITJraI/g5kRE//9rAOs1Q1s68Mn8D1z07l6Nye/P3c8bSUWMHiUiMmpqh7LLow1PcfXfddWamSWzbkQ8LS7h68qccPDiTey48lNROQU4zISLJJpYtxocxPidxMK+olB8+PIOc3hn88+LxZKQGNumciCSpps4R9AMGAulmdgifzy/QHchog2yyDytLyrn4n5+QmZHKw9+fSGaGJpoXkeZr6uvjScDFROYR+Fud53cCvwowk8Rg8849XPTAxzjwyA8m0K+HjtaJSMs0dY7gIeAhM/uWuz/ThplkHyoqq7n0oels3rmHKZdNYnifcE3XJyKtq6lDQ99190eBHDP7ef317v63Bl4mAaupda6e/Cnz1pVyz4V5jB2cGe9IIpLgmjo01CX6p75utiN//L9FvLFoE78740BOHJ0d7zgikgSaOjR0T/TP37ZdHGnKIx+t5oEPVnLJETlcNCkn3nFEJEns8/JRM7vFzLqbWYqZvWlmJWb23bYIJ597b9lm/mvqAo4b1ZfrTxsd7zgikkRiuY/g6+6+AzgdKAJGAr8INJV8QWFxGVc+Novcvl25/bxDdNewiLSqWApBSvTPU4HJ7r41wDxSz/aKSi59aDqdO3Xg/u/l0bWzbhgTkdYVy1blX2a2GNgFXGlmfYDd+3iNtIKaWucnk2ezfvtuJl82kUE9dR+fiLS+WGYouw6YBOS5exVQDpwRdDCBv762hPeWlfD7Mw/k0KG94h1HRJJULBPTpAAXAkdHhzV+B7g74Fyh98r8jdxVsJzzJgzhnPFD9v0CEZEWiuXQ0D+InCe4K7p8YfS5S4MKFXYrNpfxH0/N4eDBmfzXv+kKIREJViyFYLy7H1xn+S0zmxNUoLDbVVnDlY/NIqWjcdcF4+jcqWO8I4lIkovlqqEaMxuxd8HMhgM1wUUKL3fn18/PY8mmndx27iEMzEyPdyQRCYFY9gh+AbxtZiuIDEU9FLgk0FQh9cT0tTw7ax0/PSGXo0f2iXccEQmJfRYCd3/TzHKB/YkUgsXuvifwZCEzf10pN0xdwFG5vfnJcbnxjiMiIdLooSEzyzWzF8xsPvAgsMXd56gItL6yPdVc9fgsemWkcqvmGxaRNtbUOYIHgBeBbwGzgDvaJFEI3fjCAtZsreC2c8eS1bVzvOOISMg0dWiom7vfF338ZzOb1RaBwuaFT9fxzKwirjk+l4nDs+IdR0RCqKk9gjQzO8TMxpnZOKJzF9dZ3iczO9nMlphZoZld10S78WZWY2ZnN7cDiWz1lnJ+/dx8xuf05CfH7RfvOCISUk3tEWzgi3MVb6yz7MBxTb2xmXUE7gROJDJq6XQzm+ruCxtodzPwavOiJ7bK6lqunjybDga3nnsInTrGciWviEjra2pimmO/4ntPAArdfQWAmU0hMkbRwnrtfgI8A4z/ip+XUP76+hLmFJXyjwvG6X4BEYmrIMc0HgisrbNcBEys28DMBgJnEdm7aLQQmNllwGUA2dnZFBQUtChQWVlZi1/bmhaU1HDPjN3kD+pE+pYlFBQsCeyz2kuf25L6HA7qc+sJshA0dA2k11u+FbjW3WuiA9o1yN3vBe4FyMvL8/z8/BYFKigooKWvbS1byyv55a3vMqJPF/5x2VGkpwY7hER76HNbU5/DQX1uPUEWgiJgcJ3lQcD6em3ygCnRItAbONXMqt39+QBzxdX1z89je0UV/7xkfOBFQEQkFrHMWWxm9l0zuyG6PMTMJsTw3tOBXDMbZmapwLnA1LoN3H2Yu+e4ew7wNHBlMheBl+Zt4KV5G7nmhFwOHNAj3nFERIDYBp27i8jENOdFl3cSuRqoSe5eDVxF5GqgRcCT7r7AzK4wsytamDdhbdqxm189N48xg3pw+dHD4x1HROQzsRwamuju48xsNoC7b4t+w98nd38JeKnecw1OauPuF8fynomottb59yfnsKeqlr+fM1aXiopIuxLLFqkqeq2/A0TnLK4NNFWSefDDVbxfWMJvTh/NiD5d4x1HROQLYikEtwPPAX3N7I/A+8CfAk2VRFaVlHPLq4s5blRfzpsweN8vEBFpY7EMQ/2Ymc0EjidySeiZ7r4o8GRJoLbW+eUzc0np2IE/nXUQTV0iKyISL7FcNTQCWOnudwLzgRPNLDPwZEng0Y9X88nKrfzm9NH065EW7zgiIg2K5dDQM0Smq9wPuB8YBjweaKoksHZrBTe9vJijR/bh24cOinccEZFGxVIIaqOXgn4TuM3dfwb0DzZWYnN3rnt2Lh3M+O9v6pCQiLRvsV41dB5wEZGJagBSgouU+J6eWcQHhVv4z1NHaUA5EWn3YikElxC5oeyP7r7SzIYBjwYbK3FtK6/kv19ezKFDe3Le+CHxjiMisk+xXDW0ELi6zvJK4KYgQyWyW15dTOmuKv5w5tfooLmHRSQBNFoIzGweXx4t9DPuPiaQRAls1pptTP5kLZceOYwD+nePdxwRkZg0tUdwepulSAI1tc5vnp9PdvfO/PTEkfGOIyISs6ZmKFvdlkES3eMfr2bB+h3ccd4hdO0c5OjeIiKtK5Ybyg4zs+lmVmZmldFJ5ne0RbhEsb2ikr++vpRJw7M4fYyurBWRxBLLVUP/Q2QI6mVAOnApcEeQoRLNbW8uY8euKm74xmjdMyAiCSemYxjuXmhmHd29BvinmX0YcK6EsWJzGY9MW80544foBLGIJKRYCkFFdP6BT83sFmAD0CXYWInjb68vJbVTB36uE8QikqBiOTR0YbTdVUA5kXmIvxVkqESxYH0pL87dwPePGEafbp3jHUdEpEWauo9giLuvqXP10G7gt20TKzH89bWldE/rxA819aSIJLCm9gg+m0TezJ5pgywJZcaqrby1uJjLjxlBj3QNvSQiiaupQlD38hd95a3D3bnllSX06daZ7x8xLN5xRES+kqYKgTfyOPTeWbqZT1Zt5erj9iM9tWO844iIfCVNXTV0cPTGMQPS69xEZoC7eyivlXR3buGAxtcAAA1pSURBVH1jGQMz0zlHo4uKSBJoaogJfdVtwLvLSvh07Xb+dNZBpHaK5aIrEZH2TVuyZnB3bn9zGQN6pHG2pp8UkSShQtAM05ZvYebqbfwof4T2BkQkaWhr1gy3v7WMvt068+28wfGOIiLSalQIYjRj1VY+WrGVy48ZQVqKTp+ISPJQIYjRHW8VktUllfMmaG9ARJKLCkEM5hWV8s7SzXz/yGFkpGrSGRFJLioEMbiroJBuaZ24cNLQeEcREWl1gRYCMzvZzJaYWaGZXdfA+gvMbG7050MzOzjIPC2xdmsFry7YyHcPG0r3NI0pJCLJJ7BCYGYdgTuBU4DRwHlmNrpes5XAMe4+Bvg9cG9QeVrqvvdW0LGD8b1JOfGOIiISiCD3CCYAhe6+wt0rgSnAGXUbuPuH7r4tuvgR0K7u0tpStocnZ6zlrEMG0q9HWrzjiIgEIsgznwOBtXWWi4CJTbT/AfByQyvM7DLgMoDs7GwKCgpaFKisrKxZr31uWSW7q2oZm7alxZ8Zb83tczJQn8NBfW49QRaChmZxb3AUUzM7lkghOLKh9e5+L9HDRnl5eZ6fn9+iQAUFBcT62t1VNfz7e29xwgF9Of/08S36vPagOX1OFupzOKjPrSfIQlBEZFrLvQYB6+s3MrMxwP3AKe6+JcA8zfLCp+vYUl7J94/UfAMiktyCPEcwHcg1s2FmlgqcC0yt28DMhgDPAhe6+9IAszSLu/PA+6s4oH93Jg3PinccEZFABbZH4O7VZnYV8CrQEXjA3ReY2RXR9XcDNwBZwF1mBlDt7nlBZYrVtOVbWLJpJ7ecPYZoLhGRpBXobbLu/hLwUr3n7q7z+FLg0iAztMQ/P1xFry6p/NvBA+IdRUQkcLqzuJ61Wyt4c9EmzpswWIPLiUgoqBDUM/mTNQBcMFHDSYhIOKgQ1FFVU8uTM4o4blRfBmSmxzuOiEibUCGo442Fmygp28P5EzUpvYiEhwpBHZOnr2VAjzSOGdk33lFERNqMCkHUuu27eG/ZZs7OG0zHDrpkVETCQ4Ug6qkZkWGRvn1ouxr3TkQkcCoEQG2t89SMIo4Y0ZvBvTLiHUdEpE2pEAAfLC9h3fZdnDNe8xGLSPioEABPTF9LZkYKXz8wO95RRETaXOgLQemuKl5bsIkzxw6kcyfdSSwi4RP6QvDyvA1U1tTyrXE6SSwi4RT6QvDc7HUM79OFrw3sHu8oIiJxEepCsG77Lj5euZUzxw7UcNMiElqhLgT/mhOZMO3MsQPjnEREJH5CXQimfrqesYMzGZKlewdEJLxCWwgKi3eycMMOzhiryWdEJNxCWwhenLsBMzjtoP7xjiIiElehLQQvzdvA+Jxe9O2eFu8oIiJxFcpCUFi8k6WbyrQ3ICJCSAvBG4uKATjpwH5xTiIiEn+hLATvLt3MqH7d6NdDh4VEREJXCCoqq5mxahtHj+wT7ygiIu1C6ArBxyu3UllTy5H79Y53FBGRdiF0hWDa8i2kduzA+Jxe8Y4iItIuhLIQjB2SSXqqhpwWEYGQFYLyKmf++lImDc+KdxQRkXYjVIVg9Y5a3CEvp2e8o4iItBuhKgQrS2sA+NqAHnFOIiLSfoSqEKzaUcvgXun07JIa7ygiIu1GqArB2p21jO6vmchEROoKtBCY2clmtsTMCs3sugbWm5ndHl0/18zGBZWluqaWzRXO8D5dg/oIEZGEFFghMLOOwJ3AKcBo4DwzG12v2SlAbvTnMuAfQeVZt30XNQ7DencJ6iNERBJSkHsEE4BCd1/h7pXAFOCMem3OAB72iI+ATDMLZEjQFSXlgAqBiEh9nQJ874HA2jrLRcDEGNoMBDbUbWRmlxHZYyA7O5uCgoJmh1m2rYYxvZwNS+dQsCo8E9WXlZW16O8rkanP4aA+t54gC0FDW1tvQRvc/V7gXoC8vDzPz89vdph8YGRBAS15bSIrUJ9DQX0Oh6D6HOShoSJgcJ3lQcD6FrQREZEABVkIpgO5ZjbMzFKBc4Gp9dpMBS6KXj10GFDq7hvqv5GIiAQnsEND7l5tZlcBrwIdgQfcfYGZXRFdfzfwEnAqUAhUAJcElUdERBoW5DkC3P0lIhv7us/dXeexAz8OMoOIiDQtVHcWi4jIl6kQiIiEnAqBiEjIqRCIiIScRc7XJg4z2wysbuHLewMlrRgnEajP4aA+h8NX6fNQd+/T0IqEKwRfhZnNcPe8eOdoS+pzOKjP4RBUn3VoSEQk5FQIRERCLmyF4N54B4gD9Tkc1OdwCKTPoTpHICIiXxa2PQIREalHhUBEJOSSshCY2clmtsTMCs3sugbWm5ndHl0/18zGxSNna4qhzxdE+zrXzD40s4PjkbM17avPddqNN7MaMzu7LfMFIZY+m1m+mX1qZgvM7J22ztjaYvi33cPM/mVmc6J9TuhRjM3sATMrNrP5jaxv/e2XuyfVD5Ehr5cDw4FUYA4wul6bU4GXicyQdhjwcbxzt0GfDwd6Rh+fEoY+12n3FpFRcM+Od+42+D1nAguBIdHlvvHO3QZ9/hVwc/RxH2ArkBrv7F+hz0cD44D5jaxv9e1XMu4RTAAK3X2Fu1cCU4Az6rU5A3jYIz4CMs2sf1sHbUX77LO7f+ju26KLHxGZDS6RxfJ7BvgJ8AxQ3JbhAhJLn88HnnX3NQDunuj9jqXPDnQzMwO6EikE1W0bs/W4+7tE+tCYVt9+JWMhGAisrbNcFH2uuW0SSXP78wMi3ygS2T77bGYDgbOAu0kOsfyeRwI9zazAzGaa2UVtli4YsfT5f4ADiExzOw+4xt1r2yZeXLT69ivQiWnixBp4rv41srG0SSQx98fMjiVSCI4MNFHwYunzrcC17l4T+bKY8GLpcyfgUOB4IB2YZmYfufvSoMMFJJY+nwR8ChwHjABeN7P33H1H0OHipNW3X8lYCIqAwXWWBxH5ptDcNokkpv6Y2RjgfuAUd9/SRtmCEkuf84Ap0SLQGzjVzKrd/fm2idjqYv23XeLu5UC5mb0LHAwkaiGIpc+XADd55AB6oZmtBEYBn7RNxDbX6tuvZDw0NB3INbNhZpYKnAtMrddmKnBR9Oz7YUCpu29o66CtaJ99NrMhwLPAhQn87bCuffbZ3Ye5e4675wBPA1cmcBGA2P5tvwAcZWadzCwDmAgsauOcrSmWPq8hsgeEmWUD+wMr2jRl22r17VfS7RG4e7WZXQW8SuSKgwfcfYGZXRFdfzeRK0hOBQqBCiLfKBJWjH2+AcgC7op+Q672BB65McY+J5VY+uzui8zsFWAuUAvc7+4NXoaYCGL8Pf8eeNDM5hE5bHKtuyfs8NRmNhnIB3qbWRFwI5ACwW2/NMSEiEjIJeOhIRERaQYVAhGRkFMhEBEJORUCEZGQUyEQEQk5FQJpl8wsKzqC5qdmttHM1tVZTm3FzznBzEqj77vIzH7dgvfoaGbvRR8PN7Nz66ybaGZ/b+Wci83sphheM87MTv6qny3JT4VA2iV33+LuY919LJGxgv6+dzk6+Nje4Xhb49/w29HPGQ/8oLlDdLt7jbsfFV0cTuSmp73rPnb3n7VCxro5xwHfMrOJ+2g/DlAhkH1SIZCEYmb7mdl8M7sbmAUMNrPtddafa2b3Rx9nm9mzZjbDzD6J3oXZKHcvi77nCDNLN7OHzGyemc0ys6Oj73mQmU2PfjOfG90D6FQnw03AsdH1V0e/yT8f3WtYbWbdo+9jZrbCzHq3IGcFkeGYB0bf6zAzm2Zms83sAzPLNbN0IjcRXhDNcraZdTWzB6OfMdvMvtH834AkIxUCSUSjgf9190OAdU20ux24JXoH9XeIjLPUKDPrQ2TY4wXA1UClux8EXAg8Ej0kdSXwlzp7EPXHeLmO6Dd3d79975PuXgO8yOdDKB8OLI3eAdvcnL2I7Hm8H31qEXBk9O/j98Af3H0X8DvgsWiWp4kUhlfcfQKRAdr+amZpTX2WhEPSDTEhobDc3afH0O4EYH/7fOTRnmaWHt1I1nWsmc0mMiTD7919iZkdCfwZIDqkwXpgP+BD4HozG0pk3P9CM4v1/9ETwC+BR4gcPnqiBTnnEhlQ7fd15hrIBB42sxH7+PyvA6fY57N8pQFDSNwB6aSVqBBIIiqv87iWLw7LW/cbrgET9p5TaMLb7n5mvecaHLfa3R8xs2nAaUSGO/4ekeIQi/eIjImTBfwb8JuW5DSzUcB7Zva8u88D/gi86u53mdl+wCuNvN6AM919eYx5JSR0aEgSWnQCkm3R4+IdiExEs9cbwI/3LpjZ2Ga89bvABdHXHQD0JzLE8XB3L3T324D/A8bUe91OoFsjWZ3I6KC3AnPcfe95hWbldPfFwC1E9i4AevD5IbKLm8jyKpFDXns/55CmPkfCQ4VAksG1RL4Fv0lkrPa9fgwcET2puxD4YTPe8w4gPTqi5WPARdFv7OdbZIL0T4kcp3+03utmAx0tMpH61XzZE8B3+fywUEtz3gUcb5HhxW8G/mxmH9Rr8xZwcPTE8NnAb4GM6AnwBcB/xfA5EgIafVREJOS0RyAiEnIqBCIiIadCICIScioEIiIhp0IgIhJyKgQiIiGnQiAiEnL/D5w4G8QO1cizAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ "