diff --git a/Data_Cleaning_EDA.ipynb b/Data_Cleaning_EDA.ipynb new file mode 100644 index 0000000..4f8aa6f --- /dev/null +++ b/Data_Cleaning_EDA.ipynb @@ -0,0 +1,952 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Reading the csv file in us_acci dataframe\n", + "us_acci = spark.read.csv(get_training_filename('US_Accidents.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape is (981531, 49)\n" + ] + } + ], + "source": [ + "# Shape of the spark dataframe\n", + "\n", + "print('Shape is ',(us_acci.count(),len(us_acci.columns)))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Renaming the columns names which contain brackets for ease of usage\n", + "\n", + "us_acci = us_acci.select('*').withColumnRenamed('Distance(mi)','Distance')\\\n", + " .withColumnRenamed('Temperature(F)', 'Temperature').withColumnRenamed('Wind_Chill(F)', 'Wind_Chill')\\\n", + " .withColumnRenamed('Humidity(%)', 'Humidity').withColumnRenamed('Pressure(in)', 'Pressure')\\\n", + " .withColumnRenamed('Visibility(mi)', 'Visibility').withColumnRenamed('Wind_Speed(mph)', 'Wind_Speed')\\\n", + " .withColumnRenamed('Precipitation(in)', 'Precipitation')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- ID: string (nullable = true)\n", + " |-- Source: string (nullable = true)\n", + " |-- TMC: double (nullable = true)\n", + " |-- Severity: integer (nullable = true)\n", + " |-- Start_Time: timestamp (nullable = true)\n", + " |-- End_Time: timestamp (nullable = true)\n", + " |-- Start_Lat: double (nullable = true)\n", + " |-- Start_Lng: double (nullable = true)\n", + " |-- End_Lat: double (nullable = true)\n", + " |-- End_Lng: double (nullable = true)\n", + " |-- Distance: double (nullable = true)\n", + " |-- Description: string (nullable = true)\n", + " |-- Number: double (nullable = true)\n", + " |-- Street: string (nullable = true)\n", + " |-- Side: string (nullable = true)\n", + " |-- City: string (nullable = true)\n", + " |-- County: string (nullable = true)\n", + " |-- State: string (nullable = true)\n", + " |-- Zipcode: string (nullable = true)\n", + " |-- Country: string (nullable = true)\n", + " |-- Timezone: string (nullable = true)\n", + " |-- Airport_Code: string (nullable = true)\n", + " |-- Weather_Timestamp: timestamp (nullable = true)\n", + " |-- Temperature: double (nullable = true)\n", + " |-- Wind_Chill: double (nullable = true)\n", + " |-- Humidity: double (nullable = true)\n", + " |-- Pressure: double (nullable = true)\n", + " |-- Visibility: double (nullable = true)\n", + " |-- Wind_Direction: string (nullable = true)\n", + " |-- Wind_Speed: double (nullable = true)\n", + " |-- Precipitation: double (nullable = true)\n", + " |-- Weather_Condition: string (nullable = true)\n", + " |-- Amenity: boolean (nullable = true)\n", + " |-- Bump: boolean (nullable = true)\n", + " |-- Crossing: boolean (nullable = true)\n", + " |-- Give_Way: boolean (nullable = true)\n", + " |-- Junction: boolean (nullable = true)\n", + " |-- No_Exit: boolean (nullable = true)\n", + " |-- Railway: boolean (nullable = true)\n", + " |-- Roundabout: boolean (nullable = true)\n", + " |-- Station: boolean (nullable = true)\n", + " |-- Stop: boolean (nullable = true)\n", + " |-- Traffic_Calming: boolean (nullable = true)\n", + " |-- Traffic_Signal: boolean (nullable = true)\n", + " |-- Turning_Loop: boolean (nullable = true)\n", + " |-- Sunrise_Sunset: string (nullable = true)\n", + " |-- Civil_Twilight: string (nullable = true)\n", + " |-- Nautical_Twilight: string (nullable = true)\n", + " |-- Astronomical_Twilight: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "# Schema of dataframe with datatypes of all columns\n", + "\n", + "us_acci.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Summary of all numerical variables in the dataset to find the skewness, outliers, mean, median\n", + "\n", + "us_acci.describe().toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Columns to be dropped initially as a list\n", + "drop_col = ['ID','End_Lat','End_Lng','Description','Number','Street','Zipcode','Airport_Code','Country','Weather_Timestamp','Wind_Chill','Turning_Loop']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping columns from the original dataset\n", + "\n", + "us_acci = us_acci.drop(*(drop_col))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+------+--------+----------+--------+---------+---------+--------+----+----+------+-----+--------+-----------+--------+--------+----------+--------------+----------+-------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+--------------+--------------+-----------------+---------------------+\n", + "|Source| TMC|Severity|Start_Time|End_Time|Start_Lat|Start_Lng|Distance|Side|City|County|State|Timezone|Temperature|Humidity|Pressure|Visibility|Wind_Direction|Wind_Speed|Precipitation|Weather_Condition|Amenity|Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station|Stop|Traffic_Calming|Traffic_Signal|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|\n", + "+------+------+--------+----------+--------+---------+---------+--------+----+----+------+-----+--------+-----------+--------+--------+----------+--------------+----------+-------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+--------------+--------------+-----------------+---------------------+\n", + "| 0|240595| 0| 0| 0| 0| 0| 0| 0| 30| 0| 0| 1069| 18494| 19485| 15870| 21643| 14882| 145268| 658958| 21750| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 37| 37| 37| 37|\n", + "+------+------+--------+----------+--------+---------+---------+--------+----+----+------+-----+--------+-----------+--------+--------+----------+--------------+----------+-------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+--------------+--------------+-----------------+---------------------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking null values in all the columns\n", + "\n", + "from pyspark.sql.functions import isnan, isnull, when, count, col\n", + "\n", + "us_acci.select([count(when(isnull(c), c)).alias(c) for c in us_acci.columns]).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Replacing same Wind_Direction named with different string N and North same\n", + "\n", + "us_acci = us_acci.withColumn('Wind_Direction', when(us_acci.Wind_Direction == 'E', 'East')\\\n", + " .when(us_acci.Wind_Direction == 'W', 'West')\\\n", + " .when(us_acci.Wind_Direction == 'N', 'North')\\\n", + " .when(us_acci.Wind_Direction == 'S', 'South')\\\n", + " .when(us_acci.Wind_Direction == 'VAR', 'Variable')\\\n", + " .when(us_acci.Wind_Direction == 'CALM', 'Calm')\\\n", + " .otherwise(us_acci.Wind_Direction))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Replacing same Weather_Condition named with different string Light Rain Shower and Light Rain Showers same\n", + "\n", + "us_acci = us_acci.withColumn('Weather_Condition', when(us_acci.Weather_Condition == 'Light Rain Shower', 'Light Rain Showers')\\\n", + " .when(us_acci.Weather_Condition == 'Light Snow Shower', 'Light Snow Showers')\\\n", + " .when(us_acci.Weather_Condition == 'Rain Shower', 'Rain Showers')\\\n", + " .otherwise(us_acci.Weather_Condition))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping Null rows from City Column as there are only 30 rows with City = Null\n", + "\n", + "us_acci = us_acci.where(col(\"city\").isNotNull())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping Null rows from Timezone Column \n", + "\n", + "us_acci = us_acci.where(col(\"Timezone\").isNotNull())" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Clipping Temperature extreme values to suppress outliers\n", + "\n", + "lower = -30\n", + "upper = 115\n", + "us_acci = us_acci.withColumn('Temperature', when(us_acci.Temperature > upper, upper)\\\n", + " .when(us_acci.Temperature < lower, lower).otherwise(us_acci.Temperature).alias('Temperature'))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Distance > 100 rows dropped\n", + "us_acci = us_acci.where(us_acci.Distance <100)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Clipping Visibility extreme values to suppress outliers \n", + "\n", + "upper = 20\n", + "us_acci = us_acci.withColumn('Visibility', when(us_acci.Visibility > upper, upper)\\\n", + " .otherwise(us_acci.Visibility).alias('Visibility'))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Clipping Wind_Speed extreme values to suppress outliers \n", + "upper = 40\n", + "us_acci = us_acci.withColumn('Wind_Speed', when(us_acci.Wind_Speed > upper, upper)\\\n", + " .otherwise(us_acci.Wind_Speed).alias('Wind_Speed'))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Replacing Null values in Precipitation with 0\n", + "\n", + "us_acci = us_acci.withColumn('Precipitation', when(us_acci.Precipitation.isNull(), 0).otherwise(us_acci.Precipitation))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "#replacing missing values in categorical attributes with the mode of the corresponding variables\n", + "\n", + "for col_name in ['Wind_Direction', 'Weather_Condition']:\n", + " common = us_acci.dropna().groupBy(col_name).agg(fn.count('*')).orderBy('count(1)', ascending = False).first()[col_name]\n", + " us_acci = us_acci.withColumn(col_name, when(isnull(col_name), common).otherwise(us_acci[col_name]))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "#replacing missing values in numerical attributes with the median of the corresponding variables\n", + "\n", + "for col_name in ['Temperature', 'Humidity', 'Pressure', 'Visibility', 'Wind_Speed']:\n", + " median = us_acci.dropna().approxQuantile(col_name, [0.5], 0.00)[0]\n", + " us_acci = us_acci.withColumn(col_name, when(isnull(col_name), median).otherwise(us_acci[col_name]))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Removing Null values in last 4 columns \n", + "\n", + "us_acci = us_acci.filter(us_acci.Sunrise_Sunset.isNotNull())\n", + "\n", + "us_acci = us_acci.filter(us_acci.Civil_Twilight.isNotNull())\n", + "\n", + "us_acci = us_acci.filter(us_acci.Nautical_Twilight.isNotNull())\n", + "\n", + "us_acci = us_acci.filter(us_acci.Astronomical_Twilight.isNotNull())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Adding Month of Year, Day of Week , Week of Year and Hour of the Day Column from Start Time\n", + "\n", + "us_acci = us_acci.withColumn(\"Start_Time\",to_timestamp(col(\"Start_Time\"))).withColumn(\"month_of_year\", date_format(col(\"Start_Time\"), \"MMMM\")).withColumn(\"day_of_week\", date_format(col(\"Start_Time\"), \"EEEE\")).withColumn(\"hour_day\", date_format(col(\"Start_Time\"), \"H\")).withColumn(\"week_of_year\", date_format(col(\"Start_Time\"), \"w\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+------+--------+----------+--------+---------+---------+--------+----+----+------+-----+--------+-----------+--------+--------+----------+--------------+----------+-------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+--------------+--------------+-----------------+---------------------+-------------+-----------+--------+------------+\n", + "|Source| TMC|Severity|Start_Time|End_Time|Start_Lat|Start_Lng|Distance|Side|City|County|State|Timezone|Temperature|Humidity|Pressure|Visibility|Wind_Direction|Wind_Speed|Precipitation|Weather_Condition|Amenity|Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station|Stop|Traffic_Calming|Traffic_Signal|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|month_of_year|day_of_week|hour_day|week_of_year|\n", + "+------+------+--------+----------+--------+---------+---------+--------+----+----+------+-----+--------+-----------+--------+--------+----------+--------------+----------+-------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+--------------+--------------+-----------------+---------------------+-------------+-----------+--------+------------+\n", + "| 0|240120| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n", + "+------+------+--------+----------+--------+---------+---------+--------+----+----+------+-----+--------+-----------+--------+--------+----------+--------------+----------+-------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+--------------+--------------+-----------------+---------------------+-------------+-----------+--------+------------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking null values in all the columns\n", + "\n", + "from pyspark.sql.functions import isnan, isnull, when, count, col\n", + "\n", + "us_acci.select([count(when(isnull(c), c)).alias(c) for c in us_acci.columns]).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape is (980416, 41)\n" + ] + } + ], + "source": [ + "# Shape of the spark dataframe\n", + "\n", + "print('Shape is ',(us_acci.count(),len(us_acci.columns)))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "us_acci.toPandas().to_csv(\"Us_clean.csv\",header=True,index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Converting spark dataframe to Pandas DF for EDA\n", + "\n", + "Us_acci_pd = us_acci.toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Monthly Accident count showing how distribution of accidents on 12 months of the year\n", + "\n", + "month_lst = ['January','February','March','April','May','June','July','August','September','October','November','December']\n", + "Us_acci_pd.groupby(['Severity', 'month_of_year']).size().reset_index().pivot(columns='Severity', index='month_of_year', values=0).reindex(month_lst).plot(kind='bar', stacked=True, title='Monthly Accident count for the year for each severity',)\n", + "display()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# Daily Accident count showing how distribution of accidents on 7 days of the week\n", + "\n", + "weekday_lst = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']\n", + "Us_acci_pd.groupby(['Severity', 'day_of_week']).size().reset_index().pivot(columns='Severity', index='day_of_week',values=0).reindex(weekday_lst).plot(kind='bar', stacked=True,title='Daily Accident count for the week for each severity')\n", + "display()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Day light savings in USA bi-annually for the weeks 11 and 45 rise is seen as compared to previous weeks 10 & 44 for March & November respectively\n", + "\n", + "col = ['10','11','12','44','45','46']\n", + "newcol = Us_acci_pd[Us_acci_pd.week_of_year.isin(col)]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# For Day light Savings checking\n", + "day_acci = newcol.groupby(['Severity', 'week_of_year']).size().reset_index().pivot(columns='Severity', index='week_of_year', values=0).plot(kind='bar', stacked=True,title='Day Light Savings Week Comparison of Accidents')\n", + "display(day_acci)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "subset=['Amenity', 'Bump', 'Crossing',\n", + " 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',\n", + " 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',\n", + " 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',\n", + " 'Astronomical_Twilight']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "subset.remove(\"Turning_Loop\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Severity as a percentage on y axis showing how severe the accidents can be considering the various variables \n", + "\n", + "fig,ax=plt.subplots(4,4,figsize=(20,30))\n", + "for i in range(len(subset)):\n", + " df_plot = Us_acci_pd.groupby([subset[i], \"Severity\"]).size().reset_index().pivot(columns=subset[i], index=\"Severity\", values=0)\n", + " df_plot.div(df_plot.sum(axis=1), axis=0).plot(kind='bar', stacked=True,ax=ax[int(i/4),i-int(i/4)*4])\n", + "display()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# Sunrise Sunset for Day & Night comparison of Severity \n", + "\n", + "df_plot = Us_acci_pd.groupby([\"Sunrise_Sunset\",\"Civil_Twilight\"]).size().reset_index().pivot(columns=\"Sunrise_Sunset\", index= \"Civil_Twilight\", values=0)\n", + "display(df_plot.div(df_plot.sum(axis=1), axis=0).plot(kind='bar', stacked=True,title='Sunrise Sunset for Day & Night comparison of Severity'))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "fig,ax=plt.subplots(1,figsize=(20,10))\n", + "sns.scatterplot(x='Start_Lng', y='Start_Lat', data=Us_acci_pd, hue='Severity',palette='RdYlGn_r',ax=ax)\n", + "ax.xlabel('Longitude')\n", + "ax.ylabel('Latitude)')\n", + "ax.set_title('US Map showing severity of accidents by their categories')\n", + "plt.show()\n", + "display()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Accident count showing how distribution of accidents on each hour days of the day for the whole 24 hours\n", + "\n", + "hour_lst = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23']\n", + "\n", + "\n", + "display(Us_acci_pd.groupby(['Severity', 'hour_day']).size().reset_index().pivot(columns='Severity', index='hour_day', values=0).reindex(hour_lst).plot(kind='bar', stacked=True,figsize=(10,7),title='Hourly accident count for each severity throughout 24 hours of a day'))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# Accidents count by each state and for each state count by severity of accidents\n", + "\n", + "display(pd.crosstab(columns=Us_acci_pd['Severity'],\n", + " index=Us_acci_pd['State']).plot(kind='bar',stacked=True,figsize=(16,8),color=['purple','orange','blue','red','green'],title='State Wise Accident count for each severity along with total count'))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# Percent of severity\n", + "f,ax=plt.subplots(1,2,figsize=(12,8))\n", + "Us_acci_pd['Severity'].value_counts().plot.pie(explode=[0,0,0,0.2],autopct='%1.1f%%',ax=ax[0],shadow=False)\n", + "ax[0].set_title('Percentage Severity Distribution')\n", + "ax[0].set_ylabel('Count')\n", + "display(sns.countplot('Severity',data=Us_acci_pd,ax=ax[1],order=Us_acci_pd['Severity'].value_counts().index))\n", + "ax[1].set_title('Count of Severity')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# plot a bar plot to visualize to see which states have the highest number of accidents\n", + "top_ten_cities = Us_acci_pd['State'].value_counts().head(10)\n", + "display(top_ten_cities.plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2,title='10 US States with the Highest Number of Accidents',figsize=(16,7)))\n", + "plt.xlabel('States',fontsize=20)\n", + "plt.ylabel('Number of Accidents',fontsize=20)\n", + "ax.tick_params(labelsize=20)\n", + "plt.grid()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# plot a bar plot to visualize to see which cities have the highest number of accidents\n", + "top_ten_cities = Us_acci_pd['City'].value_counts().head(10)\n", + "display(top_ten_cities.plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2,title='10 US Cities with the Highest Number of Accidents',figsize=(16,7)))\n", + "plt.xlabel('Cities',fontsize=20)\n", + "plt.ylabel('Number of Accidents',fontsize=20)\n", + "ax.tick_params(labelsize=20)\n", + "plt.grid()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "# Accidents based on the side of the driving Left & Right USA Right side driving \n", + "\n", + "Side_lst = ['R','L']\n", + "display(Us_acci_pd.groupby(['Severity', 'Side']).size().reset_index().pivot(columns='Severity', index='Side',values=0).reindex(Side_lst).plot(kind='bar', stacked=True,title='Accidents Severity based on Right or Left Side of Driving'))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Us_acci_pd.boxplot(column=['Distance']))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "display(Us_acci_pd.boxplot(column=['Temperature']))" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "display(Us_acci_pd.boxplot(column=['Wind_Speed']))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "display(Us_acci_pd.boxplot(column=['Humidity']))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "display(Us_acci_pd.boxplot(column=['Pressure']))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "display(Us_acci_pd.boxplot(column=['Visibility']))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# Univariate histogram of Temp \n", + "\n", + "var = 'Temperature'\n", + "x = Us_acci_pd[var]\n", + "\n", + "bins = np.arange(-30, 120,10.0)\n", + "\n", + "plt.hist(x, bins, alpha=0.8, histtype='bar', color='gold',\n", + " ec='black')\n", + "\n", + "plt.xlabel(var)\n", + "plt.ylabel('count')\n", + "plt.xticks(bins)\n", + "display(plt.show())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# Univariate Analysis of Wind_Speed\n", + "\n", + "var = 'Wind_Speed'\n", + "x = Us_acci_pd[var]\n", + "\n", + "bins = np.arange(0, 100,5.0)\n", + "\n", + "plt.hist(x, bins, alpha=0.8, histtype='bar', color='gold',\n", + " ec='black')\n", + "\n", + "plt.xlabel(var)\n", + "plt.ylabel('count')\n", + "plt.xticks(bins)\n", + "display(plt.show())" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "name": "Data_Cleaning_Project_IST718 (5) (1)", + "notebookId": 1278832175434352 + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/IST718_USAccidents_Project_Report.pdf b/IST718_USAccidents_Project_Report.pdf new file mode 100644 index 0000000..236e178 Binary files /dev/null and b/IST718_USAccidents_Project_Report.pdf differ diff --git a/LR_Binary.ipynb b/LR_Binary.ipynb new file mode 100644 index 0000000..0bc509d --- /dev/null +++ b/LR_Binary.ipynb @@ -0,0 +1,725 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# importing required libraries\n", + "\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "spark = SparkSession.builder.getOrCreate()\n", + "from sklearn.metrics import classification_report\n", + "sc = spark.sparkContext\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Reading the the csv file in us_acci dataframe\n", + "us_train = spark.read.csv(get_training_filename('USAccident_train_OHE.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Reading the the csv file in us_acci dataframe\n", + "us_test = spark.read.csv(get_training_filename('USAccident_validation_OHE.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Converts the train data with Severity 3 and 4 to 1 and rest to 0\n", + "us_train=us_train.withColumn(\"Severity\",when(((us_train[\"Severity\"]==4) | (us_train[\"Severity\"]==3)),1).otherwise(0))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Converts the test data with Severity 3 and 4 to 1 and rest to 0\n", + "us_test=us_test.withColumn(\"Severity\",when(((us_test[\"Severity\"]==4) | (us_test[\"Severity\"]==3)),1).otherwise(0))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Declaring the vector assembler\n", + "va = VectorAssembler().setInputCols([i for i in us_train.columns if i!='Severity']).setOutputCol('features')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Centering the data\n", + "center = feature.StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='centered_features')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Converting the categorical columns with string to numerical labels\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LR Binary Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Create initial LogisticRegression model\n", + "lr_w = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")\n", + "\n", + "# Train model with Training Data\n", + "lrModel_w = Pipeline(stages=[label_stringIdx,va, center, lr_w])\n", + "\n", + "# Fits the model\n", + "lr_fit_w = lrModel_w.fit(us_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Performs the prediiction on test set\n", + "pred_lrb = lr_fit_w.transform(us_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7213747454175152\n" + ] + } + ], + "source": [ + "# Caculates the accuracy for binary data\n", + "true_labels=us_test.toPandas()[\"Severity\"]\n", + "binary_prediction=lr_fit_w.transform(us_test).select(\"prediction\").collect()\n", + "binary_true_labels=us_test.select(\"Severity\").collect()\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Area Under ROC 0.7621852339529867\n" + ] + } + ], + "source": [ + "# Using the evaluator to calculate the AUC ROC\n", + "evaluator_lrb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "print('Test Area Under ROC', evaluator_lrb.evaluate(pred_lrb))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set areaUnderROC: 0.7621682343009717\n" + ] + } + ], + "source": [ + "trainingSummary = lr_fit_w.stages[-1].summary\n", + "roc = trainingSummary.roc.toPandas()\n", + "plt.plot(roc['FPR'],roc['TPR'])\n", + "plt.ylabel('False Positive Rate')\n", + "plt.xlabel('True Positive Rate')\n", + "plt.title('ROC Curve')\n", + "plt.grid(True)\n", + "plt.show()\n", + "print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr_fit_w.stages[-1].getRegParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr_fit_w.stages[-1].getElasticNetParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_lrb=(pred_lrb).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.74 0.91 0.81 131790\n", + " 1 0.65 0.34 0.44 64610\n", + "\n", + " micro avg 0.72 0.72 0.72 196400\n", + " macro avg 0.69 0.62 0.63 196400\n", + "weighted avg 0.71 0.72 0.69 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification report for calculating the metrics\n", + "print(classification_report(y_pred=prediction_lrb,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# stores all the co-efficients of the logistic regression\n", + "coef_L1_m=lr_fit_w.stages[-1].coefficients.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# makes a datafram of the coefficients\n", + "feat_imp_tuned_lrt = pd.DataFrame(list(zip([i for i in us_train.columns if i!='Severity'], coef_L1_m)),\n", + " columns = ['column', 'weight']).sort_values('weight')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features are 120\n", + "Eliminated features out of 120 are 0\n" + ] + } + ], + "source": [ + "# Printing the number of feature eliminated\n", + "coef_L1_m = np.absolute(coef_L1_m)\n", + "print('Total number of features are',len(coef_L1_m))\n", + "sorted_abs = np.sort(coef_L1_m)\n", + "weights_notzero = sorted_abs[sorted_abs == 0]\n", + "nonzero_weights = len(sorted_abs[sorted_abs == 0])\n", + "print('Eliminated features out of ' + str(len(coef_L1_m)) + ' are', len(weights_notzero))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LR Binary Grid Search Model" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "lr_new = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a grid for tuning the algorithm\n", + "#paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01, 0.8,0.03]).addGrid(lr_new.elasticNetParam, [0.1,0.4,0.7]).build()\n", + "paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01]).addGrid(lr_new.elasticNetParam, [0.1]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Making the pipeline for prediction\n", + "cvModel_lrmu = Pipeline(stages=[label_stringIdx,va,center,lr_new])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining a evaluator for evaluating our prediction\n", + "evaluator_lrbt = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Defining the cross validator for model\n", + "cv = CrossValidator(estimator=cvModel_lrmu, estimatorParamMaps=paramGrid_lr, evaluator=evaluator_lrbt, numFolds=5).fit(us_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Makes prediction on test set\n", + "pred_lrbt = cv.transform(us_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Area Under ROC 0.7615211426502622\n" + ] + } + ], + "source": [ + "evaluator_lrb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "print('Test Area Under ROC', evaluator_lrb.evaluate(pred_lrbt))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set areaUnderROC: 0.7616980707233938\n" + ] + } + ], + "source": [ + "trainingSummary_t = cv.bestModel.stages[-1].summary\n", + "roc = trainingSummary_t.roc.toPandas()\n", + "plt.plot(roc['FPR'],roc['TPR'])\n", + "plt.ylabel('False Positive Rate')\n", + "plt.xlabel('True Positive Rate')\n", + "plt.title('ROC Curve')\n", + "plt.show()\n", + "print('Training set areaUnderROC: ' + str(trainingSummary_t.areaUnderROC))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.01" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv.bestModel.stages[-1].getRegParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv.bestModel.stages[-1].getElasticNetParam()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_lrbt=(pred_lrbt).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.73 0.92 0.81 131790\n", + " 1 0.66 0.31 0.42 64610\n", + "\n", + " micro avg 0.72 0.72 0.72 196400\n", + " macro avg 0.69 0.62 0.62 196400\n", + "weighted avg 0.71 0.72 0.69 196400\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_lrbt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7195977596741344\n" + ] + } + ], + "source": [ + "# calculates the accuracy\n", + "true_labels=us_test.toPandas()[\"Severity\"]\n", + "evaluator_lrbt.evaluate(pred_lrbt)\n", + "binary_prediction=pred_lrbt.select(\"prediction\").collect()\n", + "binary_true_labels=us_test.select(\"Severity\").collect()\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# stores all coefficients of the LR model\n", + "coef_L1_m=cv.bestModel.stages[-1].coefficients.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Makes a dataframe of the variables and their respective coefficients\n", + "feat_imp_tuned_lrt = pd.DataFrame(list(zip([i for i in us_train.columns if i!='Severity'], coef_L1_m)),\n", + " columns = ['column', 'weight']).sort_values('weight')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features are 120\n", + "Eliminated features out of 120 are 28\n" + ] + } + ], + "source": [ + "# Shows the number of features that our LR model eliminated\n", + "coef_L1_m = np.absolute(coef_L1_m)\n", + "print('Total number of features are',len(coef_L1_m))\n", + "sorted_abs = np.sort(coef_L1_m)\n", + "weights_notzero = sorted_abs[sorted_abs == 0]\n", + "nonzero_weights = len(sorted_abs[sorted_abs == 0])\n", + "print('Eliminated features out of ' + str(len(coef_L1_m)) + ' are', len(weights_notzero))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/LR_Multiclass.ipynb b/LR_Multiclass.ipynb new file mode 100644 index 0000000..cac68a9 --- /dev/null +++ b/LR_Multiclass.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import the requied libraries\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import StringIndexer\n", + "from sklearn.metrics import classification_report\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "from pyspark.ml.classification import LogisticRegression\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "from pyspark.mllib.evaluation import MulticlassMetrics\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# importing the train data\n", + "us_train = spark.read.csv(get_training_filename('USAccident_train_OHE.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# importing the test data\n", + "us_test = spark.read.csv(get_training_filename('USAccident_validation_OHE.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we have converted class 2,3,4 to 0,1,2 for avoiding error while calculating the evaluation metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "us_test=us_test.withColumn(\"Severity\",when(us_test[\"Severity\"]==2,0).otherwise(us_test[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "us_train=us_train.withColumn(\"Severity\",when(us_train[\"Severity\"]==2,0).otherwise(us_train[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "us_test=us_test.withColumn(\"Severity\",when(us_test[\"Severity\"]==3,1).otherwise(us_test[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "us_train=us_train.withColumn(\"Severity\",when(us_train[\"Severity\"]==3,1).otherwise(us_train[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "us_test=us_test.withColumn(\"Severity\",when(us_test[\"Severity\"]==4,2).otherwise(us_test[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "us_train=us_train.withColumn(\"Severity\",when(us_train[\"Severity\"]==4,2).otherwise(us_train[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Declaring the vector assembler\n", + "va = VectorAssembler().setInputCols([i for i in us_train.columns if i!='Severity']).setOutputCol('features')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Centering our data for logistic regression model\n", + "center = feature.StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='centered_features',)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Converting the labels from string to integers\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Create initial LogisticRegression model\n", + "lr = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")\n", + "\n", + "# Train model with Training Data\n", + "lrModel = Pipeline(stages=[label_stringIdx,va, center, lr])\n", + "\n", + "lr_fit = lrModel.fit(us_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a multiclass evaluator\n", + "evaluator_mul = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7213798370672098\n" + ] + } + ], + "source": [ + "print(\"Accuracy:\",evaluator_mul.evaluate(lr_fit.transform(us_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr_fit.stages[-1].getElasticNetParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr_fit.stages[-1].getRegParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_lrm=(lr_fit.transform(us_test)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.74 0.91 0.82 131790\n", + " 1 0.64 0.36 0.46 58617\n", + " 2 0.54 0.11 0.18 5993\n", + "\n", + " micro avg 0.72 0.72 0.72 196400\n", + " macro avg 0.64 0.46 0.49 196400\n", + "weighted avg 0.70 0.72 0.69 196400\n", + "\n" + ] + } + ], + "source": [ + "# prints the classification report for the evaluating our model\n", + "print(classification_report(y_pred=prediction_lrm,y_true=true_labels))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LR Multiclass Grid Search Model " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "lr_new = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a grid for tuning our model\n", + "#paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01, 0.04,0.07]).addGrid(lr_new.elasticNetParam, [0.2,0.5,0.8]).build()\n", + "paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01]).addGrid(lr_new.elasticNetParam, [0.2]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the pipeline for the model\n", + "cvModel_lrmu = Pipeline(stages=[label_stringIdx,va,center,lr_new])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# define the evaluator for checking the accuracy of our model\n", + "evaluator_mul = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cross validator of 6 folds\n", + "cv = CrossValidator(estimator=cvModel_lrmu, estimatorParamMaps=paramGrid_lr, evaluator=evaluator_mul, numFolds=5,seed=42).fit(us_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cv.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv.bestModel.stages[-1].getElasticNetParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.01" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv.bestModel.stages[-1].getRegParam()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7184317718940937\n" + ] + } + ], + "source": [ + "print(\"Accuracy:\",evaluator_mul.evaluate(cv.bestModel.transform(us_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# stores the LR co-efficients of all the variable\n", + "coeft_L1_m=cv.bestModel.stages[-1].coefficientMatrix.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features are 120\n", + "Eliminated features out of 120 are 105\n" + ] + } + ], + "source": [ + "# prints the total no of features eliminated\n", + "coef_L1_mul=cv.bestModel.stages[-1].coefficientMatrix.toArray()\n", + "coeft_L1_mb = np.squeeze(coeft_L1_m)\n", + "coef_one_b = coeft_L1_m[:][0]\n", + "coef_two_b = coeft_L1_m[:][1]\n", + "coef_three_b = coeft_L1_m[:][2]\n", + "coef_one_b = np.absolute(coef_one_b)\n", + "coef_two_b = np.absolute(coef_two_b)\n", + "coef_three_b = np.absolute(coef_three_b)\n", + "\n", + "print('Total number of features are',len(coef_three_b))\n", + "\n", + "sorted_abs = np.sort(coef_three_b)\n", + "weights_notzero = sorted_abs[sorted_abs == 0]\n", + "nonzero_weights = len(sorted_abs[sorted_abs == 0])\n", + "\n", + "print('Eliminated features out of ' + str(len(coef_three_b)) +' are', nonzero_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# gets the prediction by running on our test set\n", + "prediction_lrt=(cv.bestModel.transform(us_test)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "# stores the true label for using it to print the classification report below\n", + "true_labels=us_test.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.74 0.91 0.82 131790\n", + " 1 0.64 0.36 0.46 58617\n", + " 2 0.54 0.11 0.18 5993\n", + "\n", + " micro avg 0.72 0.72 0.72 196400\n", + " macro avg 0.64 0.46 0.49 196400\n", + "weighted avg 0.70 0.72 0.69 196400\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_lrm,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/LR_Multiclass_Bal .ipynb b/LR_Multiclass_Bal .ipynb new file mode 100644 index 0000000..de405bb --- /dev/null +++ b/LR_Multiclass_Bal .ipynb @@ -0,0 +1,1059 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "from pyspark.ml.classification import LogisticRegression\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#Loading Train Data\n", + "\n", + "us_train = spark.read.csv(get_training_filename('USAccident_train_bal_cat.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Loading Test Data\n", + "\n", + "us_test = spark.read.csv(get_training_filename('USAccident_val_bal_cate.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+------+\n", + "|Severity| count|\n", + "+--------+------+\n", + "| 3|234445|\n", + "| 4|219519|\n", + "| 2|263497|\n", + "+--------+------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking the balance of data in training dataset\n", + "\n", + "us_train.groupBy('Severity').count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+------+\n", + "|Severity| count|\n", + "+--------+------+\n", + "| 3| 58339|\n", + "| 4| 6121|\n", + "| 2|131724|\n", + "+--------+------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking the balance of data in testing dataset\n", + "\n", + "us_test.groupBy('Severity').count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 0 to severity 2 label for test dataset\n", + "\n", + "us_test=us_test.withColumn(\"Severity\",when(us_test[\"Severity\"]==2,0).otherwise(us_test[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 0 to severity 2 label for train dataset\n", + "\n", + "us_train=us_train.withColumn(\"Severity\",when(us_train[\"Severity\"]==2,0).otherwise(us_train[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 1 to severity 3 label for test dataset\n", + "\n", + "us_test=us_test.withColumn(\"Severity\",when(us_test[\"Severity\"]==3,1).otherwise(us_test[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 1 to severity 3 label for train dataset\n", + "\n", + "us_train=us_train.withColumn(\"Severity\",when(us_train[\"Severity\"]==3,1).otherwise(us_train[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 2 to severity 4 label for test dataset\n", + "\n", + "us_test=us_test.withColumn(\"Severity\",when(us_test[\"Severity\"]==4,2).otherwise(us_test[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 2 to severity 4 label for train dataset\n", + "\n", + "us_train=us_train.withColumn(\"Severity\",when(us_train[\"Severity\"]==4,2).otherwise(us_train[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Vector Assembler to convert all features except Severity to a single column features for feeding it to input of model\n", + "\n", + "va = VectorAssembler().setInputCols([i for i in us_train.columns if i!='Severity']).setOutputCol('features')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard Scaler to standardize data for the Logistic Regression\n", + "\n", + "center = feature.StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='centered_features')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# String Indexer to assign target Variable Severity name Label needed for the model to predict\n", + "\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic Regression Multiclass Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Create initial LogisticRegression model\n", + "lr = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")\n", + "\n", + "# LR model pipeline \n", + "\n", + "lrModel = Pipeline(stages=[label_stringIdx,va, center, lr])\n", + "\n", + "# Fir the training data using the LR model \n", + "\n", + "lr_fit = lrModel.fit(us_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluator for Evaluating the model performance\n", + "\n", + "evaluator_mul = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy is 0.5680891408065898\n" + ] + } + ], + "source": [ + "# Accuracy calculation for the model on test data\n", + "\n", + "print(\"Accuracy is\",evaluator_mul.evaluate(lr_fit.transform(us_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_lrm=(lr_fit.transform(us_test)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.89 0.48 0.62 131724\n", + " 1 0.51 0.74 0.61 58339\n", + " 2 0.14 0.93 0.24 6121\n", + "\n", + " accuracy 0.57 196184\n", + " macro avg 0.51 0.71 0.49 196184\n", + "weighted avg 0.75 0.57 0.60 196184\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_lrm,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "# Weights/coefficients for All variables assigned by LR Model \n", + "\n", + "coef_L1_mul=lr_fit.stages[-1].coefficientMatrix.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "# Combining the 3 arrays of coefficient matrix to 1 array\n", + "\n", + "coeft_L1_mb = np.squeeze(coeft_L1_m)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract 1st array of coefficients with features equal to number of columns\n", + "\n", + "coef_one_b = coeft_L1_m[:][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract 2nd array of coefficients with features equal to number of columns\n", + "\n", + "coef_two_b = coeft_L1_m[:][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract 3rd array of coefficients with features equal to number of columns\n", + "\n", + "coef_three_b = coeft_L1_m[:][2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Number of Features Eliminated by L1 Regularization for Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features for 1st class are 119\n", + "Total number of features for 2nd class are 119\n", + "Total number of features for 3rd class are 119\n", + "Eliminated features for 1st class out of 119 are 60\n", + "Eliminated features for 2nd class out of 119 are 84\n", + "Eliminated features for 3rd class out of 119 are 64\n" + ] + } + ], + "source": [ + "# Taking the absolute value of the weights and calculating how many features were eliminated by the model for each class each array\n", + "\n", + "coef_one_b = np.absolute(coef_one_b)\n", + "coef_two_b = np.absolute(coef_two_b)\n", + "coef_three_b = np.absolute(coef_three_b)\n", + "\n", + "print('Total number of features for 1st class are',len(coef_one_b))\n", + "print('Total number of features for 2nd class are',len(coef_two_b))\n", + "print('Total number of features for 3rd class are',len(coef_three_b))\n", + "\n", + "sorted_abs_1 = np.sort(coef_one_b)\n", + "sorted_abs_2 = np.sort(coef_two_b)\n", + "sorted_abs_3 = np.sort(coef_three_b)\n", + "\n", + "weights_notzero_1 = sorted_abs_1[sorted_abs_1 == 0]\n", + "nonzero_weights_1 = len(sorted_abs_1[sorted_abs_1 == 0])\n", + "\n", + "weights_notzero_2 = sorted_abs_2[sorted_abs_2 == 0]\n", + "nonzero_weights_2 = len(sorted_abs_2[sorted_abs_2 == 0])\n", + "\n", + "weights_notzero_3 = sorted_abs_3[sorted_abs_3 == 0]\n", + "nonzero_weights_3 = len(sorted_abs_3[sorted_abs_3 == 0])\n", + "\n", + "print('Eliminated features for 1st class out of ' + str(len(coef_one_b)) +' are', nonzero_weights_1)\n", + "print('Eliminated features for 2nd class out of ' + str(len(coef_two_b)) +' are', nonzero_weights_2)\n", + "print('Eliminated features for 3rd class out of ' + str(len(coef_three_b)) +' are', nonzero_weights_3)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas dataframe of weights of variables with variable names to find which variables are eliminated for 1st class\n", + "\n", + "feat_imp_tuned_lrb1 = pd.DataFrame(list(zip([i for i in us_train.columns if i!='Severity'], coef_one_b)),\n", + " columns = ['column', 'weight']).sort_values('weight')" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columnweight
0Hour_Index_80.0
85clear0.0
49day_of_week_Index_30.0
48day_of_week_Index_40.0
47day_of_week_Index_10.0
86whirl0.0
88light0.0
84cloud0.0
89heavy0.0
90thunderstorm0.0
\n", + "
" + ], + "text/plain": [ + " column weight\n", + "0 Hour_Index_8 0.0\n", + "85 clear 0.0\n", + "49 day_of_week_Index_3 0.0\n", + "48 day_of_week_Index_4 0.0\n", + "47 day_of_week_Index_1 0.0\n", + "86 whirl 0.0\n", + "88 light 0.0\n", + "84 cloud 0.0\n", + "89 heavy 0.0\n", + "90 thunderstorm 0.0" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample of 10 features eliminated by the Logistic Regression Model after L1 Regularization for class 1\n", + "\n", + "feat_imp_tuned_lrb1[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LR Multiclass Grid Search Model " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Logistic Regression Pipeline initialization\n", + "\n", + "lr_new = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Grid Search for tuning the hyper parameters of Logistic Regression Model\n", + "\n", + "paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01, 0.04,0.07]).addGrid(lr_new.elasticNetParam, [0.2,0.5,0.8]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating pipeline to be used for fitting the training data\n", + "\n", + "cvModel_lrmu = Pipeline(stages=[label_stringIdx,va,center,lr_new])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Multiclass Evaluator for evaluating the model performance\n", + "\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "\n", + "evaluator_mul = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Cross validator pipeline initialization for 5-fold cross validation and fitting the train data\n", + "\n", + "cv = CrossValidator(estimator=cvModel_lrmu, estimatorParamMaps=paramGrid_lr, evaluator=evaluator_mul, numFolds=5,seed=42).fit(us_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='LogisticRegression_63e671446ece', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,\n", + " Param(parent='LogisticRegression_63e671446ece', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.2,\n", + " Param(parent='LogisticRegression_63e671446ece', name='featuresCol', doc='features column name.'): 'centered_features',\n", + " Param(parent='LogisticRegression_63e671446ece', name='fitIntercept', doc='whether to fit an intercept term.'): True,\n", + " Param(parent='LogisticRegression_63e671446ece', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='LogisticRegression_63e671446ece', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='LogisticRegression_63e671446ece', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='LogisticRegression_63e671446ece', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='LogisticRegression_63e671446ece', name='standardization', doc='whether to standardize the training features before fitting the model.'): True,\n", + " Param(parent='LogisticRegression_63e671446ece', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.5,\n", + " Param(parent='LogisticRegression_63e671446ece', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',\n", + " Param(parent='LogisticRegression_63e671446ece', name='maxIter', doc='max number of iterations (>= 0).'): 100,\n", + " Param(parent='LogisticRegression_63e671446ece', name='regParam', doc='regularization parameter (>= 0).'): 0.01,\n", + " Param(parent='LogisticRegression_63e671446ece', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 1e-06}" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best Model Hyper Parameters \n", + "\n", + "cv.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy is 0.5427251967540676\n" + ] + } + ], + "source": [ + "# Accuracy of the model on the testing data\n", + "\n", + "print(\"Accuracy is\",evaluator_mul.evaluate(cv.bestModel.transform(us_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# Coefficient matrix from Logistic Regression for each variable weight\n", + "\n", + "coeft_L1_m=cv.bestModel.stages[-1].coefficientMatrix.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "# Combining the 3 arrays of coefficient matrix to 1 array\n", + "\n", + "coeft_L1_m = np.squeeze(coeft_L1_m)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "#Extract 1st array of coefficients with features equal to number of columns\n", + "\n", + "coef_one = coeft_L1_m[:][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract 2nd array of coefficients with features equal to number of columns\n", + "\n", + "coef_two = coeft_L1_m[:][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract 3rd array of coefficients with features equal to number of columns\n", + "\n", + "coef_three = coeft_L1_m[:][2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Number of Features Eliminated by L1 Regularization for Grid Model" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features for 1st class are 119\n", + "Total number of features for 2nd class are 119\n", + "Total number of features for 3rd class are 119\n", + "Eliminated features for 1st class out of 119 are 60\n", + "Eliminated features for 2nd class out of 119 are 84\n", + "Eliminated features for 3rd class out of 119 are 64\n" + ] + } + ], + "source": [ + "# Taking the absolute value of the weights and calculating how many features were eliminated by the model for each class each array\n", + "\n", + "coef_one = np.absolute(coef_one)\n", + "coef_two = np.absolute(coef_two)\n", + "coef_three = np.absolute(coef_three)\n", + "\n", + "print('Total number of features for 1st class are',len(coef_one))\n", + "print('Total number of features for 2nd class are',len(coef_two))\n", + "print('Total number of features for 3rd class are',len(coef_three))\n", + "\n", + "sorted_abs_1 = np.sort(coef_one)\n", + "sorted_abs_2 = np.sort(coef_two)\n", + "sorted_abs_3 = np.sort(coef_three)\n", + "\n", + "weights_notzero_1 = sorted_abs_1[sorted_abs_1 == 0]\n", + "nonzero_weights_1 = len(sorted_abs_1[sorted_abs_1 == 0])\n", + "\n", + "weights_notzero_2 = sorted_abs_2[sorted_abs_2 == 0]\n", + "nonzero_weights_2 = len(sorted_abs_2[sorted_abs_2 == 0])\n", + "\n", + "weights_notzero_3 = sorted_abs_3[sorted_abs_3 == 0]\n", + "nonzero_weights_3 = len(sorted_abs_3[sorted_abs_3 == 0])\n", + "\n", + "print('Eliminated features for 1st class out of ' + str(len(coef_one)) +' are', len(weights_notzero_1))\n", + "print('Eliminated features for 2nd class out of ' + str(len(coef_two)) +' are', len(weights_notzero_2))\n", + "print('Eliminated features for 3rd class out of ' + str(len(coef_three)) +' are', len(weights_notzero_3))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_lrmt=(cv.bestModel.transform(us_test)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.90 0.43 0.58 131724\n", + " 1 0.51 0.76 0.61 58339\n", + " 2 0.13 0.95 0.22 6121\n", + "\n", + " accuracy 0.54 196184\n", + " macro avg 0.51 0.71 0.47 196184\n", + "weighted avg 0.76 0.54 0.58 196184\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_lrmt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas dataframe of weights of variables with variable names to find which variables are eliminated for 3rd class for Grid\n", + "\n", + "feat_imp_tuned_lrt3 = pd.DataFrame(list(zip([i for i in us_train.columns if i!='Severity'], coef_three)),\n", + " columns = ['column', 'weight']).sort_values('weight')" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columnweight
59month_of_year_Index_20.0
79Wind_Direction_Index_160.0
78Wind_Direction_Index_90.0
77Wind_Direction_Index_50.0
76Wind_Direction_Index_150.0
75Wind_Direction_Index_60.0
74Wind_Direction_Index_130.0
73Wind_Direction_Index_100.0
71Wind_Direction_Index_20.0
68Wind_Direction_Index_110.0
\n", + "
" + ], + "text/plain": [ + " column weight\n", + "59 month_of_year_Index_2 0.0\n", + "79 Wind_Direction_Index_16 0.0\n", + "78 Wind_Direction_Index_9 0.0\n", + "77 Wind_Direction_Index_5 0.0\n", + "76 Wind_Direction_Index_15 0.0\n", + "75 Wind_Direction_Index_6 0.0\n", + "74 Wind_Direction_Index_13 0.0\n", + "73 Wind_Direction_Index_10 0.0\n", + "71 Wind_Direction_Index_2 0.0\n", + "68 Wind_Direction_Index_11 0.0" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample of 10 features eliminated by the Logistic Regression Model after L1 Regularization for class 3\n", + "\n", + "feat_imp_tuned_lrt3[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RFDT_Multiclass.ipynb b/RFDT_Multiclass.ipynb new file mode 100644 index 0000000..fd665cc --- /dev/null +++ b/RFDT_Multiclass.ipynb @@ -0,0 +1,2168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# imports the required library\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer, VectorAssembler\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the training data\n", + "us_train_cat = spark.read.csv(get_training_filename('USAccident_train_categorical.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the testing data\n", + "us_test_cat = spark.read.csv(get_training_filename('USAccident_validation_categorical.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 0 to severity 2 label for test dataset\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(us_test_cat[\"Severity\"]==2,0).otherwise(us_test_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 0 to severity 2 label for train dataset\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(us_train_cat[\"Severity\"]==2,0).otherwise(us_train_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 1 to severity 3 label for test dataset\n", + "\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(us_test_cat[\"Severity\"]==3,1).otherwise(us_test_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 1 to severity 3 label for train dataset\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(us_train_cat[\"Severity\"]==3,1).otherwise(us_train_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 2 to severity 4 label for test dataset\n", + "\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(us_test_cat[\"Severity\"]==4,2).otherwise(us_test_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 2 to severity 4 label for train dataset\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(us_train_cat[\"Severity\"]==4,2).otherwise(us_train_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Vector Assembler to convert all features except Severity to a single column features for feeding it to input of model\n", + "\n", + "va = VectorAssembler().setInputCols([i for i in us_train_cat.columns if i!='Severity']).setOutputCol('features')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# String Indexer to assign target Variable Severity name Label needed for the model to predict\n", + "\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Multiclass Evaluator to evaluate the performance of the model with 3 class prediction \n", + "\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "\n", + "evaluator = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiclass RF Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml.classification import RandomForestClassifier\n", + "\n", + "# Create an initial RandomForest model.\n", + "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for RF Base Model \n", + "\n", + "rfModel = Pipeline(stages=[label_stringIdx,va, rf])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Train RF base model with Training Data\n", + "\n", + "rf_fit = rfModel.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.6858234810872593\n" + ] + } + ], + "source": [ + "# Evaluation of model using Multiclass Evaluator on Test data\n", + "\n", + "print(\"Accuracy:\",evaluator.evaluate(rf_fit.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_rfmu=(rf_fit.transform(us_test_cat)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.69 0.98 0.81 131571\n", + " 1 0.68 0.09 0.16 58293\n", + " 2 0.00 0.00 0.00 6115\n", + "\n", + " micro avg 0.69 0.69 0.69 195979\n", + " macro avg 0.45 0.36 0.32 195979\n", + "weighted avg 0.66 0.69 0.59 195979\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\Anaconda\\envs\\tensorflow\\lib\\site-packages\\sklearn\\metrics\\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n", + "D:\\Anaconda\\envs\\tensorflow\\lib\\site-packages\\sklearn\\metrics\\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n", + "D:\\Anaconda\\envs\\tensorflow\\lib\\site-packages\\sklearn\\metrics\\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_rfmu,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of RF Base Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rfm = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], rf_fit.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of RF Base Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rfm['column'][:10], y=feat_imp_tuned_rfm['weight'][:10],data=feat_imp_tuned_rfm)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiclass RF Grid Search Model" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an initial RandomForest model.\n", + "\n", + "rf_new = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for RF Grid Model \n", + "\n", + "rfModel_new = Pipeline(stages=[label_stringIdx,va, rf_new])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Grid Search for Hyper Parameter Tuning\n", + "\n", + "#paramGrid_rf = ParamGridBuilder().addGrid(rf_new.numTrees, [10, 25, 60]).addGrid(rf_new.maxDepth, [3, 5, 10]).addGrid(rf_new.impurity,[\"entropy\", \"gini\"]).build()\n", + "\n", + "paramGrid_rf = ParamGridBuilder().addGrid(rf_new.numTrees, [60]).addGrid(rf_new.maxDepth, [10]).addGrid(rf_new.impurity,[\"entropy\"]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Cross Validator with 5 fold and Grid Search to fit the training data\n", + "\n", + "cv_rf = CrossValidator(estimator=rfModel_new, estimatorParamMaps=paramGrid_rf, evaluator=evaluator, numFolds=5).fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform Test data using Cross Validation Pipeline Built earlier for prediction of Test data\n", + "\n", + "pred_rft = cv_rf.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7203679986120962\n" + ] + } + ], + "source": [ + "# Evaluation of model using Multiclass Evaluator on Test data\n", + "\n", + "print(\"Accuracy:\",evaluator.evaluate(pred_rft))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_rf=cv_rf.transform(us_test_cat).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.73 0.92 0.82 131571\n", + " 1 0.65 0.33 0.44 58293\n", + " 2 0.62 0.07 0.13 6115\n", + "\n", + " micro avg 0.72 0.72 0.72 195979\n", + " macro avg 0.67 0.44 0.46 195979\n", + "weighted avg 0.71 0.72 0.68 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_rf,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='RandomForestClassifier_d70a92d04e27', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='featuresCol', doc='features column name'): 'features',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='labelCol', doc='label column name'): 'label',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='numTrees', doc='Number of trees to train (>= 1)'): 60,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='predictionCol', doc='prediction column name'): 'prediction',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities'): 'probability',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction',\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='seed', doc='random seed'): 42,\n", + " Param(parent='RandomForestClassifier_d70a92d04e27', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best Model Hyper Parameters after tuning\n", + "\n", + "cv_rf.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of RF Grid Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rft = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cv_rf.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5,1,'Top 10 Features based on Importance from Random Forest Grid Model')" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of RF Grid Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rft['column'][:10], y=feat_imp_tuned_rft['weight'][:10],data=feat_imp_tuned_rft)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest Grid Model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DecisionTreeClassificationModel (uid=dtc_eb02160664ee) of depth 10 with 729 nodes\n", + " If (feature 35 <= 3.5)\n", + " If (feature 36 <= 34.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 39 <= 13.5)\n", + " If (feature 16 <= 0.0105)\n", + " If (feature 16 <= 0.009999999888240001)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 37 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 38 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 18 <= 77.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 77.5)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.009999999888240001)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 19 <= 29.095)\n", + " If (feature 40 <= 9.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 9.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.095)\n", + " If (feature 40 <= 6.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 6.5)\n", + " Predict: 1.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 35 <= 2.5)\n", + " Predict: 1.0\n", + " Else (feature 35 > 2.5)\n", + " If (feature 36 <= 12.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 22 <= 0.045)\n", + " If (feature 40 <= 8.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 8.5)\n", + " If (feature 17 <= 59.1)\n", + " Predict: 0.0\n", + " Else (feature 17 > 59.1)\n", + " Predict: 1.0\n", + " Else (feature 22 > 0.045)\n", + " If (feature 14 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 14 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.0105)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 22 <= 0.005)\n", + " If (feature 20 <= 8.5)\n", + " If (feature 17 <= 43.35)\n", + " Predict: 1.0\n", + " Else (feature 17 > 43.35)\n", + " Predict: 0.0\n", + " Else (feature 20 > 8.5)\n", + " If (feature 18 <= 99.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 99.5)\n", + " Predict: 0.0\n", + " Else (feature 22 > 0.005)\n", + " If (feature 4 <= 0.5)\n", + " If (feature 40 <= 4.5)\n", + " Predict: 2.0\n", + " Else (feature 40 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 4 > 0.5)\n", + " If (feature 40 <= 10.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 10.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 35 <= 2.5)\n", + " If (feature 0 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 0 > 0.5)\n", + " If (feature 21 <= 4.05)\n", + " Predict: 1.0\n", + " Else (feature 21 > 4.05)\n", + " Predict: 0.0\n", + " Else (feature 35 > 2.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 41 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 41 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 18 <= 44.5)\n", + " If (feature 42 <= 1.5)\n", + " If (feature 23 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 23 > 0.5)\n", + " If (feature 16 <= 0.0165)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0165)\n", + " Predict: 2.0\n", + " Else (feature 42 > 1.5)\n", + " If (feature 35 <= 1.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 44.5)\n", + " If (feature 39 <= 11.5)\n", + " If (feature 9 <= 0.5)\n", + " If (feature 23 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 23 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 9 > 0.5)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 2.0\n", + " Else (feature 21 > 3.25)\n", + " Predict: 0.0\n", + " Else (feature 39 > 11.5)\n", + " If (feature 21 <= 14.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " Predict: 2.0\n", + " Else (feature 21 > 14.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 13.5)\n", + " If (feature 28 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 14 <= 0.5)\n", + " If (feature 37 <= 0.5)\n", + " If (feature 39 <= 15.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 15.5)\n", + " If (feature 42 <= 2.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 2.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 36 <= 2.5)\n", + " If (feature 18 <= 83.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 83.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 2.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " Predict: 2.0\n", + " Else (feature 14 > 0.5)\n", + " If (feature 36 <= 14.5)\n", + " If (feature 36 <= 6.5)\n", + " If (feature 39 <= 16.5)\n", + " Predict: 2.0\n", + " Else (feature 39 > 16.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 6.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " Predict: 2.0\n", + " Else (feature 36 > 14.5)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 18 <= 56.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 56.5)\n", + " Predict: 1.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 19 <= 29.795)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.795)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 41 <= 4.5)\n", + " If (feature 26 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 26 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 19 <= 29.795)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.795)\n", + " Predict: 0.0\n", + " Else (feature 28 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 46 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 46 > 0.5)\n", + " If (feature 40 <= 7.5)\n", + " If (feature 47 <= 21.5)\n", + " If (feature 47 <= 20.5)\n", + " If (feature 36 <= 17.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 17.5)\n", + " Predict: 0.0\n", + " Else (feature 47 > 20.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 40 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 5.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 47 > 21.5)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 21 <= 5.9)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 21 > 5.9)\n", + " Predict: 0.0\n", + " Else (feature 40 > 7.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 37 <= 0.5)\n", + " If (feature 36 <= 14.5)\n", + " If (feature 36 <= 12.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 14.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 1 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 36 <= 14.5)\n", + " If (feature 0 <= 0.5)\n", + " If (feature 19 <= 30.205)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.205)\n", + " Predict: 1.0\n", + " Else (feature 0 > 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 36 > 14.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 34.5)\n", + " If (feature 37 <= 0.5)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 26 <= 0.5)\n", + " If (feature 21 <= 0.6)\n", + " If (feature 18 <= 33.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 33.5)\n", + " If (feature 41 <= 4.5)\n", + " If (feature 16 <= 0.6205)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.6205)\n", + " If (feature 35 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 36 <= 43.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 43.5)\n", + " If (feature 18 <= 99.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 99.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 0.6)\n", + " If (feature 39 <= 2.5)\n", + " If (feature 19 <= 29.985)\n", + " If (feature 16 <= 5.0E-4)\n", + " If (feature 18 <= 61.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 61.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.985)\n", + " If (feature 41 <= 4.5)\n", + " If (feature 20 <= 2.25)\n", + " Predict: 0.0\n", + " Else (feature 20 > 2.25)\n", + " Predict: 1.0\n", + " Else (feature 41 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 2.5)\n", + " If (feature 35 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 35 > 0.5)\n", + " If (feature 41 <= 4.5)\n", + " If (feature 16 <= 0.0105)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0105)\n", + " Predict: 1.0\n", + " Else (feature 41 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 26 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 42 <= 2.5)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 17 <= 65.9)\n", + " If (feature 16 <= 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " If (feature 41 <= 1.5)\n", + " Predict: 1.0\n", + " Else (feature 41 > 1.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 65.9)\n", + " If (feature 29 <= 0.5)\n", + " If (feature 16 <= 0.0105)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0105)\n", + " Predict: 1.0\n", + " Else (feature 29 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 11 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 2.5)\n", + " If (feature 18 <= 86.5)\n", + " If (feature 39 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 0.5)\n", + " If (feature 42 <= 8.5)\n", + " Predict: 1.0\n", + " Else (feature 42 > 8.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 86.5)\n", + " If (feature 18 <= 89.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 89.5)\n", + " Predict: 1.0\n", + " Else (feature 31 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 16 <= 0.0015)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 14 <= 0.5)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 19 <= 30.125)\n", + " If (feature 21 <= 0.6)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 0.6)\n", + " If (feature 18 <= 67.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 67.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.125)\n", + " Predict: 1.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 19 <= 29.875)\n", + " If (feature 37 <= 1.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 1.5)\n", + " If (feature 36 <= 41.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 41.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.875)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 20 <= 11.0)\n", + " Predict: 0.0\n", + " Else (feature 20 > 11.0)\n", + " Predict: 1.0\n", + " Else (feature 31 > 0.5)\n", + " If (feature 21 <= 0.6)\n", + " Predict: 1.0\n", + " Else (feature 21 > 0.6)\n", + " Predict: 0.0\n", + " Else (feature 14 > 0.5)\n", + " If (feature 40 <= 1.5)\n", + " If (feature 39 <= 11.5)\n", + " If (feature 19 <= 30.125)\n", + " If (feature 21 <= 7.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 7.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.125)\n", + " Predict: 0.0\n", + " Else (feature 39 > 11.5)\n", + " Predict: 2.0\n", + " Else (feature 40 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 1 > 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 42 <= 1.5)\n", + " If (feature 20 <= 7.5)\n", + " If (feature 36 <= 38.5)\n", + " Predict: 2.0\n", + " Else (feature 36 > 38.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 7.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 31 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0015)\n", + " If (feature 14 <= 0.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 22 <= 0.045)\n", + " If (feature 35 <= 2.5)\n", + " If (feature 36 <= 47.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " Predict: 2.0\n", + " Else (feature 36 > 47.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 2.5)\n", + " If (feature 11 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 22 > 0.045)\n", + " Predict: 1.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 37 <= 1.5)\n", + " If (feature 19 <= 29.095)\n", + " If (feature 35 <= 2.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 2.5)\n", + " If (feature 41 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 5.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.095)\n", + " If (feature 41 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 5.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 1.5)\n", + " If (feature 0 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 0 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 14 > 0.5)\n", + " If (feature 36 <= 36.5)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 17 <= 62.25)\n", + " If (feature 47 <= 22.5)\n", + " Predict: 1.0\n", + " Else (feature 47 > 22.5)\n", + " Predict: 2.0\n", + " Else (feature 17 > 62.25)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 16 <= 0.1915)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.1915)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 18 <= 83.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 83.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 36.5)\n", + " If (feature 16 <= 0.018500000000000003)\n", + " If (feature 21 <= 7.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 7.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 19 <= 29.744999999999997)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.744999999999997)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 16 > 0.018500000000000003)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 16 <= 1.825000047685)\n", + " If (feature 31 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 31 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 16 > 1.825000047685)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 19 <= 30.015)\n", + " If (feature 19 <= 29.744999999999997)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.744999999999997)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.015)\n", + " Predict: 2.0\n", + " Else (feature 35 > 3.5)\n", + " If (feature 23 <= 0.5)\n", + " If (feature 37 <= 0.5)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 16 <= 0.3325)\n", + " If (feature 46 <= 0.5)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 42 <= 3.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 3.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 46 > 0.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 35 <= 11.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 35 > 11.5)\n", + " If (feature 41 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " If (feature 41 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 5.5)\n", + " If (feature 19 <= 30.305)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.305)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.3325)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 16 <= 1.825000047685)\n", + " If (feature 45 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 0.5)\n", + " If (feature 42 <= 5.5)\n", + " If (feature 26 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 26 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 5.5)\n", + " If (feature 19 <= 30.645)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.645)\n", + " Predict: 0.0\n", + " Else (feature 16 > 1.825000047685)\n", + " If (feature 19 <= 29.744999999999997)\n", + " If (feature 36 <= 12.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 12.5)\n", + " If (feature 35 <= 8.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 8.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.744999999999997)\n", + " If (feature 38 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 20 <= 4.5)\n", + " If (feature 13 <= 0.5)\n", + " If (feature 41 <= 3.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 3.5)\n", + " Predict: 2.0\n", + " Else (feature 13 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 4.5)\n", + " If (feature 18 <= 83.5)\n", + " If (feature 16 <= 1.825000047685)\n", + " Predict: 0.0\n", + " Else (feature 16 > 1.825000047685)\n", + " Predict: 1.0\n", + " Else (feature 18 > 83.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 47 <= 14.5)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 47 > 14.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 31 > 0.5)\n", + " If (feature 29 <= 0.5)\n", + " If (feature 39 <= 1.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 13 <= 0.5)\n", + " If (feature 19 <= 30.645)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.645)\n", + " Predict: 1.0\n", + " Else (feature 13 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 1.5)\n", + " If (feature 16 <= 0.46349999999999997)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.46349999999999997)\n", + " If (feature 42 <= 5.5)\n", + " If (feature 17 <= 57.1)\n", + " If (feature 20 <= 4.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 57.1)\n", + " Predict: 0.0\n", + " Else (feature 42 > 5.5)\n", + " Predict: 1.0\n", + " Else (feature 29 > 0.5)\n", + " If (feature 19 <= 30.105)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 35 <= 17.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 17.5)\n", + " Predict: 1.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 1 > 0.5)\n", + " If (feature 19 <= 29.795)\n", + " If (feature 18 <= 44.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 44.5)\n", + " If (feature 36 <= 25.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 25.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.795)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 18 <= 83.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 83.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.105)\n", + " If (feature 41 <= 4.5)\n", + " If (feature 18 <= 66.5)\n", + " If (feature 17 <= 46.2)\n", + " Predict: 0.0\n", + " Else (feature 17 > 46.2)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 66.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 17 <= 48.8)\n", + " Predict: 0.0\n", + " Else (feature 17 > 48.8)\n", + " Predict: 1.0\n", + " Else (feature 44 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 20 <= 17.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 17 <= 57.1)\n", + " Predict: 0.0\n", + " Else (feature 17 > 57.1)\n", + " If (feature 36 <= 32.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 32.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 16 <= 0.0199999997765)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0199999997765)\n", + " If (feature 39 <= 12.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 17.5)\n", + " If (feature 21 <= 6.95)\n", + " Predict: 1.0\n", + " Else (feature 21 > 6.95)\n", + " If (feature 18 <= 20.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 20.5)\n", + " If (feature 41 <= 3.5)\n", + " Predict: 1.0\n", + " Else (feature 41 > 3.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " If (feature 19 <= 29.325)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 35 <= 5.5)\n", + " Predict: 1.0\n", + " Else (feature 35 > 5.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.325)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 22 <= 0.015)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 16 <= 0.1915)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.1915)\n", + " Predict: 1.0\n", + " Else (feature 22 > 0.015)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 47 <= 13.5)\n", + " If (feature 20 <= 12.5)\n", + " If (feature 14 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 14 > 0.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " If (feature 37 <= 1.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 1.5)\n", + " If (feature 39 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " If (feature 16 <= 1.825000047685)\n", + " If (feature 41 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 4.5)\n", + " Predict: 2.0\n", + " Else (feature 16 > 1.825000047685)\n", + " If (feature 32 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 20 > 12.5)\n", + " If (feature 19 <= 29.325)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.325)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " If (feature 36 <= 10.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 10.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 47 > 13.5)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 46 <= 0.5)\n", + " If (feature 16 <= 0.053500000000000006)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.053500000000000006)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 46 > 0.5)\n", + " If (feature 36 <= 40.5)\n", + " If (feature 18 <= 33.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 33.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 40.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 47 <= 15.5)\n", + " If (feature 42 <= 1.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 47 > 15.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 40 <= 3.5)\n", + " If (feature 16 <= 0.014499999999999999)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.014499999999999999)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 36 <= 51.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 51.5)\n", + " Predict: 2.0\n", + " Else (feature 31 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 3.5)\n", + " If (feature 40 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 4.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 23 > 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 16 <= 0.0199999997765)\n", + " If (feature 17 <= 76.4)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 39 <= 2.5)\n", + " If (feature 41 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 2.5)\n", + " If (feature 42 <= 6.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 6.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 17 <= 55.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 55.5)\n", + " If (feature 35 <= 8.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 8.5)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 36 <= 2.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 2.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 76.4)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 19 <= 29.875)\n", + " If (feature 17 <= 80.8)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 37 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 80.8)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.875)\n", + " If (feature 41 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 5.5)\n", + " If (feature 40 <= 4.5)\n", + " If (feature 40 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 39 <= 8.5)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 40 <= 8.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 8.5)\n", + " If (feature 40 <= 9.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 9.5)\n", + " Predict: 0.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 8.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0199999997765)\n", + " If (feature 18 <= 63.5)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 47 <= 11.5)\n", + " If (feature 19 <= 28.735)\n", + " If (feature 39 <= 1.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 28.735)\n", + " If (feature 19 <= 29.795)\n", + " If (feature 39 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 5.5)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.795)\n", + " Predict: 0.0\n", + " Else (feature 47 > 11.5)\n", + " If (feature 36 <= 31.5)\n", + " If (feature 18 <= 54.5)\n", + " If (feature 16 <= 0.46349999999999997)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.46349999999999997)\n", + " Predict: 0.0\n", + " Else (feature 18 > 54.5)\n", + " Predict: 2.0\n", + " Else (feature 36 > 31.5)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 47 <= 16.5)\n", + " If (feature 18 <= 44.5)\n", + " If (feature 21 <= 5.4)\n", + " Predict: 0.0\n", + " Else (feature 21 > 5.4)\n", + " Predict: 2.0\n", + " Else (feature 18 > 44.5)\n", + " If (feature 41 <= 1.5)\n", + " Predict: 1.0\n", + " Else (feature 41 > 1.5)\n", + " Predict: 0.0\n", + " Else (feature 47 > 16.5)\n", + " Predict: 1.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 63.5)\n", + " If (feature 22 <= 0.11499999999999999)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 35 <= 5.5)\n", + " If (feature 16 <= 0.9285000035765)\n", + " If (feature 32 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.9285000035765)\n", + " If (feature 20 <= 9.5)\n", + " Predict: 0.0\n", + " Else (feature 20 > 9.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 5.5)\n", + " If (feature 41 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 1 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 22 > 0.11499999999999999)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 47 <= 7.5)\n", + " If (feature 41 <= 4.5)\n", + " If (feature 35 <= 7.5)\n", + " If (feature 16 <= 0.6205)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.6205)\n", + " If (feature 21 <= 0.6)\n", + " Predict: 0.0\n", + " Else (feature 21 > 0.6)\n", + " Predict: 1.0\n", + " Else (feature 35 > 7.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 4.5)\n", + " If (feature 40 <= 8.5)\n", + " If (feature 35 <= 16.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 16.5)\n", + " If (feature 18 <= 49.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 49.5)\n", + " If (feature 4 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 4 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 8.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 47 > 7.5)\n", + " If (feature 17 <= 51.25)\n", + " Predict: 0.0\n", + " Else (feature 17 > 51.25)\n", + " If (feature 36 <= 19.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 19.5)\n", + " If (feature 19 <= 29.915)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.915)\n", + " If (feature 37 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 14 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 14 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 5 <= 0.5)\n", + " If (feature 37 <= 0.5)\n", + " If (feature 19 <= 29.975)\n", + " If (feature 18 <= 63.5)\n", + " If (feature 21 <= 6.95)\n", + " If (feature 39 <= 4.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 6.95)\n", + " Predict: 0.0\n", + " Else (feature 18 > 63.5)\n", + " If (feature 35 <= 7.5)\n", + " If (feature 39 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 35 > 7.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.975)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 8 <= 0.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 19 <= 29.665)\n", + " If (feature 40 <= 7.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 7.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.665)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 47 <= 7.5)\n", + " If (feature 36 <= 2.5)\n", + " Predict: 1.0\n", + " Else (feature 36 > 2.5)\n", + " Predict: 0.0\n", + " Else (feature 47 > 7.5)\n", + " If (feature 16 <= 0.0205)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0205)\n", + " Predict: 2.0\n", + " Else (feature 8 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 5 > 0.5)\n", + " If (feature 17 <= 65.9)\n", + " Predict: 0.0\n", + " Else (feature 17 > 65.9)\n", + " Predict: 1.0\n", + "\n" + ] + } + ], + "source": [ + "# Tree from the best Model printing it \n", + "\n", + "print(cv_rf.bestModel.stages[-1].trees[3].toDebugString)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Base Model Multiclass Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "\n", + "# Create initial Decision Tree Model\n", + "\n", + "dt = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\")\n", + "\n", + "# Creating pipeline for DT Base Model \n", + "\n", + "dt_pipe = Pipeline(stages=[label_stringIdx, va, dt])\n", + "\n", + "# Train DT Base model with Training Data\n", + "\n", + "dtModel = dt_pipe.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# Multiclass Evaluator to evaluate the performance of the model \n", + "\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "\n", + "evaluator_dt = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform Test data using Fitted Pipeline Built earlier for prediction of Test data\n", + "\n", + "pred_dt = dtModel.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7120507809510203\n" + ] + } + ], + "source": [ + "# Evaluation of model using Multiclass Evaluator on Test data\n", + "\n", + "print(\"Accuracy:\",evaluator_dt.evaluate(pred_dt))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_dt=pred_dt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.74 0.89 0.81 131571\n", + " 1 0.60 0.36 0.45 58293\n", + " 2 0.50 0.22 0.31 6115\n", + "\n", + " micro avg 0.71 0.71 0.71 195979\n", + " macro avg 0.61 0.49 0.52 195979\n", + "weighted avg 0.69 0.71 0.69 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_dt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of DT Base Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dt = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of DT Base Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dt['column'][:10], y=feat_imp_tuned_dt['weight'][:10],data=feat_imp_tuned_dt)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Multiclass Base Model\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Grid Search Multiclass Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Initializing DT Grid Pipeline \n", + "\n", + "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\")\n", + "\n", + "# Creating pipeline for DT Grid Model \n", + "\n", + "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n", + "\n", + "# Creating Grid Search for Hyper Parameter Tuning for DT Model\n", + "\n", + "#grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n", + "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth,[30]).addGrid(dt_new.minInstancesPerNode, [500]).addGrid(dt_new.maxBins,[20]).build()\n", + "# Cross Validator Pipeline with 5 fold \n", + "\n", + "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator_dt)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "# Fitting the training data using the Cross Validator Pipeline \n", + "\n", + "dtModel_t = cv1_dt.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform Test data using Cross Validation Pipeline Built earlier for prediction of Test data\n", + "\n", + "pred_dtt = dtModel_t.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7259502293613092\n" + ] + } + ], + "source": [ + "# Evaluation of Testing Data using Multiclass Evaluator \n", + "\n", + "print(\"Accuracy:\",evaluator_dt.evaluate(pred_dtt))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='DecisionTreeClassifier_90ff862f7958', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='featuresCol', doc='features column name'): 'features',\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='labelCol', doc='label column name'): 'label',\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 20,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 30,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 500,\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='predictionCol', doc='prediction column name'): 'prediction',\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities'): 'probability',\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction',\n", + " Param(parent='DecisionTreeClassifier_90ff862f7958', name='seed', doc='random seed'): 3427287019239861576}" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel_t.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'DecisionTreeClassificationModel' object has no attribute 'getMaxBins'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Decision Tree Hyper Parameter Value Max Bins from Best Model\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdtModel_t\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbestModel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstages\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetMaxBins\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'DecisionTreeClassificationModel' object has no attribute 'getMaxBins'" + ] + } + ], + "source": [ + "# Decision Tree Hyper Parameter Value Max Bins from Best Model\n", + "\n", + "dtModel_t.bestModel.stages[-1].getMaxBins()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "30" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Decision Tree Hyper Parameter Value Max Depth from Best Model\n", + "\n", + "dtModel_t.bestModel.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "500" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Decision Tree Hyper Parameter Value Minimum Instances Per Node from Best Model\n", + "\n", + "dtModel_t.bestModel.stages[-1].getMinInstancesPerNode()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_dtt=pred_dtt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.77 0.86 0.81 131571\n", + " 1 0.61 0.47 0.53 58293\n", + " 2 0.51 0.22 0.31 6115\n", + "\n", + " micro avg 0.73 0.73 0.73 195979\n", + " macro avg 0.63 0.52 0.55 195979\n", + "weighted avg 0.71 0.73 0.71 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_dtt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of DT Grid Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dtt = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of DT Grid Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dtt['column'][:10], y=feat_imp_tuned_dtt['weight'][:10],data=feat_imp_tuned_dtt)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Multiclass tuned\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RFDT_Multiclass_Bal.ipynb b/RFDT_Multiclass_Bal.ipynb new file mode 100644 index 0000000..01918b8 --- /dev/null +++ b/RFDT_Multiclass_Bal.ipynb @@ -0,0 +1,2770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer, VectorAssembler\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the training data\n", + "\n", + "us_train_cat = spark.read.csv(get_training_filename('USAccident_balanced_train_cat.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the testing data\n", + "\n", + "us_test_cat = spark.read.csv(get_training_filename('USAccident_validation_cate.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+\n", + "|count(Severity)|\n", + "+---------------+\n", + "| 3|\n", + "+---------------+\n", + "\n" + ] + } + ], + "source": [ + "# Counting the Distinct Severity of Accidents in training data\n", + "\n", + "us_train_cat.agg(countDistinct(\"Severity\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+\n", + "|count(Severity)|\n", + "+---------------+\n", + "| 3|\n", + "+---------------+\n", + "\n" + ] + } + ], + "source": [ + "# Counting the Distinct Severity of Accidents in testing data\n", + "\n", + "us_test_cat.agg(countDistinct(\"Severity\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+------+\n", + "|Severity| count|\n", + "+--------+------+\n", + "| 3|234445|\n", + "| 4|219519|\n", + "| 2|263497|\n", + "+--------+------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking the balance of data in training dataset\n", + "\n", + "us_train_cat.groupBy('Severity').count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+------+\n", + "|Severity| count|\n", + "+--------+------+\n", + "| 3| 58339|\n", + "| 4| 6121|\n", + "| 2|131724|\n", + "+--------+------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking the balance of data in testing dataset\n", + "\n", + "us_test_cat.groupBy('Severity').count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 0 to severity 2 label for test dataset\n", + "\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(us_test_cat[\"Severity\"]==2,0).otherwise(us_test_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 0 to severity 2 label for train dataset\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(us_train_cat[\"Severity\"]==2,0).otherwise(us_train_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 1 to severity 3 label for test dataset\n", + "\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(us_test_cat[\"Severity\"]==3,1).otherwise(us_test_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 1 to severity 3 label for train dataset\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(us_train_cat[\"Severity\"]==3,1).otherwise(us_train_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 2 to severity 4 label for test dataset\n", + "\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(us_test_cat[\"Severity\"]==4,2).otherwise(us_test_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label 2 to severity 4 label for train dataset\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(us_train_cat[\"Severity\"]==4,2).otherwise(us_train_cat[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Vector Assembler to convert all features except Severity to a single column features for feeding it to input of model\n", + "\n", + "va = VectorAssembler().setInputCols([i for i in us_train_cat.columns if i!='Severity']).setOutputCol('features')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# String Indexer to assign target Variable Severity name Label needed for the model to predict\n", + "\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Multiclass Evaluator to evaluate the performance of the model with 3 class prediction \n", + "\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "\n", + "evaluator = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiclass RF Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Create an initial RandomForest model\n", + "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for RF Base Model \n", + "\n", + "rfModel = Pipeline(stages=[label_stringIdx,va, rf])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Train RF base model with Training Data\n", + "\n", + "rf_fit = rfModel.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy is 0.5453502834074134\n" + ] + } + ], + "source": [ + "# Evaluation of model using Multiclass Evaluator on Test data for accuracy as metric\n", + "\n", + "print(\"Accuracy is\", evaluator.evaluate(rf_fit.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_rfmu=(rf_fit.transform(us_test_cat)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.42 0.57 131724\n", + " 1 0.49 0.80 0.61 58339\n", + " 2 0.13 0.94 0.24 6121\n", + "\n", + " accuracy 0.55 196184\n", + " macro avg 0.52 0.72 0.47 196184\n", + "weighted avg 0.77 0.55 0.57 196184\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_rfmu,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of RF Base Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rfm = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], rf_fit.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from Random Forest')" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of RF Base Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rfm['column'][:10], y=feat_imp_tuned_rfm['weight'][:10],data=feat_imp_tuned_rfm)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiclass RF Grid Search Model" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an initial RandomForest model.\n", + "\n", + "rf_new = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for RF Grid Model \n", + "\n", + "rfModel_new = Pipeline(stages=[label_stringIdx,va, rf_new])" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Grid Search for Hyper Parameter Tuning\n", + "\n", + "paramGrid_rf = ParamGridBuilder().addGrid(rf_new.numTrees, [10, 25, 60]).addGrid(rf_new.maxDepth, [3, 6, 10]).addGrid(rf_new.impurity,[\"entropy\", \"gini\"]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# Cross Validator with 5 fold and Grid Search to fit the training data\n", + "\n", + "cv_rf = CrossValidator(estimator=rfModel_new, estimatorParamMaps=paramGrid_rf, evaluator=evaluator, numFolds=5,seed=42).fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform Test data using Cross Validation Pipeline Built earlier for prediction of Test data\n", + "\n", + "pred_rft = cv_rf.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy is 0.5714176487379196\n" + ] + } + ], + "source": [ + "# Evaluation of model using Multiclass Evaluator on Test data for Accuracy calculation\n", + "\n", + "print(\"Accuracy is \",evaluator.evaluate(pred_rft))" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_rf=cv_rf.transform(us_test_cat).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.46 0.61 131724\n", + " 1 0.50 0.79 0.61 58339\n", + " 2 0.15 0.94 0.25 6121\n", + "\n", + " accuracy 0.57 196184\n", + " macro avg 0.52 0.73 0.49 196184\n", + "weighted avg 0.77 0.57 0.60 196184\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_rf,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='RandomForestClassifier_19a2a830fa51', name='featuresCol', doc='features column name.'): 'features',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='seed', doc='random seed.'): 42,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='featureSubsetStrategy', doc=\"The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'\"): 'auto',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='numTrees', doc='Number of trees to train (>= 1).'): 60,\n", + " Param(parent='RandomForestClassifier_19a2a830fa51', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best Model Hyper Parameters after tuning\n", + "\n", + "cv_rf.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SparseVector(46, {0: 0.0004, 1: 0.0008, 3: 0.0001, 4: 0.0002, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0003, 9: 0.0002, 10: 0.0, 11: 0.0052, 13: 0.0001, 14: 0.0004, 15: 0.0, 16: 0.2309, 17: 0.0028, 18: 0.0021, 19: 0.0052, 20: 0.0007, 21: 0.0035, 22: 0.0006, 23: 0.0017, 24: 0.0, 25: 0.0183, 26: 0.0001, 27: 0.0069, 28: 0.0, 29: 0.0005, 31: 0.001, 32: 0.004, 33: 0.0, 34: 0.0539, 35: 0.3209, 36: 0.0916, 37: 0.0008, 38: 0.0049, 39: 0.0127, 40: 0.1946, 41: 0.0031, 42: 0.0032, 43: 0.0048, 44: 0.0071, 45: 0.0162})" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Feature Importance Sparse Matrix\n", + "\n", + "cv_rf.bestModel.stages[-1].featureImportances" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of RF Grid Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rft = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cv_rf.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from Random Forest Grid Model')" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of RF Grid Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rft['column'][:10], y=feat_imp_tuned_rft['weight'][:10],data=feat_imp_tuned_rft)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest Grid Model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DecisionTreeClassificationModel: uid=dtc_c75d2dd73064, depth=10, numNodes=1039, numClasses=3, numFeatures=46\n", + " If (feature 35 <= 0.5)\n", + " If (feature 36 <= 0.5)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 29 <= 0.5)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 45 <= 18.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 18.5)\n", + " Predict: 2.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 18 <= 41.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 41.5)\n", + " If (feature 16 <= 1.2955)\n", + " Predict: 1.0\n", + " Else (feature 16 > 1.2955)\n", + " Predict: 2.0\n", + " Else (feature 21 > 3.25)\n", + " Predict: 1.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 16 <= 0.966)\n", + " If (feature 9 <= 0.5)\n", + " If (feature 37 <= 11.5)\n", + " If (feature 32 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 11.5)\n", + " Predict: 1.0\n", + " Else (feature 9 > 0.5)\n", + " If (feature 37 <= 10.5)\n", + " If (feature 18 <= 50.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 50.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 10.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.966)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 40 <= 8.5)\n", + " If (feature 10 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 10 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 8.5)\n", + " If (feature 37 <= 12.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 22 <= 0.045)\n", + " If (feature 19 <= 29.744999999999997)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.744999999999997)\n", + " Predict: 1.0\n", + " Else (feature 22 > 0.045)\n", + " If (feature 37 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 4.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 16 <= 0.966)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.966)\n", + " Predict: 2.0\n", + " Else (feature 21 > 3.25)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 26 <= 0.5)\n", + " If (feature 9 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 9 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 26 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " If (feature 38 <= 2.5)\n", + " If (feature 21 <= 0.6)\n", + " Predict: 2.0\n", + " Else (feature 21 > 0.6)\n", + " If (feature 13 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 13 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 2.5)\n", + " If (feature 37 <= 4.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 4.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 23 <= 0.5)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 21 <= 6.45)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 6.45)\n", + " If (feature 22 <= 0.10500000000000001)\n", + " Predict: 0.0\n", + " Else (feature 22 > 0.10500000000000001)\n", + " Predict: 1.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 16 <= 1.2955)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 1.2955)\n", + " If (feature 21 <= 5.4)\n", + " Predict: 0.0\n", + " Else (feature 21 > 5.4)\n", + " Predict: 1.0\n", + " Else (feature 23 > 0.5)\n", + " If (feature 18 <= 99.5)\n", + " If (feature 21 <= 5.4)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 5.4)\n", + " Predict: 0.0\n", + " Else (feature 18 > 99.5)\n", + " Predict: 1.0\n", + " Else (feature 29 > 0.5)\n", + " If (feature 17 <= 67.9)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 16 <= 1.8465)\n", + " If (feature 0 <= 0.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 0 > 0.5)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 20 <= 11.0)\n", + " Predict: 0.0\n", + " Else (feature 20 > 11.0)\n", + " Predict: 1.0\n", + " Else (feature 16 > 1.8465)\n", + " Predict: 2.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 21 <= 10.2)\n", + " If (feature 37 <= 4.5)\n", + " If (feature 17 <= 51.2)\n", + " If (feature 45 <= 1.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 17 > 51.2)\n", + " Predict: 1.0\n", + " Else (feature 37 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 10.2)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 37 <= 12.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 44 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 17 > 67.9)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 37 <= 9.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 16 <= 0.2135)\n", + " If (feature 17 <= 69.9)\n", + " Predict: 0.0\n", + " Else (feature 17 > 69.9)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.2135)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 9.5)\n", + " If (feature 14 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 14 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 18 <= 21.5)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 11 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 21.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 38 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 31 > 0.5)\n", + " If (feature 19 <= 29.785)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 21 <= 5.4)\n", + " If (feature 0 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 0 > 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 21 > 5.4)\n", + " If (feature 29 <= 0.5)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 16 <= 0.4845)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.4845)\n", + " If (feature 37 <= 6.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 6.5)\n", + " Predict: 1.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 29 > 0.5)\n", + " If (feature 39 <= 2.5)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 21 <= 19.0)\n", + " Predict: 0.0\n", + " Else (feature 21 > 19.0)\n", + " Predict: 1.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 2.5)\n", + " If (feature 18 <= 33.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 33.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 37 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 0.5)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 18 <= 66.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 66.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 21 <= 5.4)\n", + " Predict: 0.0\n", + " Else (feature 21 > 5.4)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.785)\n", + " If (feature 29 <= 0.5)\n", + " If (feature 43 <= 0.5)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 45 <= 5.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 16 <= 0.2895)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.2895)\n", + " Predict: 1.0\n", + " Else (feature 45 > 5.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 11 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 43 > 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 21 <= 9.1)\n", + " Predict: 0.0\n", + " Else (feature 21 > 9.1)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 16 > 5.0E-4)\n", + " If (feature 0 <= 0.5)\n", + " If (feature 40 <= 4.5)\n", + " If (feature 20 <= 0.225)\n", + " Predict: 1.0\n", + " Else (feature 20 > 0.225)\n", + " Predict: 0.0\n", + " Else (feature 40 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 0 > 0.5)\n", + " If (feature 19 <= 30.255000000000003)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.255000000000003)\n", + " Predict: 0.0\n", + " Else (feature 29 > 0.5)\n", + " If (feature 45 <= 1.5)\n", + " If (feature 0 <= 0.5)\n", + " If (feature 21 <= 11.75)\n", + " If (feature 45 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 0.5)\n", + " If (feature 17 <= 57.1)\n", + " Predict: 1.0\n", + " Else (feature 17 > 57.1)\n", + " Predict: 0.0\n", + " Else (feature 21 > 11.75)\n", + " Predict: 0.0\n", + " Else (feature 0 > 0.5)\n", + " If (feature 37 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 5.5)\n", + " If (feature 18 <= 85.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 85.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 1.5)\n", + " If (feature 38 <= 7.5)\n", + " If (feature 19 <= 30.165)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.165)\n", + " If (feature 21 <= 7.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 7.5)\n", + " If (feature 17 <= 71.15)\n", + " Predict: 0.0\n", + " Else (feature 17 > 71.15)\n", + " Predict: 1.0\n", + " Else (feature 38 > 7.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 16 <= 0.0105)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.0105)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 45 <= 12.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 12.5)\n", + " If (feature 0 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 0 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 36 > 0.5)\n", + " If (feature 16 <= 0.053500000000000006)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 19 <= 30.165)\n", + " If (feature 45 <= 1.5)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 40 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 40 <= 3.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 3.5)\n", + " If (feature 22 <= 0.035)\n", + " Predict: 0.0\n", + " Else (feature 22 > 0.035)\n", + " Predict: 1.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 4 <= 0.5)\n", + " If (feature 37 <= 7.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 7.5)\n", + " Predict: 0.0\n", + " Else (feature 4 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 1.5)\n", + " If (feature 5 <= 0.5)\n", + " If (feature 43 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 43 > 0.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 21 <= 6.45)\n", + " Predict: 0.0\n", + " Else (feature 21 > 6.45)\n", + " Predict: 1.0\n", + " Else (feature 5 > 0.5)\n", + " If (feature 17 <= 77.1)\n", + " If (feature 19 <= 29.545)\n", + " If (feature 21 <= 13.4)\n", + " Predict: 1.0\n", + " Else (feature 21 > 13.4)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.545)\n", + " Predict: 0.0\n", + " Else (feature 17 > 77.1)\n", + " If (feature 40 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 21 <= 13.9)\n", + " Predict: 0.0\n", + " Else (feature 21 > 13.9)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.165)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 20 <= 11.0)\n", + " If (feature 3 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 3 > 0.5)\n", + " If (feature 37 <= 3.5)\n", + " If (feature 11 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 11 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 3.5)\n", + " Predict: 0.0\n", + " Else (feature 20 > 11.0)\n", + " If (feature 19 <= 30.185000000000002)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.185000000000002)\n", + " If (feature 38 <= 5.5)\n", + " If (feature 42 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 5.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 4.5)\n", + " If (feature 18 <= 55.5)\n", + " If (feature 18 <= 38.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 38.5)\n", + " If (feature 17 <= 33.2)\n", + " If (feature 16 <= 5.0E-4)\n", + " Predict: 1.0\n", + " Else (feature 16 > 5.0E-4)\n", + " Predict: 0.0\n", + " Else (feature 17 > 33.2)\n", + " Predict: 0.0\n", + " Else (feature 18 > 55.5)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 40 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 31 > 0.5)\n", + " If (feature 37 <= 2.5)\n", + " If (feature 21 <= 7.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 7.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 2.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 18 <= 38.5)\n", + " If (feature 19 <= 29.905)\n", + " If (feature 17 <= 53.5)\n", + " If (feature 19 <= 29.825)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.825)\n", + " If (feature 1 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 53.5)\n", + " If (feature 17 <= 71.15)\n", + " If (feature 21 <= 20.85)\n", + " Predict: 0.0\n", + " Else (feature 21 > 20.85)\n", + " Predict: 1.0\n", + " Else (feature 17 > 71.15)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.905)\n", + " If (feature 45 <= 6.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 6.5)\n", + " If (feature 11 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 39 <= 1.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 1.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 38.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 45 <= 13.5)\n", + " If (feature 26 <= 0.5)\n", + " If (feature 38 <= 8.5)\n", + " If (feature 18 <= 38.5)\n", + " If (feature 37 <= 7.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 7.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 38.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 8.5)\n", + " Predict: 0.0\n", + " Else (feature 26 > 0.5)\n", + " If (feature 17 <= 77.1)\n", + " Predict: 0.0\n", + " Else (feature 17 > 77.1)\n", + " Predict: 1.0\n", + " Else (feature 45 > 13.5)\n", + " If (feature 18 <= 50.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 50.5)\n", + " If (feature 45 <= 14.5)\n", + " If (feature 18 <= 93.5)\n", + " If (feature 17 <= 43.1)\n", + " Predict: 0.0\n", + " Else (feature 17 > 43.1)\n", + " Predict: 1.0\n", + " Else (feature 18 > 93.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 14.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.053500000000000006)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 45 <= 12.5)\n", + " If (feature 8 <= 0.5)\n", + " If (feature 16 <= 0.2135)\n", + " If (feature 19 <= 30.095)\n", + " If (feature 19 <= 29.785)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.785)\n", + " If (feature 39 <= 2.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 2.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.095)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 39 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.2135)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 37 <= 5.5)\n", + " If (feature 40 <= 16.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 16.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 5.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 11 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 37 <= 8.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 8.5)\n", + " Predict: 1.0\n", + " Else (feature 8 > 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 38 <= 6.5)\n", + " If (feature 45 <= 9.5)\n", + " If (feature 45 <= 1.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 9.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 6.5)\n", + " If (feature 40 <= 8.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 8.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 12.5)\n", + " If (feature 45 <= 17.5)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 17 <= 67.9)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 23 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 23 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 18 <= 44.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 44.5)\n", + " Predict: 1.0\n", + " Else (feature 17 > 67.9)\n", + " If (feature 16 <= 0.966)\n", + " If (feature 17 <= 77.1)\n", + " Predict: 0.0\n", + " Else (feature 17 > 77.1)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.966)\n", + " If (feature 39 <= 3.5)\n", + " Predict: 2.0\n", + " Else (feature 39 > 3.5)\n", + " Predict: 0.0\n", + " Else (feature 42 > 0.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 18 <= 58.5)\n", + " If (feature 18 <= 44.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 44.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 58.5)\n", + " If (feature 17 <= 65.85)\n", + " Predict: 0.0\n", + " Else (feature 17 > 65.85)\n", + " Predict: 1.0\n", + " Else (feature 45 > 17.5)\n", + " If (feature 19 <= 29.115000000000002)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 18 <= 88.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 88.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 38 <= 6.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 6.5)\n", + " If (feature 38 <= 8.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 8.5)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.115000000000002)\n", + " If (feature 16 <= 1.8465)\n", + " If (feature 38 <= 7.5)\n", + " If (feature 39 <= 2.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 2.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 7.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 1.8465)\n", + " If (feature 16 <= 5.1419999999999995)\n", + " If (feature 38 <= 4.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 4.5)\n", + " Predict: 2.0\n", + " Else (feature 16 > 5.1419999999999995)\n", + " Predict: 0.0\n", + " Else (feature 39 > 4.5)\n", + " If (feature 21 <= 19.0)\n", + " If (feature 37 <= 3.5)\n", + " If (feature 45 <= 20.5)\n", + " If (feature 39 <= 5.5)\n", + " If (feature 19 <= 30.255000000000003)\n", + " If (feature 38 <= 2.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 2.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.255000000000003)\n", + " If (feature 45 <= 9.5)\n", + " Predict: 2.0\n", + " Else (feature 45 > 9.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 5.5)\n", + " If (feature 18 <= 53.5)\n", + " If (feature 21 <= 15.5)\n", + " Predict: 1.0\n", + " Else (feature 21 > 15.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 53.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 20.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 3.5)\n", + " If (feature 28 <= 0.5)\n", + " If (feature 45 <= 3.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 3.5)\n", + " If (feature 18 <= 62.5)\n", + " If (feature 19 <= 28.695)\n", + " Predict: 0.0\n", + " Else (feature 19 > 28.695)\n", + " Predict: 1.0\n", + " Else (feature 18 > 62.5)\n", + " Predict: 1.0\n", + " Else (feature 28 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 19.0)\n", + " If (feature 37 <= 11.5)\n", + " If (feature 37 <= 8.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 8.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 11.5)\n", + " If (feature 3 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 3 > 0.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 0.5)\n", + " If (feature 36 <= 0.5)\n", + " If (feature 45 <= 13.5)\n", + " If (feature 38 <= 4.5)\n", + " If (feature 4 <= 0.5)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 13 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 13 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 3 <= 0.5)\n", + " If (feature 20 <= 0.625)\n", + " Predict: 2.0\n", + " Else (feature 20 > 0.625)\n", + " If (feature 16 <= 0.0165)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0165)\n", + " Predict: 2.0\n", + " Else (feature 3 > 0.5)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 19 <= 28.695)\n", + " Predict: 2.0\n", + " Else (feature 19 > 28.695)\n", + " Predict: 0.0\n", + " Else (feature 39 > 4.5)\n", + " If (feature 17 <= 84.4)\n", + " Predict: 0.0\n", + " Else (feature 17 > 84.4)\n", + " Predict: 1.0\n", + " Else (feature 1 > 0.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 38 <= 2.5)\n", + " If (feature 45 <= 8.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 8.5)\n", + " If (feature 39 <= 3.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 3.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 2.5)\n", + " If (feature 45 <= 11.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 11.5)\n", + " If (feature 18 <= 21.5)\n", + " Predict: 0.0\n", + " Else (feature 18 > 21.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 27 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 16 <= 0.966)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.966)\n", + " Predict: 2.0\n", + " Else (feature 39 > 4.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 16 <= 0.0035)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0035)\n", + " Predict: 2.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 4 > 0.5)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 17 <= 46.75)\n", + " If (feature 19 <= 29.355)\n", + " If (feature 9 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 9 > 0.5)\n", + " If (feature 17 <= 37.2)\n", + " Predict: 0.0\n", + " Else (feature 17 > 37.2)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.355)\n", + " If (feature 45 <= 4.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 4.5)\n", + " If (feature 15 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 15 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 17 > 46.75)\n", + " If (feature 18 <= 87.5)\n", + " If (feature 20 <= 5.25)\n", + " Predict: 0.0\n", + " Else (feature 20 > 5.25)\n", + " If (feature 22 <= 0.035)\n", + " Predict: 2.0\n", + " Else (feature 22 > 0.035)\n", + " Predict: 0.0\n", + " Else (feature 18 > 87.5)\n", + " If (feature 38 <= 0.5)\n", + " If (feature 20 <= 1.9)\n", + " Predict: 0.0\n", + " Else (feature 20 > 1.9)\n", + " Predict: 2.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 20 <= 1.1)\n", + " If (feature 45 <= 9.5)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 1.0\n", + " Else (feature 21 > 3.25)\n", + " Predict: 0.0\n", + " Else (feature 45 > 9.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 1.1)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 41 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 44 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 4.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 4.5)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 19 <= 30.134999999999998)\n", + " If (feature 20 <= 3.5)\n", + " If (feature 8 <= 0.5)\n", + " If (feature 22 <= 0.08499999999999999)\n", + " If (feature 31 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 31 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 22 > 0.08499999999999999)\n", + " Predict: 2.0\n", + " Else (feature 8 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 20 > 3.5)\n", + " If (feature 19 <= 29.545)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 45 <= 12.5)\n", + " Predict: 2.0\n", + " Else (feature 45 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.545)\n", + " Predict: 2.0\n", + " Else (feature 19 > 30.134999999999998)\n", + " If (feature 16 <= 0.966)\n", + " If (feature 45 <= 4.5)\n", + " Predict: 2.0\n", + " Else (feature 45 > 4.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 37 <= 14.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 14.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.966)\n", + " If (feature 45 <= 4.5)\n", + " If (feature 0 <= 0.5)\n", + " If (feature 43 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 43 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 0 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 45 > 4.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 25 <= 0.5)\n", + " If (feature 22 <= 0.08499999999999999)\n", + " Predict: 1.0\n", + " Else (feature 22 > 0.08499999999999999)\n", + " If (feature 22 <= 0.095)\n", + " If (feature 4 <= 0.5)\n", + " If (feature 38 <= 5.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 5.5)\n", + " Predict: 0.0\n", + " Else (feature 4 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 22 > 0.095)\n", + " If (feature 21 <= 13.9)\n", + " If (feature 6 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 6 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 13.9)\n", + " If (feature 37 <= 8.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 8.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " If (feature 45 <= 1.5)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 38 <= 5.5)\n", + " If (feature 37 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 5.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 5.5)\n", + " Predict: 0.0\n", + " Else (feature 31 > 0.5)\n", + " If (feature 45 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 1.5)\n", + " If (feature 14 <= 0.5)\n", + " If (feature 45 <= 12.5)\n", + " If (feature 37 <= 10.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 10.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 12.5)\n", + " Predict: 1.0\n", + " Else (feature 14 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 13.5)\n", + " If (feature 11 <= 0.5)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 39 <= 1.5)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 17 <= 65.85)\n", + " If (feature 19 <= 29.115000000000002)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.115000000000002)\n", + " Predict: 2.0\n", + " Else (feature 17 > 65.85)\n", + " If (feature 19 <= 29.825)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.825)\n", + " Predict: 1.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 45 <= 14.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 14.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 1.5)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 16 <= 0.0045000000000000005)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.0045000000000000005)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 19 <= 28.695)\n", + " Predict: 0.0\n", + " Else (feature 19 > 28.695)\n", + " If (feature 5 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 5 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 16 <= 5.0E-4)\n", + " If (feature 20 <= 9.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 18 <= 81.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 81.5)\n", + " Predict: 2.0\n", + " Else (feature 20 > 9.5)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 38 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 4.5)\n", + " Predict: 2.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 38 <= 3.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 3.5)\n", + " Predict: 2.0\n", + " Else (feature 16 > 5.0E-4)\n", + " If (feature 45 <= 15.5)\n", + " Predict: 2.0\n", + " Else (feature 45 > 15.5)\n", + " If (feature 37 <= 7.5)\n", + " If (feature 40 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 7.5)\n", + " Predict: 2.0\n", + " Else (feature 1 > 0.5)\n", + " If (feature 45 <= 16.5)\n", + " If (feature 22 <= 0.295)\n", + " If (feature 19 <= 30.005000000000003)\n", + " If (feature 18 <= 65.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 65.5)\n", + " If (feature 35 <= 1.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 30.005000000000003)\n", + " If (feature 42 <= 0.5)\n", + " If (feature 23 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 23 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 42 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 22 > 0.295)\n", + " Predict: 2.0\n", + " Else (feature 45 > 16.5)\n", + " If (feature 35 <= 1.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 39 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 39 > 0.5)\n", + " If (feature 18 <= 58.5)\n", + " If (feature 38 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 58.5)\n", + " If (feature 38 <= 8.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 8.5)\n", + " Predict: 0.0\n", + " Else (feature 11 > 0.5)\n", + " If (feature 18 <= 65.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 19 <= 29.785)\n", + " If (feature 37 <= 15.5)\n", + " If (feature 37 <= 14.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 14.5)\n", + " If (feature 17 <= 53.5)\n", + " Predict: 1.0\n", + " Else (feature 17 > 53.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 15.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.785)\n", + " If (feature 17 <= 71.15)\n", + " If (feature 44 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " Predict: 1.0\n", + " Else (feature 17 > 71.15)\n", + " If (feature 38 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 37 <= 3.5)\n", + " If (feature 39 <= 4.5)\n", + " If (feature 17 <= 80.69999999999999)\n", + " Predict: 2.0\n", + " Else (feature 17 > 80.69999999999999)\n", + " Predict: 0.0\n", + " Else (feature 39 > 4.5)\n", + " If (feature 37 <= 2.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 2.5)\n", + " If (feature 44 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 3.5)\n", + " If (feature 21 <= 3.25)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 19 <= 28.695)\n", + " Predict: 1.0\n", + " Else (feature 19 > 28.695)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 31 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 31 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 45 <= 14.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 14.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 65.5)\n", + " If (feature 38 <= 3.5)\n", + " If (feature 45 <= 14.5)\n", + " If (feature 37 <= 8.5)\n", + " If (feature 19 <= 29.665)\n", + " If (feature 16 <= 0.1375)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.1375)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.665)\n", + " If (feature 16 <= 0.1375)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.1375)\n", + " Predict: 2.0\n", + " Else (feature 37 > 8.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 14.5)\n", + " If (feature 39 <= 5.5)\n", + " If (feature 17 <= 72.95)\n", + " Predict: 2.0\n", + " Else (feature 17 > 72.95)\n", + " If (feature 20 <= 9.5)\n", + " Predict: 0.0\n", + " Else (feature 20 > 9.5)\n", + " Predict: 2.0\n", + " Else (feature 39 > 5.5)\n", + " If (feature 37 <= 5.5)\n", + " If (feature 16 <= 0.053500000000000006)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.053500000000000006)\n", + " Predict: 2.0\n", + " Else (feature 37 > 5.5)\n", + " If (feature 40 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 3.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 20 <= 8.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 8.5)\n", + " If (feature 39 <= 5.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 5.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 18 <= 77.5)\n", + " Predict: 1.0\n", + " Else (feature 18 > 77.5)\n", + " If (feature 39 <= 1.5)\n", + " Predict: 1.0\n", + " Else (feature 39 > 1.5)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " If (feature 41 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 22 <= 0.025)\n", + " Predict: 2.0\n", + " Else (feature 22 > 0.025)\n", + " Predict: 0.0\n", + " Else (feature 36 > 0.5)\n", + " If (feature 16 <= 5.0E-4)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 21 <= 17.65)\n", + " If (feature 45 <= 12.5)\n", + " If (feature 41 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 0 <= 0.5)\n", + " If (feature 20 <= 2.65)\n", + " Predict: 1.0\n", + " Else (feature 20 > 2.65)\n", + " Predict: 0.0\n", + " Else (feature 0 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 12.5)\n", + " If (feature 37 <= 2.5)\n", + " If (feature 38 <= 2.5)\n", + " If (feature 20 <= 9.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 9.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 2.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 2.5)\n", + " If (feature 32 <= 0.5)\n", + " If (feature 19 <= 29.744999999999997)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.744999999999997)\n", + " Predict: 0.0\n", + " Else (feature 32 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 17.65)\n", + " If (feature 45 <= 3.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 3.5)\n", + " If (feature 38 <= 7.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 7.5)\n", + " If (feature 19 <= 29.925)\n", + " If (feature 39 <= 2.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 2.5)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.925)\n", + " Predict: 1.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 38 <= 3.5)\n", + " If (feature 21 <= 0.6)\n", + " If (feature 9 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 9 > 0.5)\n", + " If (feature 19 <= 29.935000000000002)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.935000000000002)\n", + " Predict: 1.0\n", + " Else (feature 21 > 0.6)\n", + " If (feature 19 <= 29.994999999999997)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.994999999999997)\n", + " If (feature 21 <= 9.6)\n", + " Predict: 2.0\n", + " Else (feature 21 > 9.6)\n", + " Predict: 0.0\n", + " Else (feature 38 > 3.5)\n", + " If (feature 17 <= 37.2)\n", + " If (feature 1 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 1 > 0.5)\n", + " If (feature 43 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 43 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 17 > 37.2)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 19 <= 29.785)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.785)\n", + " Predict: 0.0\n", + " Else (feature 41 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 20 <= 2.65)\n", + " If (feature 18 <= 66.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 66.5)\n", + " Predict: 0.0\n", + " Else (feature 20 > 2.65)\n", + " If (feature 38 <= 3.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 19 <= 29.115000000000002)\n", + " Predict: 1.0\n", + " Else (feature 19 > 29.115000000000002)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 3.5)\n", + " If (feature 25 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 25 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 38 <= 3.5)\n", + " If (feature 39 <= 1.5)\n", + " If (feature 19 <= 30.115000000000002)\n", + " If (feature 45 <= 21.5)\n", + " If (feature 19 <= 30.025)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.025)\n", + " If (feature 21 <= 9.6)\n", + " Predict: 0.0\n", + " Else (feature 21 > 9.6)\n", + " Predict: 2.0\n", + " Else (feature 45 > 21.5)\n", + " If (feature 20 <= 0.9)\n", + " Predict: 2.0\n", + " Else (feature 20 > 0.9)\n", + " If (feature 39 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 39 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.115000000000002)\n", + " If (feature 19 <= 30.255000000000003)\n", + " If (feature 37 <= 0.5)\n", + " If (feature 21 <= 0.6)\n", + " Predict: 0.0\n", + " Else (feature 21 > 0.6)\n", + " Predict: 2.0\n", + " Else (feature 37 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 30.255000000000003)\n", + " Predict: 2.0\n", + " Else (feature 39 > 1.5)\n", + " If (feature 19 <= 29.925)\n", + " If (feature 1 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 1 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 19 > 29.925)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 45 <= 17.5)\n", + " Predict: 2.0\n", + " Else (feature 45 > 17.5)\n", + " If (feature 45 <= 20.5)\n", + " Predict: 1.0\n", + " Else (feature 45 > 20.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 3.5)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 14 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 14 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 37 <= 13.5)\n", + " If (feature 39 <= 0.5)\n", + " If (feature 37 <= 2.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 2.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 37 > 13.5)\n", + " If (feature 17 <= 60.9)\n", + " If (feature 21 <= 4.05)\n", + " Predict: 0.0\n", + " Else (feature 21 > 4.05)\n", + " Predict: 2.0\n", + " Else (feature 17 > 60.9)\n", + " Predict: 0.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 18 <= 72.5)\n", + " If (feature 19 <= 29.355)\n", + " If (feature 41 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.355)\n", + " If (feature 17 <= 46.75)\n", + " Predict: 1.0\n", + " Else (feature 17 > 46.75)\n", + " Predict: 0.0\n", + " Else (feature 18 > 72.5)\n", + " If (feature 38 <= 4.5)\n", + " Predict: 2.0\n", + " Else (feature 38 > 4.5)\n", + " If (feature 19 <= 30.115000000000002)\n", + " Predict: 2.0\n", + " Else (feature 19 > 30.115000000000002)\n", + " Predict: 0.0\n", + " Else (feature 16 > 5.0E-4)\n", + " If (feature 1 <= 0.5)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 19 <= 28.695)\n", + " If (feature 45 <= 12.5)\n", + " If (feature 16 <= 0.2135)\n", + " If (feature 38 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 0.5)\n", + " If (feature 17 <= 78.9)\n", + " Predict: 0.0\n", + " Else (feature 17 > 78.9)\n", + " Predict: 2.0\n", + " Else (feature 16 > 0.2135)\n", + " Predict: 2.0\n", + " Else (feature 45 > 12.5)\n", + " If (feature 18 <= 67.5)\n", + " If (feature 3 <= 0.5)\n", + " If (feature 38 <= 4.5)\n", + " Predict: 0.0\n", + " Else (feature 38 > 4.5)\n", + " Predict: 2.0\n", + " Else (feature 3 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 67.5)\n", + " If (feature 21 <= 5.4)\n", + " If (feature 21 <= 0.6)\n", + " Predict: 2.0\n", + " Else (feature 21 > 0.6)\n", + " Predict: 0.0\n", + " Else (feature 21 > 5.4)\n", + " Predict: 2.0\n", + " Else (feature 19 > 28.695)\n", + " If (feature 40 <= 0.5)\n", + " If (feature 20 <= 7.5)\n", + " Predict: 0.0\n", + " Else (feature 20 > 7.5)\n", + " If (feature 16 <= 0.2895)\n", + " If (feature 22 <= 0.005)\n", + " Predict: 0.0\n", + " Else (feature 22 > 0.005)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.2895)\n", + " Predict: 0.0\n", + " Else (feature 40 > 0.5)\n", + " If (feature 27 <= 0.5)\n", + " If (feature 21 <= 20.85)\n", + " Predict: 2.0\n", + " Else (feature 21 > 20.85)\n", + " If (feature 11 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 11 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 27 > 0.5)\n", + " If (feature 16 <= 0.053500000000000006)\n", + " Predict: 0.0\n", + " Else (feature 16 > 0.053500000000000006)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " If (feature 44 <= 0.5)\n", + " If (feature 35 <= 1.5)\n", + " If (feature 17 <= 62.7)\n", + " If (feature 29 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 29 > 0.5)\n", + " If (feature 19 <= 29.884999999999998)\n", + " Predict: 0.0\n", + " Else (feature 19 > 29.884999999999998)\n", + " Predict: 2.0\n", + " Else (feature 17 > 62.7)\n", + " If (feature 3 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 3 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 35 > 1.5)\n", + " If (feature 37 <= 3.5)\n", + " If (feature 37 <= 0.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 37 > 3.5)\n", + " If (feature 21 <= 8.55)\n", + " Predict: 1.0\n", + " Else (feature 21 > 8.55)\n", + " If (feature 16 <= 0.0105)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.0105)\n", + " Predict: 0.0\n", + " Else (feature 44 > 0.5)\n", + " If (feature 31 <= 0.5)\n", + " If (feature 13 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 13 > 0.5)\n", + " If (feature 39 <= 0.5)\n", + " Predict: 0.0\n", + " Else (feature 39 > 0.5)\n", + " If (feature 21 <= 5.9)\n", + " Predict: 1.0\n", + " Else (feature 21 > 5.9)\n", + " Predict: 2.0\n", + " Else (feature 31 > 0.5)\n", + " If (feature 38 <= 10.5)\n", + " If (feature 4 <= 0.5)\n", + " If (feature 20 <= 3.5)\n", + " Predict: 1.0\n", + " Else (feature 20 > 3.5)\n", + " Predict: 2.0\n", + " Else (feature 4 > 0.5)\n", + " Predict: 1.0\n", + " Else (feature 38 > 10.5)\n", + " If (feature 21 <= 9.6)\n", + " If (feature 37 <= 13.5)\n", + " Predict: 1.0\n", + " Else (feature 37 > 13.5)\n", + " Predict: 0.0\n", + " Else (feature 21 > 9.6)\n", + " Predict: 2.0\n", + " Else (feature 1 > 0.5)\n", + " If (feature 26 <= 0.5)\n", + " If (feature 45 <= 12.5)\n", + " If (feature 21 <= 3.25)\n", + " Predict: 0.0\n", + " Else (feature 21 > 3.25)\n", + " If (feature 17 <= 82.30000000000001)\n", + " If (feature 34 <= 0.5)\n", + " If (feature 19 <= 28.695)\n", + " Predict: 0.0\n", + " Else (feature 19 > 28.695)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 17 > 82.30000000000001)\n", + " If (feature 17 <= 90.95)\n", + " Predict: 2.0\n", + " Else (feature 17 > 90.95)\n", + " If (feature 23 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 23 > 0.5)\n", + " Predict: 0.0\n", + " Else (feature 45 > 12.5)\n", + " If (feature 18 <= 58.5)\n", + " If (feature 41 <= 0.5)\n", + " If (feature 34 <= 0.5)\n", + " Predict: 2.0\n", + " Else (feature 34 > 0.5)\n", + " If (feature 16 <= 0.1375)\n", + " Predict: 1.0\n", + " Else (feature 16 > 0.1375)\n", + " Predict: 2.0\n", + " Else (feature 41 > 0.5)\n", + " Predict: 2.0\n", + " Else (feature 18 > 58.5)\n", + " If (feature 16 <= 0.0155)\n", + " If (feature 45 <= 18.5)\n", + " If (feature 17 <= 51.2)\n", + " Predict: 2.0\n", + " Else (feature 17 > 51.2)\n", + " Predict: 0.0\n", + " Else (feature 45 > 18.5)\n", + " Predict: 2.0\n", + " Else (feature 16 > 0.0155)\n", + " Predict: 2.0\n", + " Else (feature 26 > 0.5)\n", + " Predict: 1.0\n", + "\n" + ] + } + ], + "source": [ + "# Tree from the best Model printing it \n", + "\n", + "print(cv_rf.bestModel.stages[-1].trees[3].toDebugString)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Base Model Multiclass Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Create initial Decision Tree Model\n", + "\n", + "dt = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for DT Base Model \n", + "\n", + "dt_pipe = Pipeline(stages=[label_stringIdx, va, dt])\n", + "\n", + "# Train DT Base model with Training Data\n", + "\n", + "dtModel = dt_pipe.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "# Multiclass Evaluator to evaluate the performance of the model \n", + "\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "\n", + "evaluator_dt = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform Test data using Fitted Pipeline Built earlier for prediction of Test data\n", + "\n", + "pred_dt = dtModel.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy 0.5456816050238552\n" + ] + } + ], + "source": [ + "# Evaluation of model using Multiclass Evaluator on Test data\n", + "\n", + "print(\"Accuracy\",evaluator_dt.evaluate(pred_dt))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_dt=pred_dt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.93 0.42 0.58 131724\n", + " 1 0.48 0.79 0.60 58339\n", + " 2 0.14 0.95 0.25 6121\n", + "\n", + " accuracy 0.55 196184\n", + " macro avg 0.52 0.72 0.47 196184\n", + "weighted avg 0.77 0.55 0.57 196184\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_dt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of DT Base Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dt = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Multiclass Base Model')" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of DT Base Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dt['column'][:10], y=feat_imp_tuned_dt['weight'][:10],data=feat_imp_tuned_dt)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Multiclass Base Model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Grid Search Multiclass Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Initializing DT Grid Pipeline \n", + "\n", + "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for DT Grid Model \n", + "\n", + "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n", + "\n", + "# Creating Grid Search for Hyper Parameter Tuning for DT Model\n", + "\n", + "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n", + "\n", + "# Cross Validator Pipeline with 5 fold cv to fit the training data\n", + "\n", + "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator_dt,seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "# Fitting the training data using the Cross Validator Pipeline \n", + "\n", + "dtModel_t = cv1_dt.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform Test data using Cross Validation Pipeline Built earlier for prediction of Test data\n", + "\n", + "pred_dtt = dtModel_t.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy is 0.6133068955674265\n" + ] + } + ], + "source": [ + "# Evaluation of Testing Data using Multiclass Evaluator \n", + "\n", + "print(\"Accuracy is\",evaluator_dt.evaluate(pred_dtt))" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='DecisionTreeClassifier_b6336355b38a', name='featuresCol', doc='features column name.'): 'features',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='seed', doc='random seed.'): 42,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 35,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 30,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 500,\n", + " Param(parent='DecisionTreeClassifier_b6336355b38a', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0}" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Decision Tree Hyper Parameter Values from Best Model\n", + "\n", + "dtModel_t.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SparseVector(46, {0: 0.0004, 1: 0.0001, 11: 0.0046, 14: 0.0, 16: 0.1281, 17: 0.0045, 18: 0.002, 19: 0.0092, 21: 0.0006, 22: 0.0002, 23: 0.001, 25: 0.0049, 27: 0.0056, 32: 0.0071, 34: 0.0834, 35: 0.5412, 36: 0.1411, 37: 0.0011, 38: 0.0093, 39: 0.0142, 40: 0.019, 41: 0.0001, 44: 0.0006, 45: 0.0218})" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Feature Importance Sparse Matrix\n", + "\n", + "dtModel_t.bestModel.stages[-1].featureImportances" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_dtt=pred_dtt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.88 0.55 0.68 131724\n", + " 1 0.52 0.73 0.61 58339\n", + " 2 0.17 0.90 0.29 6121\n", + "\n", + " accuracy 0.61 196184\n", + " macro avg 0.52 0.73 0.52 196184\n", + "weighted avg 0.75 0.61 0.64 196184\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_dtt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of DT Grid Model for Multiclass Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dtt = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Multiclass tuned')" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAK9CAYAAABsCbsfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzde7xt9bz/8de7nRSVS22SrpJI5JKQcHI5vzjIpUPhuEvHSbke4RC5nOR+0tEJuR8pt1PkmktEVCoVIrmUxC66SKqdz++PMVbNVnPvvfbec+7vmnO/no/Heqw1xhxzrM+YlzHf8zu+4ztSVUiSJGnVWqN1AZIkSasjQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiT5rkkayepJJs0ruOiJDu3rEGdJE9O8rskf0lyt9b1zBdJjkzyH0u5/Q1J3ruy65kmSQ5K8oHWdayuDGG6Xr9Dn/n5e5KrBqafNuL/9bQk3+//x5eH3H6/JKcn+WuSHybZbinrOjLJ1bPqf/xK1jcvgs/qaD6FvSQnJXl66zqGeBfwnKpat6p+uir/8cB748r+vXZxkq8leeLAMr8ceC9el+RvA9MvHbLOg/p17jVr/v79/P1XoM5dk5w7OK+qDqiqfZZ3XeOU5K5JFreuQ20YwnS9foe+blWtC/wWeOzAvE+M+N9dArwDeOfsG5KsA/wfcDhwG+Bo4HNJ1lzK+t44WH9VfX7E9S63JAta16AVk2SNJPNy/5jkZsDGwNlLuH1p75NR2qbfV9wN+CTwgSSvBKiqrQb2JScDzxt4b97kPd/7OfDMWfP+pZ8vTaV5uZPR/JRknSSHJvl9kguSvK3/QLj+W2ff3P+nJOcl+eclrauqvlxVnwZ+P+TmRwJ/q6r/rqqr6cLaesByt44k2TTJ//Xf1s9LsvfAbQ9K8oMklyW5MMm7Bj7ATuh/nzPTspZk7yRfH7j/jVrL+ha5/0ry1SRXAg/sH7N3Jzm/b+E5JMnN++U3SvLlJJcmuSTJN5axOY9P8uski5K8OUn69dw1ybf6x31Rko8kWW+gztf2z9nlSX6a5MH9/AX9bef1j88nktx64H7PTfLbfp2vWMbjfNsk/9sv+6sk/z5Q395Jju8fm0v7VpJHLGNbZ9a7d5JvJHlv/zz9IskOSfZKdzjuD0n2GFj+yP4x/maSK/r/e8eB2x+a5Ef9uk5Kcr+B205KcmCSHwB/Bd4P3I8uXPwlyTv65d7Xv/4vT9dK+4CBdRzUP46f7P//j5Pca+D2LQZejxfPrLO/7QVJzumfxy8O1j2wzK2AP/eT5yQ5u59/UZKX99OX9/PukeQ7/WP+4ySPmvU4vSddC9aV/evndkn+u1/+7CT3mMtzVFWLquoIYF/ggCTrz+V+Q3wXuEOSrfoadwCuAc4cqHup78GB+RsAnwPulBta4DbIrENvSf6hf94v61/rT51dVJKFSb7Uv7b/1D9/dxi4/fnp3pdXZGC/l+59+d1+3YuSfHQJ230CsGCgznsPqfNGrWV9zQf0vy9PclyS2wzc/uB0+7ZL+9f7gwZuu3OSE/t6v0T3RVeNGMK0PN4A3BO4B3Bf4B+Afx+4fQtgLWAjYC/gI0m2XIH/c3fgjJmJqvo7cFY/f87StUQdB3yPruVgV+DVSR7aL3ItsA9wW+DBwGOB5/W3PaT/vc1ytqw9HXgtXWg8me6w0SZ0j9k2wF2AmUMrrwTOATYE7gC8fhnrfixwL2BHYE9g8BDxgXSP+8z/eQ1Aku2BZ/f3uxXwT8AF/X1eAfwjXbjdhO7xeFd/v3sB7wae0t+2RV/nkhwG3AzYki5E/ysw+IH2EOAUYAPgvcDy9EF5MN1zuAHweeAzdK0vWwLPB96XZO2B5f8FeDWwEPgF8JF+m24HHAsc1K/rMOC4PtjMeDrwDLrn7wXcuBXnZf0y36d7nDega7E9Ov2Xkd4TgCOAWwPH0z2OMy1YXwJ+CmwGbNpvC32QfDHdc3x74DTg47MfiKq6jBueh22qavA98RS6x36D/vH4Qv94LaR7ro+e9X58CvDyfn1rAicB3+636zjg4Nn/fxk+B6xDt29YEUW3zc/op58BLCm4LH1FVZfQPQ/nDbTAXTK4TJI70z1Gb6Pb5vsyvHVxDbrXymZ0rzm44X1ym/7+D6+q9eheq2f1y/wn3eN/6/6+/7OEch8CXDdQ52lz3Myn0u0D7tD/j/36mrbo/+9r6PZt/wF8fiCkHUUX/DYA3k73flEjhjAtj6cBB1TVxVX1B+BN3PgNvBh4Q1VdU1VfB74O7L4C/2dd4LJZ8y6j+2Bcktf03/ouTTITMnYG1q6qt/Y1/Rz4ELAHQFX9sKpOrqrrquqXdMHgocNXP2efrqof9MHxOuA5wH5VdWn/AXrQzP+nCz0bA5v19Z0wfJXX+89+Pb+iCzJ79tvxs6r6Rr+Oi+g+9Ge2YzHdB+O2wIKqOq+/P3QhY/+qurCq/kYXsp+SJMCTgc9U1ff71shXs4T9RbqWvScBr6yqv1TVuX0Ng6+Nc6rqo1V1HV0o2jwDrW7L8LOq+t+qWkz3AbIZ8Pp+e4+hC/5bDCz/+b7uv/V1PzzJQmA34PSqOqqqFlfVh+kC6aMG7vuBqjqnqq7t/99N9Nvx56q6FngL3YfZnQYW+UZVfa3f1o/RBWDoXo/rA6+uqr9W1VVV9b3+thcAb6qqn/frfQOwc5Lbz/ExAnhX/1xeRRcGAN7Zb8tXgK/RBa8ZR1fVGf3y/wdcVlWf6us+Crj3cvxvqupKuvfpbZfnfrN8FHh6krXo9h3/uxLrWpZ/AY6tqs/0r4dFVXXG7IWq6g9V9X/983UZXbiavZ/YLsnaVfW7gT5619K9Ljfq73viiOt/f1X9sn/cP80Nr7NnAp+tqq9X1d+r6jjgJ8A/JrkL3b5gZj99PHCTPrladQxhmpP+g3kj4DcDs38DDB4yWdR/8A3evvEK/Lu/0H1YDVofuGIp93lzVd26/5k5NLE5sMVAOLsUeGm/HSTZtj/M8IcklwOvY+mtPXNx/sDfG9O1Dp098P8/D9xupmbgQuCb6Q7l3qTD8lLWff1jm2TjJEenOzx3OV2Y3BCgqs6ma3l7M/DHdIfKbt8/n5vStQTN1HYa3T5hg37d1/+//sNndjCesVF/v9/Oqm/wtXHRwN9/7X+vu4ztnfGHgb+vAq7u6xmcN7iuwbr/RPd62rj/GXz9DqvzfJYhyavSHTa8jO7Q4Nrc+HUze1tnatsU+FUf0GfbHDhs4LlYRBegl+fEkNmvvd9WVQ3Mm72tsx/X2dNzfX4ASHJLutbWPy3P/Qb1Af4PdK/X0/ove+OyKfDLZS2UZL0kR/SHKy8HvsoN768/03053Re4KMkxfQsbwEuAWwCnpTscPOoTPJb0OtucLsgO7vd24Ib3wLD9tBoxhGlO+p35RXRv8BmbAb8bmN5w1mGhzehCxvI6G9h+ZiJdB+ntWEJH5KU4n64V5dYDP+tV1RP6298P/AjYqqrWpzukl/62GrK+K+l2qjM2GrLM4P1+T/dButXA/79VVW0AXbCpqv2qanO6lqT/GOy7McSmA38PPrZv62vbrt+O5w1sB1X1karaia61Zm26Fpeie+4eNuvxWbuqLu5rv/7/9YfsBg/bDboI+Htf02B9vxu++NgN1n1bug+n39M9XpvPWnZ2nbOf9xtNJ3kk8CK6Q123pmv1uYqBx3spzqf7UjBsv3s+8KxZz8U6VXXqHNY7rNYLufHzAeN/Tp5A91gsT83DfBR4GcMPRc7lPThj2Ht40PnAVnOoZ3+6MHy//v31j9z4/fXFqno4ffAF3tfP/11VPYfucOG+wBFJZj8nS6pzebZztvPpWnQHX0u3rKp30b0Phu2n1YghTMvjk3Qdbzfo+9e8hhv3W7kZ8NokayV5GF3/lM8MW1G6TuFr0/VFWSNdB9uZTvFfA9ZJ1wn35nTfKK+k67i7PL7b/68Xz6w/yT2T3Ke/fT26QzB/SXJ3uv5FAPSH4C7jxoeZTgfuneTuSW5B13K2RP1hpSOA9yTZMJ1N+w9ykjwuyZZ9q9RldIcvr1vKKl+Z5FZ9n499gE8NbMdfgMv7nfz1LWp9a99D+8fxqv5n5n8cBhyUZNN+2dsleWx/21HAE5Pcv7/vm+iC1rDtvJquP9BbktwyXcfq/RjSp2kV2W1W3d+sqj8Cx9A9f7v3r4Vn0H0ALe1wzB+48WtgPbrDTIvoDoMeSBds5+K7dK25b0xyi3QnbezU33YYXQjfBrq+RkmeNMf1DvMduvfVi/ttfSRdeDh6JdY5VL8/eCbdIeg3VdXlK7nKj9HVOqwf5vK8B/8A3C7Jklr0Pgo8JskT+v3RwiT3HLLcenQtTZcm2ZCujxUASe6Y5J/6Wq6mex9e19/2lCQb9194Lu3vMuwQ9x/pOuYPhqHTgV369d+Grv/oXH0E+OckD++3a53+743ozjT9GTfsp3eh6yurRgxhWh6vo+tbcDbdTuJEbtx599d0O5mL6MLHs6vqvCWs6/l0geBddGHtKrp+TvR9VHYD9qbbee0BPH5JfXSWpA9BjwZ2omtyX0T3LXVmp/wS4HlJ/gIcyg2hZnB7j+6b9B9XVWf22/sduh3Zt+ZQxovpWiVOoQtaXwZmDlfcrV/HFXQdZd9eVSctZV1fpDth4RS6D9OZkPM6uv5Gl9GFocHguw7d2aUzrVvrcsMH18F0/fa+keQKus7v9wGornPwy+j6mlxA9w3/4qXU9oL+92+Ab9AdEh31sCZz9XG6vncX0z3Gz4Subw/wOLovD5fQBdnHVNWlS1gPdK/PZyT5c5KD6Tr2n0B3GOu8/n8smktRA6/H7bnhMX1if9sn6V7/n+0PeZ1O975YIf3hpsfQ9au6hG4omKdU1/dxVM7p3zs/p+tE/8KqesvKrrSqruz7M1095LbleQ+eQRe8f9O/h2/UV61/LHaj6zf4Z7r31bCTf95Od/jxErogfdzAbQuAV9Ht8y6hO5v2Rf1tDwRO7R+jo4G9quomRwb6Q5oH98temu6kmC/SnTTwE7oTJuY85E6/z30SXb/Ci+nek/sBa/SB8CnALnSHjf+ddl+WBOTGXQakFZNkV+C9VXXnZS4sjUmSI4GzqupNrWuRpGWxJUySJKkBQ5gkSVIDHo6UJElqwJYwSZKkBlbVhV5HZsMNN6wtttiidRmSJEnLdOqpp15cVQuH3TZxIWyLLbbglFNOaV2GJEnSMiVZ4lUJPBwpSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqYM3WBYzKfV/x0dYlLJdT3/aM1iVIkqSGbAmTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgNjDWFJdk1yTpJzk+w/5PZ/SHJZktP7n9eNsx5JkqT5Ys1xrTjJAuBQ4JHABcDJSY6pqp/MWvQ7VfWYcdUhSZI0H42zJWxH4NyqOq+qrgGOBHYb4/+TJEmaGOMMYXcEzh+YvqCfN9sDk5yR5EtJ7j5sRUn2SnJKklMWLVo0jlolSZJWqXGGsAyZV7OmfwRsXlXbA4cAnx+2oqo6vKp2qKodFi5cOOIyJUmSVr1xhrALgE0HpjcBLhxcoKour6q/9H8fB9wsyYZjrEmSJGleGGcIOxnYOsmWSdYC9gCOGVwgyUZJ0v+9Y1/PJWOsSZIkaV4Y29mRVbU4yT7AV4AFwBFVdXaSvfvbDwN2B/41yWLgKmCPqpp9yFKSJGnqjC2EwfWHGI+bNe+wgb/fC7x3nDVIkiTNR46YL0mS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaGGsIS7JrknOSnJtk/6Usd78k1yXZfZz1SJIkzRdjC2FJFgCHAo8CtgX2TLLtEpZ7K/CVcdUiSZI034yzJWxH4NyqOq+qrgGOBHYbstyLgM8AfxxjLZIkSfPKOEPYHYHzB6Yv6OddL8kdgScAhy1tRUn2SnJKklMWLVo08kIlSZJWtXGGsAyZV7Om3w28sqquW9qKqurwqtqhqnZYuHDhyAqUJElqZc0xrvsCYNOB6U2AC2ctswNwZBKADYFHJ1lcVZ8fY12SJEnNjTOEnQxsnWRL4HfAHsBTBxeoqi1n/k7yYeALBjBJkrQ6GFsIq6rFSfahO+txAXBEVZ2dZO/+9qX2A5MkSZpm42wJo6qOA46bNW9o+KqqZ42zFkmSpPnEEfMlSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAaWO4QluU2Se85x2V2TnJPk3CT7D7l9tyQ/TnJ6klOS7Ly89UiSJE2iOYWwJN9Ksn6S2wJnAB9K8s5l3GcBcCjwKGBbYM8k285a7Hhg+6q6F/Ac4APLuwGSJEmTaK4tYbeqqsuBJwIfqqr7Ao9Yxn12BM6tqvOq6hrgSGC3wQWq6i9VVf3kLYFCkiRpNTDXELZmkjsATwa+MMf73BE4f2D6gn7ejSR5QpKfAV+kaw27iSR79YcrT1m0aNEc/70kSdL8NdcQ9gbgK3QtWycnuRPwi2XcJ0Pm3aSlq6o+V1V3BR4PvHHYiqrq8Kraoap2WLhw4RxLliRJmr/WnONyv6+q6zvjV9V5y+oTRtfytenA9CbAhUtauKpOSLJVkg2r6uI51iVJkjSR5toSdsgc5w06Gdg6yZZJ1gL2AI4ZXCDJnZOk//s+wFrAJXOsSZIkaWIttSUsyQOBnYCFSV46cNP6wIKl3beqFifZh+4w5gLgiKo6O8ne/e2HAU8CnpHkWuAq4CkDHfUlSZKm1rIOR64FrNsvt97A/MuB3Ze18qo6Djhu1rzDBv5+K/DWuRYrSZI0LZYawqrq28C3k3y4qn6zimqSJEmaenPtmH/zJIcDWwzep6oeNo6iJEmSpt1cQ9jRwGF0I9pfN75yJEmSVg9zDWGLq+p9Y61EkiRpNbKssyNv2/95bJIXAp8Drp65var+NMbaJEmSptayWsJOpRvlfmb0+1cM3FbAncZRlCRJ0rRb1tmRW66qQiRJklYnc+oTluSJQ2ZfBpxZVX8cbUmSJEnTb64d858LPBD4Zj/9D8BJwF2SHFhVHxtDbZIkSVNrriHs78DdquoPAEluD7wPuD9wAmAIkyRJWg5zvYD3FjMBrPdH4C792ZHXjr4sSZKk6TbXlrDvJPkC3aCt0F14+4QktwQuHUtlkiRJU2yuIezf6ILXg+iGq/go8JmqKmCXMdUmSZI0teYUwvqw9en+R5IkSStpWSPmf7eqdk5yBd3grNffRJfN1h9rdZIkSVNqWYO17tz/Xm/VlCNJkrR6mOvZkSTZOcmz+783TOJo+pIkSStoTiEsyQHAK4FX9bPWAj4+rqIkSZKm3Vxbwp4APA64EqCqLgQ8RClJkrSC5hrCrunPkCyAfnwwSZIkraC5hrCjkvwPcOskzwe+Drx/fGVJkiRNt2UNUfFi4ETg3XSDsl4ObAO8rqq+Nv7yJEmSptOyBmvdBHgPcFfgx8D36ELZqWOuS5Ikaaota5ywlwMkWQvYAdgJeA7w/iSXVtW24y9RkiRp+sz12pHrAOsDt+p/LgTOHFdRkiRJ025ZfcIOB+4OXAH8gO5w5Dur6s+roDZJkqSptayzIzcDbg5cBPwOuAC4dNxFSZIkTbtl9QnbNUnoWsN2Al4GbJfkT8D3q+qAVVCjJEnS1Flmn7B+kNazklwKXNb/PAbYETCESZIkrYBl9Qnbl64F7EHAtXTDU3wfOAI75kuSJK2wZbWEbQF8GnhJVf1+/OVIkiStHpbVJ+ylq6oQSZKk1clcrx0pSZKkETKESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgNjDWFJdk1yTpJzk+w/5PanJflx//O9JNuPsx5JkqT5YmwhLMkC4FDgUcC2wJ5Jtp212K+Ah1bVPYE3AoePqx5JkqT5ZJwtYTsC51bVeVV1DXAksNvgAlX1var6cz95ErDJGOuRJEmaN8YZwu4InD8wfUE/b0meC3xpjPVIkiTNG2uOcd0ZMq+GLpjsQhfCdl7C7XsBewFsttlmo6pPkiSpmXG2hF0AbDowvQlw4eyFktwT+ACwW1VdMmxFVXV4Ve1QVTssXLhwLMVKkiStSuMMYScDWyfZMslawB7AMYMLJNkM+CzwL1X18zHWIkmSNK+M7XBkVS1Osg/wFWABcERVnZ1k7/72w4DXARsA/50EYHFV7TCumiRJkuaLcfYJo6qOA46bNe+wgb+fBzxvnDVIkiTNR46YL0mS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgNrti5Ac/PbA+/RuoTlstnrzmxdgiRJ85otYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwAt4q7kHHfKg1iUstxNfdGLrEiRJE86WMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIk56bgR0AACAASURBVCRJasAQJkmS1MBYQ1iSXZOck+TcJPsPuf2uSb6f5OokLx9nLZIkSfPJmuNacZIFwKHAI4ELgJOTHFNVPxlY7E/AvsDjx1WHJEnSfDTOlrAdgXOr6ryqugY4EthtcIGq+mNVnQxcO8Y6JEmS5p1xhrA7AucPTF/Qz1tuSfZKckqSUxYtWjSS4iRJkloaZwjLkHm1IiuqqsOraoeq2mHhwoUrWZYkSVJ74wxhFwCbDkxvAlw4xv8nSZI0McYZwk4Gtk6yZZK1gD2AY8b4/yRJkibG2M6OrKrFSfYBvgIsAI6oqrOT7N3ffliSjYBTgPWBvyd5MbBtVV0+rrokSZLmg7GFMICqOg44bta8wwb+vojuMKUkSdJqxRHzJUmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDUw1ssWSYJvP+ShrUtYbg894dutS5CkqWdLmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQGDGGSJEkNGMIkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgCFMkiSpAUOYJElSA4YwSZKkBgxhkiRJDRjCJEmSGjCESZIkNWAIkyRJasAQJkmS1IAhTJIkqQFDmCRJUgOGMEmSpAYMYZIkSQ0YwiRJkhowhEmSJDVgCJMkSWrAECZJktSAIUySJKkBQ5gkSVIDhjBJkqQG1mxdgKTJ9t6XHdu6hOW2zzse27oESbIlTJIkqQVDmCRJUgMejpSkpXjz03dvXcJyec3HP926BElzZEuYJElSA4YwSZKkBgxhkiRJDdgnTJJWYz998zdal7Bc7vaah7UuQRoZW8IkSZIaMIRJkiQ1YAiTJElqwBAmSZLUgB3zJUlT6fWvf33rEpbb8tR81NE7jq+QMXnyP/+wdQnzii1hkiRJDdgSJkmS5p3tP/2V1iUstzN2/3/LtbwtYZIkSQ0YwiRJkhowhEmSJDUw1hCWZNck5yQ5N8n+Q25Pkv/qb/9xkvuMsx5JkqT5YmwhLMkC4FDgUcC2wJ5Jtp212KOArfufvYD3jaseSZKk+WScLWE7AudW1XlVdQ1wJLDbrGV2Az5anZOAWye5wxhrkiRJmhdSVeNZcbI7sGtVPa+f/hfg/lW1z8AyXwAOqqrv9tPHA6+sqlNmrWsvupYygG2Ac8ZS9HAbAhevwv+3qrl9k22at2+atw3cvknn9k2uVb1tm1fVwmE3jHOcsAyZNzvxzWUZqupw4PBRFLW8kpxSVTu0+N+rgts32aZ5+6Z528Dtm3Ru3+SaT9s2zsORFwCbDkxvAly4AstIkiRNnXGGsJOBrZNsmWQtYA/gmFnLHAM8oz9L8gHAZVX1+zHWJEmSNC+M7XBkVS1Osg/wFWABcERVnZ1k7/72w4DjgEcD5wJ/BZ49rnpWQpPDoKuQ2zfZpnn7pnnbwO2bdG7f5Jo32za2jvmSJElaMkfMlyRJasAQJkmS1IAhTJIkqQFDmKR5JcnaQ+Zt2KKWUUry3FnTC5Ic0KoeSe0ZwoZI8ogh857ZopZxSPLGJGsOTK+f5EMtaxq1JJvPPI9J1kmyXuuaVkaS05L8aMjPaUl+1Lq+ETu5H7IGgCRPAr7XsJ5ReXiS45LcIcl2wEnARL8uZ0tyuyHztmlRyzhM8/YluX2SDyb5Uj+97ewvDpMsyQZJDun3m6cmeU+SDZrX5dmRN5XkBOBs4OXAusAHgKuravemhY1Ikv8E/pFuSJCNgEOAQ6rqvU0LG5Ekz6e7zNVtq2qrJFsDh1XVwxuXtsKSbLW026vql6uqlnFLcg/gCOBbwMbABsDzquqClnWNQpKnAIfSDcmzZ1Wd2LikkUpyDvDaqjqqn34Z8Nyq2rZtZaMxzdvXh68PAa+pqu37L+qnVdU9Gpc2Ekm+BpwAfLyf9TTgH6rqJo0uq5IhbIgkAV4GvKCf9bqq+mTDkkaubyU6Fvgz8JCqOrdxSSOT5HS6C8j/oKru3c87c1p2JquDJI8HPgZcwZS8PvsvAx8BzgTuBvwEeGlV/bVpYSOU5A50YzD9Dbg98FPgZVX1l6aFjcg0b1+Sk6vqfklOG9hvnl5V92pd2ygkObWq7jtrXvPLF3k4crjbAPcHfglcDWzeB7OpkOQhwHuAA+laG96bZOOmRY3W1VV1zcxE/41uKr5tJLlfkpOSXJbkb0muTnJ567pGKckHgRcD96RrrT02yb+1rWokjqX7QvcC4KHAL+iuLDI1+iuefBl4ILAF8NFpCCgzpnz7ruwPzxXAzFVs2pY0Ut9MskeSNfqfJwNfbF2ULWFDJPk5cFBVHZFkHeCtwA5VtVPj0kYiyQ+BZ1XVT/rpJwJvqaq7tq1sNJIcDFwKPAN4EfBC4CdV9ZqmhY1AkpOBpwNH0rX2PQvYtKpe17KuUUryEuDd1e+cktwKeGdVTXT/lCTrV9Xls+ZtXVW/aFXTqPWHfH4P7Et3LeAjgBOq6uVNCxuRad6+JPeh65qyHXAWsBDYvap+3LSwEUlyBXBL4O/9rDWAK/u/q6rWb1KXIeymkmxWVb+dNe8hVXVCq5pGKcmCqrpu1rwNquqSVjWNUpI1gOfS9XsL3aWzPlBT8GKfaVIfPLya5HvT8gVhRv/lZ7OqOqd1LaOS5BZ03Rw2q6rn94cnt6mqLzQubWSSPL6qPj8wvSbwqqp6Y8OyRmZat6/fZz4A+CGwDd1+85yqurZpYasBQ9gQ/aHHpwF3qqoDk2wGbFRVP2xc2kgkuT3wFuCOVbVrkm2BB1bVBxuXNhJJbgn8bSZoJlkA3Hwa+t70J408gu4b+G/pvpU/v6ru2bSwEUryWODtwFpVtWWSewEHVtXjGpe2UpJ8CjgVeEZVbdcHze9PS5+bGUk2B7auqq/327hmVV3Ruq5RmdbtS/L9qnpg6zrGKcnjgIf0k9+aD1+A7BM23H/THfPfs5++gu6MpmnxYbrWoTv00z+n64MzLY4H1hmYXgf4eqNaRu1ZdO/bfYDrgK2BqThrd8Dr6Q61XgpQVacDW7YsaES2qqqDgWsBquoquhaHqdGfmfxp4H/6WZsAn1/yPSbLlG/fV5M8aZr6Pw9KchCwH90JMT8B9uvnNbXmshdZLd2/qu6T5DSAqvpzkrVaFzVCG1bVUUleBVBVi5Nct6w7TZC1BzvLVtVf+kNBE6+qzuv//Bvw2pa1jNHiqrps1mfBNDTZX9O3nMz0dduK7sSfafJv9GcmA1TVL4aNrTXBpnn7XkrXZ2pxkr/RfUFo1ldqDB4N3Kuq/g6Q5CPAacD+LYsyhA13bX8Ia2ZnuZAbOvNNg2k/C+bKJPepqh8BJLkvcFXjmkaif64OADZn4P1bVXdpVtTonZXkqcCCvt/UvkzHYK0H0J1Zt2mSTwAPomvZnCZXV9U1MwF6ms5M7k3t9lXVVA0cvAS3Bv7U/32rloXMMIQN91/A54DbJXkz3eGe/2hb0ki9FDgG2CrJifRnwbQtaaReDByd5MJ++g7AUxrWM0ofAv6drm/RNLVeDnoR8Bq6VqJP0h06n+iOzwBV9bV0Vzd4AF0rw35VdXHjskbt20leDayT5JF0ZyYf27imUZra7euHLrqJaTkhDfhP4LQk36R7/z0EeHXbkuyYv0RJ7go8nO7JOr6qftq4pJHqv8FN7VkwSW7GDdv3s2nZviQ/qKr7t65Dc9ef+r9EMy2202Caz0yG6d6+JINhcm26w66nVtXDGpU0cv1gu/eje+5+UFUXNS7JEDYoyW2XdntV/Wlpt893/XhgS1RVn11VtYxbkp3oBlMcPGT30WYFjUi6S04BfJaB/kTTMJZP/yGwxB3SpJ4d2X/zhu6DbQfgDLoPgXvSfRDs3Ko2aUmSbAocXFV7LnPhCZDk+NmXrhs2b1XzcOSNnUr3IRBgM7pL+oTuOPJvmfwztB7b/74dsBPwjX56F7qR86cihCX5GLAVcDo3HLIrYOJDGLDzrN/QbdvQQwkT5u397yfSXdN05hpvewK/blHQKFTVLgBJjgT2qqoz++nt6K5PO/GSnMnSA/RED6Ey7du3BBfQDdw60ZKsDdwC2DDJbbjhjOT16a5N25QhbEBVbQmQ5DDgmKo6rp9+FN3YTBOtqp4NkOQLwLb9JThmmminaQiOHei2b+qaeavqwa1rGJeq+jZAkjdW1WCoPLYfH23S3XUmgAFU1Vn9GGjT4DH975nLS32s//00uouVT7pp3z6SHMINQXMN4F50rbaT7gV0/YQ3pmtomTEvhp7ycOQQmacX+hyVJGdV1XYD02sAPx6cN8mSHA3sOxMyp0mSfYfMvoyu78ZZq7qecUjyU+CfZobjSLIlcFxV3a1tZSsnySfpLpPycboPu6cD607L4R6AJCdW1YOWNW9STfP2JXnmwORi4NdVdWKrekYlyf3oWvV2r6pD+u18El3r+utbdzOyJWy4i5P8BzfeWU7FJX1630ryFbozzwrYA/jm0u8yUTYEfpLuGpmD/aYmsk/RLDvRdSydGen50XSXGtkvySeq6h3NKhudl9C9RmfGRNuC7tvspHs28K90A0YCnAC8r105Y3HLJDtX1Xfh+r6Zt2xc0yhN7fZV1Udm/u4P223asJxR+h/gEX0AewjdWZIvomvpO5zGIwPYEjZE30H/AG7oZ3MC8IbWiXmU+k76M4e2Tqiqz7WsZ5SSPHTY/JnDXZOsD8+7z1wmJcl6wFF03+xOqaptW9Y3KkluDsxcUP5nVTVtg5pOpX5MviO4YQymS4HnTMsZoNO8fUm+BTyOrnHmdGAR8O2qemnLulZWkjOqavv+70OBRVX1+n769NaXDTOESROkP1R3j6pa3E+vBZxRVXdLclpV3btthaMxjWe3JnkQ3SWZZg+0e6dWNY1LkvXpPl+maRDo603j9s3sP5I8D9i0qg5I8uNJP+kgyVl0I+UvTvIzupNjTpi5rXU3HA9HDpHkLnRnLW3BjXeWUzFeSt8K9la6syTDlF2eoh9V/hDgbsBawALgyinZvqOA7yeZuV7d44Cj0l20/Jx2ZY3OFJ/d+kG6Q61TO9Bu34L5JPp958zI8lV1YMOyRmbKt2/N/iStJ9MNljwtPkk3yO7FdFdO+Q5AkjszD64UYwgb7mjgMOADTOfO8mDgsdM2AO2A99L1czua7kzJZ9Bd6Hri9d9Oj6MbomJm1PWT+pv3aFfZSE3r2a2XVdWXWhcxZv9Hf6II03ddTJju7TuQbvDZ71bVyUnuBPyicU0rrarenOR4uiunfHVgv7IGXd+wpjwcOcSwsyOnybSczbMkM2eyDjalJ/leVe3UurYVleSWVXVlfxjkJqrq8lVd07hM69mtSQ6ia5WdPdDuxPcnmjEfDu+M07Rvn1Y9W8KGOzbJC+muHzm4s5yWjvmnJPkU8HluvH1TMVgr8Ne+r9TpSQ4Gfs/kn8H0aeBRwNnceNDI9NObtShqTKb17NaZy00NDnVTwFR0c+h9L8k9BsdDmzJTu339vvJNdIfsvgxsD7y4qj6+1DtqpdgSNkSSXw2ZXdPSgTbJh4bMrqp6ziovZgySbA78ga4/2EvozmQ6tKp+2bQwzck0n9067ZL8BLgz8Cu6AD3T33SiO3fPmObtmzlTMMkTgMfT7Tu/OXNmocbDEKapk2S/qnrPsuZNkv46bpfNHHbsx7vZjW7AwcNqSi5QPo2SLPUU/6p656qqZdz6L0A3UVW/WdW1jMM0b1+Ss6vq7kneD3ymqr48OLyDxsPDkQMy5Re4nnVZipuoqmGjsU+iZwKzA9ezhsybJEfTDSp4eZLt6Q6VH0x3iOsewF4NaxuJJFcw/PU56Wfvrte6gHHrx1aE7lIwU2fat693bD+Ew1XAC5MsBP7WuKapZ0vYgCUcppsx8YfrZl2W4iYGR0yeREn2BJ5Kd+bgdwZuWh9YXFUTe/3PWScZvA2gql7RX3LqjKq6R9MCV6Ekt6mqP7euY9SSvKqq/rN1HSui78JR3HBx5EET35Vj2rdvRj9S/uVVdV0/7M16VXVR67qmmSFsBSR55qQHlqVJckhVNT91d3n1hwq2pLssxf4DN11Bd23MxU0KG4EkZ84ErSSnAq+pqi/30xM/oOLySPKjqrpP6zpGbVq3a1CSu1fV2a3rGJdJ3r4ktwBeCmxWVXsl2RrYpqq+sIy7aiWs0bqACbXfsheZaBM5fEVV/aaqvgU8AvhO35H798AmDP8GO0m+neR/k7wD2AD4BkCSjYDVrT/YpD+XSzKt2zXoY60LGLNJ3r4PAdfQXZ8Wuotev6ldOasHQ9iKWR12lpPsBGDtJHcEjqe7cPKHm1a08vYFjgMuAh5cVdf08zcGXtusqjamtfl+Wrdr0LTvOyd5+7aqqoPpv9RV1VVM9vZMBDvmr5jVYWc5yVJVf03yXOCQqjo4yWmti1oZVfV34Cbj9cwe6DPJd6tq51VWmEZpdfjAm/Z95yRv3zVJ1qHfhiRbMX1XBZh3DGErZtp3lpO+fUnyQOBpwHP7eavLa33SB6Wdi0l/fS7J0a0L0GrtALpBWjdN8gm6binPalrRasDDkSvmxNYFjEJ/9sswkzyUA8CLgVcBn6uqs/troH2zcU2ryiR/EyfJGknOWsZiD18lxYxIkoOT7D1k/kuSvHVmuqresmora+KaZS8yP6Wz6TIWm8jtS3cl8p8BT6QLXp8Eduj72GqMPDtyiCS3B94CbFxVj0qyLfDAqvpg49JGIslOdBcnX7eqNuvHnXpBVb2wcWlaSdNwhl3/LfxVVfXb1rWMQj/K+nb9IeXB+WvQnbU7NdciTHJgVb1uYHoB8NGqelrDskZmmq8rPM3bNp/ZEjbch+muJr9xP/1zutaVafEu4P8BlwBU1RnAQ5pWNAJJ3t3/PjbJMbN/Wte3ikzDobo7AGcnOX5Knr+aHcD6mX9nOp6vQZsleRVAkpvTDSr8i7YljdRJSe7XuogxmeZtm7dWl34yy2vDqjpqZmdSVYuTXNe6qFGqqvO7FujrTcP2zZwe/vamVYxRks2AP1bV3/rpdeher+f3izyrVW0j9IbWBYzYX5NsXVU3CiP9OExXNappXJ4NfKLfd+4CfKmq3tW4plHaBXhBkt8AVzJF146k27a9k/ya6du2ecsQNtyVSTbghrNEHgBc1rakkTq/PyRZSdaiG/7gp41rWmlVdWr/+9v9JTeoqkVtqxq5z3LDOD4Afwc+A+wI17dqTrQpvFD364AvJXkTcGo/bwe6fotT0cKeZPAQ+HuA/6HrO/vtJPeZfRbvBHtU6wLGaJq3bd6yT9gQ/Q7lEGA74CxgIbB7Vf24aWEjkmRDuh3lI+i+7XwV2K+qLmla2ErqO5ceAOxDt11rAIvphqk4sGVto5Lk9Kq616x5U3WR3VnXkFwLuBlw5QRfO5Ik2wGvoNunAJwNvK2qzmxX1egkWdqJL1VVD1tlxawCSW4HrD0zPcn9F5OsDewN3Bk4E/jgJF9dZNIYwpYgyZrANnQf5udU1eo2KvnESfIS4NHAXlX1q37enYD3AV+ehsMiSY4H3lFVx/XTjwFeVlW7tK1sfJI8Htixql7duhat3pI8DngHXX/hPwKbAz+tqrs3LWwlJPkU3QCt36FrDftNVU37VWHmDUPYEEn+DfhEVV3aT98G2LOq/rttZSsnySEsZQiDqtp3FZYzcv2ArI+sqotnzV8IfLWq7t2mstFJchfgf+kuXQSwCHh6Vf28XVXjl+SkqnpA6zpWxLJOKqiqx62qWsZtNTiz/AzgYcDXq+reSXah+2zYq3FpK2zWdWnXBH446WdYTxL7hA33/Ko6dGaiqv6c5PnARIcw4JT+94OAbYFP9dP/zA19VSbZzWYHMOj6hSW5WYuCRq0PWzskuXU/fWnjkkYuyRMHJteg6z81yd8WHwicTzf20g+YvjMiB32Y7hqEr+mnf063n5mKEAZcW1WX9OPZrVFV3xwc621CXX+Upz8JrWUtqx1D2HBrJEn1zYT9WDdrNa5ppVXVRwCSPAvYZeYQa5LD6PqFTbqlDZQ4kYMozkiyZ1V9Msm+s+YDUFX/1aSw8XjswN+LgV8Du7UpZSQ2Ah4J7Ak8Ffgi8MmqOrtpVeMx7WeWX5pkXbpDd59I8ke61+gk2z7J5f3fAdbpp2fOjpzYvpiTwBA23FeBo/pwUnSdFr/ctqSR2hhYD/hTP70uN4yJNskGdyaDwkAn2gl16/73wqZVrAJV9ezWNYxSVV1Ht//4cj921p7At/qBTQ9pW93ITfuZ5bvRDSvyYrrLot0KmOiTfqpqQesaVmeGsOH+HdgL+FduOHvwA00rGq2DgNMGzmh6KPD6duWMxpTvTGYul3JaVX22aSVjlmQTurOTH0T3Yf5durN3L2ha2Erow9c/0QWwLYD/ohtuZNq8FDgG2CrJifRnlrctaXSq6sokmwNbV9VHktwCmOb9jsbMjvmz9IceP1JVT29dyzgl2Qi4fz/5g6q6qGU9Wrr/397dB9tVlXcc//4SY7AkEBgdcVoMEpBIKaHBlAqIQkT6MliopaUDioCtaLW2iC3TUkboTK1lxKHTF0EqhYyVlpd2JJYCk0ICAUFIQkwLFiQgtLS8KBBsKhp//WPtC+deTpJLsnPX3fv+PjNnztl7n3PPc5hLznPXetazJH0DOAj4et+LZiXdRFl8MNJ892TgJNtH14tq20m6nNKa4nrgSttb2xuz0/q8srypDf5NYHfb85qGu5+33an9TGPySBI2hKQbgGNtd7qOaCxJ823fP6ax4ot61FCxdyRdCJwO7AwMTrmO1G3sXiWwHWAzvdBedq4rJP2I0oEcRi8w6F3NTTMydCYw1/ZvNEnKfraXVg6tFZLWUBoj3zmy2npwdWHEK5XpyOEeBlY2S8tH/vHE9oXVImrHmZS/4j7bHI/NwHvVULFnfg/4BLAU6E1Lg814StLJlNWEUKbwutxI+N4+tEcZp8soK63f1hw/BlxF+b3tg+/bfmFkQUwz6peRjNhm2cB7uP+i/KMxjVLAPnLruksl7WH7yKa55+XA85RdAXpTt9FTdzardZ+0vWnsrXZwLTsN+FXgv4HHKb+bp1WNaPtMpS/pebb/jKbtge2N9Kslx3JJf0BZQXg0JcG8rnJM0WGZjpxCJK0C3mX7O5KOAK4EPkapNXqL7SRik5SkdcCnKSuxfnfsddtbbAga9Uh6DNjsKHoPRthfJOl2YDGw0vZCSfMo7Th+pnJorZA0jVIW8G5KcnkDcKnzRRrbKNORQzSrBl/2P1UP9j+bbnukLcWvAZfYvga4pql1iMnrtygF6nMozXUHmbIirdN6vKPDdEobmD6NCG3OpyjtOPaU9CXKCtcP1AyoZb9A2VvxC7UDiX5IEjbcWQOPdwLeS/cb8gFMl/SqZnPWxZT6sBH5XZjEbC+nTIXcbfvi2vHsIHcPPD6Pshl7Hzzelw3kt8b2jZLuAX6WknR+fNguFh12InCRpGuAy2zfVzug6LZMR46TpOW231E7ju0h6Q8pf8k9BbwRWGjbkvahtOU4rGqAMS6S5lO2nXqxAa3tv6sXUfskre5LMXufPsvWSFoCrAButX1/7Xh2BEm7UBaLnEoZub2MMuW6oWpg0UlJwoaQNLjcfxpwMPDntverFFJrmg7Wb6BsaP295tybgVlpUTH5STqHUo8yn1KPcgxwm+1f3uILO0bSqr70Q5O0+0AZQK9JOgo4HHg7sDewBlhh+6KqgbVM0msp5QG/A9wH7EP5jujbDgixgyUJG0LSespfOKJMQ64Hzrd9W9XAYsobaNq6yvYCSW8ALrbdq7YVfUrCppqm4fUi4EjKlm8bbc+vG1U7JB1LWak7j9JM+HLbTzT90e6zPbdqgNE5qQMawvabascQsRkbbW+S9ENJsyltHPauHVQbJG3gpcL8HxuzqXCvmpr2laRllIbCd1A2uV5k+4m6UbXqBOBztlcMnrT9v5K63EYlKkkSNoSkGZR9I49oTt1CGW3ozfYb0VmrJc0BvkgpZH8O6MU0su0+9OKb6tZSyjcOoGzc/YykO5p+YZ1n+/1buLas+axv29xzIsbKdOQQki4FZlCamQK8D9hk+4P1ooqpTqVN9x62H2+O9wF2SS1fTDaSZlEK18+i/M7OrBzShJhKizCiHRkJG26R7QUDx/8q6d5q0URQ5uMkLaWMNGD7wcohRYwi6aOUovyDgUcoI7a3Vg1qYmVUI16RJGHDbZI0z/a3ACTtDfRta5joprskLczoV0xSr6HsDnBP049wFEm72f7uxIcVMTllOnIISYspvV8eak7tBZxq++ZqQcWUNtJkt1kd+RbgW5TN5UeK1rOSMCa9rq56lTTT9vfH8bxMR8YrkpGwAZIWAY82BZb7Ah8C3gXcCGQ6Mmq6C1gIHFc7kIjt0NWtm+4AFkpaYvt9W3jelq5FvEySsNEupiRdAIcAZ/PSBteXANngOmoRwMgUeURHdXXq5dWSTgEOlfSyxsi2r23u1014ZNFpScJGywbXMVm9TtKZm7to+8KJDCZiijkDOAmYAxw75pqBayc8ouiFJGGjZYPrmKymA7Po7nRO9JikN9leP56n7vBgdoBmt5TbJN1t+29qxxP9kcRitC8DyyU9BWykWVrd9GN6tmZgMeU9bvv82kFEbMbVwMGSltlevIXnbelaFyyR9Nu81Mh7OfD5NPKObZXVkWNkg+uYjLLqKiYzSauBfwI+CHxuHOyt1QAABu5JREFU7PW+TJenkXe0LSNhY9j+2pBz/1EjlogBXR9BiH47kbJy91VAn7efSiPvaFWSsIgOGFgwEjHp2P4m8BlJa21fXzueHSiNvKNVScIiIqItt0u6kNE1U+fb7ktN7SeBmyU9RFlkMJeyR2bENklNWEREtELSNcA6RtdMLbD9st5aXSVpJrAfJQm7f7CTvqSjbd9ULbjonCRhERHRCklrbB+0tXN91dVtmaKeabUDiIiI3tgo6fCRA0mHUdr9TBWd7IMW9aQmLCIi2nIGcIWkXZvj7wKnVIxnomVqKV6RJGEREdEK2/cCCyTt0hw/N3hd0im2Lx/64ogpKNORERHRKtvPjU3AGh+f8GAm1sO1A4huyUhYRERMlM7XTEk6FNiLge9P21c0971ZBRoTI0lYRERMlE7XTElaAswD1vBSk1YDV1QLKjotSVhEREyUro+EvRXY3+ntFC1JTVhERLRC0vStPGXlhASy46wD9qgdRPRHmrVGREQrJK0HrgYus/3vteNpi6TrKNOOs4GDgLuAFzvl235PpdCi45KERUREKyTNBk6k7Kc4DfgicOVmVkp2hqR3bOm67eUTFUv0S5KwiIhonaQjgC8DcyijY39s+8G6UW0fSZ+x/ftbOxcxXqkJi4iIVkiaLuk9kv4RuAj4LLA3cB3wz1WDa8fRQ879/IRHEb2R1ZEREdGWB4CbgQts3z5w/upmZKyTJH0Y+Aiwt6S1A5dmA7cPf1XE1mU6MiIiWiFplu3na8fRtmYvzN2ATwNnD1zaYPs7daKKPkgSFhERrZC0E3A68JPATiPnbZ9WLaiWNW04Xs/ojvnfrhdRdFlqwiIioi1LKH20jgGWAz8BbKgaUYskfRT4H+Am4KvNbWnVoKLTMhIWERGtkLTa9k9LWmv7QEkzgBtsH1U7tjZIehA4xPbTtWOJfshIWEREtOUHzf0zkg4AdqVsdt0XjwLP1g4i+iOrIyMioi2XSNoNOAf4CjAL+KO6IbXqIeAWSV9ldMf8C+uFFF2WJCwiIraLpDMHDk9t7v+yud95gsPZkb7d3F7d3CK2S5KwiIjYXrOb+/2ARZRRMIBjgRVVItoBbJ8HL27P5D6244iJlcL8iIhohaQbgffa3tAczwausv1zdSNrR1PntgTYvTn1FPB+2/9WL6roshTmR0REW94IvDBw/AL9Ksy/BDjT9lzbc4FPAF+oHFN0WKYjIyKiLUuAu5q9Iw0cD1xeN6RW7Wz75pED27dI6lPNW0ywTEdGRERrJC0E3t4crrC9umY8bWqSy1WUZBPgZOCtto+rF1V0WZKwiIiIcWjab5wHHAaIsujgU7afqRpYdFZqwiIiIsZnHrAn5btzBrCYHq3+jImXkbCIiIhxkPRN4CxgHfCjkfO2H6kWVHRaCvMjIiLG50nb19UOIvojI2ERERHjIGkx8OvAMkZvW3RttaCi0zISFhERMT6nAvMp9WAj05EGkoTFNkkSFhERMT4LbP9U7SCiP7I6MiIiYny+Jmn/2kFEf6QmLCIiYhwk3UdpU7GeUhMmykbeB1YNLDorSVhERMQ4SJo77HxaVMS2ShIWERERUUFqwiIiIiIqSBIWERERUUGSsIjoHEmbJK0ZuO21DT9jjqSPtB9dRMT4pCYsIjpH0vO2Z23nz9gLWGr7gFf4uum2N23Pe0dEQEbCIqInJE2XdIGkr0taK+lDzflZkpZJWiXpG5J+qXnJnwLzmpG0CyS9U9LSgZ/3F5I+0Dx+WNK5km4DTpA0T9K/SLpH0q2S5jfPO0HSOkn3Sloxsf8FIqJr0jE/IrroNZLWNI/X2z4eOB141vYiSTOBlZJuBB4Fjrf9nKTXUhpufgU4GzjA9kEAkt65lff8P9uHN89dBpxh+wFJhwB/BRwFnAscY/s/Jc1p9yNHRN8kCYuILto4kjwNeDdwoKRfaY53BfYFHgP+RNIRlP3+fhx4/Ta8599DGVkDDgWukjRybWZzvxL4W0n/QPYTjIitSBIWEX0h4GO2bxh1skwpvg442PYPJD0M7DTk9T9kdInG2Od8r7mfBjwzJAnE9hnNyNgvAmskHWT76W35MBHRf6kJi4i+uAH4sKQZAJLeLGlnyojYE00CdiQw0vV8AzB74PWPAPtLmilpV2DxsDex/RywXtIJzftI0oLm8Tzbd9o+F3gK2LP9jxkRfZGRsIjoi0uBvYBVKvOETwLHAV8CrpN0N7AGuB/A9tOSVkpaB1xv+5PNNOJa4AFg9Rbe6yTgryWdA8wArgTuBS6QtC9lVG5Zcy4iYqi0qIiIiIioINORERERERUkCYuIiIioIElYRERERAVJwiIiIiIqSBIWERERUUGSsIiIiIgKkoRFREREVPD/G1zGNi12ttsAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of DT Grid Model for Multiclass Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dtt['column'][:10], y=feat_imp_tuned_dtt['weight'][:10],data=feat_imp_tuned_dtt)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Multiclass tuned\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RF_DT_GBT_Binary.ipynb b/RF_DT_GBT_Binary.ipynb new file mode 100644 index 0000000..a6110d6 --- /dev/null +++ b/RF_DT_GBT_Binary.ipynb @@ -0,0 +1,1596 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import the required libraries\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.classification import GBTClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "from pyspark.ml import Pipeline\n", + "from sklearn.metrics import classification_report\n", + "from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# reads the train data\n", + "us_train_cat = spark.read.csv(get_training_filename('USAccident_train_categorical.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# reads the test data\n", + "us_test_cat = spark.read.csv(get_training_filename('USAccident_validation_categorical.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# creates a vector assembler\n", + "va = VectorAssembler().setInputCols([i for i in us_train_cat.columns if i!='Severity']).setOutputCol('features')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# creates a string indexer\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we convert the multiclass data into binary data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(((us_train_cat[\"Severity\"]==4) | (us_train_cat[\"Severity\"]==3)),1).otherwise(0))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(((us_test_cat[\"Severity\"]==4) | (us_test_cat[\"Severity\"]==3)),1).otherwise(0))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating the evaluator for our binary classification\n", + "evaluator_rfb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RF Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an initial RandomForest model.\n", + "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Train model with Training Data\n", + "rfModel = Pipeline(stages=[label_stringIdx,va, rf])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "rf_fit = rfModel.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC ROC score: 0.7634901876492165\n" + ] + } + ], + "source": [ + "print(\"AUC ROC score:\",evaluator_rfb.evaluate(rf_fit.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.6892473173146102\n" + ] + } + ], + "source": [ + "# Prints the accuracy of our binary classification\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]\n", + "evaluator_rfb.evaluate(rf_fit.transform(us_test_cat))\n", + "binary_prediction=rf_fit.transform(us_test_cat).select(\"prediction\").collect()\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf_fit.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf_fit.stages[-1].getNumTrees" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'gini'" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf_fit.stages[-1].getImpurity()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# tranforming the test data for predictions\n", + "prediction_rfb=(rf_fit.transform(us_test_cat)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# storing the true labels for evaluation purpose below\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.69 0.98 0.81 131571\n", + " 1 0.73 0.09 0.16 64408\n", + "\n", + " micro avg 0.69 0.69 0.69 195979\n", + " macro avg 0.71 0.54 0.48 195979\n", + "weighted avg 0.70 0.69 0.59 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_rfb,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# create a dataframe to print the feature importance\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rf = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], rf_fit.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAALhCAYAAAAAZ+NaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3Xe8bFV9///XG64gBikKMSJSjFgQGyIWLF9iw+83ikaMEv2KxkiMwW6+sUQwqImaYozdREysiKj5oUEIsWFD6QgiilhAiigCCpaAn98fex0YDufeew535s6aua/n4zGPM7P3lM+Us+c9a6+1dqoKSZIk9WmjaRcgSZKk1TOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZtYJLcJck1U67h5kkqyfbTrEODJC9I8qMkP0/yW9OuZ5KSvC7Jv067DmklDGuamLbhXzj9JskvRi4/ZcyP9ZQkX2mPccwS6++b5LQkVyf5WpLd1nBfJyT55aL6772O9U09IG2IeguFSS5O8qBp1zGqhbPXAw+uqs2r6qr1/Ph3ae/Rwv/aeUletD5rmIQk+7Tt3uh25CPruQaD6ZxYNe0CNL+qavOF80m+B/xJVf33hB7uJ8A/APcG7jO6IslmwP8HvAZ4N/A84ONJ7lpVqwtQf1JV759QrSuWZCOAqvrNtGvRyiVZtYbP2rTdFti4qs5ZauV6qv3ahe1FkgcAn0lyYlV9YcKPO2nnVdUd1+UOOv/saD2xZU1Tk2SzJG9NclGSC5L8XZKbtXX7JDk3yV8nuaz92n7i6u6rqo6pqiOBi5ZY/Qjgl1X1tqr6FUOouyWw4haOJLsl+UySnyY5O8njRtY9PsnpSa5M8v0kLx+56fHAxqMtdYt/9S5ufWstfIcm+SpwNbBdklsleW9roTk/ySELQa7d/otJrkhyaZL3ruW5PLu99hcmee7I8r2SfLXdz4VJ3phkVVu3cZK3tPu/oj3fO7d1myX5p1bXxUnenGTTkft9RZJLklwAPHUtte2Q5Oj23n8ryQEj616X5ANJPpTkZ0nOSHKvNd3fErf9cHsfTkuyc3sdf5zke0n2XvQevDrJye35fjTJliPrn5DkG0kuT/LfSXYZWXdxkpckOQu4srWq/DbwX+2xn5dkVbvPS9p9fHbh9Wz3cXh7TY9tz/VLSXYcWX/Pkc/jxUlePPI+vbL93/y4Peetlng97g6czvWfzU/l+tbIP0vyHeDMdt2HJjmlvQ4nJLnvotfpVRlarX+e5GNJbp3kiAz/Dydkma2bVfUV4NvAde9pkoOTfLe9Bmcm+T8j656d5NNJ/rm9ht9J8vCR9Xdsr9vPknwK2HrRa7C29/BFSc5qz+vtSW6b5Lj2vI5JssVynteix1zOtu+VSS4B3t6WP7591i9P8oUku47c3yvbfV2ZYbv04AzbphcBB7Tav7bSOtWRqvLkaeIn4HvAwxctewPwBWAb4DbAicAr2rp9gGuAvwU2AR7OEFh2XsvjHAQcs2jZy4CPL1r238Cfr+Y+TgCeusTyLRjC4FOAjYH7ApcBd2zrHwbcjeFH0O5t3T5t3V2Aaxbd3+uAfx25fIPrtDrOA+4M3IyhJfxTwJuBWzC0iJwKHNCu/3HgJUCAzYC9VvP87gIU8O/tevdutT6ord+zPbeNgd8FzgWe3dbtC3ylvRYbtef7223dO4Ajga2ALYFjgUPauscBP2yPvTnw0VbD9qup8avAG4FNgT1afXuNvG5XM4Twjdv1Prea+7n56OOM3Hbv9np+GPhue91WAc8Fzl70Hnx/pO5PLLxnwG7Az4D/xfAZfSVwNrCqrb+Y4TO9HbDZyLIHjdz/KuCAdt83Z/hiPmFk/eHAjxg+Tzdrr++/tXVbA5cyfOY3be/Jfdu6lzL8b23X7vffgPes4fNwzRKv2X+293IzhpB5JfCHreant8fecuR1OhvYCbgVQ9j6JvDQkdf57Wt7fIbP7oOBXwKPHrnOkxg+7xsB/7e97tu0dc8G/gd4Wvs8vBD43shtT+H67cjD2vu/kvdwYRu1A/BT4GvA3dvr8kXgL1fzvPYBzl3NuuVs+w5tNW0G3J9h23Of9hwPBL7VXtt7MmwnbtNevzvQtpMs2sZ4mt3T1AvwtGGcWDqs/RD4vZHL+wLfbOf3aRvsm4+sPwr4i7U8zlJh7bW0L7iRZR8FXrqa+zgBuAq4vJ2+3JYfABy36Lr/voaN9TuAv23nb2pYe/nI5R1bXTcbWfYM4FPt/BHAW4DbruU1WghrO40s+2fgrau5/kuBD7Xz/xs4iyHQZeQ6q4BfA7cbWbY3LfgAHwReNbLuHqwmrAG7tPd+s5FlbwTeMfK6fXJk3e7A5aupfamw9omR9U9k2IWednnbdv2FcHXCorp3B64a+Vy9d2TdxgwB5v7t8sXAHy2q5wZhbYl6fwf4De1zzxDW3jKy/g+A00be+6+s5n6+y0hYB3ZmCClZ4rqrC2sPHFn2LOD4Rbc7FXjyyOv04pF1b2XkB1J7nU9YTa0Ln8fLgV+0869dy2f4m8Cj2vlnA2eOrLtVu4+tgDtx4+3Ix7g+rC3nPXzCyPr/BN44cvkvgMNXU+M+wLVcvx25HHhsW7e2bd/i//P30MLcyLLvA/dj+MF0Ee0HyKLrGNbm5ORuUE1FkjB8MX1/ZPH3gduNXL60qn65aP12N+Hhfs7Q6jBqC4Zf1Kvzp1W1VTs9sC3bEXhI2w1xeZLLgScw/OJf2H34+bRdhAytD9vchHpHnT9yfkeGL9JLRx7/TQy/qGFoUbgFcGrbXbLGXY2L7vu61zbJrm132CVJrgQOHnken2Lo9/dO4JIkb0uyebvtzYCzRmr7D4YWGdr6xY+3OtsxvPe/WHT90c/GxSPnr2ZomVquS0bO/6I9Vo1cBhgdEbm47ltk2BW6HSPPo6quZfgSvt1qbnsjbTfo37fdlVcyhJAAtx652uqe6+2B7yxxn2nrjh55L05laJW69eLrr8Fo7Td4rs3i92Tx67r48preo2uraiuG7gmvAPZO2/UOkOSZI7sALwfuyA3/txa/RrTHW/gsLd6OLPm8VvMersvz+u7IdmSrqjpqmdu+i6vqf0Yu7wi8fNG2Z1uGH0dnMfygei3wo7bL+zZorhjWNBXty/Fiho3Qgh0YNpQLtkly80XrL7wJD3cWw64C4LrO+ru15StxPvBfiza+m1fVC9r6Ixh299y+qrZk2PWUtq5ufHdcxRCuFvzOEtcZvd35DMFz65HH36Kqdgeoqh9W1R8zhMfnAYcl2WENz+f2I+dHX9t/Ydh19LtVtQXD7pi0x6iq+sequjdD69g9gecz/LK/pt1mobYtq2ohHFy0xOOtzoXAthkGhoxe/4eruf6kLa776qq6gqHO6z6/STZm+MIdrXPx+7748jOARzK0imzJ0MoE139u1uR8ht3UN3yA4X9roeVm9LN686r68TLud6lab/Bcm7G/JzV0pF/YZfknAEnuxLDr/0DgVi3UncvyXqOLWHo7smA57+FYLXPbt/hzcj5w8KL38xZV9bF2n//eflTegeEH3WtWcz+aUYY1TdOHgENaR+TfZvhFPToC82bAK5NskuT3GPoofXSpO8rQofrmDLvjNsrQSXrhl/lxwGatI/KmDC1QVzH0N1mJ/wDuneRJSW7W6rp/kju1X8ubAz+pql8meSDDrp8FP2LoxD36RXEaQwvC7ZJsDfzlmh68qr7LsLvpDUlumWSjJLukTQXR6tqufRlc3m62plFkh7SOzvdk6Af04bb8lsAVVfXzJHdj2AVGe4z7J9mjvbZXMez6vLa1AhwGvCnJNhncPskj2k2PAP6kvVabM7TWrc65wBnAa5JsmmR3hl3QH1jT6zNBTx+p+1Vc/zp9GHh8koe0zuEvZdiletIa7usShi/UBbdk2E33E4bWvNcsdaPV+A/gjhkGAmySZItc3+n/HcDrktweIMlvJ3nMCu57saMYPvv7tdbApzEEjBtNk7Ou2uf3dcDL2uu6OcOu4UsZ/refzdCythzfYmitXNiO7M2wm3HBTXkPx2Ft277F3gU8t/3vJcnmSR6b5BatJfyhbdv2i3a6tt3uEmDntn3SDDOsaZoOBr7B0MJ1GvAlho63C77HEDYuZggCz6iq81ZzX89i2Ei9kSHU/YKh/xZtd9q+DH1bLgeeDDyuVjgcvqp+CjyKoTXkIoZf5a9h6FtS7f7/PsnPgP8HfGTRbd8AnNx2Y9yLof/LJ9trcALDl+/a7M/QF+ebDJ3uP8z1u0Ef0O7/5+2xD6yq1bVEXsvQif+7DF+4h1bV8W3dCxmC1c8Z+h59eOR2WzG0GF7O0Kn5+wz93QBe0F6Tk4Ar2v3esT3/jzN84Xyh1X7s6p5gey3/ENiV4b3/MENfxWlN4/A+hi/XHzKEhhe3Os8AnsmwS/hShs7r+67lc/Va4LXtM3AQwy7lSxme59dZwQ+I9pl6BMPn+UfAOVw/wvkNDINoPtM+j19m6G93k1TVJcBjGULFTxj6hv5+VV2+xhvedB9jGDTw9Ko6hSF8nsTwf7czywxT7bP0JIaWy8sY/i/fP7L+pryH47C2bd8NVNWXGFrL38nwv/ct4I9o/SsZRrj/mOH1Gf0xdDhD6/1lSb48iSei9WOhU63UlST7MHSsXqc5iqR1keQEhs9hN3PuSdrw2LImSZLUMcOaJElSx9wNKkmS1DFb1iRJkjpmWJMkSerYqrVfZTZss802tdNOO027DEmSpLU6+eSTf1xV2y7nunMT1nbaaSdOOmnS8xhKkiStuyRrOuzeDbgbVJIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6tiqaRcwaff5i/dOu4QVO/nvnjbtEiRJUidsWZMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6NtGwlmSfJOckOTfJS5dY/6Ik30hyRpJPJ9lxZN0BSb7dTgdMsk5JkqReTSysJdkYeCvwaGBXYP8kuy662qnAHlV1D+BI4A3ttrcCDgHuB+wJHJJk60nVKkmS1KtJtqztCZxbVedV1a+Bw4F9R69QVZ+tqqvbxROA7dv5RwHHVdVlVfVT4DhgnwnWKkmS1KVJhrXbAeePXL6gLVudZwKfuom3lSRJmkurJnjfWWJZLXnF5KnAHsBDV3LbJAcCBwLssMMON61KSZKkjk2yZe0C4PYjl7cHLlx8pSQPB14BPLaqfrWS21bVu6pqj6raY9tttx1b4ZIkSb2YZFg7Edglyc5JNgGeDBw1eoUk9wbeyRDUfjSy6ljgkUm2bgMLHtmWSZIkbVAmthu0qq5JchBDyNoYOKyqzkpyKHBSVR0F/B2wOfCRJAA/qKrHVtVlSV7NEPgADq2qyyZVqyRJUq8m2WeNqjoaOHrRsoNHzj98Dbc9DDhsctVJkiT1zyMYSJIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUsdWTbsArZsfHHr3aZewIjsc/PVplyBJ0kyxZU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjq2atoFSGuy15v3mnYJK/Kl535p2iVIkuaMLWuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHVs17QKkDdXnH/LQaZewYg89/vPTLkGSNjgTbVlLsk+Sc5Kcm+SlS6x/SJJTklyTZL9F665Nclo7HTXJOiVJkno1sZa1JBsDbwUeAVwAnJjkqKr6xsjVfgA8HXjJEnfxi6q616TqkyRJmgWT3A26J3BuVZ0HkORwYF/gurBWVd9r634zwTokSZJm1iR3g94OOH/k8gVt2XLdPMlJSU5I8rjxliZJkjQbJtmyliWW1Qpuv0NVXZjkDsBnkny9qr5zgwdIDgQOBNhhhx1ueqWSJEmdmmTL2gXA7Ucubw9cuNwbV9WF7e95wOeAey9xnXdV1R5Vtce22267btVKkiR1aJJh7URglyQ7J9kEeDKwrFGdSbZOsmk7vw2wFyN93SRJkjYUEwtrVXUNcBBwLHA2cERVnZXk0CSPBUhy3yQXAE8E3pnkrHbzuwInJTkd+CzwukWjSCVJkjYIE50Ut6qOBo5etOzgkfMnMuweXXy7LwN3n2RtkiRJs8DDTUmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktSxFYe1JFsnucckipEkSdINLSusJflcki2S3Ao4HXhPkn+cbGmSJElabsvallV1JfAHwHuq6j7AwydXliRJkmD5YW1VktsCfwh8coL1SJIkacRyw9pfA8cC51bViUnuAHx7cmVJkiQJYNUyr3dRVV03qKCqzrPPmiRJ0uQtt2XtzctcJkmSpDFaY8takgcADwS2TfKikVVbABtPsjBJkiStfTfoJsDm7Xq3HFl+JbDfpIqSJEnSYI1hrao+D3w+yb9V1ffXU02SJElqljvAYNMk7wJ2Gr1NVf3eJIqSJEnSYLlh7SPAO4B/Ba6dXDmSJEkatdywdk1VvX2ilUiSJOlG1jYa9Fbt7CeSPAf4OPCrhfVVddkEa5MkSdrgra1l7WSggLTLfzGyroA7TKIoSZIkDdY2GnTn9VWIJEmSbmxZfdaS/MESi68Avl5VPxpvSZIkSVqw3AEGzwQeAHy2Xf5fwAnAnZIcWlXvm0BtkiRJG7zlhrXfAHetqksAktwGeDtwP+B4wLAmSZI0Acs9kPtOC0Gt+RFwpzYa9H/GX5YkSZJg+S1rX0jySYbJcQGeAByf5LeAyydSmSRJkpYd1v6cIaDtxTCNx3uBj1ZVAXtPqDZJkqQN3rLCWgtlR7aTJEmS1pO1HcHgi1X1oCQ/Y5gE97pVDBlui4lWJ0mStIFb26S4D2p/b7l+ypEkSdKo5Y4GJcmDkjyjnd8miUc3kCRJmrBlhbUkhwB/CbysLdoEeP+kipIkSdJguS1rjwceC1wFUFUXAu4alSRJmrDlhrVftxGhBdDmV5MkSdKELTesHZHkncBWSZ4F/DfwL5MrS5IkSbD2qTteAHwJ+CeGyW+vBO4MHFxVx02+PEmSpA3b2ibF3R54E3AX4Azgywzh7eQJ1yVJkiTWPs/aSwCSbALsATwQ+GPgX5JcXlW7Tr5ESZKkDddyjw26GbAFsGU7XQh8fVJFSZIkabC2PmvvAu4G/Az4KsNu0H+sqp+uh9okSZI2eGsbDboDsClwMfBD4ALg8kkXJUmSpMHa+qztkyQMrWsPBF4M7JbkMuArVXXIeqhRkiRpg7XWPmttMtwzk1wOXNFOvw/sCRjWJEmSJmhtfdaex9CithfwPwzTdnwFOAwHGEiSJE3c2lrWdgKOBF5YVRdNvhxJkiSNWluftRetr0IkSZJ0Y8s9NqgkSZKmwLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxyYa1pLsk+ScJOcmeekS6x+S5JQk1yTZb9G6A5J8u50OmGSdkiRJvZpYWEuyMfBW4NHArsD+SXZddLUfAE8HPrjotrcCDgHuB+wJHJJk60nVKkmS1KtJtqztCZxbVedV1a+Bw4F9R69QVd+rqjOA3yy67aOA46rqsqr6KXAcsM8Ea5UkSerSJMPa7YDzRy5f0JZN+raSJElzY5JhLUssq3HeNsmBSU5KctKll166ouIkSZJmwSTD2gXA7Ucubw9cOM7bVtW7qmqPqtpj2223vcmFSpIk9WqSYe1EYJckOyfZBHgycNQyb3ss8MgkW7eBBY9syyRJkjYoEwtrVXUNcBBDyDobOKKqzkpyaJLHAiS5b5ILgCcC70xyVrvtZcCrGQLficChbZkkSdIGZdUk77yqjgaOXrTs4JHzJzLs4lzqtocBh02yPkmSpN55BANJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6ZliTJEnqmGFNkiSpY6umXYCk+fSWF39i2iWs2EH/8JhplyBJN2LLmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdm2hYS7JPknOSnJvkpUus3zTJh9v6rybZqS3fKckvkpzWTu+YZJ2SJEm9WjWpO06yMfBW4BHABcCJSY6qqm+MXO2ZwE+r6o5Jngy8HnhSW/edqrrXpOqTJEmaBRMLa8CewLlVdR5AksOBfYHRsLYv8Kp2/kjgLUkywZokaSxe+9T9pl3Cir3i/UdOuwRJN8Ekd4PeDjh/5PIFbdmS16mqa4ArgFu3dTsnOTXJ55M8eKkHSHJgkpOSnHTppZeOt3pJkqQOTDKsLdVCVsu8zkXADlV1b+BFwAeTbHGjK1a9q6r2qKo9tt1223UuWJIkqTeTDGsXALcfubw9cOHqrpNkFbAlcFlV/aqqfgJQVScD3wHuNMFaJUmSujTJsHYisEuSnZNsAjwZOGrRdY4CDmjn9wM+U1WVZNs2QIEkdwB2Ac6bYK2SJEldmtgAg6q6JslBwLHAxsBhVXVWkkOBk6rqKODdwPuSnAtcxhDoAB4CHJrkGuBa4NlVddmkapUkSerVJEeDUlVHA0cvWnbwyPlfAk9c4nYfBT46ydokSZJmgUcwkCRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZLOIQ6TAAAgAElEQVQ6ZliTJEnqmGFNkiSpY4Y1SZKkjhnWJEmSOmZYkyRJ6phhTZIkqWOGNUmSpI4Z1iRJkjpmWJMkSeqYYU2SJKljhjVJkqSOGdYkSZI6tmraBUiS+nP2az8z7RJW5K6v+L1plyBNjC1rkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdWzXtAiRJWt9e9apXTbuEFZm1ejVetqxJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSx1ZNuwBJkjQ+R3xkz2mXsGJ/+MSvTbuErtmyJkmS1DHDmiRJUscMa5IkSR2zz5okSZoZ9zzy2GmXsGKn7/eodbq9LWuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1DHDmiRJUscMa5IkSR0zrEmSJHXMsCZJktQxw5okSVLHDGuSJEkdM6xJkiR1zLAmSZLUMcOaJElSxwxrkiRJHTOsSZIkdcywJkmS1LGJhrUk+yQ5J8m5SV66xPpNk3y4rf9qkp1G1r2sLT8nyaMmWackSVKvJhbWkmwMvBV4NLArsH+SXRdd7ZnAT6vqjsAbgde32+4KPBm4G7AP8LZ2f5IkSRuUSbas7QmcW1XnVdWvgcOBfRddZ1/g39v5I4GHJUlbfnhV/aqqvguc2+5PkiRpg5KqmswdJ/sB+1TVn7TL/xe4X1UdNHKdM9t1LmiXvwPcD3gVcEJVvb8tfzfwqao6ctFjHAgc2C7eGThnIk9madsAP16Pj7e++fxmm89vds3zcwOf36zz+Y3PjlW17XKuuGqCRWSJZYuT4equs5zbUlXvAt618tLWXZKTqmqPaTz2+uDzm20+v9k1z88NfH6zzuc3HZPcDXoBcPuRy9sDF67uOklWAVsCly3ztpIkSXNvkmHtRGCXJDsn2YRhwMBRi65zFHBAO78f8Jka9sseBTy5jRbdGdgF+NoEa5UkSerSxHaDVtU1SQ4CjgU2Bg6rqrOSHAqcVFVHAe8G3pfkXIYWtSe3256V5AjgG8A1wJ9X1bWTqvUmmsru1/XI5zfbfH6za56fG/j8Zp3PbwomNsBAkiRJ684jGEiSJHXMsCZJktQxw5okSVLHDGsrkOSZiy5vnOSQadUjSZq+JDdfYtk206hlfUhys2nXsKExrK3Mw5IcneS2SXYDTgBuOe2ixiXJq9t8dwuXt0jynmnWNG5Jdkzy8HZ+syQz/f4lOTXJKUucTk1yyrTrG6ckv73EsjtPo5ZxS7L7EqffHf1/nGUL/3OLlh2w1HVn1IlJ7r9wIckTgC9PsZ6xSfLpJDuMXL4PczKVVpKN2pGUujcXG4L1par+KMmTgK8DVwP7V9WXplzWOK0CvprkGcDvAG9up7mQ5FkMhye7FfC7DJMtvwN42DTrWkf7TbuA9egLSV5ZVUcAJHkx8Exg1+mWNRZvA3YHzmA4gstu7fytkzy7qv5rmsWNwcEtwLwE2Bz4V+BXXH9s6Fn3R8BhST4HbAfcGvi9qVY0Pv8IHJfkH4DbMRy7+1nTLWk8quo3SU5PskNV/WDa9ayJU3esQJJdGDYuXwfuyjAP3Iuq6uqpFjZG7RfwJ4CfAg+pqnOnXNLYJDkN2BP4alXduy37elXdfbqVaTmS3JZhDqRfArcBzgZeXFU/n2phY5DkcODVVXVWu7wr8BfAq4GPVdW9plnfukoS4MXAn7ZFB1fVh6ZY0tgleRzwPuBnzN+286HAfzMcM/NeVXXJlEsamySfAe7L0Fp41cLyqnrs1Ipagi1rK/MJ4KCq+u+28XkRw5Ea7jbdssYjyUOANwGHAncH3pLkj6tqXg719auq+vXw1l13iLO5+LWS5L4MraB3BTZlaJ35VVVtMdXCxqiqLkpyDPAy4DfAy+YhqDV3WQhqAFX1jST3rqrzFj6vM25r4H7AdxhatHdMkpqT1oIk72Zorb8HcCfgE0neUlVvnW5l6y7Jy4CnMLQU3gP4XJIXVNWx061sbP562gUsh2FtZfasqisB2kbmH5IsPoTWLPt74IlV9Q2AJH8AfAa4y1SrGp/PJ3k5sFmSRwDPYQjg8+BtwFOBwxlaD5/ODY+vO/OSHAdcxLCLcHuG3U7HV9VLplvZWJyT5O0M7x/Ak4BvJdkU+J/plTU2JwCvq6rDkmwGvB74EvDA6ZY1NmcCf9K+F77b+q/945RrGpftGb77rmboivAphqMPzUVYq6rPT7uG5XA36AokuQVDU/4OVfWstlv0zlX1ySmXNhZJNl58WK8kt66qn0yrpnFKshFDH6dHMrQ8HQv86zz8uk9yclXdZ3S3bpIvV9W8fBmS5HFV9R8jl1cxtK69eopljUULMM8BHsTw2fwiQwD/JXCLWW9BXKpPUJKHVNXx06pp3Np7uENVnTPtWiYhyaZV9atp1zFuLVgv7JXYhOHwmFf1tlfCsLYCST4MnAw8rap2a/+cX5n1/iQLktwG+BvgdlW1T+s384CqeveUSxuLJL8F/HIhkCbZGNh0HvocJjkeeDhwGPADhhaoZ1XVPaZa2Jgl2RHYpXVF2AxYVVU/m3ZdWrPWbeQpwB2q6tA2uvB3qmpeRhU+hmHPxCZVtXOSewGH9tbv6aZIsidDS9qWVbVDknsytCI+d8qljUWSkxiOS/4RYA/gaQzbmJdPtbBFnLpjZX63qt5A2y1RVb9g+BU8L/6NobXptu3yt4AXTK2a8fs0sNnI5c0YOs3Og6cz/D8fBFwL7MKcjRRto3mPBN7ZFm0P/MfqbzE7kuyV5Lgk30py3sJp2nWN0duABwD7t8s/A2a+P9eIVzF0P7gcoKpOA3aeZkFj9M/A7wM/Aaiq04G9p1rRmLXBIBtX1bVV9R7gf025pBuxz9rK/Lr9mi+AJL/LMPx8XmxTVUe0DqVU1TVJrl3bjWbIzUd3J1XVz9uu7ZlXVQtf7L8EXjnNWiboz2mjeQGq6ttLzb02o94NvJCh5X6e/ucW3K+qdk9yKkBV/TTJJtMuaoyuqaorFg0GmZfdVhtV1fcXPbd5+oxe3T6LpyV5A8Neid+ack03YlhbmUOAY4DbJ/kAsBdDi8a8uCrJrbk+jN4fuGK6JY3VVUl2r6pT4LrJHX8x5ZrGor1XhwA7MvJ/XVV3mlpR4ze3o3mBK6rqU9MuYoL+p3U7WNi2bMswondenJnkj4CNW1/m5zEnk+IC57ddodXew+cy7HWZF/+X6/dKvJBhYNYTplrREuyztkItzNyfYffnCVX14ymXNDZJdmfoaLkbw+imbYH9quqMqRY2Jm16i8OBhalIbgs8qapOnl5V45HkbOD/sahlZs7mQ3oDw26mpzF8YTwH+EZVvWKqhY1BktcxdGz+GCOt9Qs/LGZdkqcwjHDdnWGuyv2Av6qqj0y1sDFpLfSv4IaDl15dVb+camFj0Fqv/5mhTywMXUcOmrPvvu4HhxjWlqGFmNWalw0qXNdacWeGDc45VTUP0wZcJ8Mx7Rae3zfn5fkl+WpV3W/adUzSnI/m/ewSi6uq5mUWfJLcheFoIQE+XVVnT7kkaWYGhxjWlmFkQ3pzhtEipzNscO7BMBv+g6ZV2zi0+dRWq6o+tr5qmbQkDwR24oa7Ct87tYLGJMnftrOLW2bmolVUsynJrda0vqouW1+1TEKST7CGXfG9feGvRJI3subn9qL1WM7EJDmZYcLfz40c2eaM3kbS22dtGapqb7jukDAHVtXX2+XdGI51N+se0/7+NsMklZ9pl/cGPscQAGZekvcxzDJ+GtfvKixg5sMaw/xco39heG4PmUItY5Xk66z5S6OrjepKJHlqVb0/yZJffFU16xOrnszw3gXYgeEwdgG2YphiZtZHTP59+/sHDMdTfn+7vD/wvWkUNEYLBzi/P0PXmCPa5f0YjtwzL5YaHNIdw9rK3GUhqAFU1ZmtyXSmVdUzAJJ8Eti1qi5ql2/LfA2v34Ph+c1dc3JVPXjaNUzQ77e/f97+vq/9fQow63PkLYw6u+VUq5iQqtoZIMk7gKOq6uh2+dFc3wdqZi3Mfp/k1VU1+sPoE23uw5m1ML9m62/4kIUuI0neyjDQbl7MxOAQd4OuQJIPMRzo9f0MvxafCmxeVfuv8YYzIsmZVbXbyOWNgDNGl82yJB8BnrcQRudJkuctsfgK4OSqOnOJdTMnyZeqaq+1LVN/Fo6wsWjZSVW1x7RqGqc2wOf/LEyhk2Rn4Oiquut0K1t3Sc5hmHrl8nZ5K4buP3eebmXjsWhwCAx9YV/T2+AQW9ZW5hnAnwHPb5ePB94+vXLG7nNJjgU+xBBGnwws1fF5Vm0DfCPJ17hhv66Z7Vcy4oHAfYGFQ5/9b+BrwPOTfKCq/mFqlY3PbyV5UFV9Ea7rf9jdfEg3RRvp+hqGqWSOAe4JvKCq3r/GG86OHyf5K274Q3cuDmPXvJBh+7kw3+FOwJ9Or5yx+juGOcgWJhD/PYbP6ry4QxtR3vWoclvWdANtsMHCLrXjq+rj06xnnJI8dKnlNSMH8l2TFrL3Wzj0UpJbMvQxeQJwUlXtOs36xqHNi3cYsGVbdDnwx/MwGjvJaVV1rySPBx7H8OX/2aq655RLG4s20OAQru9DeTzw17M+wGBUkk2Bu7SL35yn42gmuR1D3zUYpqz64TTrGackX2Q4Jui/AR9caEHsjWFtBZLsxXBYkcUTj95hWjVJcN1umLtX1TXt8ibA6VV11ySnLoxymgdJtmDYds3NhM1JzqqquyX5F+CjVXVMktPnJaxtCOZ1pDlAkt9hGCAy+ty669d1UyW5E8Oesycy7JF4T1UdN92qbsjdoCsz14eEaa1qr2cYFZp2qqraYqqFjUmb5f/NwF0ZfkltDFw1J8/vCOArSRaOlflY4IgMB6/vdqLHlWgtF0+gfSEujN6qqkOnWNa4fCLJNxl2gz6nzfDfVZ+ZddG+DF/CjcPMXMwjN88jzZP8DcNu67O5/qgTxdDVYi5U1bfabvqTGCYAvneGDczLe5m6ypa1FZj3iUeTnAs8Zl4nq0xyEkM/vI8wjAx9GrBLVb18qoWNSZL7MUzdEeCLVXXClEsaqyTH0AZNcMOjNMxDfzySbA1cWVXXtk7PW1TVxdOuaxySnA68gxu/dzN/9BC4rmV7LkeatwEG9+ytw/24JLkHQ6va/wGOA95dVack2Q74SlXtONUCG1vWVuazSf6OOT0kDHDJvAa1BVV1bpKNq+pa4D1JZropP8lvVdVVbdfg2e20sG6LqrpyetWN3fZVtc+0i5iEJE8EjmlB7a8YDsv0GmAuwhrDXFbzNBhrsTMZ5lmbu5HmwHcZjp05r94C/AtDK9p1x4quqgvb/2IXDGsrs9CqNjrcvBhGx8yDk5J8GPgPbhhGu2gGHoOrW1+u09rou4uY/dGERwKPBs7ihhPHpl3eYRpFTciXk9x9dK7DOfLKqvpIkgcBj2KYbPXtXL/NmXWfSPIc4OPccNsyLwMM5nmk+c+AU9to0NHnNhdHMFg0P97ide9L8tGqmvqB3d0Nquskec8Si6uq/ni9FzMBSXYELmHor/ZChlGFb62q70y1MC1Lkm8Ad2T4pf8rru9TObNHMFiwMAikHTbs61X1wXkaGJLku0ssrnkZnDXnI82fudTyhUlz510v/4eGtWVY3aFgFszBIWE2CEmeX1VvWtuyWZLk9sAVC7s7kzwE2JfhUDfvqDk5UD1cF7ZvpKq+v75rGbd29JAfMszqfx+GgQZfczSoNF1JTqmq3adeh2Ft7ZIcsqb1VfXX66uWSUjyZtZ87MWlZsefOUv90/Xyq+mmSnICw/xqFyS5J8NxXd8A3B24uqoOnGqBY5A5Pxg4XDeL+j4MrWrfbod6u3tV/deUS1snbYT5as16F4skP2PpbefMj6RPcipr/l6YeoBZH3oJa/ZZW4blhrEkL6uqv510PRNw0rQLmKQk+wN/BOyc5KiRVVsw+7Oo36KqLmjnnwocVlWvb4cKO32KdY3T6MHAFytg5nelVdXVSb4DPCrJo4AvzHpQax6zhnXFMFhrZlXVso7pmmTrqvrppOsZs/2mXUAnujjCuy1rY9RLAp+UJG+uqudOu46VarvPdgb+FnjpyKqfMRz79JqpFDYGSb5eVXdv508GXlFVx7TLZ8xDf67lSnK3qjpr2nXcFEmeDzyL68PL44F3VdWbp1fV+pPkgKr692nXMSnz/N2Q5ItV9aBp17FSST5dVQ9L8vqq+ss1XO+RPfxwsmVtvLpI4BM0kwfMbn2avp/k4cAvquo3bZLOuwCzPrLw80k+yDCy9dYMu0EXZhyfm/5qy/Q+hikvZtEzGQ6WfRVAktcDX2GYxHlD8HxgbsMa8/3dMKsj6m/bBoY8NsnhLHqPFqbk6iGogWFt3Gym7NvxwIPb5KOfZtj9+yTgKVOtat08j2EX722BB1fVr9vy7YBXTq2q6ZjlL8Rww6OiXMtsP5+VmvfnOs/fDbP63A5m2NOyPbB4kGB3U3IZ1sZr3jc4sy6tb9AzgTdX1RtaJ9qZVVW/Ad6/xPIbTNQ8q7sqVmhWvzQA3gN8NcnH2+XHMRzebkMxy++dZlBVHQkcmeSVVfXqadezNoa18frItAuYsFkPo0nyAIaWtIW5gzaU/4FZ3VWxQaiqf0zyOa4/XNgzqmqmf0is0KxvW9Zmnp/fTD+3qnp1kscCC5Pjfq6qPjnNmpYyz4eQGJskb0jy7CWWv7D1LQGgqv5m/VY2Ge3g30uZ2fnImhcALwM+XlVnJbkD8Nkp17S+bAgtF79e+1X6k2SjJGdW1SlV9c9V9aYNLKgBfGnaBdxUC+/fWq72sPVSzAS0A7mvadnT118149cmon4+8I12en5b1hVHgy5Dmzl9t7bLaXT5RgyjCXebTmXjleSBwL8Cm1fVDm3erj+tqudMuTSto3kYjZbk0Ko6eOTyxsB7q2qW+xwCkOQDwMuq6gfTrmUSktwG+Btgu6p6dJJdgQfMyyz48/z+rWZ+ytPnZcLmJGcA91r4fm/blVN7G0m/oewCWle1OKi1hb9JMtNNwIu8keG4hEcBVNXpbUb8mZbkn6rqBUk+wRItTHNy/L61mYfP6Q4Lcxkm2ZSh28Epa7vRjLgtcFY7tuRVCwvn6LP5bwz98l7RLn8L+DDz0y9v7t6/JH8KPBu4U5LR/7NbMsx9OE+2AhYm195ymoWsjmFtea5OsktVfXt0YZJdGA4LMzeq6vxF+fPa1V13hryv/f37qVYxQUl2AH5UVb9slzcDtqmq89tVnj6t2sboGcAHkrwM2Bv4VFW9cco1rZMkdwRuAyyeePuhDIefmhfbVNUR7b2jqq5JMg/blgUzfRSb1TiCYdT8jeanrKofTaekifhbhgPVf5bhR+1DGLrLdMWwtjwHA59K8hqu/0WxB8Mb+oKpVTV+57ddoZVkE4ZpIc6eck3rrKpObn8/n2Tbdv7S6VY1dh8DHjhy+TfAR4E9YWglnUZR45BkdBfMm4B3MvRx+nyS3RePfJ0x/wS8vKrOGF2Y5CrgEOan5emqJLemtWwnuT9wxXRLGp95OGD7Yu2ICz8Fntj2IG3LkBlWJdmuqi6caoFjUlUfaoN77ssQ1v6yqi5eWN/LZNv2WVumJLsBfwEs9E87C/i7qpr1SVWvk2Qbhi/DhzN8aP8LeH5VzfQhmdqG5hDgIIbntRFwDcP0HYdOs7ZxSXJaVd1r0bK56FfSfvGuTlVVV/MhrUQbWLBkn9fRo1PMuha438yw/TyT4Yt/v8UhdVYtOkboJsDNgKtm+digC5L8GfBqhkPzLXQHqqradXpVrT+99Pe1ZW2ZqupM4IBp1zFJVfVjZnuC2NV5AcPRF+5bVd8FaCNB357khbO+K635SZL/XVVHAyT5fa7vgzHTqmrvadcwQTdfw7rN1lsVE1ZVp7TZ4u/M8IPpnKqamyNsLD5GaJLH0Vq158CLgbvO4d6I5eqiv68ta8uw6ODfNzLLnUhhOOYna5jaoaqetx7LGbs28e0jWhgdXb4t8F9Vde/pVDY+7fBZH2Q45BTApcBTq+pb06tqvOZxRGGSDwGfqap/WbT8mcAjq+pJ06lsvJL8OfCBqrq8Xd4a2L+q3jbdyiYnyQlVdf9p17Gu2i7Ch1XVPPUxXLZeWtYMa8uQ5FLgfOBDwFe58THEZrq/QpKFFsO9gF0ZRmkBPBE4uapeOJXCxmQtu5pWu24WJdkKYOFLcZ4k+RRtRGFV3TPJKoYh9jO7q7AF0I8zzBE32h92E+Dxo31nZtlqdtOfOg8/lACS/MHIxY0Y3sOHVtUDplTSOkuy8CP9HsAuwCeBXy2sr6p/nkZd61svYc3doMvzO8AjgP0ZjsP4n8CHeuh0OA5V9e8ASZ4O7L2weyLJOxj6rc26NU2WOpMTqS5Isn/rIPu8RcuBudugzt2Iwqq6BHhgkr25vj/sf1bVZ6ZY1iRslCTVWgfaXFabTLmmcXrMyPlrgO8B+06nlLHZtv29qJ1mvv/dTdTFd4RhbRla8+8xwDFtfqf9gc+1STrfPN3qxmo7hjl0Fvo6bd6Wzbp7JrlyieVhzX2GZsFW7e+2a7zW/9/evQfbVdZnHP8+iQxBEhKpjKigkWBEpIGBZrQiUcFLK15LGaTiKFBvrVaHtlSmeMFqcYaBFmF0UBSjRfGCF8xUiY0YYkCUkHARGW8BpUKRm4kagcDTP9Y6nJ3D3jkHOGe/e631fGbO7L3WOSfnSSbJ/u338nvbobU7Cm1fQrtP01gJfLF+A2iq/l3fKhtp+tg+tnSG6Wb7PaUzDMNkzbZHZSo7xdoU1UXa4VSF2kLgI1TtEtrkw4z3m4Gq19P7y8WZHrZnl84wg/asH9fbbtvfx4lOoGrYvEjSWuodhWUjxRSdCLwZeBvjO83PLZpoGknag2q368FUxej3qHbS31w02DSQ1O//ld8CVwKfsD0SI0+PQiOabWfN2hRIWk41RfFN4IJ6Z2grSdodeHZ9eUVb1sy0laRrgQOAH47CuoqZVq9Ta+WOwraqRyqW2z6mdJaZIunbVBt8xhpwHwO8zvaLy6WaHpI+QrUU6PP1raOoGjbPBebYbnSXhLq10/nAtYxws+0Ua1Mg6QHGjxDp/QMTVb+ZRs/lS9rH9g0Tmo8+qOFNR1tN0hnA8cDOQO9U79jfzV2LBJsBkh5LNbr2VNtvqk8QeYbtFYWjxSQkXQy8ogWjMH0N2EDxkHtNJGm17ef3XAtYbXuZpOub2m9twuvdDow32/4kjN7rXqZBp+bqtuxaGuAEqimK0+vriRV8Y5uOdsCJVH2QVgCNbiEzBedR7Zgc22F3M9WURYq10XcjsLZug9R7duYZxRJNr9slHcP46NPRVE1k2+AJkvbomdJ9EuNrZO8Z8D1NcPqE67uouiGcTvUaOFKveynWpqbtw4/nStp9rPlo3crjCKr/YN9fMFdM7grbB0n6TQf6IC2yfZSkowFsb9GEg2xjZP26/phFtYmpbY4Dzgb+g+r14rL6XhucCFwu6QaqEfvFwNsl7Uw1fdhITWu2nWnQKZB0MzDwHWDT3x1Kugp4ke07JS0DLgDeQbUW6pm2s4h7REm6juog4g8AD+mHZ3u7DZ2bRNJlwGHAWtsHSlpE1UKnLZ3iI0aSpJ2oRp0E/Mj2lsKRpk1Tmm1nZG1qZlMtpmzru/jZtsfadRwFfNz2hcCFkjYUzBWT+3uqxcwLqJoY9zLV7sm2eD9Vu4c9JZ1PtfPujSUDxdTUO8wfMjLQ5HNdod2nv0h6vu3VkiYur3iSpDa9Efw0dbPt+vonVI3hU6w10C1tOfB7gNmSHmN7K9XIxZt7Ppe/IyOsPj1jtaQrbZ9TOs9Msr1S0jrgOVRvnN458QixGFn/1PN8DtUyi62FskynK3uenwK8r1SQGfBiYDUPfRMI7Xoj2Ihm23khnpq2jqiN+TzVC/7twBZgDYCkvWlJ09G2s32OpH2opirm9Nz/XLlU00vSZ4FLgTW2byidJ6bO9roJt9ZKavQxfTB++guApHf1Xjed7ZPrx9eXzjLDGtFsO8Xa1BxWOsBMsv0hSauAJ1IdbD42rD+Lau1ajDhJJwMvAfYBLgZeStWYszXFGtVUxfOAsyTtBWwALrV9ZtlYMRlJvS1kZgEHUfXuapNWLQCfeITdRC06yq4RzbazwSCiBXqa415VH3L+ROAc261q51E3WF1K1bzyrcAW2/uUTRWTkbSRqpgR1fTnRuADtr9XNNg0GpUDv6eLpH/b3ufbdBxVE5ptZ2Qtoh222L5f0lZJ84Bbgb1Kh5pO9ejvzsDlVFP1S23fVjZVTIXtp5XOMBMkbWZ8RO2xPWcQN75hepuKse3p12xb0sg1206xFtEO6yUtAD5Fteh5EyN4vt2jdA3V9Nl+VGtK7pZ0eZvaCLSVpB2ozgVdVt/6LtXI78iNYDwcttvYMw4ASf9o+3RJY73jtmH7hAKxZkIjmm1nGjSi4erGsLvbvqW+3hvYZdSOS5kukuYCx1LtMNzd9o6FI8UkJJ1LdaTP2AL81wP32/7bcqlieyS92vbXJB3f7/Oj1ofskRGU/gsAAAv0SURBVKp30v+ZpPVjJxVJutr2/qWz9crIWkTD2bakFVSjTtj+WeFIM0LS24FDqH6fN1GNIq4pGiqmaumEF7/vSLq6WJqYlO2v1U+/bfuXRcPMrHvrpr9ju0EXMYLHaKVYi2iHH0g6sK2jabWdqE4SWVf3BNyGpMfZvmv4sWIK7pe0yPbPAerdvCPXyyr6ukDSbsAVjLfO+XHhTNPpfTSg2XamQSMabKyZcb0b9JnAz6kOyh5b4Nya3WmTadtuvDaRdBjV2qBf1LcWAsfavqRYqJgySXOAZ1OtOXwTsJPt3bb/Xc1Q92+8lqrH6C+ozlseuWbbGVmLaLYfAAcCry4dZAS0vXl140haCvzK9ipJTwfeArwIWAlkGrQB6iaxh1AVao+nGoVq0/KDsf6NL6baQb9B0sj1b8zIWkSD9S6K7bqMrI0eSVcBL7J9p6RlwAVUjbYPAJ5pe+Saj8a26qOXrgROBVb0W4LQdE3o35iRtYhm203SwC30ts8YZpiICWbbvrN+fhTwcdsXAhdK2lAwV0zdE6jWcS0D3iXpXmCt7VPKxpoeTenfOKt0gIh4VGYDc4F5Az4aT9JUG6pmGnT0zK67w0N1bN93ej6XwYIGqNdvXQ/8mGoX9mKqo+3a4hrgXqr+jUuA/erdoSMl06ARDdaFqT9J62wfJGmV7YHn9EratWcUJ0aApH8FXgbcDjwFOLBuNbM3sNz2wUUDxqQk/Zxq49IaqvOGL7f9x7Kppt+o92/MO5uIZuvCaNIsSe8DFveb8h2b6k2hNnpsf6ieZnoisNLjowOzqNauxYiS9HbbZwOLbbe2zUpT+jemWItotoEjTS3yWqrdro+hJVO7XWL7+33u/aRElnhYjgPObnOhVttu/8ZRkWnQiGgESX9p+5ulc0R0QReWWDRJirWIaARJ86m6jY8dBr4a+IDt35ZLFdFOkrYCf+j3KaqG27sMOVKnpViLiEaQdCFwHdseBr6/7b8qlyqindLDcbRkzVpENMUi20f0XJ+SXl0R0QXpsxYRTbFF0vPGLiQdTHWeX0RMvy9N5YsknTTTQSLToBHREJL2Bz4DzK9v3QW8wfY15VJFdFs2IgxHpkEjohFsXw3sL2mX+npT7+clvcH28r7fHBEzpQu9HovLNGhENIrtTRMLtdo7hx4mIjI9NwQp1iKiLfIOP2L48u9uCFKsRURb5B1+xPBNaSNCPDrZYBARrZC+UBHTR9JZbOcNkO1/GGKczssGg4hoBEmzJzmncO3QwkS035WlA8S4jKxFRCNI2gh8GTjP9vWl80REDEuKtYhoBEnzgNcCx1Ktt/0UcMGAnaERMQ0k7Qb8C7AvMGfsvu1Di4XqoGwwiIhGsL3Z9idsPxc4kepQ91skLZe0d+F4EW11PvBj4GnAKcCNwA9LBuqiFGsR0QiSZkt6paSvAmcCpwN7Ad8A/rtouIj2+hPbnwTus73a9nHAc0qH6ppsMIiIpvgpcAlwmu3Leu5/WdKyQpki2u6++vEWSYcDvwb2KJink7JmLSIaQdJc278rnSOiSyS9HFgD7AmcBewCnGL7oqLBOibFWkQ0gqQ5wPHAs9h2ofNxxUJFRAxB1qxFRFN8FtgdeCmwmmoqZnPRRBEtV2/gWdBz/ThJnyqZqYtSrEVEU+xt+z3A720vBw4H/rRwpoi2W2L77rEL23cBOSlkyFKsRURTjC10vlvSfsB8YGG5OBGdMEvS48YuJO1KNicOXf7AI6IpPl6/aJwMXATMBd5TNlJE650OXCbpy/X1kcCHCubppGwwiIiRJumEfrfrR9s+Y5h5IrpG0r7AoVT/7lbluLfhy8haRIy6efXjM4ClVKNqAK8ALi2SKKLlJO1ie1M97Xkr8Lmez+1q+85y6bonI2sR0QiSVgJH2N5cX88DvmT7L8omi2gfSStsv1zSRsBUo2oPPtreq2jAjsnIWkQ0xVOAe3uu7yUbDCJmhO2X149PK50lUqxFRHN8FvhBfTaogdcAy8tGimg/SUuo3hg9WDPY/kqxQB2UadCIaAxJBwKH1JeX2l5fMk9E29UNcJcAPwIeqG87J4cMV4q1iIiI6EvS9bb3LZ2j69IUNyIiIga5vG7dEQVlZC0iIiL6krQM+AZV+457GN8NuqRosI5JsRYRERF9SfoZcAJwLeNr1rB9U7FQHZTdoBERETHIL21fNPmXxUzKyFpERET0JemjwAKqqdB7xu6ndcdwZWQtIiIiBtmJqkh7Sc89AynWhigjaxEREREjLK07IiIioi9Je0j6qqTbJP2fpAsl7VE6V9ekWIuIiIhBzgMuAp4EPJlq7dp5RRN1UKZBIyIioi9JG2wfMNm9mFkZWYuIiIhBbpd0jKTZ9ccxwB2lQ3VNRtYiIiKiL0lPAc4G/pxqF+hlwDvTFHe4UqxFREREjLD0WYuIiIi+JO0GvAlYSE/NYPu4Upm6KMVaREREDPJ1YA3wP8D9hbN0VqZBIyIioq/s/BwN2Q0aERERg6yQ9LLSIbouI2sRERHRl6TNwM7AvcB99W3b3qVcqu5JsRYRERExwrLBICIiIgaS9EpgWX35XdsrSubpooysRURERF+SPgwsBc6vbx0NrLP97nKpuifFWkRERPQl6RrgANsP1NezgfW2l5RN1i3ZDRoRERHbs6Dn+fxiKTosa9YiIiJikFOB9ZIuAUS1du2kspG6J9OgERER8RCSBOwBbKVatybgCtu3Fg3WQSnWIiIioi9J62wfVDpH12XNWkRERAzyfUlLS4fouoysRURERF+SrgcWAzcBv6eaCnV2gw5XirWIiIjoS9JT+923fdOws3RZpkEjIiJikA/avqn3A/hg6VBdk2ItIiIiBnlW70XdFDcbDoYsxVpERERsQ9JJkjYDSyRtqj82A7cBXy8cr3OyZi0iIiL6knSq7TTBLSwjaxERETHICkk7A0g6RtIZgzYdxMxJsRYRERGDfAz4g6T9gROpWnh8pmyk7kmxFhEREYNsdbVe6lXAmbbPBOYVztQ5Ocg9IiIiBtks6STg9cAh9W7Q1A5DlpG1iIiIGOQo4B7g2PoA94OBnctG6p5UxxEREdGX7VslfQf4G0n/BWwE/rNwrM5JsRYRERHbkLQYeC1wNHAH8AWqdl8vLBqso9JnLSIiIrYh6QFgDXC87Z/V935he6+yybopa9YiIiJioiOAW4FLJH1C0mGACmfqrIysRURERF91Q9xXU02HHgosB75qe2XRYB2TYi0iIiImJWlX4EjgKNuHls7TJSnWIiIiIkZY1qxFREREjLAUaxEREREjLMVaRLSKpPslbej5WPgIfo0Fkv5u+tNFRDx8WbMWEa0i6Xe25z7KX2MhsML2fg/z+2bbvv/R/OyIiIkyshYRrSdptqTTJP1Q0jWS3lLfnytplaSrJF0r6VX1t3wYWFSPzJ0m6QWSVvT8emdLemP9/EZJ75X0PeBISYskfUvSOklrJO1Tf92Rkq6TdLWkS4f7JxARTZbjpiKibXaStKF+vtH2a4Djgd/aXippR2CtpJXAr4DX2N4k6fHA9yVdBLwb2M/2AQCSXjDJz/yj7efVX7sKeKvtn0p6NvBRqv5U7wVeavt/JS2Y3t9yRLRZirWIaJstY0VWj5cASyT9dX09H3g6cDPw75KWAQ8ATwae8Ah+5hegGqkDngt8SXqw2fuO9eNa4NOSvgh85RH8jIjoqBRrEdEFAt5h++JtblZTmbsBB9m+T9KNwJw+37+VbZeNTPya39ePs4C7+xSL2H5rPdJ2OLBB0gG273gkv5mI6JasWYuILrgYeJukHQAkLa6P0ZkP3FYXai8Enlp//WZgXs/33wTsK2lHSfOBw/r9ENubgI2Sjqx/jiTtXz9fZPsK2+8Fbgf2nP7fZkS0UUbWIqILzgUWAlepmp/8DdV5h+cD35B0JbABuAHA9h2S1kq6Dvim7X+upy+vAX4KrN/Oz3od8DFJJwM7ABcAVwOnSXo61SjfqvpeRMSk0rojIiIiYoRlGjQiIiJihKVYi4iIiBhhKdYiIiIiRliKtYiIiIgRlmItIiIiYoSlWIuIiIgYYSnWIiIiIkZYirWIiIiIEfb/HxHujuHRPG8AAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rf['column'][:10], y=feat_imp_tuned_rf['weight'][:10],data=feat_imp_tuned_rf)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RF Grid Search " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an initial RandomForest model.\n", + "rf_new = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Train model with Training Data\n", + "rfModel_new = Pipeline(stages=[label_stringIdx,va, rf_new])\n", + "\n", + "#paramGrid_rft = ParamGridBuilder().addGrid(rf_new.numTrees, [10, 30, 60]).addGrid(rf_new.maxDepth, [3, 5, 10]).addGrid(rf_new.impurity,[\"entropy\", \"gini\"]).build()\n", + "paramGrid_rft = ParamGridBuilder().addGrid(rf_new.numTrees, [60]).addGrid(rf_new.maxDepth, [10]).addGrid(rf_new.impurity,[\"gini\"]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# creating a cross validator for tuning our model\n", + "cv_rf = CrossValidator(estimator=rfModel_new, estimatorParamMaps=paramGrid_rft, evaluator=evaluator_rfb, numFolds=5).fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# store the predictions from our test set\n", + "pred_rft = cv_rf.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROC AUC sccore: 0.7880870240899686\n" + ] + } + ], + "source": [ + "print(\"ROC AUC sccore:\",evaluator_rfb.evaluate(pred_rft))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7257971517356452\n" + ] + } + ], + "source": [ + "# Printing the accuracy of our binary predictions\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]\n", + "evaluator_rfb.evaluate(pred_rft)\n", + "binary_prediction=pred_rft.select(\"prediction\").collect()\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_rf.bestModel.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "60" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_rf.bestModel.stages[-1].getNumTrees" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'gini'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_rf.bestModel.stages[-1].getImpurity()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='RandomForestClassifier_deb569e64636', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='featuresCol', doc='features column name'): 'features',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='labelCol', doc='label column name'): 'label',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='numTrees', doc='Number of trees to train (>= 1)'): 60,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='predictionCol', doc='prediction column name'): 'prediction',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities'): 'probability',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction',\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='seed', doc='random seed'): 42,\n", + " Param(parent='RandomForestClassifier_deb569e64636', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_rf.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# storing the predictions of our test set\n", + "prediction_rft=pred_rft.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# storing the true labels of our test set\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.74 0.91 0.82 131571\n", + " 1 0.66 0.34 0.45 64408\n", + "\n", + " micro avg 0.73 0.73 0.73 195979\n", + " macro avg 0.70 0.63 0.63 195979\n", + "weighted avg 0.71 0.73 0.70 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_rft,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a Dataframe of feature importances from our model\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rfg = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cv_rf.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rfg['column'][:10], y=feat_imp_tuned_rfg['weight'][:10],data=feat_imp_tuned_rfg)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest Grid\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GBT Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Declares the gbt classifier model\n", + "gbt = GBTClassifier(seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a pipeline for our model\n", + "gbt_pipe = Pipeline(stages=[label_stringIdx, va, gbt])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Fits the pipeline on our train data\n", + "gbtModel = gbt_pipe.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROC AUC Score: 0.787408211157604\n" + ] + } + ], + "source": [ + "print(\"ROC AUC Score:\",evaluator_rfb.evaluate(gbtModel.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7289046275366238\n" + ] + } + ], + "source": [ + "# Calculates the accuracy of our model\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]\n", + "evaluator_rfb.evaluate(gbtModel.transform(us_test_cat))\n", + "binary_prediction=gbtModel.transform(us_test_cat).select(\"prediction\").collect()\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gbtModel.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbtModel.stages[-1].getStepSize()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gbtModel.stages[-1].getMaxIter()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Predict on test data\n", + "prediction_gbtn=gbtModel.transform(us_test_cat).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Stores the true labels from our test data\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.76 0.88 0.81 131571\n", + " 1 0.63 0.43 0.51 64408\n", + "\n", + " micro avg 0.73 0.73 0.73 195979\n", + " macro avg 0.69 0.65 0.66 195979\n", + "weighted avg 0.72 0.73 0.71 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_gbtn,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataframe for our feature importances\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_gtbb = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], gbtModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_gtbb['column'][:10], y=feat_imp_tuned_gtbb['weight'][:10],data=feat_imp_tuned_gtbb)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from GBT Base Model\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GBT Binary Tuned Best Model" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a GBT Classifier\n", + "gbt_t_new = GBTClassifier(maxIter=55,seed=42)\n", + "gbt_pipe_t_new = Pipeline(stages=[label_stringIdx, va, gbt_t_new])\n", + "\n", + "# Create a evaluator for our model\n", + "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "\n", + "# Create grid for tuning the model\n", + "#grid_gbt_t_new = ParamGridBuilder().addGrid(gbt_t_new.stepSize, [0.1,0.3,0.01]).addGrid(gbt_t_new.maxDepth, [3, 5, 8]).build()\n", + "grid_gbt_t_new = ParamGridBuilder().addGrid(gbt_t_new.stepSize, [0.3]).addGrid(gbt_t_new.maxDepth, [8]).build()\n", + "cv1_gbt_t_new = CrossValidator(estimator=gbt_pipe_t_new,estimatorParamMaps=grid_gbt_t_new, numFolds=5, evaluator=evaluator,seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# fit the cross validation model\n", + "cvModel_gbt_t_new = cv1_gbt_t_new.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROC AUC Score: 0.8032377340351009\n" + ] + } + ], + "source": [ + "print(\"ROC AUC Score:\",evaluator_rfb.evaluate(cvModel_gbt_t_new.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7407885538756703\n" + ] + } + ], + "source": [ + "# calculates the accuracy of the binary model\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]\n", + "evaluator_rfb.evaluate(cvModel_gbt_t_new.transform(us_test_cat))\n", + "binary_prediction=cvModel_gbt_t_new.transform(us_test_cat).select(\"prediction\").collect()\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cvModel_gbt_t_new.bestModel.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cvModel_gbt_t_new.bestModel.stages[-1].getStepSize()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='GBTClassifier_34f6d1b395a8', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'all',\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='featuresCol', doc='features column name'): 'features',\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='labelCol', doc='label column name'): 'label',\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: logistic'): 'logistic',\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='maxIter', doc='maximum number of iterations (>= 0)'): 55,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='predictionCol', doc='prediction column name'): 'prediction',\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='seed', doc='random seed'): 42,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.3,\n", + " Param(parent='GBTClassifier_34f6d1b395a8', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cvModel_gbt_t_new.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Stores the prediction from ourr test set\n", + "prediction_gbt_t_new=cvModel_gbt_t_new.transform(us_test_cat).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "# Stores the true labels from our test set\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.78 0.86 0.82 131571\n", + " 1 0.63 0.50 0.56 64408\n", + "\n", + " micro avg 0.74 0.74 0.74 195979\n", + " macro avg 0.71 0.68 0.69 195979\n", + "weighted avg 0.73 0.74 0.73 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_gbt_t_new,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataframe of feature importances\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_gbt_t_new = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cvModel_gbt_t_new.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_gbt_t_new['column'][:10], y=feat_imp_tuned_gbt_t_new['weight'][:10],data=feat_imp_tuned_gbt_t_new)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from GBT Best tuned\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Trees Binary Classification Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# Create initial Decision Tree Model\n", + "dt = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creates a pipeline\n", + "dt_pipe = Pipeline(stages=[label_stringIdx, va, dt])\n", + "\n", + "# Train model with Training Data\n", + "dtModel = dt_pipe.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# Makes prediction from our test set\n", + "pred_dt = dtModel.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ROC AUC: 0.6639903304724255\n" + ] + } + ], + "source": [ + "# prints the ROC AUC score\n", + "print(\"ROC AUC:\",evaluator.evaluate(pred_dt))" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 204, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel.stages[-1].getMinInstancesPerNode()" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "32" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel.stages[-1].getMaxBins()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7156021818664244\n" + ] + } + ], + "source": [ + "# calculates the accuracy of our model\n", + "evaluator.evaluate(pred_dt)\n", + "\n", + "binary_prediction=pred_dt.select(\"prediction\").collect()\n", + "\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_dtb=pred_dt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.74 0.89 0.81 131571\n", + " 1 0.61 0.37 0.46 64408\n", + "\n", + " micro avg 0.72 0.72 0.72 195979\n", + " macro avg 0.68 0.63 0.63 195979\n", + "weighted avg 0.70 0.72 0.69 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_dtb,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dtb= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dtb['column'][:10], y=feat_imp_tuned_dtb['weight'][:10],data=feat_imp_tuned_dtb)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Binary Base Model\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Binary Classification Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "# declares a decision tree classifier\n", + "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\")\n", + "\n", + "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n", + "\n", + "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "\n", + "#grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n", + "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10]).addGrid(dt_new.minInstancesPerNode, [500]).addGrid(dt_new.maxBins,[50]).build()\n", + "\n", + "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "dtModel_t = cv1_dt.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "pred_dtt = dtModel_t.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC ROC: 0.5687880524415173\n" + ] + } + ], + "source": [ + "print(\"AUC ROC:\",evaluator.evaluate(pred_dtt))" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7307313538695472\n" + ] + } + ], + "source": [ + "# calculates the accuracy \n", + "evaluator.evaluate(pred_dtt)\n", + "\n", + "binary_prediction=pred_dtt.select(\"prediction\").collect()\n", + "\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "\n", + "print(\"Accuracy:\",np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='DecisionTreeClassifier_6cf0199ad377', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='featuresCol', doc='features column name'): 'features',\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='labelCol', doc='label column name'): 'label',\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 50,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 500,\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='predictionCol', doc='prediction column name'): 'prediction',\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities'): 'probability',\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name'): 'rawPrediction',\n", + " Param(parent='DecisionTreeClassifier_6cf0199ad377', name='seed', doc='random seed'): -3198175077911245588}" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel_t.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 179, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel_t.bestModel.stages[-1].getMaxDepth()" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "500" + ] + }, + "execution_count": 180, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel_t.bestModel.stages[-1].getMinInstancesPerNode()" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtModel_t.bestModel.stages[-1].getMaxBins()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_dtbt=pred_dtt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.77 0.85 0.81 131571\n", + " 1 0.61 0.49 0.54 64408\n", + "\n", + " micro avg 0.73 0.73 0.73 195979\n", + " macro avg 0.69 0.67 0.68 195979\n", + "weighted avg 0.72 0.73 0.72 195979\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=prediction_dtbt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dtbt= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dtbt['column'][:10], y=feat_imp_tuned_dtbt['weight'][:10],data=feat_imp_tuned_dtbt)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Binary Grid Model\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RF_DT_GBT_LR_Binary_Bal.ipynb b/RF_DT_GBT_LR_Binary_Bal.ipynb new file mode 100644 index 0000000..bd8fd9a --- /dev/null +++ b/RF_DT_GBT_LR_Binary_Bal.ipynb @@ -0,0 +1,2548 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:62046)\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 958, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1096, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n" + ] + }, + { + "ename": "Py4JNetworkError", + "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:62046)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 958\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1095\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1096\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1097\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mevaluation\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mBinaryClassificationEvaluator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeature\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOneHotEncoder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOneHotEncoderModel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mStringIndexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mVectorAssembler\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m \u001b[0mspark\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetOrCreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 27\u001b[0m \u001b[0msc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msparkContext\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\sql\\session.py\u001b[0m in \u001b[0;36mgetOrCreate\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSparkSession\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_options\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jsparkSession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msessionState\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetConfString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 189\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1282\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1283\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1284\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1285\u001b[0m return_value = get_return_value(\n\u001b[0;32m 1286\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \"\"\"\n\u001b[1;32m-> 1012\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1013\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1014\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 958\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 960\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 961\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 962\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 964\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 965\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 966\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 967\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 968\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1106\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1107\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1108\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1109\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1110\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:62046)" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import RandomForestClassifier\n", + "from pyspark.ml.classification import LogisticRegression,RandomForestClassifier\n", + "from pyspark.ml.classification import GBTClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", + "from pyspark.ml.classification import LogisticRegression\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer, VectorAssembler\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading Train data\n", + "\n", + "us_train_cat = spark.read.csv(get_training_filename('USAccident_train_bal_bin.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+\n", + "|count(Severity)|\n", + "+---------------+\n", + "| 2|\n", + "+---------------+\n", + "\n" + ] + } + ], + "source": [ + "# Number of unique labels in Severity column\n", + "\n", + "us_train_cat.agg(countDistinct(\"Severity\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+------+\n", + "|Severity| count|\n", + "+--------+------+\n", + "| 1|258836|\n", + "| 0|263700|\n", + "+--------+------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking the balance of data in training dataset\n", + "\n", + "us_train_cat.groupBy('Severity').count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the test data\n", + "\n", + "us_test_cat = spark.read.csv(get_training_filename('USAccident_validation_new.csv'), header = True, inferSchema = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+------+\n", + "|Severity| count|\n", + "+--------+------+\n", + "| 3| 58617|\n", + "| 4| 5993|\n", + "| 2|131790|\n", + "+--------+------+\n", + "\n" + ] + } + ], + "source": [ + "# Checking the balance of data in testing dataset\n", + "\n", + "us_test_cat.groupBy('Severity').count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Vector Assembler to convert all features except Severity to a single column features for feeding it to input of model\n", + "\n", + "va = VectorAssembler().setInputCols([i for i in us_train_cat.columns if i!='Severity']).setOutputCol('features')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# String Indexer to assign target Variable Severity name Label needed for the model to predict\n", + "\n", + "label_stringIdx = StringIndexer(inputCol=\"Severity\", outputCol=\"label\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label of 1 to category 3 and 4 and combine them to make it 1 category for train data\n", + "\n", + "us_train_cat=us_train_cat.withColumn(\"Severity\",when(((us_train_cat[\"Severity\"]==4) | (us_train_cat[\"Severity\"]==3)),1).otherwise(0))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning label of 1 to category 3 and 4 and combine them to make it 1 category for test data\n", + "\n", + "us_test_cat=us_test_cat.withColumn(\"Severity\",when(((us_test_cat[\"Severity\"]==4) | (us_test_cat[\"Severity\"]==3)),1).otherwise(0))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluator defined for Binary Classification\n", + "\n", + "evaluator_rfb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RF Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Create an initial RandomForest model.\n", + "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Pipeline with stages for fitting the training data\n", + "\n", + "rfModel = Pipeline(stages=[label_stringIdx,va, rf])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the training data using RF pipeline\n", + "\n", + "rf_fit = rfModel.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Predict the test data using fitted train pipeline\n", + "\n", + "pred_rfbb = rf_fit.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score is 0.7516960347127739\n" + ] + } + ], + "source": [ + "# AUC Score for the test data\n", + "\n", + "print(\"AUC Score is\", evaluator_rfb.evaluate(pred_rfbb))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_rfbb=(pred_rfbb).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.86 0.59 0.70 131790\n", + " 1 0.49 0.80 0.61 64610\n", + "\n", + " accuracy 0.66 196400\n", + " macro avg 0.68 0.70 0.66 196400\n", + "weighted avg 0.74 0.66 0.67 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_rfbb,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SparseVector(119, {2: 0.0001, 3: 0.0001, 4: 0.0, 5: 0.0, 7: 0.0, 9: 0.0006, 13: 0.0003, 14: 0.0001, 15: 0.0, 17: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 23: 0.0228, 24: 0.0088, 25: 0.0142, 26: 0.0025, 27: 0.006, 28: 0.0, 30: 0.0268, 32: 0.0003, 34: 0.0164, 36: 0.0976, 38: 0.0, 40: 0.0007, 42: 0.0017, 44: 0.0025, 48: 0.0, 49: 0.0, 50: 0.0014, 51: 0.0508, 52: 0.0274, 53: 0.0, 54: 0.0, 55: 0.0, 58: 0.0001, 59: 0.0, 60: 0.0001, 61: 0.0, 62: 0.0001, 63: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 0.0, 70: 0.0, 71: 0.0001, 73: 0.0, 75: 0.0001, 77: 0.0, 78: 0.0, 81: 0.1403, 82: 0.0434, 83: 0.0004, 84: 0.0001, 85: 0.0, 87: 0.0, 89: 0.0, 92: 0.0, 93: 0.0, 94: 0.0, 95: 0.0112, 97: 0.0, 98: 0.0005, 100: 0.05, 101: 0.0002, 102: 0.0003, 103: 0.0014, 104: 0.0, 105: 0.0209, 106: 0.0001, 107: 0.0121, 109: 0.1052, 110: 0.0, 111: 0.0209, 113: 0.001, 115: 0.0088, 116: 0.013, 117: 0.0, 118: 0.288})" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Feature Importance from RF model \n", + "\n", + "rf_fit.stages[-1].featureImportances" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of RF Base Model for Binary Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rfbb = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], rf_fit.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from Random Forest binary balanced')" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of RF Base Model for Binary Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rfbb['column'][:10], y=feat_imp_tuned_rfbb['weight'][:10],data=feat_imp_tuned_rfbb)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest binary balanced\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6618991853360489" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Accuracy calculation for RF Base Model\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]\n", + "\n", + "binary_prediction=(pred_rfbb).select(\"prediction\").collect()\n", + "\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "\n", + "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RF Grid Search " + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:62046)\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1174, in send_command\n", + " self.socket.sendall(command.encode(\"utf-8\"))\n", + "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1014, in send_command\n", + " response = connection.send_command(command)\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1178, in send_command\n", + " \"Error while sending\", e, proto.ERROR_ON_SEND)\n", + "py4j.protocol.Py4JNetworkError: Error while sending\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 958, in _get_connection\n", + " connection = self.deque.pop()\n", + "IndexError: pop from an empty deque\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1096, in start\n", + " self.socket.connect((self.address, self.port))\n", + "ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it\n" + ] + }, + { + "ename": "Py4JNetworkError", + "evalue": "An error occurred while trying to connect to the Java server (127.0.0.1:62046)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mConnectionResetError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1173\u001b[0m \u001b[1;31m# if it sent a RST packet (SO_LINGER)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1174\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"utf-8\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1175\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionResetError\u001b[0m: [WinError 10054] An existing connection was forcibly closed by the remote host", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1013\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1014\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1015\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 1177\u001b[0m raise Py4JNetworkError(\n\u001b[1;32m-> 1178\u001b[1;33m \"Error while sending\", e, proto.ERROR_ON_SEND)\n\u001b[0m\u001b[0;32m 1179\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m: Error while sending", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 957\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 958\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mIndexError\u001b[0m: pop from an empty deque", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1095\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1096\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1097\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmakefile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"rb\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mConnectionRefusedError\u001b[0m: [WinError 10061] No connection could be made because the target machine actively refused it", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Create an initial RandomForest model.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mrf_new\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mRandomForestClassifier\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlabelCol\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"label\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeaturesCol\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"features\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m42\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m# Train model with Training Data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mrfModel_new\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPipeline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstages\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlabel_stringIdx\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mva\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrf_new\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\__init__.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 109\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Method %s forces keyword arguments.\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 110\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_input_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 111\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 112\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\classification.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, featuresCol, labelCol, predictionCol, probabilityCol, rawPredictionCol, maxDepth, maxBins, minInstancesPerNode, minInfoGain, maxMemoryInMB, cacheNodeIds, checkpointInterval, impurity, numTrees, featureSubsetStrategy, seed, subsamplingRate, leafCol, minWeightFractionPerNode)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mRandomForestClassifier\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1425\u001b[0m self._java_obj = self._new_java_obj(\n\u001b[1;32m-> 1426\u001b[1;33m \"org.apache.spark.ml.classification.RandomForestClassifier\", self.uid)\n\u001b[0m\u001b[0;32m 1427\u001b[0m self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,\n\u001b[0;32m 1428\u001b[0m \u001b[0mmaxMemoryInMB\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m256\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcacheNodeIds\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheckpointInterval\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_new_java_obj\u001b[1;34m(java_class, *args)\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[0mjava_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_jvm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mjava_class\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\".\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 67\u001b[1;33m \u001b[0mjava_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjava_obj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 68\u001b[0m \u001b[0mjava_args\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0m_py2java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0marg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mjava_obj\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mjava_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 1676\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mREFLECTION_COMMAND_NAME\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1677\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mREFL_GET_UNKNOWN_SUB_COMMAND_NAME\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mname\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"\\n\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_id\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1678\u001b[1;33m \"\\n\" + proto.END_COMMAND_PART)\n\u001b[0m\u001b[0;32m 1679\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mproto\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSUCCESS_PACKAGE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1680\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mJavaPackage\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_gateway_client\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjvm_id\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1027\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_should_retry\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mretry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpne\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1028\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Exception while sending command.\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc_info\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1029\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbinary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbinary\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1030\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1031\u001b[0m logging.exception(\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry, binary)\u001b[0m\n\u001b[0;32m 1010\u001b[0m \u001b[1;32mif\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;31m`\u001b[0m \u001b[1;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1011\u001b[0m \"\"\"\n\u001b[1;32m-> 1012\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1013\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1014\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_get_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 958\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdeque\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 959\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 960\u001b[1;33m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 961\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 962\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m_create_connection\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 964\u001b[0m connection = GatewayConnection(\n\u001b[0;32m 965\u001b[0m self.gateway_parameters, self.gateway_property)\n\u001b[1;32m--> 966\u001b[1;33m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 967\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 968\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36mstart\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1106\u001b[0m \u001b[1;34m\"server ({0}:{1})\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maddress\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1107\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1108\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1109\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1110\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_authenticate_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mPy4JNetworkError\u001b[0m: An error occurred while trying to connect to the Java server (127.0.0.1:62046)" + ] + } + ], + "source": [ + "\n", + "# Create an initial RandomForest model\n", + "\n", + "rf_new = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Pipeline with stages for RF Grid Search Model\n", + "\n", + "rfModel_new = Pipeline(stages=[label_stringIdx,va, rf_new])\n", + "\n", + "# Grid Search for tuning Hyper parameters \n", + "\n", + "paramGrid_rft = ParamGridBuilder().addGrid(rf_new.numTrees, [10,25,60]).addGrid(rf_new.maxDepth, [3,6,10]).addGrid(rf_new.impurity,[\"entropy\", \"gini\"]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [ + { + "ename": "Py4JJavaError", + "evalue": "An error occurred while calling o124422.fit.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 51187.0 failed 1 times, most recent failure: Lost task 7.0 in stage 51187.0 (TID 368623, DESKTOP-TT8TT9T.fios-router.home, executor driver): java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n\nDriver stacktrace:\r\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1989)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1977)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1976)\r\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\r\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\r\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1976)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:956)\r\n\tat scala.Option.foreach(Option.scala:407)\r\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2206)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2155)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2144)\r\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:758)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2116)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2137)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2156)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2181)\r\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1003)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:588)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:226)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:144)\r\n\tat org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)\r\n\tat scala.util.Try$.apply(Try.scala:213)\r\n\tat org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:122)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:48)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:152)\r\n\tat sun.reflect.GeneratedMethodAccessor2715.invoke(Unknown Source)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)\r\n\tat java.lang.reflect.Method.invoke(Unknown Source)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Unknown Source)\r\nCaused by: java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcv_rf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCrossValidator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrfModel_new\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mestimatorParamMaps\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparamGrid_rft\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluator\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mevaluator_rfb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnumFolds\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m42\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mus_train_cat\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m raise ValueError(\"Params must be either a param map or a list/tuple of param maps, \"\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\tuning.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 350\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 351\u001b[0m \u001b[0mtasks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_parallelFitTasks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meva\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepm\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 352\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msubModel\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpool\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimap_unordered\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 353\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mj\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mmetric\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mnFolds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 354\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\multiprocessing\\pool.py\u001b[0m in \u001b[0;36mnext\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 746\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msuccess\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 747\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 748\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 749\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 750\u001b[0m \u001b[0m__next__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m \u001b[1;31m# XXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\multiprocessing\\pool.py\u001b[0m in \u001b[0;36mworker\u001b[1;34m(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)\u001b[0m\n\u001b[0;32m 119\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtask\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 120\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 121\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 122\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 123\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mwrap_exception\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfunc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0m_helper_reraises_exception\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\tuning.py\u001b[0m in \u001b[0;36m\u001b[1;34m(f)\u001b[0m\n\u001b[0;32m 350\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 351\u001b[0m \u001b[0mtasks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_parallelFitTasks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meva\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepm\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 352\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msubModel\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpool\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mimap_unordered\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 353\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mj\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mmetric\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mnFolds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 354\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcollectSubModelsParam\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\tuning.py\u001b[0m in \u001b[0;36msingleTask\u001b[1;34m()\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0msingleTask\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 52\u001b[1;33m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodelIter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 53\u001b[0m \u001b[0mmetric\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0meva\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalidation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepm\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 54\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcollectSubModel\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36m__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"No models remaining.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcounter\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 62\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfitSingleModel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 64\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfitSingleModel\u001b[1;34m(index)\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 104\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mfitSingleModel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 105\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparamMaps\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 106\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_FitMultipleIterator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfitSingleModel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparamMaps\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 127\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 128\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 129\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\pipeline.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstage\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# must be an Estimator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 109\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstage\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 110\u001b[0m \u001b[0mtransformers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mindexOfLastEstimator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m raise ValueError(\"Params must be either a param map or a list/tuple of param maps, \"\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_fit\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[0mjava_model\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_create_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjava_model\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_copyValues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_fit_java\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 316\u001b[0m \"\"\"\n\u001b[0;32m 317\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transfer_params_to_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 318\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_java_obj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 319\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1284\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1285\u001b[0m return_value = get_return_value(\n\u001b[1;32m-> 1286\u001b[1;33m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m 1287\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1288\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\pyspark\\sql\\utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 98\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 99\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 100\u001b[0m \u001b[0mconverted\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconvert_exception\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m 326\u001b[0m raise Py4JJavaError(\n\u001b[0;32m 327\u001b[0m \u001b[1;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m format(target_id, \".\", name), value)\n\u001b[0m\u001b[0;32m 329\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 330\u001b[0m raise Py4JError(\n", + "\u001b[1;31mPy4JJavaError\u001b[0m: An error occurred while calling o124422.fit.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 51187.0 failed 1 times, most recent failure: Lost task 7.0 in stage 51187.0 (TID 368623, DESKTOP-TT8TT9T.fios-router.home, executor driver): java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n\nDriver stacktrace:\r\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1989)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1977)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1976)\r\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\r\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\r\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1976)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:956)\r\n\tat scala.Option.foreach(Option.scala:407)\r\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:956)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2206)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2155)\r\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2144)\r\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\r\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:758)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2116)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2137)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2156)\r\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2181)\r\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1003)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\r\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\r\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:388)\r\n\tat org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:588)\r\n\tat org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:226)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:144)\r\n\tat org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)\r\n\tat scala.util.Try$.apply(Try.scala:213)\r\n\tat org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:122)\r\n\tat org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:48)\r\n\tat org.apache.spark.ml.Predictor.fit(Predictor.scala:152)\r\n\tat sun.reflect.GeneratedMethodAccessor2715.invoke(Unknown Source)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)\r\n\tat java.lang.reflect.Method.invoke(Unknown Source)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Unknown Source)\r\nCaused by: java.lang.OutOfMemoryError: Java heap space\r\n\tat java.lang.reflect.Array.newArray(Native Method)\r\n\tat java.lang.reflect.Array.newInstance(Unknown Source)\r\n\tat java.io.ObjectInputStream.readArray(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.defaultReadFields(Unknown Source)\r\n\tat java.io.ObjectInputStream.readSerialData(Unknown Source)\r\n\tat java.io.ObjectInputStream.readOrdinaryObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject0(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat java.io.ObjectInputStream.readObject(Unknown Source)\r\n\tat org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)\r\n\tat org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)\r\n\tat org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)\r\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)\r\n\tat org.apache.spark.storage.BlockManager.maybeCacheDiskValuesInMemory(BlockManager.scala:1516)\r\n\tat org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:825)\r\n\tat org.apache.spark.storage.BlockManager.get(BlockManager.scala:1111)\r\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1178)\r\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:360)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:311)\r\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\r\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)\r\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:313)\r\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\r\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\r\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\r\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Exception while sending command.\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1181, in send_command\n", + " answer = smart_decode(self.stream.readline()[:-1])\n", + " File \"C:\\Users\\hites\\Anaconda3\\lib\\socket.py\", line 589, in readinto\n", + " return self._sock.recv_into(b)\n", + "ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1014, in send_command\n", + " response = connection.send_command(command)\n", + " File \"C:\\Spark_installed\\spark-3.0.0-preview2-bin-hadoop2.7\\python\\lib\\py4j-0.10.8.1-src.zip\\py4j\\java_gateway.py\", line 1193, in send_command\n", + " \"Error while receiving\", e, proto.ERROR_ON_RECEIVE)\n", + "py4j.protocol.Py4JNetworkError: Error while receiving\n" + ] + } + ], + "source": [ + "# 5-fold Cross Validator Pipeline and fitting the train data using this cv pipeline\n", + "\n", + "cv_rf = CrossValidator(estimator=rfModel_new, estimatorParamMaps=paramGrid_rft, evaluator=evaluator_rfb, numFolds=5,seed=42).fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# Predicting the test data using fitted cv pipeline\n", + "\n", + "pred_rft = cv_rf.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score is 0.7819275566312943\n" + ] + } + ], + "source": [ + "# AUC Score of tuned RF model\n", + "\n", + "print(\"AUC Score is\", evaluator_rfb.evaluate(pred_rft))" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='RandomForestClassifier_8713e549bc1f', name='featuresCol', doc='features column name.'): 'features',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='seed', doc='random seed.'): 42,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='featureSubsetStrategy', doc=\"The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'\"): 'auto',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='numTrees', doc='Number of trees to train (>= 1).'): 60,\n", + " Param(parent='RandomForestClassifier_8713e549bc1f', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0}" + ] + }, + "execution_count": 216, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best Model Hyper Parameters after tuning\n", + "\n", + "cv_rf.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "60" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best model number of trees parameter from Grid Search\n", + "\n", + "cv_rf.bestModel.stages[-1].getNumTrees" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6788441955193483" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Accuracy of the model on test data\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]\n", + "\n", + "binary_prediction=pred_rft.select(\"prediction\").collect()\n", + "\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "\n", + "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_rft=pred_rft.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.87 0.61 0.72 131790\n", + " 1 0.51 0.81 0.62 64610\n", + "\n", + " accuracy 0.68 196400\n", + " macro avg 0.69 0.71 0.67 196400\n", + "weighted avg 0.75 0.68 0.69 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_rft,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of RF Grid Model for Binary Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_rfg = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cv_rf.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from Random Forest Grid')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of RF Grid Model for Binary Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_rfg['column'][:10], y=feat_imp_tuned_rfg['weight'][:10],data=feat_imp_tuned_rfg)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from Random Forest Grid\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GBT Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the GBT Base model\n", + "\n", + "gbt = GBTClassifier(seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "# Pipeline with stages to be used to fit the train data\n", + "\n", + "gbt_pipe = Pipeline(stages=[label_stringIdx, va, gbt])" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "# Fitting the training data using the pipeline above\n", + "\n", + "gbtModel = gbt_pipe.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score is 0.7842374103722184\n" + ] + } + ], + "source": [ + "# AUC Score from the model on the test data\n", + "\n", + "print(\"AUC Score is\", evaluator_rfb.evaluate(gbtModel.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_gbtn=(gbtModel.transform(us_test_cat)).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "# True labels from the test data for the target variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.88 0.60 0.72 131790\n", + " 1 0.51 0.83 0.63 64610\n", + "\n", + " accuracy 0.68 196400\n", + " macro avg 0.69 0.72 0.67 196400\n", + "weighted avg 0.75 0.68 0.69 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_gbtn,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of GBT Base Model for Binary Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_gtbb = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], gbtModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from GBT Base Model')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of GBT Base Model for Binary Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_gtbb['column'][:10], y=feat_imp_tuned_gtbb['weight'][:10],data=feat_imp_tuned_gtbb)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from GBT Base Model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GBT Binary Classification Grid Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GBT Binary Tuned Best Model" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Initializing GBT Grid Pipeline \n", + "\n", + "gbt_t_new = GBTClassifier(maxIter=55,seed=42)\n", + "\n", + "# Creating pipeline for GBT grid Model \n", + "\n", + "gbt_pipe_t_new = Pipeline(stages=[label_stringIdx, va, gbt_t_new])\n", + "\n", + "# Binary Classification Evaluator\n", + "\n", + "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "\n", + "# Creating Grid Search for Hyper Parameter Tuning for GBT model\n", + "\n", + "grid_gbt_t_new = ParamGridBuilder().addGrid(gbt_t_new.stepSize, [0.2,0.4,0.01]).addGrid(gbt_t_new.maxDepth, [3, 5, 8]).build()\n", + "\n", + "# Cross Validator Pipeline with 5 fold cv to fit the training data\n", + "\n", + "cv1_gbt_t_new = CrossValidator(estimator=gbt_pipe_t_new,estimatorParamMaps=grid_gbt_t_new, numFolds=5, evaluator=evaluator,seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [], + "source": [ + "# Fitting train data using 5-fold cross validator pipeline\n", + "\n", + "cvModel_gbt_t_new = cv1_gbt_t_new.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score is 0.8118245222266022\n" + ] + } + ], + "source": [ + "# AUC Score from the fitted pipeline for the test data\n", + "\n", + "print(\"AUC Score is\", evaluator.evaluate(cvModel_gbt_t_new.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7997203339021297" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# AUC Score from the Cross validator Pipeline\n", + "\n", + "np.max(cvModel_gbt_t_new.avgMetrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='GBTClassifier_48357a426a79', name='featuresCol', doc='features column name.'): 'features',\n", + " Param(parent='GBTClassifier_48357a426a79', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='GBTClassifier_48357a426a79', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='GBTClassifier_48357a426a79', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='GBTClassifier_48357a426a79', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='GBTClassifier_48357a426a79', name='seed', doc='random seed.'): 42,\n", + " Param(parent='GBTClassifier_48357a426a79', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n", + " Param(parent='GBTClassifier_48357a426a79', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n", + " Param(parent='GBTClassifier_48357a426a79', name='featureSubsetStrategy', doc=\"The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'\"): 'all',\n", + " Param(parent='GBTClassifier_48357a426a79', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: variance'): 'variance',\n", + " Param(parent='GBTClassifier_48357a426a79', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n", + " Param(parent='GBTClassifier_48357a426a79', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: logistic'): 'logistic',\n", + " Param(parent='GBTClassifier_48357a426a79', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32,\n", + " Param(parent='GBTClassifier_48357a426a79', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 8,\n", + " Param(parent='GBTClassifier_48357a426a79', name='maxIter', doc='max number of iterations (>= 0).'): 55,\n", + " Param(parent='GBTClassifier_48357a426a79', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n", + " Param(parent='GBTClassifier_48357a426a79', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='GBTClassifier_48357a426a79', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,\n", + " Param(parent='GBTClassifier_48357a426a79', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0,\n", + " Param(parent='GBTClassifier_48357a426a79', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.2,\n", + " Param(parent='GBTClassifier_48357a426a79', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0,\n", + " Param(parent='GBTClassifier_48357a426a79', name='validationTol', doc='Threshold for stopping early when fit with validation is used. If the error rate on the validation input changes by less than the validationTol, then learning will stop early (before `maxIter`). This parameter is ignored when fit without validation is used.'): 0.01}" + ] + }, + "execution_count": 217, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Hyper pramaters from the best model \n", + "\n", + "cvModel_gbt_t_new.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_gbt_t_new=cvModel_gbt_t_new.transform(us_test_cat).toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.88 0.65 0.75 131790\n", + " 1 0.53 0.82 0.65 64610\n", + "\n", + " accuracy 0.70 196400\n", + " macro avg 0.71 0.73 0.70 196400\n", + "weighted avg 0.77 0.70 0.71 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_gbt_t_new,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of GBT Grid Model for Binary Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_gbt_t_new = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cvModel_gbt_t_new.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from GBT Best tuned')" + ] + }, + "execution_count": 182, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmcAAAK/CAYAAAAs32pTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzdd7ykZX3//9dblqaAiKyiFAGDImJDQBRLrAGjYouCGmJFElGxJXbFqPFnorEhhCio0YjdoEFRI2IDZCmCiPxEVFgBXQtFkKaf7x/3PTIczu45h53Zuc7s6/l4nMeZu87nnvqe6y5XqgpJkiS14RaTLkCSJEk3MJxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJq0lkmyQpJJsNeE6LknywEnWoE6SpyT5RZLfJ7nbpOvReLTy3tf8Gc40cv0H/eDvT0n+MDT89BHf19OTnNjfx5dnmb5bkjOSXJXke0l2XsW6jk5yzYz6H7+a9fmhOCEthcAkJyV5xqTrmMW/A8+uqo2q6pw1fedJbpHk4CQ/6N+jFyf5epInDc1zUpKr+/fjZUmOHwTJJB8aeq9em+S6oeHPzXJ/e/WfSYN5LkzymhFsx15JzptjnqOTvHZ170trB8OZRq7/oN+oqjYCLgAeOzTuYyO+u98A7wDeOXNCkg2B/wGOAG4DfAr4XJIlq1jfPw/XX1WfH3G9C5ZknUnXoJunDx9Nfs4mWRe4I3D2Sqav6n0yKv8B/D3wQmAzYGvgEGDvGfM9t/88uS3wPeAogKp65tBnzTuBDw+9d5+wkvs8f2iZhwEvSrLXyLdMWg1NfmhouiXZMMmh/a/k5Un+tf+i+PMv0CSHJPltkvOT/M3K1lVVX66qTwMXzzL5kcDVVfX+qrqGLsRtDCy4NSXJ1kn+J8mv+5oOHJq2Z5KT+1/1FyX596Evtm/2/88dtMQlOTDJ14aWv1HrWv8L+z1JvpLkSuD+/WP2rv6X/iVJ3ptk/X7+LZJ8OcmlSX6T5OtzbM7jk/wsyYokb0mSfj07JvlG/7ivSPLhJBsP1fm6/jm7PMk5SR7Uj1+nn3Z+//h8LMmmQ8s9J8kF/TpfMcfjvFmS/+7n/WmSfxyq78Ak/9c/Npcm+UmSR8yxrYP1Hti3yLyvf55+nGTXJAek2633yyT7Ds1/dP8YH5/kiv5+txya/pAkp/XrOinJbkPTTkrypiQnA1cB/wnsBnygfw28o5/vsP71f3m6Vt09htbxtv5x/Hh//2cmuffQ9G2HXo+/Hqyzn/b8JOf2z+P/Dtc9NM+tgd/1g+cmObsff0mSl/fDl/fj7pHkW/1jfmaSvYfWc3SSdyf5apIr+9fP7ZK8v5//7CT3WMlzcg/g2cCTq+r4qrq6qq6vqhOq6rmzLVNV1wOfAHaabfpCVdWPgZOH15dk5/618rv+df74oWn7JPlR/5xcmORFSW4LfA7YPje0yN12xra+CHgS8Lp++qcyS6t6hlrXcsNn4av798MvMrTnIav4TOinv6Z/XS8HWmy11SoYzjQJhwD3BO4B3Bf4S+Afh6ZvC6wHbAEcAHw4yXY3437uDnx/MFBVfwJ+0I+ft3QtV8cC36VradgLeHWSh/SzXAccRPfL/0HAY4HBl8uD+/93XWBL3DOA19GFyVPodj9tRfeY3RW4C/DKft5/As4FNgfuALxxjnU/Frg3sDuwHzC8q/lNdI/74H5eA5DkXsCz+uVuDfw1sLxf5hXAo+hC71Z0j8e/98vdG3gX8NR+2rZ9nStzOLAusB1duP574GlD0x8MLKNrQXkf8IE5tnXYg+iew9sCnwc+A9ytv6/nAYcl2WBo/r8FXg0sBX4MfLjfptsBXwDe1q/rcODYPvAMPAPYn+75ez7dc/jc/jXwsn6eE+ke59vStfB+Kv2PlN4TgCOBTYH/o3scBy1eXwLOAbaha236TD9tX+Bguuf49sDpwEdnPhBVdRk3PA93rarh98RT6R772/aPxxf7x2sp3XP9qRnvx6cCL+/XtwQ4CTih365jgbfPvP/ew4EfV9VZK5l+E334eFp/H6st3e7R+9EFNJJsAnwV+CDd9uwPHJnkL/pFjgT2r6qN6d4L36qq39A9V+cPtdr9Zvh+quo9dM/RoGV+pT84Z7gTELrPnYOAw5Ns1E9b6WdCHyj/AXgIsCM3bYlU66rKP//G9gf8DHjEjHG/AB42NLwP8KP+9l7A1cAGQ9OPAV4xx/0cBHx5xri3AB+aMe4zwCtXso6jgT8Al/Z/y/vxD6H7Ehme9xDgsJWs55XAx/vbGwAFbDU0/UDga0PDN5qnr+OIoelLgGuBLYfGPRQ4p7/9drpdttvP8RgN7ucvh8a9FPjflcy/L3Bif/vudK2TDwWWzJjvp8CeQ8Pb0bUYBXjr8HNAF+z+BDxwlvtbH/jj8HYALx48r/3j9oOhaZv127PpSuq/ZHA//bJnDU3brV/21kPjrgR2HHoOPjTLfS2lC3LfnHFfpwP79rdPAl49Y/pJwDNW8dykf8zu2g+/Dfji0PRdgEuHnvtfALeYZT3HA08fGl6XLizffhWvh+HX5iXA04aGHwn8HMjQuM/Rv4f6x+m9Q9NeAZw+43G+ZCXb/GbgGzPG/ZruvXf1oOb+sbuyH38t8FvgQbOs723AB+Z4D+zVv8YupWsZLODjg9c08HfAV2cs82Hgn/rbv6T7kbLxLOs9b477Php47RyP/5/n6dd52fDz3Nd8b+b+TPhv4I1D0+458778a/vPljOtUUlC1zLz86HRPweGd72sqKqrZ0y/4824u98Dm8wYtwlwxSqWeUtVbdr/DXY33AnYtt9Nc2mSS+lCzRYASXZK8qV+F8LlwOtZdevQfFw4dPuOdF+yZw/d/+eB2w1qBi4Cju93g7x0Aev+82Ob5I797pZf9NvxgcF2VNXZdKHzLcCv+l1ut++fz63pWo4GtZ1O1yp/237df76/6lpsLltJXVv0y10wo77h18YlQ7ev6v9vxPz8cuj2H4Br+nqGxw2va7ju39K9nu7Y/w2/fmer80LmkORV/e7Hy+h2MW7AjV83M7d1UNvWwE+rawme6U50rSuD52IFcD1dC8t8zXztXVD9N3xv5rbOfFxnDq/s+fkNXUvvn1XV5n2t69MF1oHnV9WmdI/Rk4EvJNlx7k2Z1U/79/cmdKF7Xbpdz9A9fg+e8V5/0lCdj++HL+h3fe56M2uYrxUznufB62Cuz4Qbve+46etVjTOcaY3qP+QvofsQHNiGriVgYPMZu5e2oQsfC3U2cK/BQLoDs3dmJQdAr8KFdC17mw79bVw3HHD8n8BpwJ37D/w3ccMXS82yviuBWw4NbzHLPMPLXUz3BXvnofu/dVXdFrrAU1Uvrqo70X1xvDbJnqvYnq2Hbg8/tv/a17Zzvx3PHdoOqurDVfUAYHu6L8k398/noCV0+PHZoKp+3df+5/vrd/0N7/4bdgldq9o2M+r7xeyzj91w3ZvRfSleTPd43WnGvDPrnPm832g4ySPpDoJ/At1uy83ogkyY24V0PxZm+/y+EHjmjOdiw6o6dR7rna3Wi7jx8wGje07+D/iLJPecd2FVf6qqr9Nt57yON5xjfb+jazl7bD/qQuArMx6/jarq4H7+E6vqMXS7jL/SLwuzv89vcnczhq+la9Wc67NgNqv8TGDG+46bPodqnOFMk/Bx4A1Jbtsfv/MabnxczLp0B86ul+RhdLtWPjPbitIdjL4BXTP/LfqDbAcH438V2DDdweDrAy+hCx/fXmC93+7v6+DB+pPcM8ku/fSNgcuq6vdJ7k632wuA6k5EuIwu0AycAdwnyd2T3JKupW2lquo6umNd3p1k83S27r/gSfK4JNv1rViX0e22+eMqVvlPSW6dZFu63cGfGNqO3wOXJ9mGrnWQ/j52SncQ/Pp0IeIPQ/dxOPC2JFv3894uyeDL7pPAE5Pcr1/2zXQBbLbtvIZul9lbk9wqyZ3pdmve5JipNWSfGXUfX1W/otvNfp8kT+5fC/vTffnd5FIuQ37JjV8DG9N9Ma+gO77yTXSBdz6+Tdf6+89JbtkfGP6AftrhdOH8rgBJbpOhy1LcDN+ie18d3G/rI+mOL/zUaqwTgKo6k26X4SeTPHTovXv/VS2X5MHADiz8R9Zs69oYeMrQuj5P99w+Ncm6/WfQHknu0r8m9+2PS7uO7jkYvAd+Cdxu6Hiw2dzoNdC3iJ0FPL3/HHssc2z70LKr/Eyge989t697I+b4jFF7DGeahNcDP6T7QDwD+A43Pmj4Z3S/Ci+h+wB6VlWdv5J1PY8uKPw7XYj7A92B4lTVH+iOZzuQ7hiTfYHHV3fG17z1H4SPBh5At3tgBXAYN+yueQndB+HvgUO5IewMb++n+t0Pj6vuAOi3033x/Qj4xjzKOJiuFWMZXQD7MjA4SPlu/TquoDs79N+qalUHTP8v3YkSy+i+ZAfh5/V0B/VfRheShgPxhnRnuw5aw4Y/8N8OfA34epIr6A663wWgqk4HXgZ8mu4Eggv6dazM8/v/Pwe+TrdrddSXX5mvj9Idx/Rrusf47wCq6pfA4+h+VPyGLuA+pqouXcW6/h3YP90ZgG+nO6Hgm8BPgPP7+1gxn6KGXo/34obH9In9tI/Tvf4/2++aPoPufXGz9IcXPIZuV+Jv6C5X8dSq+snNXecMz6VreX4f3a7dC4FX0bUAD+8eHZzp+nu618TLqur4m3mf2w+t62d0oXjw3P4O+Cu648oGraRvpvvBCN3ZpT+ne4/sP1iO7v10DPDz/n2+2Sz3ewSwWz/96H7cQXQnVPyOrhX1iwvYjpV+JlTV5/r7G3zGHLeA9aoBufGhBNJkpbve0Puq6i/mnFkak/7L8wdV9eZJ1yJp7WPLmSRJUkMMZ5IkSQ1xt6YkSVJDbDmTJElqiOFMkiSpIUvmnmXx2HzzzWvbbbeddBmSJElzOvXUU39dVUtnjp+qcLbtttuybNmySZchSZI0pySzdq3lbk1JkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhSyZdwLjd9xUfmXQJC3bqv+4/6RIkSdKE2HImSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDVkrOEsyV5Jzk1yXpJXzjJ9xyQnJrkmycuHxm+d5Pgk5yQ5O8mLx1mnJElSK5aMa8VJ1gEOBR4JLAdOSXJMVf1waLbfAi8CHj9j8euBl1XVaUk2Bk5N8tUZy0qSJE2dcbac7Q6cV1XnV9W1wNHAPsMzVNWvquoU4LoZ4y+uqtP621cA5wBbjrFWSZKkJowznG0JXDg0vJybEbCSbAvcBzh5JFVJkiQ1bJzhLLOMqwWtINkI+AxwcFVdvpJ5DkiyLMmyFStW3IwyJUmS2jHOcLYc2HpoeCvgovkunGRdumD2sar67Mrmq6ojqmrXqtp16dKlN7tYSZKkFowznJ0C7JBkuyTrAfsCx8xnwSQBPgicU1XvHGONkiRJTRnb2ZpVdX2Sg4DjgHWAI6vq7CQH9tMPT7IFsAzYBPhTkoOBnYB7An8LnJXkjH6Vr66qY8dVryRJUgvGFs4A+jB17Ixxhw/dvoRud+dM32b2Y9YkSZKmmj0ESJIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1ZMmkC9DqueBN95h0CQu2zevPmnQJkiQ1y5YzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhow1nCXZK8m5Sc5L8spZpu+Y5MQk1yR5+UKWlSRJmkZjC2dJ1gEOBfYGdgL2S7LTjNl+C7wI+LebsawkSdLUGWfL2e7AeVV1flVdCxwN7DM8Q1X9qqpOAa5b6LKSJEnTaJzhbEvgwqHh5f24cS8rSZK0aI0znGWWcTXqZZMckGRZkmUrVqyYd3GSJEktGmc4Ww5sPTS8FXDRqJetqiOqateq2nXp0qU3q1BJkqRWjDOcnQLskGS7JOsB+wLHrIFlJUmSFq0l41pxVV2f5CDgOGAd4MiqOjvJgf30w5NsASwDNgH+lORgYKequny2ZcdVqyRJUivGFs4AqupY4NgZ4w4fun0J3S7LeS0rSZI07ewhQJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYsmXQB0qrs+d49J13Cgnznhd+ZdAmSpEXOljNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSFjDWdJ9kpybpLzkrxylulJ8p5++plJdhma9pIkZyf5QZKPJ9lgnLVKkiS1YGzhLMk6wKHA3sBOwH5Jdpox297ADv3fAcBh/bJbAi8Cdq2qnYF1gH3HVaskSVIrxtlytjtwXlWdX1XXAkcD+8yYZx/gI9U5Cdg0yR36aUuADZMsAW4JXDTGWiVJkpowznC2JXDh0PDyftyc81TVL4B/Ay4ALgYuq6qvjLFWSZKkJowznGWWcTWfeZLchq5VbTvgjsCtkjxj1jtJDkiyLMmyFStWrFbBkiRJkzbOcLYc2HpoeCtuumtyZfM8AvhpVa2oquuAzwIPmO1OquqIqtq1qnZdunTpyIqXJEmahHGGs1OAHZJsl2Q9ugP6j5kxzzHA/v1Zm3vQ7b68mG535h5JbpkkwMOBc8ZYqyRJUhOWjGvFVXV9koOA4+jOtjyyqs5OcmA//XDgWODRwHnAVcCz+mknJ/k0cBpwPXA6cMS4apUkSWrF2MIZQFUdSxfAhscdPnS7gBesZNk3AG8YZ32SJEmtsYcASZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhiw4nCW5TZJ7jqMYSZKktd28wlmSbyTZJMlmwPeBo5K8c7ylSZIkrX3m23J266q6HHgicFRV3Rd4xPjKkiRJWjvNN5wtSXIH4CnAF8dYjyRJ0lptvuHsEOA44LyqOiXJ9sCPx1eWJEnS2mnJPOe7uKr+fBJAVZ3vMWeSJEmjN9+Ws/fOc5wkSZJWwypbzpLcH3gAsDTJS4cmbQKsM87CJEmS1kZz7dZcD9ion2/jofGXA08eV1GSJElrq1WGs6o6ATghyYeq6udrqCZJkqS11nxPCFg/yRHAtsPLVNXDxlGUJEnS2mq+4exTwOHAB4A/jq8cSZKktdt8w9n1VXXYWCuRJEnSnGdrbtbf/EKSfwA+B1wzmF5Vvx1jbZIkSWuduVrOTgUKSD/8iqFpBWw/jqIkSZLWVnOdrbndmipEkiRJ8zzmLMkTZxl9GXBWVf1qtCVJkiStveZ7QsBzgPsDx/fDfwmcBNwlyZuq6r/GUJskSdJaZ77h7E/A3arqlwBJbg8cBtwP+CZgOJMkSRqB+XZ8vu0gmPV+BdylP1vzutGXJUmStHaab8vZt5J8ke5itABPAr6Z5FbApWOpTJIkaS0033D2ArpAtifdZTU+Anymqgp46Jhqk6beCQ9+yKRLWJCHfPOESZcgSVNvXuGsD2Gf7v8kSZI0JnP1EPDtqnpgkivoLjr750l0mW2TsVYnSZK0lpnrIrQP7P9vvGbKkSRJWrvN92xNkjwwybP625snsfcASZKkEZtXOEvyBuCfgFf1o9YDPjquoiRJktZW8205ewLwOOBKgKq6CHBXpyRJ0ojNN5xd25+xWQD99c0kSZI0YvMNZ59M8h/ApkmeB3wN+M/xlSVJkrR2mutSGgcD3wHeRXex2cuBuwKvr6qvjr88SZKktctcF6HdCng3sCNwJvBdurB26pjrkiRJWivNdZ2zlwMkWQ/YFXgA8GzgP5NcWlU7jb9ESZKktcd8+9bcENgEuHX/dxFw1riKkiRJWlvNdczZEcDdgSuAk+l2a76zqn63BmqTJEla68x1tuY2wPrAJcAvgOXApeMuSpIkaW011zFneyUJXevZA4CXATsn+S1wYlW9YQ3UKEmStNaY85iz/uKzP0hyKXBZ//cYYHfAcCZJkjRCcx1z9iK6FrM9gevoLqNxInAknhAgSZI0cnO1nG0LfBp4SVVdPP5yJEmS1m5zHXP20jVViCRJkubft6YkSZLWAMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1JCxhrMkeyU5N8l5SV45y/QkeU8//cwkuwxN2zTJp5P8KMk5Se4/zlolSZJaMLZwlmQd4FBgb2AnYL8kO82YbW9gh/7vAOCwoWnvBr5cVTsC9wLOGVetkiRJrRhny9nuwHlVdX5VXQscDewzY559gI9U5yRg0yR3SLIJ8GDggwBVdW1V2eG6JEmaeuMMZ1sCFw4NL+/HzWee7YEVwFFJTk/ygSS3GmOtkiRJTRhnOMss42qe8ywBdgEOq6r7AFcCNzlmDSDJAUmWJVm2YsWK1alXkiRp4sYZzpYDWw8NbwVcNM95lgPLq+rkfvyn6cLaTVTVEVW1a1XtunTp0pEULkmSNCnjDGenADsk2S7JesC+wDEz5jkG2L8/a3MP4LKquriqLgEuTHLXfr6HAz8cY62SJElNWGXH56ujqq5PchBwHLAOcGRVnZ3kwH764cCxwKOB84CrgGcNreKFwMf6YHf+jGmSJElTaWzhDKCqjqULYMPjDh+6XcALVrLsGcCu46xPkiSpNfYQIEmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQ8bafZOktdf7XvaFSZewYAe947GTLkGSbDmTJElqieFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIasmTSBUjSYvSWZzx50iUs2Gs++ulJlyBpHmw5kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGjLWcJZkryTnJjkvyStnmZ4k7+mnn5lklxnT10lyepIvjrNOSZKkVowtnCVZBzgU2BvYCdgvyU4zZtsb2KH/OwA4bMb0FwPnjKtGSZKk1oyz5Wx34LyqOr+qrgWOBvaZMc8+wEeqcxKwaZI7ACTZCvhr4ANjrFGSJKkp4wxnWwIXDg0v78fNd553Af8I/GlVd5LkgCTLkixbsWLF6lUsSZI0YeMMZ5llXM1nniSPAX5VVafOdSdVdURV7VpVuy5duvTm1ClJktSMcYaz5cDWQ8NbARfNc549gccl+Rnd7tCHJfno+EqVJElqwzjD2SnADkm2S7IesC9wzIx5jgH278/a3AO4rKourjZKDRQAACAASURBVKpXVdVWVbVtv9zXq+oZY6xVkiSpCUvGteKquj7JQcBxwDrAkVV1dpID++mHA8cCjwbOA64CnjWueiRJkhaDsYUzgKo6li6ADY87fOh2AS+YYx3fAL4xhvIkSZKaYw8BkiRJDRlry5kkaXE65y1fn3QJC3a31zxs0iVII2HLmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQ+whQJK01nnjG9846RIWZLHVq9Vjy5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDlky6AEmSNFqf/NTuky5hQZ7yN9+bdAlNMZxJkqRF416fPm7SJSzY95/8Vwua392akiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDDGeSJEkNMZxJkiQ1xHAmSZLUEMOZJElSQwxnkiRJDTGcSZIkNcRwJkmS1BDDmSRJUkMMZ5IkSQ0xnEmSJDXEcCZJktQQw5kkSVJDxhrOkuyV5Nwk5yV55SzTk+Q9/fQzk+zSj986yfFJzklydpIXj7NOSZKkVowtnCVZBzgU2BvYCdgvyU4zZtsb2KH/OwA4rB9/PfCyqrobsAfwglmWlSRJmjrjbDnbHTivqs6vqmuBo4F9ZsyzD/CR6pwEbJrkDlV1cVWdBlBVVwDnAFuOsVZJkqQmjDOcbQlcODS8nJsGrDnnSbItcB/g5JFXKEmS1JhxhrPMMq4WMk+SjYDPAAdX1eWz3klyQJJlSZatWLHiZhcrSZLUgnGGs+XA1kPDWwEXzXeeJOvSBbOPVdVnV3YnVXVEVe1aVbsuXbp0JIVLkiRNyjjD2SnADkm2S7IesC9wzIx5jgH278/a3AO4rKouThLgg8A5VfXOMdYoSZLUlCXjWnFVXZ/kIOA4YB3gyKo6O8mB/fTDgWOBRwPnAVcBz+oX3xP4W+CsJGf0415dVceOq15JkqQWjC2cAfRh6tgZ4w4ful3AC2ZZ7tvMfjyaJEnSVLOHAEmSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJaojhTJIkqSGGM0mSpIYYziRJkhpiOJMkSWqI4UySJKkhhjNJkqSGGM4kSZIaYjiTJElqiOFMkiSpIYYzSZKkhhjOJEmSGmI4kyRJashYw1mSvZKcm+S8JK+cZXqSvKeffmaSXea7rCRJ0jQaWzhLsg5wKLA3sBOwX5KdZsy2N7BD/3cAcNgClpUkSZo642w52x04r6rOr6prgaOBfWbMsw/wkeqcBGya5A7zXFaSJGnqjDOcbQlcODS8vB83n3nms6wkSdLUWTLGdWeWcTXPeeazbLeC5AC6XaIAv09y7rwrXD2bA78ex4rzb383jtUu1Ni2jzfM9vSucWPZvryoiW2DcT1/me7te+E7R73Gm2Vs773XfqyJ5298ny2vHctaF2os23fIIYeMepU311i276mzfu1PxHi+G1Y+6U6zjRxnOFsObD00vBVw0TznWW8eywJQVUcAR6xusQuVZFlV7bqm73dNcfsWN7dv8ZrmbQO3b7Fz+9aMce7WPAXYIcl2SdYD9gWOmTHPMcD+/VmbewCXVdXF81xWkiRp6oyt5ayqrk9yEHAcsA5wZFWdneTAfvrhwLHAo4HzgKuAZ61q2XHVKkmS1Ipx7takqo6lC2DD4w4ful3AC+a7bGPW+K7UNcztW9zcvsVrmrcN3L7Fzu1bA9LlI0mSJLXA7pskSZIaYjiTJElqiOFMa60kr590DZq/JFsleWh/e/0kt5p0TZqfvks+SfPkMWcLlOROwA5V9bUkGwJLquqKSdc1LkmOqKoD5p5z8UlyQVVtM+k6VkeS05n9As2hO+dmlzVc0lgkeTZwEHDrqrpzkrsA76+qR0y4tJFIcnvgrcAdq2rvvi/h+1fVBydc2kgk+SnwaeCoqvrhpOsZlSRfYCUXSAeoqsetwXLGLslGwF2A86vq0knXs7r6S3Vd15+cSP/jbxfgh1X1pYnWZjibvyTPo+uNYLP+C2IH4PCqeviES1stSTZb2STg+1W11ZqsZ5SSXL6yScCGVTXWM5bHLcmdVzW9qn6ypmoZpyRn0PW5e3JV3acfd2ZV3XOylY1Gki8BRwGvqap7JVkCnF5V95hwaSORZGO661U+i26PzZHA0VW1svfnopDkIf3NJwJbAB/th/cDflZVr55IYSOS5P1V9Q/97QcC/w38BPgL4Pn9VRUWrSTfB/6yqn6X5BXAE+iuEvEQYFlVvWpitRnO5m8lXxBnLfYP0CR/BH7OjXuYGHSjtWVVrTeRwkYgyQXAblX1y1mmXVhVW8+ymBqT5KSq2iPJ6VV1n3432RmL/b03kOSUqtptsH39uDOq6t6Trm3UkjwY+DiwKV1r2j9X1XmTrWr1JPlmVT14rnGLTZLTBq3vSY4HXlZVpyXZHvhkC1fSXx1JflBVO/e3lwEPqqo/9D+OTpvkjz+POVuYa6rq2sFA/wROQ7o9n+7Xw3ZDf9tX1XbATULNIvMRVtJ3Gd2vwKmQZLckJyW5LMnVSa5ZRavhYvSdJP8IbNDvevgE8MUJ1zRKVya5Lf3nyaDHlMmWNDpJ1knyuCSfA94NvAPYHvgCbV/Pcr6W9oEFgCTbAUsnWM84bFJVpwFU1fl0F4hf7C5PsnN/+9fABv3tJUw4Hy3qXToTcEKSVwMbJnkk8A90Hy6L3buA2wAXzDLt7Wu4lpGqqpV2hVxV/zS4neTui7wXivcDzwCOpmvdfSY37p92sftHukMKfgS8mK73kP+YaEWj9VK6LurunOQ7dF/sT55sSSP1Y+B44F+r6rtD4z/dt6Qtdi8BvpHk/H54W+D5kytnZHZMcibdXpRtk9ym3wV4C2DdCdc2CgcCH+t3b/4KWJbkBOCedMeAToy7NRegf0E+B3gU3Yv1OOADtZY8iEkeWVVfnXQd4zDcfL8YJTm1qu47vJs9yXer6gGTrm119bswj6yqv5t0LePQf67sAXwPuCvdZ8u5VXXdRAsboSQPrKpvzxi3Z1V9Z1I1jVqS9YEd+8EfVdU1k6xnFPoT4IZdVFXXJdkceHBVfXYSdY1S//nyKLoTHZYAy4HjJn3Cg+FsAfpT96+uqj/2w+sA61fVVZOtbM1Y7AFmVYaP9VmMknwTeATdgdYXABcDz5uiA+a/Avz1NAWWYUlOrKr7T7qOcZnts2OaPk+S3JKu9fNOVfW8/mSxu1bVNO16X6kkn6mqJ026jnGZxPa5W3Nh/o/uC/D3/fCGwFeARd86MU+Ze5ZFa7H/Snkm3TESBwEvA3ZgunaLnQ98K8n/AFcORlbVeyZX0kh9JcmTgM9OU0t8kvvTfT4uTfLSoUmbMB3HLA0cBZwKDAL2cuBTTNdxkauy/dyzLGprfPsMZwuzQVUNghlV9fv+F9PaYmq+NKZNf4AuwNXA6yZZy5isAL4K3LL/mzYvBW4FXJ/kam64Tt0mky1rta0HbET3XbPx0PjLma4fD3euqqcm2Q+gP+Nvmn/MzjTt3w1rfPsMZwtzZZJdBmesJLkv8IcJ16TRuHbuWdrVn933BrozU//8vq6qu0ysqBGqqmkMnH9WVRvPPdfiU1Un0J1I9aGq+vmk6xmja/uLkg/Otr0zsOiPOdPkGM4W5mDgU0ku6ofvADx1gvWsaT+bdAE3V5LnDF9tvT9e8LVVdQhAVe0xseJG4yi6MxpPBf444VpGLslXmeXXa1U9agLljNzKzlisqm+u6VpGKcm7qupg4H1JZnv+puUK+m8AvgxsneRjwJ50hxqsLaa9lXCNb58nBCxQknW54YyqH03TAcpJ/hk4pKqu74c3Ad5dVc+abGWrL8l/01308jnAbenCzAlV9fKJFjYiSU6uqvtNuo5xSTK8bRsAT6K77uArJlTSSPXdAA1sQHc5lFOr6mETKmkkkty3qk4dupL+jfQta1Ohv07dHnTfDSdV1a8nXNLIJLldVf1qxri7VtW5/e1HVdVXJlPd6mtx+wxnC5TkAXTXsBnedfSRiRU0Qkn+he6U4mfRdUXyXuC9VfW+iRY2IkmeChwKXAXsN2Wn8f9Lf/OzDO1OqaozJ1PR+CU5oapm/dJf7JJsDby9qvabdC2aW3982dOB7avqTUm2Abaoqu9NuLSRSHIu8Lqq+mQ//DLgOVW102QrG40Wt89wtgBJ/gu4M3AGN+w6qqp60eSqGq0kj6C7sO7v6K5js6i7VRnoT23/MHAWcDfgh8BLp+UyKEm+NcvoWuzdxwz0rbgDtwDuCxw2LcfUzdR/2Z+52LunSnIWq+4YfFou9XIY8CfgYVV1tyS3Ab5SVbtNuLSRSHIH4Ai6E45uD5xD15XT71e54CLR4vZ5zNnC7ArsNE2nug/rj3t5N/Am4B50x4k8u6ouWvWSi8IXgIOq6mv9F99LgVOAu0+2rNGoqgdNuoYxO5sb+nu9Hvgp8LyJVjRCSd7LDSHmFsC9ge9PrqKReUz//wX9///q/z+drgV7WtyvqnZJcjpAfxX9Rdsn8UxVdXGSLwOvoguhr5qWYAZtbp/hbGF+QLe77+JJFzIm/wb8TVX9ECDJE4Gvc8NVrxez3avqcuiak4B3JDlmwjWNTJLZWm8voztu6Qdrup4x2H7m8Z3p+radFsuGbl8PfHwadrsPztDsewPYc2jSK/tuqt40mcpG7rr+JKPB2ZpL6b7kp0J/Qs7FwM7AVsCR6Tp2n5Zjdpvbvmn6cFsTNgd+mOR73Pi4nmk54+j+g94PAKrqs30/Y9Pg+iSvA7YZvoI3XZ9/0+ABwG7ccNHLR9N1B/TiJB+rqndMrLLROBmYeTX5780yblGqqg8Pbve7xKapX1SAWw134dQfu3urCdc0Su8BPgfcPslb6K7httJ+fRehQ6vq8/3tS/vn71WTLGjEmts+jzlbgGk/4yjJ7ek6e92yqvZKshNdYPvgHIs2L8kn6C4zsX9V7dxfk+jEqrr3hEsbiSTHAU+uqiv64Y2BT9Kd1bhssR64m+R2dJesORp4Cjec0r4JXb+209CqS5JvAI+j+8F8Bt1Fd0+oqpeuarnFor8m5JHArftRlwLPHlwzchok2RF4eD/49ao6Z5L1jFrfz+YO/aEhGwJLBp8306C17bPlbAGmJYStwofoLjHxmn74/wc+ASz6cMb0X8F7G258QeRrgG2r6qoki/limH8NPJtuV8P7h8ZfwXT1hHDrqro8yXOBo6rqDUmm5kzbqjoVuFd/Ykeq6rJJ1zQGt6TrkqrouvabGkmeBxwAbEZ3UtxWwOHcEEYXtRa3z3C2AP1V2N9Ld7bfenRvxCunoIuVgc2r6pNJXgVQVdcnmZYLmk77Fbw/CZyYZNA0/zjgk0luBZw7ubJWT1UdBRyV5CmD09yn1JL+jLGncMOPo0UvyTOq6qMz+tVk8Luoqt45kcJGLMnrgb8BPkPXuntUkk9V1ZsnW9nIvIDu2nsnA1TVj/tW7WnR3PYZzhbmfcC+dB3a7grsT9fB9LS4sr+Q4iDA7EF3UPk0mOorePctLccCD6T7cnhxVZ3UT953cpWNRv+j4a/ozq7dYGj8WydX1Ui9CTgO+HZVnZJke6bjeMjBcWVT2T3VkP2A+1TV1QBJ3gacBkxLOLumqq4dhOr+ZJxpOiaque0znC1QVZ2XZJ3+wPmjknx30jWN0EuBY4A792dSLWVKOieuqq8mOY0bruD94mm4gneSW1XVlf3uonP6v8G0TQZnqC52Sd5P18PDg+l2vT8JOGmVCy0iVfUpuh99g+Hz6bZxUauq/+j/HzLpWsbsZ3Q/Gq7uh9cHfjKxakbvhCSvBjZM8kjgH+guTzQtmts+TwhYgCTfBB4BfAC4hO7U22dW1b0mWtgI9b8YBt1TnTvz8gWLTZJVns232A9ITvKlqto7yYXc+Jde6K4ass2EShupJGdW1T2TfL+q7tWf8PCZmp6+Nd9O18ryB7oW3nsBB1fVRyda2Igk2Q54ITftXWUqznTvDyfYDRj0AftI4NvArwAW+4XKk9yCruu7R9F9thxHd0LOVASIFrfPcLYA/dkcv6Q73uwldGceHVpVi/oXUn89s5Wqqs+uqVpGLcnx/c0N6HZFf5/uzXdP4OSqeuCkatP8JfleVe2e5GRgH+A3wNnT0kNAkjOq6t5JngA8nu7z5fhp+eGX5Pt0JxadxdD1v6blJKskf7eq6cOXSpHmw92aC/P4qno3XdP1IQBJXkx3Vf3F7LH9/9vRXS/r6/3wQ4Fv0PXXuChV1UMBkhwNHFBVZ/XDOwOL/gKK6fpgvGyw+7Lv5WEfut0shy/2ls8hxybZlO5CyYPu06bpC2/d/v+j6S5A+9vpOpmYq6vqPZMuYlwG4SvJunQXMv1FzehIezHKlHe/1fL22XK2AElOq6pdZow7varuM6maRinJF4HnVdXF/fAd6FoGV9mythgMWibmGrfYJDmJ7vpmy5Pciy5Yv52u+62rquqAiRY4Av0uh92q6uR+eENgw6r67WQrG53+APLH0+3W3J3u+LovVtX9JlrYiCR5Gt3JU1/hxhfwXuyHFRwOvLeqzk5ya+BEuh8OmwEvr6qPT7TA1dTvLYKVdL9VVYu6h4eWt89wNg/9tbGeRncm3HAH05sA11fVIyZS2Igl+UFV7Tw0fAu6zpd3XsVii0KSjwNXAh+l+6X0DGCjqtpvooWtpsGxWP3tfwWoqlf0z933a5F3nD2Q5KSq2mPSdYxT3zPA5VX1x/4SKBtX1SWTrmsUkvwL8Ld0B8kPdmtWVT1sclWtviRnV9Xd+9sHA39ZVY9PsgXwpSn64f6dunH3W7OOW6xa3D53a87Pd+kO/t8cGO4G5wpgai4UCXyjv9L8x+kCzL7A8ateZNF4FvD3wIv74W8Ch02unJEZ3vf1MPprZFXVn5JM0y+vrybZp6r+Z9KFjEOSW9L9et+G7mKYd6Q7MeeLq1puEXkCXf+o1066kBEb3p5H0p9xW1WXTNlu6Wnvfqu57bPlbAH6X7N/6L/47kLXIfiXpui4nsHJAQ/qB79ZVZ+bZD1atSTvo9uFcjHdpRfu0l+vZwvgf6vqvhMtcESS/I7uBJxr6Hb9Dc5G3WyihY1Ipr97sU8AL5yG47CG9SccvQP4Bd0P2R37YLYE+EFNT/diU939VovbZzhbgCSn0gWX29BdY2kZ3X7pp0+0MM0pyZ7AG4E7ceNT+befVE2j0O++fBp9/5NVdWE/fhdgi6o6dpL1jUqSdWYb319vcNFLsqyqdh0+hnVw2ZBJ1zYK6foOvSdwCjc+5mxRX0qj/5H+HmAL4F1V9aF+/F8Bj6qql02wvJHLdHe/1dT2Gc4WYHBCQJIX0h2Q/PYpOyHgicD/R3fWZrihdWLRd0+V5Ed0lyc4le6AXQCq6jcTK2oNSvLtxX7ZkCT70u0ae2uSrYDbV9dn46LXX8z64cB3+s+YO9Odtbn7hEsbiSQPmW38tFxKYy5JXlVV/zLpOm6uJOvTtcxvy41/3C7qEwIGWtw+jzlbmCS5P92ZHM/px03TY/h24LFVdc6ccy4+l1XVlyZdxAQt6uND+t2369L1EPBW4Cq6jol3m2RdIzTt3YutFSFsFf4GWLThDPgfuq78TmW6+iQeaG77pilYrAkHA68CPtefOr0903PAPMAvpzSYARzfn834WaboVP4FWOxN5A/oW5ROB+ivA7bepIsahXRHjv8IeCJT1r3YQJIruOE1uB5d0L5yGlrl52mxnx2wVVXtNekixqi57TOcLUD/6++EoeHzgUXdLccMy/oDdz/PjQPMor0I7ZDB9aJ2HRpXdGc4qn3X9cfXFUCS2zJ0pfnFrKoqyef7kzf+d9L1jENV3ajj8ySPp7ue29pisf84+m6Sewwu4j2Fmts+w9k8JHlXVR2c5AvM8iZb7Ae1DtmEbnfRcH+FxSLuIWBg0FPAWmyx/3I/FPgMsDTJIcBT6HvpmBInJdmtqk6ZdCFrQlV9PskrJ13HGrTY338PBJ75/9q79yDJ6vKM49+HhQC6664XIjFyCcglsFlWLlEBUVBRokZRES0jaCwRLUHBaLQ0IJoiJOhaqIlKIQQoRAS1CohcIomAq8ht1wUUxXApNFoiF0FYwy48+eOcHnpnZ2bHndPzO+fs86nq6j6nu6fft3qm59e/2yvpDqov7oP5yJ2uEDCkdfmlcTY9g12DP1U0ihGz/fbSMTRN0rFT3W97yWzFMkqStgZ+bfv39fHmwDMGqzfp+Pwl22fVq6UHGz4fYvvmkjE1bH/gSEl3Um2WXPyfQ5O0Zv3ejah6sLvem/SHOL90ADN0UOkARqx1+aVxNg2DFWG2r5S0RX37nrJRNUfS55i6vliXh27nrfshvfANqrqoA49T9TT9JYDtH5YIqmFzgFVUv6sbFY6laa3759CwVw/dXk1V+/U1ZUJpznQ/O22fOGtBNUjSYB/Bh4oGMiJtzi+Ns2moJ+weD7yX6hvtRpJWU9VU68NS4utLBzAqtqc19NX1pe7AxsO7r9v+v3p5eC9I+ijVfm7fpPob/Iqkczr+niFpM+BI4DnATcCXba8uG1Xz+tgrXxt8du4D7AKcVx8fQrXyr+tuoGp8TjQsa6DT+0TS4vyyz9k0SDoG+CvgCNt31Oe2oyr/c6ntz5SMb7ZI+pzto0rHMQqaoKh9l0i6Avj0YNNZSa8CPtCXuXaSfgzsYfuR+vhJwA22/7xsZDNTL8BZRVWz9yDgLtvvm/pZ3dHzXvkxdaWAAwfVYiRtAlzel7+/dZG0q+1bSscxKiXyS8/Z9BwGvGx4abvt2yX9DXA5sEE0zqi+HfZV1yfsvpuqN+lf6+N7qIq798VdrPl5tTFwe6FYmrSL6+L0kr4MXFs4nqYN98qfQDUC0UfPoppCcV99PLc+t6E4G+jsl9tpmPX80jibnk0m2nPI9j31N6Tovk53Idv+KbCnpAX18QOFQ2raI8Atki6jeq8OBL4raQmA7SkXfrTYWF1e26t7Viwb22cObkt6//Bxz5wELKt70ABeRFUubkPRr1/ctc16fmmcTc+j63lfdEcnP1wkvdn2uZKOHnceANufLRJY8/6DNfcAu6ZUIA3bTdKD9W0Bm9fHvSmdNqTTX4CmYvsMSZfwxH6KH7b9q5IxzbLevre1Wc8vjbPpGf4AHSZgs9kOpqBONmCmqatL3RfU11sUjWLEbH+5dAyjYHvCgu7RSXOophNsDOwoaUfbVxWOKToqjbNp2NA+QCU92fbDE9x1yqwHM0OS/gW43fYXx50/BtjS9t9Dd5e6A1vV18t6UslhQpJeAXwS2Ibqc2vQs/S0KZ8YRY0r2/Skcb2EvekZlPTPwKHALTxRucLAhtI46/sI0qznl9WaMUbS3sBpwFzbW0vaDXiX7fcUDm29SfoRsND24+PObwSssL2wTGTNkHQTsBi4rsurTddF0s+oqgLcxFDZJtuPFQsqoibpJ8Ai260omt00SZ+wfdzQ8RzgLNtvKRhWY9qYX982coyZ+QzwcuBeGNu4dL+iEc2cxzfM6pOP049h2v+kWiG2SNJ9Q5f7Jd23rid3yM+B5bZX2X5scCkdVETtdqpi7n21taSPANT7J34TuK1sSI1qXX4Z1ow12L573Iqxrv8DfETSDrbX+EOTtAOwslBMTfoQ8AHgYqAvNV4n8iHgIknfoap9B/RqwUN02yPA8nq/weHfz17s4wa8HTinbsDsD1zSs/09W5dfGmcx7O56aNOS/gg4Gvhx4Zhm6jjgEkn/yBM7du8JfAR4f7GomvMD23tIuqfnPUknUG07sYChYc2IlriwvvSKpOGpEqcAXwKWAldK2t32jWUia0ab88ucsxgj6RlUv6AvpRryuxx4n+17iwY2Q5IWAh8EBvPLbgFOtn1TuaiaIelm4J+ATwDHjL/fdi/+YUi6wfYepeOI2JAM7ds2Eds+YNaCGYE255fGWUSHSXoRVSWA1wHfGne3bR82+1E1r151e6nt/yodS8SApK/ZfmO9MGetf6a2FxUIK3ogjbPodf07SVP2HNnuxTwtSe+y/aXScYyKpPuB+VRzex4lW2lEC9RzlL4N3M9QtYcB23fNelAjIOmZwInAs2wfJGkX4AV92X+wjfmlcRZIOry+uQ+wC3BefXwIVXHptYbLukLSPcDdwLnADxi3QtP2lSXiGgVJO1O9f2MbI9v+SrmImlMvbV9Lz+fZRctJ+hSwN7AzsAL47nyZpQAAClBJREFUHtWcpe/b7s1q6br6wRnAR23vJmljqr0V/6JwaI1oY35pnMWYevz9QNur6uNNgMtt7182svVX/1N/GfBmYBFVCaBzbd9SNLCGSfoYVb3JnYHLqLZE+a7t1xUNrEGS3gRsZ/tESc8Gnmn7hnU9L2LU6gVUe1I11F5QXx6wvUvRwBoi6Trbe0laZvu59bnltheXjq0Jbcwv+5zFsGcB84aO59bnOqveD+tS24cDzwd+BnxH0lGFQ2vaoVRLwH9p+63AbvRoNbakz1Pl99b61CPAFyd/RsSs2hx4CtXQ+3zgf6l66vviYUlPp57+Iun5wG/LhtSo1uXXmw/vaMRJwLKhFSwvAj5eLpxm1JsKvpKq92xb4LNA30odrbT9mKTVkuYBvwK2Kx1Ug/a2vbukZQC276t7KyKKkXQqsCvwEFVj7HvAEtv3Fw2secdSbRWyvaSlVLV831A2pEa1Lr80zmKM7TPqsffn1ac+bPtXJWOaKUlnUm2hcQlwgu2bC4c0KsskLQBOB64HHgQ6vQfROKvqkluDb7ZPJ/udRXlbA5tS7Sb/C6pKFg8UjWgEbN9YrwzfiWre7k8G01/6oI35Zc5ZIGln27eO25BvTJc3GpT0ODAo4j78y96bwsuqSjpsafuX9fFzgKd0+X0bkLSx7dWSDgMOpprXczpVnc0TbH+1aICxwav//nalmm+2N9WXwfuoFgUcXzK2pkh6ElXv0ja231lXWNnJ9sWFQ2tEG/NL4yyQdKrtI4aGM9f4pejyRoPDEzz7rK+btEq6cVDQXdKuPLFB8rd73AsaHVQvUtmHqoH2KuDptheUjaoZks6jqrBymO2Fkjananz2ZUFA6/LLsGYAnCZpy8GqzHprjdcDd9L9OWcbyrePa0uXGxmRsa1P6hW2vVplG90m6Wiqxtg+VPucLQW+T9W72/kKJEO2t32opDcD2F6pcUWYO651+aVxFlCtenspgKT9qMoBHQUsBk6l2xM//1jSsZPdaXvJbAbTtMGwH7Av8E5J/0M1jDsYtp1wqLpDtujz+xedty1wAXDMYFpBTz1a9yYN5nxuz1CB9x5oXX5pnAXAnKENEw8FTrX9deDrkpYXjKsJc6i2BOnTt7xh1wK7A68tHciI9P39iw6zPekXh575OHApsJWkc6h6Ct9WMqCGfZyW5Zc5ZzEonr24nnh9K3CE7asG99leOPVPaK/hOUt91Pc5dX1//yK6ol4h/XyqL0rX2P5N4ZAa1bb80nMWUJU2ulLSb4CVwNUwtuqv6xsN9r3Hpe/Dfn1//yJaT9LZwFXA1bZvLR1P09qYX3rOAhjbEflPqMo1PVyf2xGY2+VJ5pKe1qcad+NJ+iXwBSZpxNg+YXYjalbf37+ILpB0ANW81hdSbW69HLjK9ilFA2tIG/NL4yyiwzLsFxGzoa5TvBdVGbUjqaqS7Fw2qua0Lb8Ma0Z0W4b9ImKkJF0BPJlqm5Crgb1s/7psVM1pY34pfB7RbS8pHUBE9N4K4FGq6geLgMFGrX3RuvwyrBkRERHrJGku8Hbg76hKxm1aOKRGtSm/DGtGRETEpCS9l2qy/B7AXVQVEK4uGlSD2phfGmcRERExlc2BJcANdUWSNUh6qu37Zz+sxrQuvwxrRkRExHrr+6rxEvllQUBERETMRN9Xjc96fmmcRURExEz0fQhu1vNL4ywiIiKiRdI4i4iIiLVI+rPpPnSkgYxIm/NL4ywiIiImcgGM7aA/la5uht3a/LKVRkRERExkI0nHAztKOnb8nbaX1Nf3zXpkzWhtfuk5i4iIiIm8Cfg9VUfOvAkuXdfa/LLPWURERExK0kG2Lykdx6i0Mb80ziIiImJSkuYDxwP71aeuBD5h+7flompOG/PLsGZERERM5XTgIeCN9eVB4IyiETWrdfml5ywiIiImJWm57cXrOtdVbcwvPWcRERExlZWS9h0cSNoHWFkwnqa1Lr/0nEVERMSkJO0GnAXMr0/dDxxue0W5qJrTxvzSOIuIiIh1kvQUANsPjjt/uO0zy0TVnDbll8ZZRERErDdJN9revXQco1Iiv8w5i4iIiJnoZG3NP0Bqa0ZERESn9H0IbtbzS+MsIiIiZiI9Zw1L4ywiIiImJWnOOh6ydFYCGZE25pcFARERETEpSXcAFwBn2P5R6Xia1sb80nMWERERU1kE/BQ4TdI1ko4YbDvRE63LLz1nERERMS2S9gPOBRZQ9TZ90vbPykbVnLbkl56ziIiImJSkOZL+WtI3gVOATwPbARcB3yoaXAPamN/GJV40IiIiOuM24L+Bk21/b+j8BXVPU9e1Lr8Ma0ZERMSkJM21/bvScYxKG/NL4ywiIiImJWkz4B3ArsBmg/O2/7ZYUA1qY36ZcxYRERFTORvYEng5cCXwbOChohE1q3X5pecsIiIiJiVpme3nSlphe5GkTYDLbB9QOrYmtDG/9JxFRETEVFbV1w9IWgjMB7YtF07jWpdfVmtGRETEVE6V9FTgY8CFwFzgH8qG1KjW5ZdhzYiIiFiLpGMnOl1f2/aS2YynaW3OLz1nERERMZF59fVOwF5UvUoArwauKhJRs1qbX3rOIiIiYlKSLgdeb/uh+ngecL7tV5SNrBltzC8LAiIiImIqWwOPDh0/Sr8WBLQuvwxrRkRExFTOBq6ta08aOBg4s2xIjWpdfhnWjIiIiClJ2h14YX14le1lJeNpWtvyS+MsIiIiokUy5ywiIiKiRdI4i4iIiGiRNM4iolckPSZp+dBl2/X4GQskvaf56CIi1i1zziKiVyT9zvbcGf6MbYGLbS/8A583x/ZjM3ntiIj0nEVE70maI+lkSddJWiHpXfX5uZKukHSjpJskvaZ+yknA9nXP28mSXizp4qGf93lJb6tv3ynpOEnfBQ6RtL2kSyXdIOlqSTvXjztE0s2SfiipD7urR8SIZJ+ziOibzSUtr2/fYftg4B3Ab23vJWlTYGm9K/jdwMG2H5T0DOAaSRcCHwYW2l4MIOnF63jN39vet37sFcCRtm+T9Dzg34ADgOOAl9v+haQFzaYcEX2SxllE9M3KQaNqyIHAIklvqI/nAzsAPwdOlLQf8Djwp8Az1+M1z4OqJw7YGzhfGtRPZtP6einw75K+BnxjPV4jIjYQaZxFxIZAwFG2L1vjZDU0uQWwh+1Vku4ENpvg+atZcxrI+Mc8XF9vBDwwQeMQ20fWPWmvBJZLWmz73vVJJiL6LXPOImJDcBnwbkmbAEjaUdKTqXrQfl03zPYHtqkf/xAwb+j5dwG7SNpU0nzgJRO9iO0HgTskHVK/jiTtVt/e3vYPbB8H/AbYqvk0I6IP0nMWERuC06gKGd+oarzxHuC1wDnARZKuB5YDtwLYvlfSUkk3A5fY/mA9HLkCuA2YqrTLW4AvSPoYsAnwVeCHwMmSdqDqxbuiPhcRsZZspRERERHRIhnWjIiIiGiRNM4iIiIiWiSNs4iIiIgWSeMsIiIiokXSOIuIiIhokTTOIiIiIlokjbOIiIiIFknjLCIiIqJF/h+kuYWA4X3SUgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of GBT Grid Model for Binary Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_gbt_t_new['column'][:10], y=feat_imp_tuned_gbt_t_new['weight'][:10],data=feat_imp_tuned_gbt_t_new)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from GBT Best tuned\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Trees Binary Classification Base Model" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Create initial Decision Tree Model\n", + "dt = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Pipeline with stages created for DT Model \n", + "\n", + "dt_pipe = Pipeline(stages=[label_stringIdx, va, dt])\n", + "\n", + "# Train model with Training Data\n", + "\n", + "dtModel = dt_pipe.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "# Binary Class Evaluator Initialize\n", + "\n", + "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform the test data to get prediction from the model for the test data\n", + "\n", + "pred_dt = dtModel.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score is 0.7039599188458128\n" + ] + } + ], + "source": [ + "# AUC Score from the evaluator for the test data\n", + "\n", + "print(\"AUC Score is\",evaluator.evaluate(pred_dt))" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6643584521384929" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Accuracy Calculation for DT Base Model\n", + "\n", + "binary_prediction=pred_dt.select(\"prediction\").collect()\n", + "\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "\n", + "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_dtb=pred_dt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.86 0.60 0.70 131790\n", + " 1 0.49 0.81 0.61 64610\n", + "\n", + " accuracy 0.66 196400\n", + " macro avg 0.68 0.70 0.66 196400\n", + "weighted avg 0.74 0.66 0.67 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_dtb,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of DT Base Model for Binary Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dtb= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Binary Base Model')" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of DT Base Model for Binary Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dtb['column'][:10], y=feat_imp_tuned_dtb['weight'][:10],data=feat_imp_tuned_dtb)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Binary Base Model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Binary Classification Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing DT Grid Pipeline\n", + "\n", + "dt_new = DecisionTreeClassifier(labelCol=\"label\", featuresCol=\"features\",seed=42)\n", + "\n", + "# Creating pipeline for DT Grid Model \n", + "\n", + "dt_new_pipe = Pipeline(stages=[label_stringIdx, va, dt_new])\n", + "\n", + "# Binary Evaluator Initializing\n", + "\n", + "evaluator = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "\n", + "# Creating Grid Search for Hyper Parameter Tuning for DT Model\n", + "\n", + "grid_dt = ParamGridBuilder().addGrid(dt_new.maxDepth, [10,15,30]).addGrid(dt_new.minInstancesPerNode, [500,1000,1500]).addGrid(dt_new.maxBins,[20,35,50]).build()\n", + "\n", + "# Cross Validator Pipeline with 5 fold cv to fit the training data\n", + "\n", + "cv1_dt = CrossValidator(estimator=dt_new_pipe,estimatorParamMaps=grid_dt, numFolds=5, evaluator=evaluator,seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "# Fitting the train data using the 5-fold Cross validator pipeline\n", + "\n", + "dtModel_t = cv1_dt.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "# Predicting the test data using the fitted pipeline\n", + "\n", + "pred_dtt = dtModel_t.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score 0.6710458592255817\n" + ] + } + ], + "source": [ + "# AUC Score for the fitted pipeline for test data\n", + "\n", + "print(\"AUC Score\", evaluator.evaluate(pred_dtt))" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='featuresCol', doc='features column name.'): 'features',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='seed', doc='random seed.'): 42,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 50,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1500,\n", + " Param(parent='DecisionTreeClassifier_0bccbe6ff0a3', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0}" + ] + }, + "execution_count": 218, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best Model Hyper parameters for the DT Grid Search Model\n", + "\n", + "dtModel_t.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_dtbt=pred_dtt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.88 0.57 0.70 131790\n", + " 1 0.49 0.85 0.62 64610\n", + "\n", + " accuracy 0.66 196400\n", + " macro avg 0.69 0.71 0.66 196400\n", + "weighted avg 0.76 0.66 0.67 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_dtbt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating Pandas Dataframe for Features and their Importance of DT Grid Model for Binary Classification\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_dtbt= pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], dtModel_t.bestModel.stages[-1].featureImportances)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 10 Features based on Importance from DT Binary Grid Model')" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting top 10 Features from Feature Importance of DT Grid Model for Binary Classification\n", + "\n", + "plt.figure(figsize=(10,10))\n", + "sns.barplot(x=feat_imp_tuned_dtbt['column'][:10], y=feat_imp_tuned_dtbt['weight'][:10],data=feat_imp_tuned_dtbt)\n", + "plt.xticks(rotation=90)\n", + "plt.xlabel(\"Features\")\n", + "plt.ylabel(\"Weights\")\n", + "plt.title(\"Top 10 Features based on Importance from DT Binary Grid Model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic Regression " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic Base Model Binary" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard Scaler to standardize the output of vector assembler before feeding it to Logistic Regression\n", + "\n", + "center = feature.StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='centered_features')" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "# Create initial LogisticRegression model\n", + "lr = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")\n", + "\n", + "# Pipeline for training data \n", + "\n", + "lrModel = Pipeline(stages=[label_stringIdx,va, center, lr])\n", + "\n", + "# Fit the train data using LR model\n", + "\n", + "lr_fit = lrModel.fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "# Transsform test data to predict Severity by using fitted pipeline from training data\n", + "\n", + "pred_lrb = lr_fit.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Area Under ROC for Logistic Base Model 0.7619402091983631\n" + ] + } + ], + "source": [ + "# Evaluator to get AUC Score for test data\n", + "\n", + "evaluator_lrb = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')\n", + "print('Test Area Under ROC for Logistic Base Model ', evaluator_lrb.evaluate(lr_fit.transform(us_test_cat)))" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set areaUnderROC: 0.7627619042131226\n" + ] + } + ], + "source": [ + "# ROC Curve for LR Base Model\n", + "\n", + "trainingSummary = lr_fit.stages[-1].summary\n", + "roc = trainingSummary.roc.toPandas()\n", + "plt.plot(roc['FPR'],roc['TPR'])\n", + "plt.ylabel('False Positive Rate')\n", + "plt.xlabel('True Positive Rate')\n", + "plt.title('ROC Curve')\n", + "plt.grid(True)\n", + "plt.show()\n", + "print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "# Logistic Regression Coefficients to Array \n", + "\n", + "coef_l1=lr_fit.stages[-1].coefficientMatrix.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "# to stack the coefficient array column wise for further analysis\n", + "\n", + "cof_l1=np.hstack(coef_l1)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating pandas dataframe with Logistic Regression weights for each variable along with variable name\n", + "\n", + "pd.set_option('display.max_rows', None)\n", + "feat_imp_tuned_b = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], cof_l1)),\n", + " columns = ['column', 'weight']).sort_values('weight',ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "# Coefficient from LR model for each variable\n", + "\n", + "coef_L1=lr_fit.stages[-1].coefficients.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 6.10404226e-02, 3.84503967e-01, 5.55462923e-01, 2.83682423e-02,\n", + " 3.11545767e-01, 5.72957889e-01, 2.68684159e-01, 6.40513508e-01,\n", + " 2.39662302e-01, 7.29093517e-01, 3.38870399e-01, 6.35365038e-01,\n", + " 3.42524554e-01, 3.00627117e-01, 2.86496247e-01, 3.33305131e-01,\n", + " 1.74724528e-01, 7.28250150e-01, 4.11683887e-01, 1.01770132e-01,\n", + " 7.77776525e-02, 4.80918989e-01, 1.93834032e-01, 6.10964689e-02,\n", + " 7.07937460e-02, -2.46781145e-02, 3.25562566e-02, 1.93521868e+00,\n", + " 6.98425343e-01, 2.44544359e-01, -2.92797816e-01, 4.20970924e-01,\n", + " 1.18834527e+00, -4.98702447e-01, 1.49876162e+00, -3.76084559e-01,\n", + " 8.73144768e-01, 1.01459587e+00, -3.68386095e-01, 8.15470313e-01,\n", + " 1.02401163e+00, 1.33550551e+00, 9.34355323e-01, -5.02078892e-02,\n", + " 1.90952537e+00, 1.15563651e+00, 6.80595337e-01, 5.58708379e-02,\n", + " 2.25339626e-02, 2.20958954e-02, 7.43138848e-04, 6.98986882e-01,\n", + " 6.92230035e-01, 3.43491620e-01, 1.09905601e-01, -6.59910295e-02,\n", + " 2.73193089e-01, 6.57264786e-02, 1.43917276e-01, 6.74550394e-05,\n", + " 7.00180998e-02, 2.05815870e-01, 3.62644772e-01, 2.44123017e-01,\n", + " 8.74585264e-02, 2.65865377e-02, 1.39148506e-02, -3.64614378e-02,\n", + " 3.60980672e-02, 5.39018526e-02, 6.97262794e-02, 3.48474891e-02,\n", + " 7.32900545e-02, 9.03299822e-03, 3.43603153e-02, 1.55938700e-02,\n", + " 1.39685633e-02, 3.34354498e-02, 4.88498457e-02, 7.42683555e-02,\n", + " 6.41895294e-03, -1.52099373e+00, -2.92797816e-01, 3.21061922e-01,\n", + " 1.12504363e-01, 9.90974565e-02, 8.69937407e-01, -3.11252258e-01,\n", + " -4.64919144e-02, 7.05797649e-02, 2.02713308e-01, -2.74288756e-03,\n", + " 4.03122329e-01, 2.52918849e-01, 3.42967380e-01, -3.63234523e-01,\n", + " -1.65340324e-01, 2.05437823e-01, 2.01133559e-01, 3.13500571e-01,\n", + " 3.03022094e-01, -3.83431591e-03, 1.29392900e-04, 5.04152924e-02,\n", + " 1.42110421e-02, 9.75298836e-03, 3.16315594e-02, -7.36661603e-01,\n", + " 3.93267264e-01, -6.70645537e-01, 3.50216355e-01, 1.91966480e-01,\n", + " 3.17809826e-02, 1.69946149e-01, -2.11351631e+00, -4.39883557e-01,\n", + " -1.60607312e+00, -9.26287099e-01, -1.28426974e+00])" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coef_L1" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features are 119\n", + "Eliminated features out of 119 are 0\n" + ] + } + ], + "source": [ + "# Taking absolute values of weights and calculating the number of features eliminated by LR Model after L1 regularization \n", + "\n", + "coef_L1 = np.absolute(coef_L1)\n", + "\n", + "print('Total number of features are',len(coef_L1))\n", + "\n", + "sorted_abs = np.sort(coef_L1)\n", + "\n", + "weights_notzero = sorted_abs[sorted_abs == 0]\n", + "nonzero_weights = len(sorted_abs[sorted_abs == 0])\n", + "\n", + "print('Eliminated features out of ' + str(len(coef_L1)) + ' are', nonzero_weights)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_lrbal=pred_lrb.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.85 0.63 0.72 131790\n", + " 1 0.50 0.77 0.61 64610\n", + "\n", + " accuracy 0.67 196400\n", + " macro avg 0.67 0.70 0.66 196400\n", + "weighted avg 0.73 0.67 0.68 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_lrbal,y_true=true_labels))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic Binary Grid Search Binary" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "# Logistic Regression Pipeline initialization\n", + "\n", + "lr_new = LogisticRegression(labelCol=\"label\", featuresCol=\"centered_features\")" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [], + "source": [ + "# Grid Search for tuning the hyper parameters of Logistic Regression Model\n", + "\n", + "paramGrid_lr = ParamGridBuilder().addGrid(lr_new.regParam, [0.01, 0.04,0.07]).addGrid(lr_new.elasticNetParam, [0.1,0.4,0.7]).build()" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating pipeline to be used for fitting the training data\n", + "\n", + "cvModel_lrbal = Pipeline(stages=[label_stringIdx,va,center,lr_new])" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Binary Evaluator for evaluating the model performance\n", + "\n", + "evaluator_lrbt = BinaryClassificationEvaluator(labelCol='label',metricName='areaUnderROC')" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [], + "source": [ + "# Cross validator pipeline initialization for 5-fold cross validation and fitting the train data\n", + "\n", + "cv_lrbal = CrossValidator(estimator=cvModel_lrbal, estimatorParamMaps=paramGrid_lr, evaluator=evaluator_lrbt, numFolds=5,seed=42).fit(us_train_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [], + "source": [ + "# Testing the test data on fitted 5 fold cv pipeline\n", + "\n", + "pred_lrbalt=cv_lrbal.transform(us_test_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC Score is 0.7612376191461516\n" + ] + } + ], + "source": [ + "# Accuracy calculation from the Multiclass evaluator\n", + "\n", + "print(\"AUC Score is\",evaluator_lrbt.evaluate(pred_lrbalt))" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set areaUnderROC: 0.7622374995379829\n" + ] + } + ], + "source": [ + "# ROC Curve for Logistic Regression Grid Search Model\n", + "\n", + "trainingSummary_t = cv_lrbal.bestModel.stages[-1].summary\n", + "roc = trainingSummary_t.roc.toPandas()\n", + "plt.plot(roc['FPR'],roc['TPR'])\n", + "plt.ylabel('False Positive Rate')\n", + "plt.xlabel('True Positive Rate')\n", + "plt.title('ROC Curve')\n", + "plt.grid(True)\n", + "plt.show()\n", + "print('Training set areaUnderROC: ' + str(trainingSummary_t.areaUnderROC))" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{Param(parent='LogisticRegression_f000cca535b6', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='featuresCol', doc='features column name.'): 'centered_features',\n", + " Param(parent='LogisticRegression_f000cca535b6', name='fitIntercept', doc='whether to fit an intercept term.'): True,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='labelCol', doc='label column name.'): 'label',\n", + " Param(parent='LogisticRegression_f000cca535b6', name='predictionCol', doc='prediction column name.'): 'prediction',\n", + " Param(parent='LogisticRegression_f000cca535b6', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',\n", + " Param(parent='LogisticRegression_f000cca535b6', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',\n", + " Param(parent='LogisticRegression_f000cca535b6', name='standardization', doc='whether to standardize the training features before fitting the model.'): True,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.5,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',\n", + " Param(parent='LogisticRegression_f000cca535b6', name='maxIter', doc='max number of iterations (>= 0).'): 100,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='regParam', doc='regularization parameter (>= 0).'): 0.01,\n", + " Param(parent='LogisticRegression_f000cca535b6', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 1e-06}" + ] + }, + "execution_count": 212, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best Model Hyper parameters after Tuning LR Model\n", + "\n", + "cv_lrbal.bestModel.stages[-1].extractParamMap()" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6688034623217922" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Accuracy Calculation for the test data from the model\n", + "\n", + "binary_prediction=pred_lrbalt.select(\"prediction\").collect()\n", + "\n", + "binary_true_labels=us_test_cat.select(\"Severity\").collect()\n", + "\n", + "np.sum(list([int(binary_true_labels[i][0]==binary_prediction[i][0]) for i in range(len(true_labels))]))/len(true_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction output from the model to pandas\n", + "\n", + "prediction_lrbalt=pred_lrbalt.toPandas()[\"prediction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [], + "source": [ + "# True Labels from test data for Target Variable\n", + "\n", + "true_labels=us_test_cat.toPandas()[\"Severity\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [], + "source": [ + "# Initializing Classification Report from sklearn\n", + "\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.85 0.61 0.71 131790\n", + " 1 0.50 0.78 0.61 64610\n", + "\n", + " accuracy 0.67 196400\n", + " macro avg 0.67 0.70 0.66 196400\n", + "weighted avg 0.74 0.67 0.68 196400\n", + "\n" + ] + } + ], + "source": [ + "# Classification Report Generation for all metrics display at once\n", + "\n", + "print(classification_report(y_pred=prediction_dtbalt,y_true=true_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [], + "source": [ + "# Coefficient from LR model for each variable\n", + "\n", + "coef_L1_m=cv_lrbal.bestModel.stages[-1].coefficients.toArray()" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.05730593, 0.1631614 , 0.22683363, -0.0922915 , 0.12743824,\n", + " 0.22850115, 0.0874906 , 0.29489116, 0. , 0.37672293,\n", + " 0.15445802, 0.28753523, 0.14824354, 0.00970946, 0.10372804,\n", + " 0.08653566, 0. , 0.37548958, 0.13350159, -0.01698079,\n", + " -0.04548731, 0.17180723, 0.01903713, 0.10730425, 0.08693924,\n", + " 0.01656798, 0.06626178, 1.65740542, 0.5776856 , 0. ,\n", + " -0.25852766, 0.34445895, 0.99689203, -0.21432981, 1.33791048,\n", + " 0. , 0.80841273, 0.59158904, -0.23910861, 0.58651763,\n", + " 0.90778268, 0.11721702, 0.81946075, 0. , 1.58545586,\n", + " 0.78832342, 0.50243283, 0.0249303 , 0. , 0. ,\n", + " -0.01444565, 0.63605211, 0.62860006, 0.22391409, 0.02995857,\n", + " -0.11490668, 0.14692048, 0. , 0.02469895, -0.06192017,\n", + " 0. , 0.10814645, 0.23145346, 0.13481408, 0.03663532,\n", + " 0. , 0. , -0.03897284, 0. , 0. ,\n", + " 0.02532982, 0.00652907, 0.01371828, 0. , 0. ,\n", + " 0. , 0. , 0. , 0.01066444, 0.02208044,\n", + " 0. , -1.41648022, -0.25852766, 0.26103542, 0. ,\n", + " 0. , 0. , -0.22986335, 0.0026555 , 0.04903758,\n", + " 0.14350738, 0. , 0.21757303, 0.08485045, 0.0949972 ,\n", + " -0.44105302, 0. , 0.01732823, 0.07618706, 0. ,\n", + " 0.23152556, -0.00195721, 0. , 0.0395699 , 0.00836024,\n", + " 0.01054067, 0.00280582, -0.64456 , 0. , -0.63672179,\n", + " 0.17710428, 0.18114564, 0. , 0.0416721 , -0.79625681,\n", + " -0.37949686, -1.40585049, -0.38560954, -1.20185474])" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coef_L1_m" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas dataframe of weights of variables with variable names to find which variables are eliminated\n", + "\n", + "feat_imp_tuned_lrt = pd.DataFrame(list(zip([i for i in us_train_cat.columns if i!='Severity'], np.absolute(coef_L1_m))),\n", + " columns = ['column', 'weight']).sort_values('weight')" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columnweight
80Wind_Direction_Index_120.0
65Wind_Direction_Index_70.0
85clear0.0
84cloud0.0
57month_of_year_Index_110.0
29TMC_Index_180.0
77Wind_Direction_Index_50.0
76Wind_Direction_Index_150.0
75Wind_Direction_Index_60.0
74Wind_Direction_Index_130.0
\n", + "
" + ], + "text/plain": [ + " column weight\n", + "80 Wind_Direction_Index_12 0.0\n", + "65 Wind_Direction_Index_7 0.0\n", + "85 clear 0.0\n", + "84 cloud 0.0\n", + "57 month_of_year_Index_11 0.0\n", + "29 TMC_Index_18 0.0\n", + "77 Wind_Direction_Index_5 0.0\n", + "76 Wind_Direction_Index_15 0.0\n", + "75 Wind_Direction_Index_6 0.0\n", + "74 Wind_Direction_Index_13 0.0" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample of 10 features eliminated by the Logistic Regression Model after L1 Regularization\n", + "\n", + "feat_imp_tuned_lrt[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of features are 119\n", + "Eliminated features out of 119 are 28\n" + ] + } + ], + "source": [ + "# Taking absolute values of weights and calculating the number of features eliminated by LR Model after L1 regularization \n", + "\n", + "coef_L1_m = np.absolute(coef_L1_m)\n", + "\n", + "print('Total number of features are',len(coef_L1_m))\n", + "\n", + "sorted_abs = np.sort(coef_L1_m)\n", + "\n", + "weights_notzero = sorted_abs[sorted_abs == 0]\n", + "nonzero_weights = len(sorted_abs[sorted_abs == 0])\n", + "\n", + "print('Eliminated features out of ' + str(len(coef_L1_m)) + ' are', len(weights_notzero))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/StringIndexing_OHE_Conversion.ipynb b/StringIndexing_OHE_Conversion.ipynb new file mode 100644 index 0000000..44e237f --- /dev/null +++ b/StringIndexing_OHE_Conversion.ipynb @@ -0,0 +1,504 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#importing all required libraries\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.sql.functions import udf\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import StringIndexer\n", + "from pyspark.ml.feature import OneHotEncoderEstimator" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# reads the cleaned data file\n", + "data=spark.read.csv(get_training_filename(\"Us_clean.csv\"),inferSchema=True,header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Extracts hour of the day from the start time of the accident and stores in a new column named Hour \n", + "data=data.withColumn(\"Hour\", date_format(col(\"Start_Time\"), \"H\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping column Start time,end time, timezone, start latitude and end latitude as they won't be helpful in predicting\n", + "# the severiy of the accident\n", + "# The columns city, county and state have high cardinality so we have dropped them\n", + "drop_col=[\"Start_Time\",\"End_Time\",\"Start_Lat\",\"Start_Lng\",'City','County','State','Timezone']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#Dropping the columns \n", + "data = data.drop(*(drop_col))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# As the data contains very few rows (around 300 in 1 million rows) of Severity 1 we have converted it to Severity 2 because \n", + "# both the classes indicate accidents with less severity \n", + "data=data.withColumn(\"Severity\",when(data[\"Severity\"]==1,2).otherwise(data[\"Severity\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# TMC column is an important column so we decided not to drop it\n", + "# It has around 25000 missing so using the mode to impute does not make sense\n", + "# So, we have made a different category for the missing values\n", + "data=data.fillna({'TMC':'-1'})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#list of all categorical columns\n", + "categorical_columns=['Source','Side','Wind_Direction','month_of_year','day_of_week',\"TMC\",'Sunrise_Sunset','Civil_Twilight',\n", + " 'Nautical_Twilight','Astronomical_Twilight',\"Hour\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#encoding the categrical column as models do not accept string\n", + "stages = []\n", + "\n", + "#iterate through all categorical values\n", + "for categoricalCol in categorical_columns:\n", + " #create a string indexer for those categorical values and assign a new name including the word 'Index'\n", + " stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + '_Index')\n", + "\n", + " #append the string Indexer to our list of stages\n", + " stages += [stringIndexer]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Running the pipeline which encodes the categorical column\n", + "\n", + "pipeline = Pipeline(stages = stages)\n", + "#fit the pipeline to our dataframe\n", + "pipelineModel = pipeline.fit(data)\n", + "#transform the dataframe\n", + "data= pipelineModel.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Dropping the original categorical column\n", + "data=data.drop(*(categorical_columns))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#List of columns with binary values i.e. True/False\n", + "\n", + "binary_columns=['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop',\n", + "'Traffic_Calming','Traffic_Signal']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Converts the binary values into 0/1\n", + "\n", + "for i in binary_columns:\n", + " data=data.withColumn(i,data[i].cast(\"int\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "#Converts the Weather Condition to lowercase\n", + "data=data.withColumn('Weather_Condition',fn.lower(col(\"Weather_Condition\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Replaces T-storm with Thunderstorm because they are the same weather condition\n", + "data=data.withColumn('Weather_Condition', regexp_replace('Weather_Condition', 'T-Storm', 'Thunderstorm'))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# List of manually picked weather conditions that we thought could help in predicting the severity \n", + "w_conditions=[\"cloud\",\"clear\",\"whirl\",\"wind\",\"light\",\"heavy\",\"thunderstorm\",\"shower\",\"snow\",\"rain\",\"drizzle\",\n", + " \"fair\",\"hail\",\"haze\",\"overcast\",'pellets']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "#Splits the strings in the weather condition on space character and converts it into a list of words\n", + "data=data.withColumn('Weather_Condition', fn.split(\"Weather_Condition\",\" \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Removes the word having length less than 4 and also the word \"with\"\n", + "data_clean1=udf(lambda x: list([i for i in x if ((len(i)>3) and (i!=\"with\"))]),\n", + " returnType=ArrayType(StringType()))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Executes the above function\n", + "data=data.withColumn(\"Weather_Condition\",data_clean1(\"Weather_Condition\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Converts the categorical columns into String type\n", + "for i in categorical_columns:\n", + " data = data.withColumn(i+\"_Index\", data[i+\"_Index\"].cast(StringType()))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Removes the words from the weather condition column that are not present in the w_conditions list\n", + "data_clean2=udf(lambda x: list([i for i in w_conditions if any(i in j for j in x)]),\n", + " returnType=ArrayType(StringType()))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Executes the above function\n", + "data=data.withColumn(\"Weather_Condition\",data_clean2(\"Weather_Condition\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Makes dummy variable for each weather condition in our list\n", + "exprs = [fn.when(fn.array_contains(fn.col('Weather_Condition'), column), 1).otherwise(0).alias(column)\\\n", + " for column in w_conditions]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Makes a temporary dataframe of our weather condition and dummy variables \n", + "temp=data.select(['Weather_Condition']+exprs)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "#create two dataframe which we will join to make our final dataframe \n", + "df1 = data.withColumn(\"id\", monotonically_increasing_id())\n", + "df2 = temp.withColumn(\"id\", monotonically_increasing_id())" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the final datafram\n", + "data = df2.join(df1, \"id\", \"outer\").drop(\"id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "# Now we can drop the weather condition column after making dummies from it\n", + "data=data.drop(\"Weather_Condition\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Splits the dataframe into train and test\n", + "training_df, validation_df= data.randomSplit([0.8, 0.2],seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# Saves as csv\n", + "training_df.toPandas().to_csv(\"USAccident_train_categorical.csv\",index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Saves as csv\n", + "validation_df.toPandas().to_csv(\"USAccident_validation_categorical.csv\",index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One Hot Encoding" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# list of columns to be one hot encoded\n", + "categorical_columns2=[i+\"_Index\"for i in categorical_columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating dummies of categorical column\n", + "for category in categorical_columns2:\n", + " categ = data.select(category).distinct().rdd.flatMap(lambda x:x).collect()\n", + " exprs = [fn.when(fn.col(category) == cat,1).otherwise(0)\\\n", + " .alias(category+\"_\"+str(int(float(cat)))) for cat in categ]\n", + " data = data.select(exprs+data.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping all the original categorical columns\n", + "data=data.drop(*(categorical_columns2))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# From the n dummies made for each categorical column, dropping the nth dummy\n", + "data=data.drop(*([i+\"_Index_0\" for i in categorical_columns]))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# Splits the dataframe into train and test\n", + "training_df, validation_df= data.randomSplit([0.8, 0.2],seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Saves as csv\n", + "training_df.toPandas().to_csv(\"USAccident_train_OHE.csv\",index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Saves as csv\n", + "validation_df.toPandas().to_csv(\"USAccident_validation_OHE.csv\",index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Undersampling_Oversampling.ipynb b/Undersampling_Oversampling.ipynb new file mode 100644 index 0000000..4ffb180 --- /dev/null +++ b/Undersampling_Oversampling.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.getOrCreate()\n", + "sc = spark.sparkContext\n", + "from pyspark.sql import Row\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *\n", + "import matplotlib.pyplot as plt\n", + "from pyspark.sql import functions as fn\n", + "from pyspark.ml import feature, regression, evaluation, Pipeline\n", + "import seaborn as sns\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.regression import LinearRegression\n", + "from pyspark.ml.stat import Correlation\n", + "from pyspark.ml.feature import StringIndexer\n", + "from pyspark.ml.feature import OneHotEncoderEstimator\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import RandomForestClassifier\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator\n", + "from sklearn.metrics import classification_report\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "from pyspark.ml.tuning import CrossValidator,ParamGridBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Do not delete or change this cell\n", + "\n", + "import os\n", + "\n", + "# Define a function to determine if we are running on data bricks\n", + "# Return true if running in the data bricks environment, false otherwise\n", + "def is_databricks():\n", + " # get the databricks runtime version\n", + " db_env = os.getenv(\"DATABRICKS_RUNTIME_VERSION\")\n", + " \n", + " # if running on data bricks\n", + " if db_env != None:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "# Define a function to read the data file. The full path data file name is constructed\n", + "# by checking runtime environment variables to determine if the runtime environment is \n", + "# databricks, or a student's personal computer. The full path file name is then\n", + "# constructed based on the runtime env.\n", + "# \n", + "# Params\n", + "# data_file_name: The base name of the data file to load\n", + "# \n", + "# Returns the full path file name based on the runtime env\n", + "#\n", + "def get_training_filename(data_file_name): \n", + " # if running on data bricks\n", + " if is_databricks():\n", + " # build the full path file name assuming data brick env\n", + " full_path_name = \"/FileStore/tables/%s\" % data_file_name\n", + " # else the data is assumed to be in the same dir as this notebook\n", + " else:\n", + " # Assume the student is running on their own computer and load the data\n", + " # file from the same dir as this notebook\n", + " full_path_name = data_file_name\n", + " \n", + " # return the full path file name to the caller\n", + " return full_path_name" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Sampling will only be performed on training data\n", + "# below we import the data\n", + "training_df=spark.read.csv(get_training_filename(\"USAccident_train_categorical.csv\"),inferSchema=True,header=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Balancing for Multiclass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**For balancing the multiclass data, we will oversample the class 4 as it has the least data and undersample class 2 with the most data.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Oversampling Target 4" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# The class 4 will be oversampled in such a way that the new number of rows of class 4 matches the number of class 3.\n", + "major_df = training_df.filter(col(\"Severity\") == 3)\n", + "minor_df = training_df.filter(col(\"Severity\") == 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# calculating the ratio of number of rows in class 3 by class 4\n", + "oversampling_ratio = int(major_df.count()/minor_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "a=range(oversampling_ratio)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# storing the new oversampled data of class 4\n", + "oversampled_df = minor_df.withColumn(\"dummy\", explode(array([lit(x) for x in a]))).drop('dummy')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Undersampling Target 2" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# The class 2 will be undersampled in such a way that the new number of rows of class 2 matches the number of class 3.\n", + "major_df = training_df.filter(col(\"Severity\") == 2)\n", + "minor_df = training_df.filter(col(\"Severity\") == 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# calculating the ratio of number of rows in class 2 by class 3\n", + "ratio=int(major_df.count()/minor_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#Performs the undersampling\n", + "undersampled_df = major_df.sample(False, 1/ratio)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "unsampled_class_data=training_df.filter(col(\"Severity\") == 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Combining the data to create our final dataset\n", + "temp_data=unsampled_class_data.unionAll(undersampled_df)\n", + "balanced_data=temp_data.unionAll(oversampled_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# saving in a csv file\n", + "balanced_data.toPandas().to_csv(\"USAccident_balanced_train_categorical.csv\",index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### One Hot encoding for balanced data" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#list of all categorical columns\n", + "categorical_columns=['Source','Side','Wind_Direction','month_of_year','day_of_week',\"TMC\",'Sunrise_Sunset','Civil_Twilight',\n", + " 'Nautical_Twilight','Astronomical_Twilight',\"Hour\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "#list of columns to be one hot encoded\n", + "categorical_columns2=[i+\"_Index\"for i in categorical_columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating dummies of categorical column\n", + "for category in categorical_columns2:\n", + " categ = balanced_data.select(category).distinct().rdd.flatMap(lambda x:x).collect()\n", + " exprs = [fn.when(fn.col(category) == cat,1).otherwise(0)\\\n", + " .alias(category+\"_\"+str(int(float(cat)))) for cat in categ]\n", + " balanced_data = balanced_data.select(exprs+balanced_data.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping all the original categorical columns\n", + "balanced_data=balanced_data.drop(*(categorical_columns2))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# From the n dummies made for each categorical column, dropping the nth dummy\n", + "balanced_data=balanced_data.drop(*([i+\"_Index_0\" for i in categorical_columns]))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Saves as csv\n", + "balanced_data.toPandas().to_csv(\"USAccident_balanced_train_categorical_OHE.csv\",index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Balancing for Binary Data" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# binarizing the target variable\n", + "training_df_new=training_df.withColumn(\"Severity\",fn.when(((training_df[\"Severity\"]==1) | (training_df[\"Severity\"]==2)),0).otherwise(1))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# The class 0 will be undersampled in such a way that the new number of rows of class 0 matches the number of class 1.\n", + "major_df = training_df_new.filter(col(\"Severity\") == 0)\n", + "minor_df = training_df_new.filter(col(\"Severity\") == 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "ratio=int(major_df.count()/minor_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# performs the undersampling\n", + "undersampled_df = major_df.sample(False, 1/ratio)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Combining the data to create the balanced dataset for binary output\n", + "balanced_data_binary=training_df_new.filter(col(\"Severity\") == 1).unionAll(undersampled_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "balanced_data_binary.toPandas().to_csv(\"USAccident_balanced_train_binary.csv\",index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### One Hot Encoding for balanced data" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "#list of all categorical columns\n", + "categorical_columns=['Source','Side','Wind_Direction','month_of_year','day_of_week',\"TMC\",'Sunrise_Sunset','Civil_Twilight',\n", + " 'Nautical_Twilight','Astronomical_Twilight',\"Hour\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "#list of columns to be one hot encoded\n", + "categorical_columns2=[i+\"_Index\"for i in categorical_columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating dummies of categorical column\n", + "for category in categorical_columns2:\n", + " categ = balanced_data_binary.select(category).distinct().rdd.flatMap(lambda x:x).collect()\n", + " exprs = [fn.when(fn.col(category) == cat,1).otherwise(0)\\\n", + " .alias(category+\"_\"+str(int(float(cat)))) for cat in categ]\n", + " balanced_data_binary = balanced_data_binary.select(exprs+balanced_data_binary.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping all the original categorical columns\n", + "balanced_data_binary=balanced_data_binary.drop(*(categorical_columns2))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# From the n dummies made for each categorical column, dropping the nth dummy\n", + "balanced_data_binary=balanced_data_binary.drop(*([i+\"_Index_0\" for i in categorical_columns]))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Saves as csv\n", + "balanced_data_binary.toPandas().to_csv(\"USAccident_balanced_train_binary_OHE.csv\",index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}