-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6.Classificazione per regione politica
1 lines (1 loc) · 51 KB
/
6.Classificazione per regione politica
1
{"cells":[{"cell_type":"markdown","metadata":{"id":"d0VekpCZ5iLt"},"source":["# Classificazione per regione"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4296,"status":"ok","timestamp":1641055406015,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"28IQ-n5IN5-a","outputId":"ba595737-c9ee-4780-89c5-e480c4230b14"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: pyspark in /usr/local/lib/python3.7/dist-packages (3.2.0)\n","Requirement already satisfied: py4j==0.10.9.2 in /usr/local/lib/python3.7/dist-packages (from pyspark) (0.10.9.2)\n"]}],"source":["pip install pyspark"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1500,"status":"ok","timestamp":1641055407510,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"s3VB5dqkOQE-","outputId":"7d474a62-9e6f-47f4-8aee-ade4b4771ac3"},"outputs":[{"name":"stdout","output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"MjCpzA59OQB5"},"outputs":[],"source":["# Spark context\n","\n","from pyspark import SparkContext\n","sc = SparkContext(appName=\"MY-APP-NAME\", master=\"local[*]\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":360,"status":"ok","timestamp":1641055416627,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"jN1Bnox_OP-4","outputId":"a4673029-73d1-4f2d-cde2-143450df96ac"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.7/dist-packages/pyspark/sql/context.py:79: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n"," FutureWarning\n"]}],"source":["# SQL context\n","\n","from pyspark.sql import SQLContext\n","sqlCtx = SQLContext(sc)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"RKxr_UAROP8R"},"outputs":[],"source":["from pyspark.sql.types import FloatType\n","from pyspark.sql.functions import col\n","from pyspark.sql.functions import when\n","from pyspark.sql.functions import regexp_replace\n","from pyspark.ml.feature import VectorAssembler\n","from pyspark.ml.regression import LinearRegression\n","from pyspark.ml.regression import RandomForestRegressor\n","from pyspark.ml.evaluation import RegressionEvaluator\n","from pyspark.sql.functions import monotonically_increasing_id \n","from pyspark.ml.clustering import KMeans\n","from pyspark.ml.evaluation import ClusteringEvaluator\n","from pyspark.ml.feature import StandardScaler\n","from pyspark.sql.types import IntegerType\n","import pyspark.sql.functions as F\n","from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n","from pyspark.ml.classification import DecisionTreeClassifier\n","from pyspark.ml.evaluation import BinaryClassificationEvaluator\n","from pyspark.ml.classification import RandomForestClassifier\n","from pyspark.ml import Pipeline\n","from pyspark.ml.classification import LogisticRegression\n","from pyspark.mllib.util import MLUtils\n","from pyspark.mllib.evaluation import MulticlassMetrics\n","from pyspark.mllib.evaluation import BinaryClassificationMetrics\n","from tabulate import tabulate\n","from pyspark.sql.functions import split\n","from pyspark.ml.feature import PCA"]},{"cell_type":"markdown","metadata":{"id":"yl3DBVzCQeSK"},"source":["## Data Preparation"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JSnqQlrgOP5g"},"outputs":[],"source":["# Caricamento nuovo dataset senza missing values\n","\n","filename = 'drive/MyDrive/DDAM/Australia Rain/rain_tomorrow_australia.csv'\n","df = sqlCtx.read.load(filename, format=\"csv\", sep=\",\", inferSchema=\"true\", header=\"true\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1546,"status":"ok","timestamp":1641055434605,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"LE0wZRW9OPyr","outputId":"a4cdf85a-9ac8-4ab7-a473-bebc6428efc7"},"outputs":[{"name":"stdout","output_type":"stream","text":["+---+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+\n","|_c0| Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow|\n","+---+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+\n","| 0|01/12/2008| Albury| 13.4| 22.9| 0.6| 3.2| 5.6| W| 44.0| W| WNW| 20.0| 24.0| 71.0| 22.0| 1007.7| 1007.1| 8.0| 2.0| 16.9| 21.8| 0| 0.0| 0|\n","| 1|02/12/2008| Albury| 7.4| 25.1| 0.0| 3.2| 5.6| WNW| 44.0| NNW| WSW| 4.0| 22.0| 44.0| 25.0| 1010.6| 1007.8| 4.0| 2.0| 17.2| 24.3| 0| 0.0| 0|\n","+---+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+\n","only showing top 2 rows\n","\n"]}],"source":["df = df.na.drop()\n","\n","# Trasformo R.Today e R.Tomorrow in numeriche binarie\n","df= df.withColumn('RainToday', when( (df.RainToday == 'No'), '0' ).otherwise('1') )\n","df= df.withColumn('RainTomorrow', when( (df.RainTomorrow == 'No'), '0' ).otherwise('1') )\n","\n","\n","df = df.withColumn('RainToday' , df.RainToday.cast(IntegerType()) )\n","df = df.withColumn('RainTomorrow' , df.RainTomorrow.cast(IntegerType()) )\n","\n","df.show(2)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XbUrQTGROPmv"},"outputs":[],"source":["filename_cities = 'drive/MyDrive/DDAM/Australia Rain/Mario/cities_australia.csv'\n","df_cities = sqlCtx.read.load(filename_cities, format=\"csv\", sep=\";\", inferSchema=\"true\", header=\"true\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":424,"status":"ok","timestamp":1641055435607,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"goNF8kOhOPFc","outputId":"8361c16b-e21a-4c58-b496-f4e205abf970"},"outputs":[{"name":"stdout","output_type":"stream","text":["+---------+---------------+\n","| City| Region|\n","+---------+---------------+\n","|Melbourne| Victoria|\n","| Sydney|New South Wales|\n","+---------+---------------+\n","only showing top 2 rows\n","\n"]}],"source":["df_cities.show(2)"]},{"cell_type":"markdown","metadata":{"id":"rIFVt_-2TuQt"},"source":["Con withColumn dà errore, perciò uso gli rdd per creare la nuova colonna City:"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1128,"status":"ok","timestamp":1641055436733,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"wurTEJTydw4I","outputId":"c7497d1d-50b2-42c2-f009-0bbbe91b1d5f"},"outputs":[{"data":{"text/plain":["['City;Region', 'Melbourne;Victoria', 'Sydney;New South Wales']"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["cities_rdd = sc.textFile(filename_cities)\n","cities_rdd.take(3)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"v62K82AT3zJn"},"outputs":[],"source":["# create a dictionary (k: location, v: region)\n","cities_rdd_dict = cities_rdd.map(lambda line: line.strip().split(\";\")).collectAsMap()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":36},"executionInfo":{"elapsed":246,"status":"ok","timestamp":1641055436972,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"rIJlArFo3zEr","outputId":"b5d23f41-3b7c-44b4-c97f-b20d446ebe62"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'New South Wales'"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["cities_rdd_dict['Sydney']"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"QquQwXIt4FTh"},"outputs":[],"source":["from pyspark.sql.functions import udf\n","from pyspark.sql.types import *\n","\n","func_name = udf(\n"," lambda val: cities_rdd_dict[val], \n"," StringType()\n",")\n","\n","df = df.withColumn('Region', func_name(df.Location))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3122,"status":"ok","timestamp":1641055440093,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"xqnc083y4FRE","outputId":"91d3c259-61ba-4ae2-945a-c61e41c7bb1b"},"outputs":[{"name":"stdout","output_type":"stream","text":["+---+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+---------------+\n","|_c0| Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow| Region|\n","+---+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+---------------+\n","| 0|01/12/2008| Albury| 13.4| 22.9| 0.6| 3.2| 5.6| W| 44.0| W| WNW| 20.0| 24.0| 71.0| 22.0| 1007.7| 1007.1| 8.0| 2.0| 16.9| 21.8| 0| 0.0| 0|New South Wales|\n","| 1|02/12/2008| Albury| 7.4| 25.1| 0.0| 3.2| 5.6| WNW| 44.0| NNW| WSW| 4.0| 22.0| 44.0| 25.0| 1010.6| 1007.8| 4.0| 2.0| 17.2| 24.3| 0| 0.0| 0|New South Wales|\n","| 2|03/12/2008| Albury| 12.9| 25.7| 0.0| 3.2| 5.6| WSW| 46.0| W| WSW| 19.0| 26.0| 38.0| 30.0| 1007.6| 1008.7| 4.0| 2.0| 21.0| 23.2| 0| 0.0| 0|New South Wales|\n","| 3|04/12/2008| Albury| 9.2| 28.0| 0.0| 3.2| 5.6| NE| 24.0| SE| E| 11.0| 9.0| 45.0| 16.0| 1017.6| 1012.8| 4.0| 2.0| 18.1| 26.5| 0| 1.0| 0|New South Wales|\n","| 4|05/12/2008| Albury| 17.5| 32.3| 1.0| 3.2| 5.6| W| 41.0| ENE| NW| 7.0| 20.0| 82.0| 33.0| 1010.8| 1006.0| 7.0| 8.0| 17.8| 29.7| 0| 0.2| 0|New South Wales|\n","+---+----------+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+---------------+\n","only showing top 5 rows\n","\n"]}],"source":["df.show(5)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3077,"status":"ok","timestamp":1641055443152,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"fMBqNr3W4FMg","outputId":"60d1f4b0-ea37-4b14-eb77-5c4d846e6233"},"outputs":[{"name":"stdout","output_type":"stream","text":["+----------------+-----+\n","| Location|count|\n","+----------------+-----+\n","| Cairns| 2945|\n","| NorfolkIsland| 2894|\n","| Bendigo| 2986|\n","| Canberra| 3010|\n","| Cobar| 2833|\n","| SydneyAirport| 2945|\n","| Wollongong| 2916|\n","| Williamtown| 2259|\n","| Moree| 2707|\n","| Mildura| 2995|\n","| Portland| 2899|\n","| Brisbane| 3109|\n","| Sydney| 2277|\n","| Sale| 2834|\n","| BadgerysCreek| 2828|\n","| Tuggeranong| 2925|\n","| Ballarat| 2969|\n","| GoldCoast| 2849|\n","|MelbourneAirport| 2983|\n","| Dartmoor| 2765|\n","+----------------+-----+\n","only showing top 20 rows\n","\n"]}],"source":["df.groupBy('Location').count().show()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":36},"executionInfo":{"elapsed":9,"status":"ok","timestamp":1641055443152,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"QLxbotHJ_AbB","outputId":"f93788d8-e621-485d-bd6b-3ed4e0c8e801"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["\"import pandas as pd\\n\\ncitta_conto = df.groupBy('Location').count()\\ncitta_conto.toPandas().to_csv('drive/MyDrive/DDAM/Australia Rain/Mario/citta_conto.csv', header=True)\""]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["'''import pandas as pd\n","\n","citta_conto = df.groupBy('Location').count()\n","citta_conto.toPandas().to_csv('drive/MyDrive/DDAM/Australia Rain/Mario/citta_conto.csv', header=True)'''"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3048,"status":"ok","timestamp":1641055446193,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"CWP62N1t_m1_","outputId":"f3bead8a-d481-4e24-f67f-d4768df64d51"},"outputs":[{"name":"stdout","output_type":"stream","text":["+------------------+-----+\n","| Region|count|\n","+------------------+-----+\n","| NorfolkIsland| 2894|\n","| Victoria|27214|\n","| New South Wales|44084|\n","| Queensland|11906|\n","|Northern Territory| 8289|\n","| South Australia|11793|\n","| Western Australia|16938|\n","| Tasmania| 6128|\n","+------------------+-----+\n","\n"]}],"source":["df.groupBy('Region').count().show()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5963,"status":"ok","timestamp":1641055452147,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"U4llz6y8URYv","outputId":"e5e1b094-6c39-4308-c0b9-dd6c3e803e24"},"outputs":[{"name":"stdout","output_type":"stream","text":["+---+----+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+------+\n","|_c0|Date|Location|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow|Region|\n","+---+----+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+------+\n","| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n","+---+----+--------+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+------+\n","\n"]}],"source":["# Non ci sono Null Values\n","\n","df.select([F.count(when(F.isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()"]},{"cell_type":"markdown","metadata":{"id":"4nY5Ii3VVeZg"},"source":["**Divisione in 8 dataframe, uno per regione**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"X-zQxr-eWczV"},"outputs":[],"source":["df = df.select(df.Region, df.RainToday, df.RainTomorrow, df.RISK_MM, df.Rainfall, \n"," df.Sunshine, df.WindGustSpeed, df.Humidity3pm, df.Pressure9am, \n"," df.Cloud3pm, df.Temp3pm)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1780,"status":"ok","timestamp":1641055453907,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"SVfPy7OwduWs","outputId":"b491206c-738f-4233-d2a9-7461e6f4d08b"},"outputs":[{"name":"stdout","output_type":"stream","text":["['RainToday', 'Rainfall', 'Sunshine', 'WindGustSpeed', 'Humidity3pm', 'Pressure9am', 'Cloud3pm', 'Temp3pm']\n","+---------------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+\n","| Region|RainToday|RainTomorrow|RISK_MM|Rainfall|Sunshine|WindGustSpeed|Humidity3pm|Pressure9am|Cloud3pm|Temp3pm| features|\n","+---------------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+\n","|New South Wales| 0| 0| 0.0| 0.6| 5.6| 44.0| 22.0| 1007.7| 2.0| 21.8|[0.0,0.6,5.6,44.0...|\n","|New South Wales| 0| 0| 0.0| 0.0| 5.6| 44.0| 25.0| 1010.6| 2.0| 24.3|[0.0,0.0,5.6,44.0...|\n","+---------------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+\n","only showing top 2 rows\n","\n"]}],"source":["# Creazione vettore features\n","\n","no_cols = ['Region', 'RainTomorrow', 'RISK_MM']\n","cols2 = [col for col in df.columns if col not in no_cols]\n","print(cols2)\n","\n","assemble = VectorAssembler(inputCols=cols2, outputCol='features')\n","df = assemble.transform(df)\n","\n","df.show(2)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ouYrv3D5UeYt"},"outputs":[],"source":["df_norfolk_island = df.filter(df['Region'] == 'NorfolkIsland')\n","df_victoria = df.filter(df['Region'] == 'Victoria')\n","df_new_south_wales = df.filter(df['Region'] == 'New South Wales')\n","df_queensland = df.filter(df['Region'] == 'Queensland')\n","df_northern_territory = df.filter(df['Region'] == 'Northern Territory')\n","df_south_australia = df.filter(df['Region'] == 'South Australia')\n","df_western_australia = df.filter(df['Region'] == 'Western Australia')\n","df_tasmania = df.filter(df['Region'] == 'Tasmania')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"WfHs7EDrb7VU"},"outputs":[],"source":["dataframes = [df_norfolk_island, df_victoria, df_new_south_wales, df_queensland,\n"," df_northern_territory, df_south_australia, df_western_australia, df_tasmania]"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2989,"status":"ok","timestamp":1641055457219,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"5P7vWpFxXEuW","outputId":"19c55afb-cdc4-4e81-ea3a-ef154b369e7b"},"outputs":[{"name":"stdout","output_type":"stream","text":["+--------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+\n","| Region|RainToday|RainTomorrow|RISK_MM|Rainfall|Sunshine|WindGustSpeed|Humidity3pm|Pressure9am|Cloud3pm|Temp3pm| features|\n","+--------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+\n","|Tasmania| 1| 1| 16.2| 3.0| 6.0| 81.0| 42.0| 988.3| 5.0| 12.1|[1.0,3.0,6.0,81.0...|\n","|Tasmania| 1| 1| 3.0| 16.2| 3.2| 83.0| 60.0| 991.7| 6.0| 8.5|[1.0,16.2,3.2,83....|\n","+--------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+\n","only showing top 2 rows\n","\n"]}],"source":["df_tasmania.show(2)"]},{"cell_type":"markdown","metadata":{"id":"yrgX9CXgbmt9"},"source":["## Standardizzazione e PCA"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"8MDpn9bnbpbv"},"outputs":[],"source":["scale = StandardScaler(inputCol='features', outputCol='standardized')\n","pca = PCA(k=3, inputCol= 'standardized', outputCol=\"pca\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"72weN2-ZdqdK"},"outputs":[],"source":["# Scaling e PCA per ogni regione\n","\n","# norfolk_island\n","data_scale = scale.fit(df_norfolk_island)\n","df_norfolk_island = data_scale.transform(df_norfolk_island)\n","pcamodel = pca.fit(df_norfolk_island)\n","df_norfolk_island = pcamodel.transform(df_norfolk_island)\n","\n","# victoria\n","data_scale = scale.fit(df_victoria)\n","df_victoria = data_scale.transform(df_victoria)\n","pcamodel = pca.fit(df_victoria)\n","df_victoria = pcamodel.transform(df_victoria)\n","\n","# new_south_wales\n","data_scale = scale.fit(df_new_south_wales)\n","df_new_south_wales = data_scale.transform(df_new_south_wales)\n","pcamodel = pca.fit(df_new_south_wales)\n","df_new_south_wales = pcamodel.transform(df_new_south_wales)\n","\n","# queensland\n","data_scale = scale.fit(df_queensland)\n","df_queensland = data_scale.transform(df_queensland)\n","pcamodel = pca.fit(df_queensland)\n","df_queensland = pcamodel.transform(df_queensland)\n","\n","# northern_territory\n","data_scale = scale.fit(df_northern_territory)\n","df_northern_territory = data_scale.transform(df_northern_territory)\n","pcamodel = pca.fit(df_northern_territory)\n","df_northern_territory = pcamodel.transform(df_northern_territory)\n","\n","# south_australia\n","data_scale = scale.fit(df_south_australia)\n","df_south_australia = data_scale.transform(df_south_australia)\n","pcamodel = pca.fit(df_south_australia)\n","df_south_australia = pcamodel.transform(df_south_australia)\n","\n","# western_australia\n","data_scale = scale.fit(df_western_australia)\n","df_western_australia = data_scale.transform(df_western_australia)\n","pcamodel = pca.fit(df_western_australia)\n","df_western_australia = pcamodel.transform(df_western_australia)\n","\n","# tasmania\n","data_scale = scale.fit(df_tasmania)\n","df_tasmania = data_scale.transform(df_tasmania)\n","pcamodel = pca.fit(df_tasmania)\n","df_tasmania = pcamodel.transform(df_tasmania)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":54},"executionInfo":{"elapsed":29,"status":"ok","timestamp":1641055525174,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"xVxhuK6hbpSE","outputId":"4785eda1-1f30-499f-ade0-aa292d28a6af"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'# Per fare scaling e PCA per ogni dataframe\\n\\ndef scaler_and_pca(dataframe):\\n for dataframe in dataframes:\\n\\n data_scale = scale.fit(dataframe)\\n dataframe = data_scale.transform(dataframe)\\n\\n pcamodel = pca.fit(dataframe)\\n dataframe = pcamodel.transform(dataframe)\\n\\n return dataframe'"]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["'''# Per fare scaling e PCA per ogni dataframe\n","\n","def scaler_and_pca(dataframe):\n"," for dataframe in dataframes:\n","\n"," data_scale = scale.fit(dataframe)\n"," dataframe = data_scale.transform(dataframe)\n","\n"," pcamodel = pca.fit(dataframe)\n"," dataframe = pcamodel.transform(dataframe)\n","\n"," return dataframe'''"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":73},"executionInfo":{"elapsed":26,"status":"ok","timestamp":1641055525175,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"0ZHLz97Dcuxg","outputId":"b18b81b1-9440-4c67-8548-a648d6c71f17"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'# Applico la funzione\\n\\ndf_norfolk_island = scaler_and_pca(df_norfolk_island) \\ndf_victoria = scaler_and_pca(df_victoria) \\ndf_new_south_wales = scaler_and_pca(df_new_south_wales) \\ndf_queensland = scaler_and_pca(df_queensland) \\ndf_northern_territory = scaler_and_pca(df_northern_territory) \\ndf_south_australia = scaler_and_pca(df_south_australia) \\ndf_western_australia = scaler_and_pca(df_western_australia) \\ndf_tasmania = scaler_and_pca(df_tasmania)'"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["'''# Applico la funzione\n","\n","df_norfolk_island = scaler_and_pca(df_norfolk_island) \n","df_victoria = scaler_and_pca(df_victoria) \n","df_new_south_wales = scaler_and_pca(df_new_south_wales) \n","df_queensland = scaler_and_pca(df_queensland) \n","df_northern_territory = scaler_and_pca(df_northern_territory) \n","df_south_australia = scaler_and_pca(df_south_australia) \n","df_western_australia = scaler_and_pca(df_western_australia) \n","df_tasmania = scaler_and_pca(df_tasmania)''' "]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1958,"status":"ok","timestamp":1641055527109,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"ldIxGTF6foNL","outputId":"4a4290fe-606f-42df-b949-d528e05e5d86"},"outputs":[{"name":"stdout","output_type":"stream","text":["+--------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+--------------------+--------------------+\n","| Region|RainToday|RainTomorrow|RISK_MM|Rainfall|Sunshine|WindGustSpeed|Humidity3pm|Pressure9am|Cloud3pm|Temp3pm| features| standardized| pca|\n","+--------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+--------------------+--------------------+\n","|Tasmania| 1| 1| 16.2| 3.0| 6.0| 81.0| 42.0| 988.3| 5.0| 12.1|[1.0,3.0,6.0,81.0...|[2.35891778711926...|[32.5484828440512...|\n","|Tasmania| 1| 1| 3.0| 16.2| 3.2| 83.0| 60.0| 991.7| 6.0| 8.5|[1.0,16.2,3.2,83....|[2.35891778711926...|[30.3057898841538...|\n","+--------+---------+------------+-------+--------+--------+-------------+-----------+-----------+--------+-------+--------------------+--------------------+--------------------+\n","only showing top 2 rows\n","\n"]}],"source":["df_tasmania.show(2)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b_HKpomaixfw"},"outputs":[],"source":["# Creo dizionario con i dataframe (mi servirà per accedere al nome della regione)\n","\n","dataframes = {'Norfolk Island': df_norfolk_island, 'Victoria': df_victoria, \n"," 'New South Wales': df_new_south_wales, 'Queensland': df_queensland, \n"," 'Northern Territory': df_northern_territory, 'South Australia': df_south_australia,\n"," 'Western Australia': df_western_australia, 'Tasmania': df_tasmania}"]},{"cell_type":"markdown","metadata":{"id":"ggFKg8yVguad"},"source":["#### Analisi cluster"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":15282,"status":"ok","timestamp":1641055542385,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"},"user_tz":-60},"id":"3vZU5eaNgzuT","outputId":"b349274e-02ed-46d9-dc41-36404ae00189"},"outputs":[{"name":"stdout","output_type":"stream","text":["df_norfolk_island\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 892|\n","| 0| 2002|\n","+------------+-----+\n","\n","df_victoria\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 6287|\n","| 0|20927|\n","+------------+-----+\n","\n","df_new_south_wales\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 9351|\n","| 0|34733|\n","+------------+-----+\n","\n","df_queensland\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 2878|\n","| 0| 9028|\n","+------------+-----+\n","\n","df_northern_territory\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 1285|\n","| 0| 7004|\n","+------------+-----+\n","\n","df_south_australia\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 2302|\n","| 0| 9491|\n","+------------+-----+\n","\n","df_western_australia\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 3756|\n","| 0|13182|\n","+------------+-----+\n","\n","df_tasmania\n","+------------+-----+\n","|RainTomorrow|count|\n","+------------+-----+\n","| 1| 1434|\n","| 0| 4694|\n","+------------+-----+\n","\n"]}],"source":["print('df_norfolk_island')\n","df_norfolk_island.groupBy('RainTomorrow').count().show()\n","\n","print('df_victoria')\n","df_victoria.groupBy('RainTomorrow').count().show()\n","\n","print('df_new_south_wales')\n","df_new_south_wales.groupBy('RainTomorrow').count().show()\n","\n","print('df_queensland')\n","df_queensland.groupBy('RainTomorrow').count().show()\n","\n","print('df_northern_territory')\n","df_northern_territory.groupBy('RainTomorrow').count().show()\n","\n","print('df_south_australia')\n","df_south_australia.groupBy('RainTomorrow').count().show()\n","\n","print('df_western_australia')\n","df_western_australia.groupBy('RainTomorrow').count().show()\n","\n","print('df_tasmania')\n","df_tasmania.groupBy('RainTomorrow').count().show()\n"]},{"cell_type":"markdown","metadata":{"id":"-Bk4of4YYFB_"},"source":["## Decision Tree"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"r4fOo9YrX8YI"},"outputs":[],"source":["# DT con 'pca' come input column\n","dt = DecisionTreeClassifier(featuresCol = 'pca', labelCol = 'RainTomorrow', seed= 42)\n","\n","# grid search\n","paramGrid = ParamGridBuilder() \\\n"," .addGrid(dt.maxDepth, [ x for x in range( 2, 15, 3 ) ]) \\\n"," .build()\n","\n","# cross-validation\n","crossval = CrossValidator(estimator=dt,\n"," numFolds= 5,\n"," estimatorParamMaps=paramGrid,\n"," evaluator=BinaryClassificationEvaluator(labelCol='RainTomorrow',\n"," rawPredictionCol='prediction',\n"," metricName=\"areaUnderROC\"))\n","\n","# Evaluator per ROC curve\n","evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n"," labelCol='RainTomorrow',\n"," metricName=\"areaUnderROC\")\n","\n","# Evaluator per PR curve\n","evaluator2 = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n"," labelCol='RainTomorrow',\n"," metricName=\"areaUnderPR\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"HuGnXgD6i4GR"},"outputs":[],"source":["def alberoDecisionale(dataframe, name):\n","\n"," splits = dataframe.randomSplit([0.7, 0.3]) \n"," train = splits[0] \n"," test = splits[1]\n","\n"," cvModel = crossval.fit(train)\n"," bestModel = cvModel.bestModel\n"," predictions = bestModel.transform(test)\n"," #dtmodel = crossval.fit(train)\n"," #predictions = dtmodel.transform(test)\n","\n"," print()\n"," #print('----- Cluster {} -----'.format(cluster))\n"," # Area under ROC curve\n"," #print('Area under ROC curve:', evaluator.evaluate(predictions))\n"," \n"," # Area under PR curve\n"," #print('Area under PR curve: ', evaluator2.evaluate(predictions))\n","\n"," TP = predictions.filter((predictions['prediction'] == 1.0) & (predictions['RainTomorrow'] == 1)).count() +0.1 # evita ZeroDivisionError\n"," FP = predictions.filter((predictions['prediction'] == 1.0) & (predictions['RainTomorrow'] == 0)).count()\n"," TN = predictions.filter((predictions['prediction'] == 0.0) & (predictions['RainTomorrow'] == 0)).count()\n"," FN = predictions.filter((predictions['prediction'] == 0.0) & (predictions['RainTomorrow'] == 1)).count()\n","\n"," '''print('Accuracy:', (TP+TN)/(TP+TN+FP+FN))\n"," print('Precision:', TP/(TP+FP))\n"," print('Recall:', TP/(TP+FN))\n"," print('F-measure:', (2*TP)/(2*TP+FN+FP))'''\n","\n"," print(tabulate([['Area under ROC curve:', evaluator.evaluate(predictions)],\n"," ['Area under PR curve:', evaluator2.evaluate(predictions)],\n"," ['Accuracy:', (TP+TN)/(TP+TN+FP+FN)], \n"," ['Precision:', TP/(TP+FP)], \n"," ['Recall:', TP/(TP+FN)],\n"," ['F-measure:', (2*TP)/(2*TP+FN+FP)]], headers=[name, ''], tablefmt='orgtbl'))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true,"base_uri":"https://localhost:8080/"},"id":"dZxcmQNri494","outputId":"6921d985-b54d-4df9-a597-f2a0bdf1ed77"},"outputs":[{"name":"stdout","output_type":"stream","text":["+-------------+\n","|Decision Tree|\n","+-------------+\n","\n","| Norfolk Island | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.71831 |\n","| Area under PR curve: | 0.570831 |\n","| Accuracy: | 0.778589 |\n","| Precision: | 0.645899 |\n","| Recall: | 0.56934 |\n","| F-measure: | 0.605208 |\n","\n","| Victoria | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.677059 |\n","| Area under PR curve: | 0.557268 |\n","| Accuracy: | 0.819804 |\n","| Precision: | 0.693026 |\n","| Recall: | 0.409363 |\n","| F-measure: | 0.514699 |\n","\n","| New South Wales | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.672966 |\n","| Area under PR curve: | 0.555616 |\n","| Accuracy: | 0.834397 |\n","| Precision: | 0.705262 |\n","| Recall: | 0.390464 |\n","| F-measure: | 0.502643 |\n","\n","| Queensland | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.713241 |\n","| Area under PR curve: | 0.627153 |\n","| Accuracy: | 0.833384 |\n","| Precision: | 0.762205 |\n","| Recall: | 0.47534 |\n","| F-measure: | 0.585524 |\n","\n","| Northern Territory | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.7047 |\n","| Area under PR curve: | 0.541687 |\n","| Accuracy: | 0.880819 |\n","| Precision: | 0.688385 |\n","| Recall: | 0.447514 |\n","| F-measure: | 0.542411 |\n","\n","| South Australia | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.723647 |\n","| Area under PR curve: | 0.506073 |\n","| Accuracy: | 0.841874 |\n","| Precision: | 0.602039 |\n","| Recall: | 0.531273 |\n","| F-measure: | 0.564447 |\n","\n","| Western Australia | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.745119 |\n","| Area under PR curve: | 0.581478 |\n","| Accuracy: | 0.847064 |\n","| Precision: | 0.682643 |\n","| Recall: | 0.563773 |\n","| F-measure: | 0.61754 |\n","\n","| Tasmania | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.638354 |\n","| Area under PR curve: | 0.435575 |\n","| Accuracy: | 0.761415 |\n","| Precision: | 0.517894 |\n","| Recall: | 0.396962 |\n","| F-measure: | 0.449435 |\n"]}],"source":["print('+-------------+')\n","print('|Decision Tree|')\n","print('+-------------+')\n","\n","\n","for k,v in dataframes.items():\n"," alberoDecisionale(v,k)\n"," "]},{"cell_type":"markdown","metadata":{"id":"XeYw-_tiECkJ"},"source":["## Random Forest"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"68jjLcxDC-Nq"},"outputs":[],"source":["rf = RandomForestClassifier(featuresCol = 'pca', labelCol = 'RainTomorrow', seed= 42)\n","\n","# grid search\n","paramGrid = ParamGridBuilder() \\\n"," .addGrid(rf.maxDepth, [ x for x in range( 2, 15, 3 ) ]) \\\n"," .addGrid(rf.numTrees, [ x for x in range( 2, 15, 3 ) ]) \\\n"," .build()\n","\n","# cross-validation\n","crossval = CrossValidator(estimator=rf,\n"," numFolds= 5,\n"," estimatorParamMaps=paramGrid,\n"," evaluator=BinaryClassificationEvaluator(labelCol='RainTomorrow',\n"," rawPredictionCol='prediction',\n"," metricName=\"areaUnderROC\"))\n","\n","# Evaluator per ROC curve\n","evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n"," labelCol='RainTomorrow',\n"," metricName=\"areaUnderROC\")\n","\n","# Evaluator per PR curve\n","evaluator2 = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n"," labelCol='RainTomorrow',\n"," metricName=\"areaUnderPR\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"w1wdPCzlEEnH"},"outputs":[],"source":["def forestaCasuale(dataframe, name):\n"," splits = dataframe.randomSplit([0.7, 0.3]) \n"," train = splits[0] \n"," test = splits[1] \n","\n"," rfmodel= rf.fit(train)\n"," predictions = rfmodel.transform(test)\n","\n"," print()\n","\n"," TP = predictions.filter((predictions['prediction'] == 1.0) & (predictions['RainTomorrow'] == 1)).count() +0.1 # evita ZeroDivisionError\n"," FP = predictions.filter((predictions['prediction'] == 1.0) & (predictions['RainTomorrow'] == 0)).count()\n"," TN = predictions.filter((predictions['prediction'] == 0.0) & (predictions['RainTomorrow'] == 0)).count()\n"," FN = predictions.filter((predictions['prediction'] == 0.0) & (predictions['RainTomorrow'] == 1)).count()\n","\n"," print(tabulate([['Area under ROC curve:', evaluator.evaluate(predictions)],\n"," ['Area under PR curve:', evaluator2.evaluate(predictions)],\n"," ['Accuracy:', (TP+TN)/(TP+TN+FP+FN)], \n"," ['Precision:', TP/(TP+FP)], \n"," ['Recall:', TP/(TP+FN)], \n"," ['F-measure:', (2*TP)/(2*TP+FN+FP)]], headers=[name, ''], tablefmt='orgtbl'))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"L71WJ0FXEEe1","outputId":"7e32bd96-4566-41d0-bc26-4199b5b78994"},"outputs":[{"name":"stdout","output_type":"stream","text":["+-------------+\n","|Random Forest|\n","+-------------+\n","\n","| Norfolk Island | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.705957 |\n","| Area under PR curve: | 0.605392 |\n","| Accuracy: | 0.785316 |\n","| Precision: | 0.705079 |\n","| Recall: | 0.5041 |\n","| F-measure: | 0.587887 |\n","\n","| Victoria | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.678904 |\n","| Area under PR curve: | 0.579004 |\n","| Accuracy: | 0.8222 |\n","| Precision: | 0.723331 |\n","| Recall: | 0.406183 |\n","| F-measure: | 0.520232 |\n","\n","| New South Wales | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.684559 |\n","| Area under PR curve: | 0.541408 |\n","| Accuracy: | 0.836051 |\n","| Precision: | 0.675663 |\n","| Recall: | 0.423208 |\n","| F-measure: | 0.520436 |\n","\n","| Queensland | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.747802 |\n","| Area under PR curve: | 0.608279 |\n","| Accuracy: | 0.845051 |\n","| Precision: | 0.712373 |\n","| Recall: | 0.56527 |\n","| F-measure: | 0.630353 |\n","\n","| Northern Territory | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.75108 |\n","| Area under PR curve: | 0.520393 |\n","| Accuracy: | 0.877227 |\n","| Precision: | 0.620799 |\n","| Recall: | 0.567043 |\n","| F-measure: | 0.592704 |\n","\n","| South Australia | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.694092 |\n","| Area under PR curve: | 0.573679 |\n","| Accuracy: | 0.858524 |\n","| Precision: | 0.726885 |\n","| Recall: | 0.426555 |\n","| F-measure: | 0.53762 |\n","\n","| Western Australia | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.725382 |\n","| Area under PR curve: | 0.60835 |\n","| Accuracy: | 0.848543 |\n","| Precision: | 0.735759 |\n","| Recall: | 0.502767 |\n","| F-measure: | 0.597348 |\n","\n","| Tasmania | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.631851 |\n","| Area under PR curve: | 0.547199 |\n","| Accuracy: | 0.805431 |\n","| Precision: | 0.713668 |\n","| Recall: | 0.301529 |\n","| F-measure: | 0.423941 |\n"]}],"source":["print('+-------------+')\n","print('|Random Forest|')\n","print('+-------------+')\n","\n","for k,v in dataframes.items():\n"," forestaCasuale(v,k)\n"]},{"cell_type":"markdown","metadata":{"id":"zL-GZT4tEFOB"},"source":["## Logistic Regression"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"LuLuJDhAEHRb"},"outputs":[],"source":["lr = LogisticRegression(featuresCol = 'pca', labelCol = 'RainTomorrow')\n","\n","# grid search\n","paramGrid = ParamGridBuilder() \\\n"," .addGrid(lr.maxIter, [x for x in range(1,30,5)])\\\n"," .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\\\n"," .build()\n","\n","# cross-validation\n","crossval = CrossValidator(estimator=lr,\n"," numFolds= 5,\n"," estimatorParamMaps=paramGrid,\n"," evaluator=BinaryClassificationEvaluator(labelCol='RainTomorrow',\n"," rawPredictionCol='prediction',\n"," metricName=\"areaUnderROC\"))\n","\n","# Evaluator per ROC curve\n","evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n"," labelCol='RainTomorrow',\n"," metricName=\"areaUnderROC\")\n","\n","# Evaluator per PR curve\n","evaluator2 = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n"," labelCol='RainTomorrow',\n"," metricName=\"areaUnderPR\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"512pgz5uEH8H"},"outputs":[],"source":["def regressioneLogistica(dataframe, name):\n"," splits = dataframe.randomSplit([0.7, 0.3]) \n"," train = splits[0] \n"," test = splits[1] \n","\n"," lrmodel= crossval.fit(train)\n"," predictions = lrmodel.transform(test)\n","\n"," print()\n","\n"," TP = predictions.filter((predictions['prediction'] == 1.0) & (predictions['RainTomorrow'] == 1)).count() +0.1 # evita ZeroDivisionError\n"," FP = predictions.filter((predictions['prediction'] == 1.0) & (predictions['RainTomorrow'] == 0)).count()\n"," TN = predictions.filter((predictions['prediction'] == 0.0) & (predictions['RainTomorrow'] == 0)).count()\n"," FN = predictions.filter((predictions['prediction'] == 0.0) & (predictions['RainTomorrow'] == 1)).count()\n","\n"," print(tabulate([['Area under ROC curve:', evaluator.evaluate(predictions)],\n"," ['Area under PR curve:', evaluator2.evaluate(predictions)],\n"," ['Accuracy:', (TP+TN)/(TP+TN+FP+FN)], \n"," ['Precision:', TP/(TP+FP)], \n"," ['Recall:', TP/(TP+FN)], \n"," ['F-measure:', (2*TP)/(2*TP+FN+FP)]], headers=[name, ''], tablefmt='orgtbl'))"]},{"cell_type":"code","execution_count":39,"metadata":{"id":"kOcG0VZBEH4Y","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1641057157807,"user_tz":-60,"elapsed":803406,"user":{"displayName":"Mario Bianchi","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiYjqJ5A8gRK7lZX57CyeAJHOrmxNdh0QGBaLkAFg=s64","userId":"04504788663289352355"}},"outputId":"cee67766-e175-4802-86d9-f3c024f8a0c5"},"outputs":[{"output_type":"stream","name":"stdout","text":["+-------------------+\n","|Logistic Regression|\n","+-------------------+\n","\n","| Norfolk Island | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.68341 |\n","| Area under PR curve: | 0.576412 |\n","| Accuracy: | 0.76589 |\n","| Precision: | 0.673636 |\n","| Recall: | 0.468274 |\n","| F-measure: | 0.552488 |\n","\n","| Victoria | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.694539 |\n","| Area under PR curve: | 0.569966 |\n","| Accuracy: | 0.822011 |\n","| Precision: | 0.695471 |\n","| Recall: | 0.450919 |\n","| F-measure: | 0.54711 |\n","\n","| New South Wales | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.667764 |\n","| Area under PR curve: | 0.54038 |\n","| Accuracy: | 0.831048 |\n","| Precision: | 0.68632 |\n","| Recall: | 0.383016 |\n","| F-measure: | 0.491654 |\n","\n","| Queensland | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.713941 |\n","| Area under PR curve: | 0.586593 |\n","| Accuracy: | 0.824804 |\n","| Precision: | 0.701542 |\n","| Recall: | 0.496538 |\n","| F-measure: | 0.581501 |\n","\n","| Northern Territory | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.729638 |\n","| Area under PR curve: | 0.600783 |\n","| Accuracy: | 0.893005 |\n","| Precision: | 0.752034 |\n","| Recall: | 0.490028 |\n","| F-measure: | 0.593397 |\n","\n","| South Australia | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.696838 |\n","| Area under PR curve: | 0.560214 |\n","| Accuracy: | 0.853774 |\n","| Precision: | 0.702395 |\n","| Recall: | 0.439035 |\n","| F-measure: | 0.540333 |\n","\n","| Western Australia | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.714409 |\n","| Area under PR curve: | 0.618601 |\n","| Accuracy: | 0.841617 |\n","| Precision: | 0.755438 |\n","| Recall: | 0.475789 |\n","| F-measure: | 0.583855 |\n","\n","| Tasmania | |\n","|-----------------------+----------|\n","| Area under ROC curve: | 0.64593 |\n","| Area under PR curve: | 0.502389 |\n","| Accuracy: | 0.797636 |\n","| Precision: | 0.628149 |\n","| Recall: | 0.357777 |\n","| F-measure: | 0.455891 |\n"]}],"source":["print('+-------------------+')\n","print('|Logistic Regression|')\n","print('+-------------------+')\n","\n","for k,v in dataframes.items():\n"," regressioneLogistica(v,k)"]}],"metadata":{"colab":{"collapsed_sections":[],"name":"6.Classificazione per regione politica","provenance":[],"authorship_tag":"ABX9TyMmHENreI1R/NbSu7FwcByP"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}