From 0ca707ba93c58acbf02b434b83f99bd921a15ad2 Mon Sep 17 00:00:00 2001
From: Conor Heffron <conor.heffron@gmail.com>
Date: Sat, 23 Mar 2024 00:10:02 +0000
Subject: [PATCH] Refactor spot (#3)

* Create .igitignore

* Update .gitignore

* Minor refactor of main method & README.md updates
---
 .gitignore   |   7 +++
 README.md    | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 main.py      |   7 ++-
 normalise.py |  14 ++----
 4 files changed, 144 insertions(+), 12 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7c2febc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+
+*.iml
+*.xml
+*.pyc
+/python
+/weka_notes
+*.arff
diff --git a/README.md b/README.md
index 5289ae2..f9b0545 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,129 @@
 # normalise-spotify
 
-## Max Min Normalisation of Spotify Data
+## Min-Max Normalisation of Spotify Data
+
+### Sample CLI usage: python main.py `number_of_decimals_for_rounding`
+
+```shell
+ python main.py 3
+```
+
+```shell
+ > python main.py 2
+The number of decimals set is 2
+Spotify data dimensions: (21812, 13)
+------------------------------------------------
+Min of danceability is: 0.0
+Max of danceability is: 0.983
+Mean of danceability is: 0.6514165734458096
+Median of danceability is: 0.667
+Standard Deviation of danceability is: 0.1463938018880771
+Variance of danceability is: 0.021431145231245567
+Z score value danceability is: 14454.170000000002
+------------------------------------------------
+Min of energy is: 0.000175
+Max of energy is: 1.0
+Mean of energy is: 0.7201891396937466
+Median of energy is: 0.745
+Standard Deviation of energy is: 0.1734950267410095
+Variance of energy is: 0.030100524303863604
+Z score value energy is: 15698.15
+------------------------------------------------
+Min of key is: 0
+Max of key is: 11
+Mean of key is: 5.359205941683477
+Median of key is: 6.0
+Standard Deviation of key is: 3.604633195031203
+Variance of key is: 12.99338047072086
+Z score value key is: 10631.75
+------------------------------------------------
+Min of loudness is: -46.448
+Max of loudness is: 1.275
+Mean of loudness is: -6.488673665871998
+Median of loudness is: -5.895
+Standard Deviation of loudness is: 2.969534676572623
+Variance of loudness is: 8.818136195367273
+Z score value loudness is: 18263.84
+------------------------------------------------
+Min of mode is: 0
+Max of mode is: 1
+Mean of mode is: 0.578809829451678
+Median of mode is: 1.0
+Standard Deviation of mode is: 0.49376126632609246
+Variance of mode is: 0.24380018812394644
+Z score value mode is: 12625.0
+------------------------------------------------
+Min of speechiness is: 0.0
+Max of speechiness is: 0.877
+Mean of speechiness is: 0.10477729231615623
+Median of speechiness is: 0.0614
+Standard Deviation of speechiness is: 0.0995377389466803
+Variance of speechiness is: 0.009907761474617477
+Z score value speechiness is: 2606.0000000000005
+------------------------------------------------
+Min of acousticness is: 0.0
+Max of acousticness is: 0.994
+Mean of acousticness is: 0.15819448752980012
+Median of acousticness is: 0.067
+Standard Deviation of acousticness is: 0.2074554900373028
+Variance of acousticness is: 0.04303778034661744
+Z score value acousticness is: 3467.55
+------------------------------------------------
+Min of instrumentalness is: 0.0
+Max of instrumentalness is: 0.994
+Mean of instrumentalness is: 0.0964625660187053
+Median of instrumentalness is: 2.2e-05
+Standard Deviation of instrumentalness is: 0.23831234781115662
+Variance of instrumentalness is: 0.056792775119265684
+Z score value instrumentalness is: 2113.51
+------------------------------------------------
+Min of liveness is: 0.0
+Max of liveness is: 0.996
+Mean of liveness is: 0.19425706583532001
+Median of liveness is: 0.13
+Standard Deviation of liveness is: 0.15813235139325513
+Variance of liveness is: 0.025005840557159913
+Z score value liveness is: 4253.33
+------------------------------------------------
+Min of valence is: 0.0
+Max of valence is: 0.991
+Mean of valence is: 0.5054820983862094
+Median of valence is: 0.506
+Standard Deviation of valence is: 0.23474981893651678
+Variance of valence is: 0.05510747749072741
+Z score value valence is: 11125.94
+------------------------------------------------
+Min of tempo is: 0.0
+Max of tempo is: 220.252
+Mean of tempo is: 122.23560191637631
+Median of tempo is: 123.966
+Standard Deviation of tempo is: 26.325958217264375
+Variance of tempo is: 693.0560760571498
+Z score value tempo is: 12101.340000000002
+------------------------------------------------
+Min of duration_ms is: 4000
+Max of duration_ms is: 517810
+Mean of duration_ms is: 223585.04231615624
+Median of duration_ms is: 213507.0
+Standard Deviation of duration_ms is: 59883.42208965201
+Variance of duration_ms is: 3586024241.167423
+Z score value duration_ms is: 9321.51
+
+------------------------------------------------
+z_i_loudness            18263.84
+z_i_energy              15698.15
+z_i_danceability        14454.17
+z_i_mode                12625.00
+z_i_tempo               12101.34
+z_i_playlist_genre      11156.35
+z_i_valence             11125.94
+z_i_key                 10631.75
+z_i_duration_ms          9321.51
+z_i_liveness             4253.33
+z_i_acousticness         3467.55
+z_i_speechiness          2606.00
+z_i_instrumentalness     2113.51
+dtype: float64
+```
+
+
diff --git a/main.py b/main.py
index 411e557..20bc392 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,5 @@
+import sys
+
 import pandas as pd
 
 import evaluation
@@ -28,7 +30,8 @@ def normalise(df, decimals):
 
 if __name__ == '__main__':
     # Main run configurations
-    decimals = 3
+    decimals = sys.argv[1]
+    print("The number of decimals set is %s" % str(decimals))
 
     # Run assignment 2 code
-    assignment_2(decimals)
\ No newline at end of file
+    assignment_2(int(decimals))
\ No newline at end of file
diff --git a/normalise.py b/normalise.py
index ff183db..b99f372 100644
--- a/normalise.py
+++ b/normalise.py
@@ -1,16 +1,15 @@
 import pandas as pd
 
+
 def playlist_genre(df, decimals):
     # Note: Genre is class label (5X) but plan to verify as categorical (nominal) feature
     playlist_genre_dict = {'edm' : 1, 'latin' : 2, 'pop' : 3, 'rap' : 4, 'rock' : 5, None: 0}
 
-    # Create dictionary mapping fetal health classification string to integer
-    dict_len = len(playlist_genre_dict)
-
     # Assign int value for genre to new column 'playlist_genre_int'
     df["playlist_genre_int"] = df["playlist_genre"].apply(lambda x: playlist_genre_dict.get(x))
 
     # absolute difference (after normalisation)
+    dict_len = len(playlist_genre_dict)
     df["z_i_playlist_genre"] = df["playlist_genre_int"].apply(lambda x: round(abs((x - dict_len) / dict_len), decimals))
 
 
@@ -33,11 +32,8 @@ def numerical_features(c, df, decimals):
     df[mean_c] = pd.to_numeric(df[c]).median()
     df[sd_c] = pd.to_numeric(df[c]).std()
     df[var_c] = pd.to_numeric(df[c]).var()
-    # if c == "acousticness" or c == "energy":
-    #     # already normalised to range[0,1]
-    #     df[z_i] = df[c]
-    # else:
-    df[z_i] = round(df[diff_curr_min_c] / df[diff_max_min_c], decimals)  # Rounding to 3 decimals
+
+    df[z_i] = round(df[diff_curr_min_c] / df[diff_max_min_c], decimals)  # Rounding to decimals specified
 
     # For numeric or continuous variables
     # The absolute difference after normalisation to range [0, 1] is preferred
@@ -48,4 +44,4 @@ def numerical_features(c, df, decimals):
     print("Median of " + c + " is: " + str(pd.to_numeric(df[c]).median()))
     print("Standard Deviation of " + c + " is: " + str(pd.to_numeric(df[c]).std()))
     print("Variance of " + c + " is: " + str(pd.to_numeric(df[c]).var()))
-    print("Zi sum value " + c + " is: " + str(pd.to_numeric(df[z_i].sum())))
+    print("Z score value " + c + " is: " + str(pd.to_numeric(df[z_i].sum())))