From 0ca707ba93c58acbf02b434b83f99bd921a15ad2 Mon Sep 17 00:00:00 2001 From: Conor Heffron Date: Sat, 23 Mar 2024 00:10:02 +0000 Subject: [PATCH] Refactor spot (#3) * Create .igitignore * Update .gitignore * Minor refactor of main method & README.md updates --- .gitignore | 7 +++ README.md | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++- main.py | 7 ++- normalise.py | 14 ++---- 4 files changed, 144 insertions(+), 12 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7c2febc --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ + +*.iml +*.xml +*.pyc +/python +/weka_notes +*.arff diff --git a/README.md b/README.md index 5289ae2..f9b0545 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,129 @@ # normalise-spotify -## Max Min Normalisation of Spotify Data +## Min-Max Normalisation of Spotify Data + +### Sample CLI usage: python main.py `number_of_decimals_for_rounding` + +```shell + python main.py 3 +``` + +```shell + > python main.py 2 +The number of decimals set is 2 +Spotify data dimensions: (21812, 13) +------------------------------------------------ +Min of danceability is: 0.0 +Max of danceability is: 0.983 +Mean of danceability is: 0.6514165734458096 +Median of danceability is: 0.667 +Standard Deviation of danceability is: 0.1463938018880771 +Variance of danceability is: 0.021431145231245567 +Z score value danceability is: 14454.170000000002 +------------------------------------------------ +Min of energy is: 0.000175 +Max of energy is: 1.0 +Mean of energy is: 0.7201891396937466 +Median of energy is: 0.745 +Standard Deviation of energy is: 0.1734950267410095 +Variance of energy is: 0.030100524303863604 +Z score value energy is: 15698.15 +------------------------------------------------ +Min of key is: 0 +Max of key is: 11 +Mean of key is: 5.359205941683477 +Median of key is: 6.0 +Standard Deviation of key is: 3.604633195031203 +Variance of key is: 12.99338047072086 +Z score value key is: 10631.75 +------------------------------------------------ +Min of loudness is: -46.448 +Max of loudness is: 1.275 +Mean of loudness is: -6.488673665871998 +Median of loudness is: -5.895 +Standard Deviation of loudness is: 2.969534676572623 +Variance of loudness is: 8.818136195367273 +Z score value loudness is: 18263.84 +------------------------------------------------ +Min of mode is: 0 +Max of mode is: 1 +Mean of mode is: 0.578809829451678 +Median of mode is: 1.0 +Standard Deviation of mode is: 0.49376126632609246 +Variance of mode is: 0.24380018812394644 +Z score value mode is: 12625.0 +------------------------------------------------ +Min of speechiness is: 0.0 +Max of speechiness is: 0.877 +Mean of speechiness is: 0.10477729231615623 +Median of speechiness is: 0.0614 +Standard Deviation of speechiness is: 0.0995377389466803 +Variance of speechiness is: 0.009907761474617477 +Z score value speechiness is: 2606.0000000000005 +------------------------------------------------ +Min of acousticness is: 0.0 +Max of acousticness is: 0.994 +Mean of acousticness is: 0.15819448752980012 +Median of acousticness is: 0.067 +Standard Deviation of acousticness is: 0.2074554900373028 +Variance of acousticness is: 0.04303778034661744 +Z score value acousticness is: 3467.55 +------------------------------------------------ +Min of instrumentalness is: 0.0 +Max of instrumentalness is: 0.994 +Mean of instrumentalness is: 0.0964625660187053 +Median of instrumentalness is: 2.2e-05 +Standard Deviation of instrumentalness is: 0.23831234781115662 +Variance of instrumentalness is: 0.056792775119265684 +Z score value instrumentalness is: 2113.51 +------------------------------------------------ +Min of liveness is: 0.0 +Max of liveness is: 0.996 +Mean of liveness is: 0.19425706583532001 +Median of liveness is: 0.13 +Standard Deviation of liveness is: 0.15813235139325513 +Variance of liveness is: 0.025005840557159913 +Z score value liveness is: 4253.33 +------------------------------------------------ +Min of valence is: 0.0 +Max of valence is: 0.991 +Mean of valence is: 0.5054820983862094 +Median of valence is: 0.506 +Standard Deviation of valence is: 0.23474981893651678 +Variance of valence is: 0.05510747749072741 +Z score value valence is: 11125.94 +------------------------------------------------ +Min of tempo is: 0.0 +Max of tempo is: 220.252 +Mean of tempo is: 122.23560191637631 +Median of tempo is: 123.966 +Standard Deviation of tempo is: 26.325958217264375 +Variance of tempo is: 693.0560760571498 +Z score value tempo is: 12101.340000000002 +------------------------------------------------ +Min of duration_ms is: 4000 +Max of duration_ms is: 517810 +Mean of duration_ms is: 223585.04231615624 +Median of duration_ms is: 213507.0 +Standard Deviation of duration_ms is: 59883.42208965201 +Variance of duration_ms is: 3586024241.167423 +Z score value duration_ms is: 9321.51 + +------------------------------------------------ +z_i_loudness 18263.84 +z_i_energy 15698.15 +z_i_danceability 14454.17 +z_i_mode 12625.00 +z_i_tempo 12101.34 +z_i_playlist_genre 11156.35 +z_i_valence 11125.94 +z_i_key 10631.75 +z_i_duration_ms 9321.51 +z_i_liveness 4253.33 +z_i_acousticness 3467.55 +z_i_speechiness 2606.00 +z_i_instrumentalness 2113.51 +dtype: float64 +``` + + diff --git a/main.py b/main.py index 411e557..20bc392 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ +import sys + import pandas as pd import evaluation @@ -28,7 +30,8 @@ def normalise(df, decimals): if __name__ == '__main__': # Main run configurations - decimals = 3 + decimals = sys.argv[1] + print("The number of decimals set is %s" % str(decimals)) # Run assignment 2 code - assignment_2(decimals) \ No newline at end of file + assignment_2(int(decimals)) \ No newline at end of file diff --git a/normalise.py b/normalise.py index ff183db..b99f372 100644 --- a/normalise.py +++ b/normalise.py @@ -1,16 +1,15 @@ import pandas as pd + def playlist_genre(df, decimals): # Note: Genre is class label (5X) but plan to verify as categorical (nominal) feature playlist_genre_dict = {'edm' : 1, 'latin' : 2, 'pop' : 3, 'rap' : 4, 'rock' : 5, None: 0} - # Create dictionary mapping fetal health classification string to integer - dict_len = len(playlist_genre_dict) - # Assign int value for genre to new column 'playlist_genre_int' df["playlist_genre_int"] = df["playlist_genre"].apply(lambda x: playlist_genre_dict.get(x)) # absolute difference (after normalisation) + dict_len = len(playlist_genre_dict) df["z_i_playlist_genre"] = df["playlist_genre_int"].apply(lambda x: round(abs((x - dict_len) / dict_len), decimals)) @@ -33,11 +32,8 @@ def numerical_features(c, df, decimals): df[mean_c] = pd.to_numeric(df[c]).median() df[sd_c] = pd.to_numeric(df[c]).std() df[var_c] = pd.to_numeric(df[c]).var() - # if c == "acousticness" or c == "energy": - # # already normalised to range[0,1] - # df[z_i] = df[c] - # else: - df[z_i] = round(df[diff_curr_min_c] / df[diff_max_min_c], decimals) # Rounding to 3 decimals + + df[z_i] = round(df[diff_curr_min_c] / df[diff_max_min_c], decimals) # Rounding to decimals specified # For numeric or continuous variables # The absolute difference after normalisation to range [0, 1] is preferred @@ -48,4 +44,4 @@ def numerical_features(c, df, decimals): print("Median of " + c + " is: " + str(pd.to_numeric(df[c]).median())) print("Standard Deviation of " + c + " is: " + str(pd.to_numeric(df[c]).std())) print("Variance of " + c + " is: " + str(pd.to_numeric(df[c]).var())) - print("Zi sum value " + c + " is: " + str(pd.to_numeric(df[z_i].sum()))) + print("Z score value " + c + " is: " + str(pd.to_numeric(df[z_i].sum())))