-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassignment-2.qmd
143 lines (106 loc) · 4.82 KB
/
assignment-2.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
---
title: "Machine Learning - Assignment 2 (Spotify)"
author: "Conor Heffron (23211267)"
format: pdf
editor: visual
---
## Load R libraries
```{r}
library(tidyverse)
library(dendextend)
library(formatR)
library(knitr)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
```
## Load Spotify Data
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
path = "./"
spotify_23211267 <- read_csv(paste(path, "spotify.csv", sep = "/"),
na = "NA")
# View(spotify_23211267)
```
## Show Data Dimensions, Structure, Summary
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
dim(spotify_23211267)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
str(spotify_23211267)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
summary(spotify_23211267)
```
## Hiearchial Cluster via Euclidean Distance of Spotify Data
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
hc_spotify <- hclust(dist(spotify_23211267, method = 'euclidean'), method = 'complete')
```
## Segment Spotify Data into 5 clusters / groups
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
grp5 <- cutree(hc_spotify, k = 5)
segmented_spotify <- mutate(spotify_23211267, cluster = grp5)
# Plots
ggplot(segmented_spotify, aes(x = loudness, y=acousticness, color = factor(cluster))) + geom_point() + facet_wrap(~playlist_genre)
ggplot(segmented_spotify, aes(x = loudness, y=speechiness, color = factor(cluster))) + geom_point() + facet_wrap(~playlist_genre)
ggplot(segmented_spotify, aes(x = loudness, y=instrumentalness, color = factor(cluster))) + geom_point() + facet_wrap(~playlist_genre)
ggplot(segmented_spotify, aes(x = loudness, y=danceability, color = factor(cluster))) + geom_point() + facet_wrap(~playlist_genre)
ggplot(segmented_spotify, aes(x = loudness, y=energy, color = factor(cluster))) + geom_point() + facet_wrap(~playlist_genre)
ggplot(segmented_spotify, aes(mode, fill = factor(cluster))) + geom_bar() + facet_wrap(~playlist_genre)
ggplot(segmented_spotify, aes(x = loudness, y=tempo, color = factor(cluster))) + geom_point() + facet_wrap(~playlist_genre)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
avg_dend_obj <- as.dendrogram(hc_spotify)
avg_col_dend <- color_branches(avg_dend_obj, h = 16000)
# Create PDF for plotting
pdf("plots/plots_avg_col_dendrogram.pdf", width=40, height=15)
# Plotting
plot(avg_col_dend)
plot(cut(avg_col_dend, h=16000)$upper,
main="Upper tree of cut at h=16000")
plot(cut(avg_col_dend, h=16000)$lower[[2]],
main="Second branch of lower tree with cut at h=16000")
# Close the PDF file's associated graphics device
dev.off()
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
path_weka <- "/Users/conorheffron/Library/CloudStorage/GoogleDrive-conor.heffron@ucdconnect.ie/My Drive/UCD/MSc in AI for Medicine and Medical Research/Courses/Trimester 1/COMP47460-Machine Learning/COMP47460-Machine Learning (Blended Del)-202324 Autumn - 1062023 - 839 PM/Assignment-2/weka_notes/"
weka_acc <- read_csv(paste(path_weka, "weka_acc.csv", sep = "/"))
```
## Get Data Dimensions & Summary, Print Data Frame
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
dim(weka_acc)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
summary(weka_acc)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
weka_acc
```
## Plot for voting with 3x Combination Rules(KNN K=1, Multilayer Perceptron Neural Network (MP NN), JV8 (Decision Tree))
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
ggplot(weka_acc %>% filter(substr(Type, 1, 4) != "Vote" & Bag == 0), aes(x = Type, y = `%`, color = Accuracy, group = Accuracy)) + geom_line(size = 1)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
ggplot(weka_acc %>% filter(substr(Type, 1, 4) == "Vote" & Bag == 0), aes(x = Type, y = `%`, color = Accuracy, group = Accuracy)) + geom_line(size = 1)
```
## Plot Ensembles by Bagging (Bag == 2-\>20 in increments of 2)
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
ggplot(weka_acc %>% filter(Bag != 0), aes(y = `%`, x = Bag, color = Accuracy, group = Accuracy)) + geom_point() + geom_line(size = 0.31) + facet_wrap(~Type)
```
## Build Linear Regression Model (predict energy based on tempo, loudness, liveness)
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
# library()
lm_energy1 <- lm(formula = energy ~ tempo + loudness + liveness, data = spotify_23211267)
summary(lm_energy1)
plot(lm_energy1, pch = 16, col = "blue") #Plot the results
# abline(lm_energy1) #Add a regression line
confint(lm_energy1)
sigma(lm_energy1)/mean(spotify_23211267$energy)
```
```{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)}
lm_energy2 <- lm(formula = energy ~ tempo * loudness * liveness, data = spotify_23211267)
summary(lm_energy2)
plot(lm_energy2, pch = 16, col = "blue") #Plot the results
# abline(lm_energy2) #Add a regression line
confint(lm_energy2)
# Lower value here so is the more accurate model
sigma(lm_energy2)/mean(spotify_23211267$energy)
```