-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgofund.Rmd
244 lines (186 loc) · 5.96 KB
/
gofund.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
---
title: "R Notebook"
output:
html_document:
df_print: paged
---
#Importing libraries
```{r}
library(tidyverse)
library(ggplot2)
library(stringi)
library(wordcloud)
library(tidytext)
library(stringr)
library(dplyr)
library(textdata)
```
```{r}
#importing the dataset
gofund <- read.csv("gofundme_data_OSF.csv")
head(gofund)
```
```{r}
#Preprocessing
#Encoding categorical data
gofund$currency <- factor(gofund$currency, levels = c("USD", "GBP", "CAD","AUD","EUR","NOK"))
gofund$Category <- factor(gofund$Category, levels = c("Funerals & Memorials","Sports, Teams & Clubs","Medical, Illness & Healing", "Missions, Faith & Church" ,"Business & Entrepreneurs","Babies, Kids & Family","Other","Education & Learning", "Community & Neighbors","Accidents & Emergencies","Creative Arts, Music & Film","Celebrations & Events","Volunteer & Service", "Animals & Pets" , "Rent, Food & Monthly Bills", "Environment", "Non-Profits & Charities", "Dreams, Hopes & Wishes", "Competitions & Pageants","Travel & Adventure"))
#Success == 1 | Fail == 0
gofund$state <- ifelse(gofund$Raised > gofund$Goal,1,0)
```
```{r}
#Data cleaning
gofund<- na.omit(gofund)
gofund$negate <- as.numeric(gofund$negate)
gofund$sad <- as.numeric(gofund$sad)
gofund$risk <- as.numeric(gofund$risk)
gofund$money <- as.numeric(gofund$money)
gofund$anger <- as.numeric(gofund$anger)
gofund$posemo <- as.numeric(gofund$posemo)
gofund$negemo <- as.numeric(gofund$negemo)
gofund$anx <- as.numeric(gofund$anx)
gofund$anger <- as.numeric(gofund$anger)
gofund$sad <- as.numeric(gofund$sad)
gofund$risk <- as.numeric(gofund$risk)
gofund$money <- as.numeric(gofund$money)
gofund<- na.omit(gofund)
```
```{r}
#Data Exploration
#Failed vs Success
gofund %>% group_by(state) %>% count(state)
```
```{r}
#Number of countries
levels(gofund$currency)
```
```{r}
#success/failure rate country
grp_by_country <- gofund %>%
group_by(currency) %>%
summarise(successful = sum(state == 1), failed = sum(state == 0))
grp_by_country
```
```{r}
#Visualization
ggplot(gofund , aes(currency)) + geom_bar(aes(fill=state))
```
```{r}
#success/failure rate category
grp_by_category <- gofund %>%
group_by(Category) %>%
summarise(successful = sum(state == 1), failed = sum(state == 0))
grp_by_category
```
```{r}
#visualization
ggplot(gofund , aes(y=Category)) + geom_bar(aes(fill=state))
```
```{r}
#Funds
summary(as.numeric(gofund$Raised))
```
```{r}
#Date Difference,state per category
grp_by_category_date <- gofund %>% group_by(Category) %>%
summarise(date_difference = sum(as.numeric(date_difference)))
grp_by_category_date
```
```{r}
#Visualization
ggplot(gofund, aes(y=Category,x=as.numeric(date_difference))) + geom_col()
```
```{r}
#Word count per category
count_category <- gofund %>%
group_by(Category) %>%
summarise(word_count = sum(as.numeric(WC)))
count_category
```
```{r}
#Word frequency
gofund$Description <- iconv(enc2utf8(gofund$Description),sub="byte")
gofund$text_clean <- tolower(gofund$Description) # Lowercase
gofund$text_clean <- gsub("[[:punct:]]", "", gofund$text_clean) # Remove punctuation
gofund$text_clean <- gsub("[[:digit:]]", "", gofund$text_clean) # Remove numbers
gofund$text_clean <- gsub("\\s+", " ", stri_enc_toutf8(gofund$text_clean)) #Removeextrawhitespaces
vocab <- unlist(str_split(gofund$text_clean, " ")) # Split into vocab list
total_words <- length(vocab) # Total words - 2958646
unique_words <- length(unique(vocab)) # Unique Words - 101764
lex_div <- unique_words / total_words # Lexical Diversity - 0.03439546
vocab_nsw <- vocab[!(vocab) %in% stop_words$word] # Only keep nonstopwords
gofund_text_clean <- paste(vocab_nsw, collapse = " ") # Gather back into one string
freq_word <- sort(table(unlist(strsplit(gofund_text_clean, " "))), #Create frequency table
decreasing = TRUE)
freq_word <- as.data.frame(freq_word)
freq_word <- freq_word %>%
rename(word = Var1, freq=Freq)
word_frequency_top10 <- freq_word%>% top_n(10,freq)
ggplot(word_frequency_top10,aes(word,freq,fill=freq)) +
geom_col()
```
```{r}
#Word Cloud
word_frequency_top200 <- freq_word%>%
top_n(200,freq)
wordcloud(words = word_frequency_top200$word, freq =
word_frequency_top200$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
```
```{r}
#Check frequency of words in text
text_df <- subset(gofund, select = c(Description,Category))
text_df %>%
group_by(Category)
```
```{r}
tidy_category <- text_df %>% unnest_tokens(word, Description)
head(tidy_category)
```
```{r}
#Removing stop words
data(stop_words)
tidy_category <- tidy_category %>%
anti_join(stop_words)
```
```{r}
#Check frequency of words
tidy_category %>%
count(word, sort = TRUE)
```
```{r}
#Visualization
tidy_category %>%
count(word, sort = TRUE) %>%
filter(n > 25000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)
```
```{r}
#Effects of posemo and negemo
grp_by_state <- gofund %>% group_by(state) %>%
summarise(posemo = sum(posemo), negemo = sum(negemo))
grp_by_state
```
```{r}
#nrc emotions related to financial distress,financial strain,financial pressure
nrc <- get_sentiments("nrc") %>%
filter(sentiment == c("anger", "fear","sadness","anticip"))
nrc
```
```{r}
#words linked to financial distress linked,financial strain, financial pressure in the dataset and their nrc per category
tidy_category %>% inner_join(nrc) %>%
group_by(Category) %>%
count(word, sort = TRUE)
```
```{r}
#LIWC output variables related to financial distress, financial stress are negemo,anx,anger,sad,risk,money
#LIWC in the dataset per category
grp_by_category <- gofund %>% group_by(Category) %>%
summarise(anxiety = sum(anx),anger = sum(anger), sad = sum(sad),risk = sum(risk), money = sum(money), negemo = sum(negemo))
grp_by_category
```