-
Notifications
You must be signed in to change notification settings - Fork 0
/
Footprints.Rmd
207 lines (180 loc) · 6.27 KB
/
Footprints.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
---
title: "Groupme Data Analysis"
output: html_notebook
---
This is an data project to the evolution of our social space in VFA.
Questions?
- Who contributes the most?
- Meme frequency by time of day?
- Themes of the most liked posts?
- Who gets @'d the most?
- Who @'s the most
- Who gives their likes most freely?
Let's get our data:
```{r}
#Dear computer:
setwd("~/Documents/Coding/Data")
library(readr)
data = as.data.frame(read_csv("groupme_cleaned_72918.csv"))
View(data)
class(data)
```
```{r}
#libraies
library(lattice)
library(boot)
library(MASS)
install.packages("ggplot2")
library(ggplot2)
install.packages("anytime")
library(anytime)
```
Data cleaning and prep:
```{r}
#****************DATA CLEANING*****************************
data$created_at<-anytime(data$created_at)
colnames(data)[colnames(data)=="created_at"] <- "time_sent"
data$attachments<-as.character(data$attachments)
data$favorited_by<-as.character(data$favorited_by)
######## ADD LIKES COLUMN ##########
# The string to search in
s <-data$favorited_by[which(data$favorited_by!="[]")]
# The character to search for
p <- ","
# Replace all occurrences by the empty string - note that gsub uses regular expressions, so escape p accordingly
s2 <- gsub(p,"",s) # Count the length difference
data$likes<-rep(0,nrow(data))
data$likes[which(data$favorited_by!="[]")] <- nchar(s) - nchar(s2) + 1 # numOcc now contains the number of occurrences of p in s
####### ADD attachment columns COLUMN
data$pic_attch <- grepl("image",data$attachments)
data$mention_attch <- grepl("mentions",data$attachments)
####### FIND NUMBER MENTIONS
data$mention_ids <- rep(0,nrow(data))
data$mention_ids<-sapply(strsplit(data$attachments,"'user_ids': [",fixed="TRUE"),'[',2)
data$mention_ids<-sapply(strsplit(data$mention_ids,"]}",fixed="TRUE"),'[',1)
# The string to search in
s <-data$mention_ids[which(!is.na(data$mention_ids))]
# The character to search for
p <- ","
# Replace all occurrences by the empty string - note that gsub uses regular expressions, so escape p accordingly
s2 <- gsub(p,"",s) # Count the length difference
data$mention_count<-rep(0,nrow(data))
data$mention_count[which(!is.na(data$mention_ids))] <- nchar(s) - nchar(s2) + 1
```
Write the data to a csv
```{r}
write.csv(data, file = "GroupmeDataXXXXXX.csv")
View(data)
nrow(data)
```
Get text mining packages:
```{r}
install.packages("tm")
install.packages("SnowballC")
install.packages("wordcloud")
install.packages("RColorBrewer")
install.packages("RCurl")
install.packages("XML")
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RCurl)
library(XML)
```
Define rquery.wordcloud
```{r}
#++++++++++++++++++++++++++++++++++
# rquery.wordcloud() : Word cloud generator
# - http://www.sthda.com
#+++++++++++++++++++++++++++++++++++
# x : character string (plain text, web url, txt file path)
# type : specify whether x is a plain text, a web page url or a file path
# lang : the language of the text
# excludeWords : a vector of words to exclude from the text
# textStemming : reduces words to their root form
# colorPalette : the name of color palette taken from RColorBrewer package,
# or a color name, or a color code
# min.freq : words with frequency below min.freq will not be plotted
# max.words : Maximum number of words to be plotted. least frequent terms dropped
# value returned by the function : a list(tdm, freqTable)
rquery.wordcloud <- function(x, type=c("text", "url", "file"),
lang="english", excludeWords=NULL,
textStemming=FALSE, colorPalette="Dark2",
min.freq=3, max.words=200)
{
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
if(type[1]=="file") text <- readLines(x)
else if(type[1]=="url") text <- html_to_text(x)
else if(type[1]=="text") text <- x
# Load the text as a corpus
docs <- Corpus(VectorSource(text))
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove stopwords for the language
docs <- tm_map(docs, removeWords, stopwords(lang))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Remove your own stopwords
if(!is.null(excludeWords))
docs <- tm_map(docs, removeWords, excludeWords)
# Text stemming
if(textStemming) docs <- tm_map(docs, stemDocument)
# Create term-document matrix
tdm <- TermDocumentMatrix(docs)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
# check the color palette name
if(!colorPalette %in% rownames(brewer.pal.info)) colors = colorPalette
else colors = brewer.pal(8, colorPalette)
# Plot the word cloud
set.seed(1234)
wordcloud(d$word,d$freq, min.freq=min.freq, max.words=max.words,
random.order=FALSE, rot.per=0.35,
use.r.layout=FALSE, colors=colors)
invisible(list(tdm=tdm, freqTable = d))
}
#++++++++++++++++++++++
# Helper function
#++++++++++++++++++++++
# Download and parse webpage
html_to_text<-function(url){
library(RCurl)
library(XML)
# download html
html.doc <- getURL(url)
#convert to plain text
doc = htmlParse(html.doc, asText=TRUE)
# "//text()" returns all text outside of HTML tags.
# We also don’t want text such as style and script codes
text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
# Format text vector into one character string
return(paste(text, collapse = " "))
}
```
Most Common Words Overall:
```{r}
rquery.wordcloud(data$text, type=c("text"),
lang="english", excludeWords = c("s","d","y","m","ve","'","",",","ll",",","`","re"),
textStemming = FALSE, colorPalette="Dark2",
max.words=200)
```
Common Words in Top 200 Most Likes (Top 5%) Messages:
```{r}
library(plyr)
top200 = head(arrange(data,desc(likes)), n = 200) #get 200 most liked comments
View(top200$text) #view
rquery.wordcloud(top200$text, type=c("text"),
lang="english", excludeWords = c("s","d","y","m","ve","'","",",","ll",",","`","re"),
textStemming = FALSE, colorPalette="Dark2",
max.words=200)
```
Correlation between post activity and total likes accumulated: