forked from sureshgorakala/Text-Mining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTopic Modelling.R
58 lines (50 loc) · 1.99 KB
/
Topic Modelling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library("twitteR")
library("tm")
library("wordcloud")
library("slam")
library("topicmodels")
library("readr")
#Connect to Twitter
# For guidance on creating an app see
# https://www.credera.com/blog/business-intelligence/twitter-analytics-using-r-part-1-extract-tweets/
Consumer_key<- "YOUR_CONSUMER_KEY"
Consumer_secret <- "YOUR_CONSUMER_SECRET"
access_token <- "YOUR_ACCESS_TOKEN"
access_token_secret <- "YOUR_TOKEN_SECRET"
setup_twitter_oauth(Consumer_key,Consumer_secret,access_token,access_token_secret)
numTweets <- 900
#get data set and save for later, e.g. to investigate anomalies
tweetData <- searchTwitter("flight", n=numTweets, lang="en")
write.csv(twListToDF(tweetData), file="tweets.csv")
#Load Text
dataset <- read_csv("tweets.csv")
tweets <- dataset$text
#
#Clean Text
tweets = gsub("(RT|via)((?:\\b\\W*@\\w+)+)","",tweets)
tweets = gsub("http[^[:blank:]]+", "", tweets)
tweets = gsub("@\\w+", "", tweets)
tweets = gsub("[ \t]{2,}", "", tweets)
tweets = gsub("^\\s+|\\s+$", "", tweets)
tweets <- gsub('\\d+', '', tweets)
tweets = gsub("[[:punct:]]", " ", tweets)
corpus = Corpus(VectorSource(tweets))
corpus = tm_map(corpus,removePunctuation)
corpus = tm_map(corpus,stripWhitespace)
corpus = tm_map(corpus,tolower)
corpus = tm_map(corpus,removeWords,stopwords("english"))
#remove the Twitter related metadata
corpus = tm_map(corpus,removeWords, c("iphone","android","web", "tweetdeck", "ifttt"))
tdm = DocumentTermMatrix(corpus) # Creating a Term document Matrix
# create tf-idf matrix
# see http://tidytextmining.com/tfidf.html for the theory
term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm > 0))
summary(term_tfidf)
tdm <- tdm[,term_tfidf >= 0.1]
tdm <- tdm[row_sums(tdm) > 0,]
#Perform the topic modelling and save summary
#Further info on LDA here http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/
lda <- LDA(tdm, k = 8) # find 8 topics
Terms <- terms(lda, 10) # first 4 terms of every topic
Terms
write.csv(Terms, file = "Terms.csv")