-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_1_data_pre_processing.R
132 lines (109 loc) · 4.47 KB
/
3_1_data_pre_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Data from comments column is transformed
# from raw into understandable format
# to be useful for further analysis
# Limit the review by removing too short and too long characters
all_data_df <- all_data_df %>%
filter(!(is.na(comments)))
all_data_df$review_length_chars <- nchar(all_data_df$comments)
# Check min, max, mean and quantile statistics
# To decide the limitation
max(all_data_df$review_length_chars)
min(all_data_df$review_length_chars)
mean(all_data_df$review_length_chars)
quantile_review <- quantile(all_data_df$review_length_chars)
all_data_df <- all_data_df%>%
arrange(desc(review_length_chars))
all_data_df <- all_data_df %>%
filter(between(review_length_chars,quantile_review[2],1000))
# Review the spread using histogram
hist(all_data_df$review_length_chars,breaks = 800)
# Limit the number of reviews included in each listing
to_remove_listing <- all_data_df %>%
group_by(listing_id) %>%
summarise(total = n()) %>%
filter(total<=5) %>%
pull(listing_id)
# Delete property listings with review less than 5
all_data_df <- all_data_df %>%
filter(!(listing_id %in% to_remove_listing))
# Remove digits and punctuations in comments
all_data_df$comments <- gsub('[[:digit:]]+',' ', all_data_df$comments)
all_data_df$comments <- gsub('[[:punct:]]+',' ', all_data_df$comments)
# Filter non-English reviews
library(cld2)
all_data_df <- all_data_df %>%
mutate(review_language = cld2::detect_language(comments,plain_text = TRUE))%>%
filter(review_language == "en")
# Tokenise using parallelism
# Remove stop words using dictionaries and customised stop words
library(tidytext)
library(lexicon)
data(stopwords)
data(sw_fry_1000) #Fry's 1000 Most Commonly Used English Words
split_size <- 10000
tokens_list <- split(all_data_df,
rep(1:ceiling(nrow(all_data_df)
/split_size),
each=split_size,
length.out=nrow(all_data_df)))
# Create customised stop words
# Selection of stop words decided by contextual analysis
# All stop words transformed into lower case and combined into one object
tokens_all_comments <- data.frame()
property_name <- strsplit(unique(tolower(all_data_df$property_type))," ")
room_name <- as.character(unique(tolower(all_data_df$room_type)))
neighbourhood_name <-as.character(unique(tolower(all_data_df$neighbourhood)))
neighbourhood <- as.character(unique(tolower(all_data_df$neighbourhood_cleansed)))
host_name <- as.character(unique(tolower(all_data_df$host_name)))
cus_stopwords <- c("airbnb",neighbourhood_name,neighbourhood,host_name,this_city)
# Tokenisation and stop words removal
for(i in 1:length(tokens_list)){
tokens_h <- tokens_list[[i]] %>%
unnest_tokens(word,comments)
tokens_h$word <- tolower(tokens_h$word)
tokens_h <- tokens_h %>%
count(word,listing_id) %>%
anti_join(stop_words) %>%
filter(!(word %in% c(sw_fry_1000,cus_stopwords)))
tokens_all_comments <- bind_rows(tokens_all_comments,tokens_h)
print(i)
}
library(textstem)
# Lemmatize Words
tokens_all_comments$word <- lemmatize_words(tokens_all_comments$word,
dictionary = lexicon::hash_lemmas)
# Calculate token length
# Remove too short and too long tokens
tokens_all_comments$token_length <- nchar(tokens_all_comments$word)
tokens_all_comments %>%
group_by(token_length) %>%
summarise(total =n())
tokens_all_comments <- tokens_all_comments %>%
filter(between(token_length,3,15))
# Inspect the descending distributions
tokens_all_comments %>% group_by(token_length) %>%
summarise(total =n()) %>%
arrange(desc(token_length))
# Clean tokens by stop words removal
tokens_all_comments <- tokens_all_comments%>%
filter(!(word %in% cus_stopwords))
# Calculate TF-IDF
# Word as term, listing as document
tokens_all_comments_tf_idf <- tokens_all_comments %>%
bind_tf_idf(word,listing_id,n)
# Find TF-IDF cut-off value
# Filter important words using left and right trim
avg_comments_tf_idf <- mean(tokens_all_comments_tf_idf$tf_idf)
tokens_all_comments_tf_idf <- tokens_all_comments_tf_idf %>%
filter(between(tf_idf,avg_comments_tf_idf/8,2*avg_comments_tf_idf)) %>%
arrange(desc(tf_idf))
hist(tokens_all_comments_tf_idf$tf_idf,breaks = 800)
# Cast tokens to a DTM (document-term matrix)
library(tm)
reviews_dtm <- tokens_all_comments%>%
cast_dtm(listing_id,word,n)
# Find 10 most important words
tokens_all_comments %>%
group_by(word) %>%
summarise(total =n()) %>%
arrange(desc(total)) %>% top_n(10)