Data Ananlysis Projects with R
For the analysis of water quality i will be using the kaggle data set: https://www.kaggle.com/adityakadiwal/water-potability
library(DataCombine)
library(ggplot2)
library(plyr)
df = read.csv('water_potability.csv')
head(df)
df = DropNA(df)
sum(is.na(df))
fdf = data.frame(table(df$Potability))
head(fdf)
ggplot(fdf, aes(x = Var1, y = Freq, fill = Var1)) +
geom_bar(stat = "identity") +
labs(title="Distribution of Unsafe and Safe Water") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Potability") +
ylab("Count") +
geom_text(aes(label = Freq), vjust = 2)+
scale_fill_manual("Potability", values = c("0" = "#E74C3C", "1" = "#2ECC71"))
ggplot(df, aes(x = ph, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: PH") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Ph") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Hardness, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Hardness") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Hardness") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Solids, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Solids") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Solids") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Chloramines, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Chloramines") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Chloramines") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Sulfate, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Sulfate") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Sulfate") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Conductivity, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Conductivity") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Conductivity") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Organic_carbon, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Organic Carbon") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Organic Carbon") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Trihalomethanes, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Trihalomethanes") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Trihalomethanes") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
ggplot(df, aes(x = Turbidity, fill = as.factor(df$Potability))) +
geom_histogram(alpha = 0.5, position = "identity") +
labs(title="Factors Affecting Water Quality: Turbidity") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Turbidity") +
ylab("Count") +
scale_fill_manual("Potability", values = c("0" = "#C0392B", "1" = "#1D8348"))
So, we will need the dataset for Squid Game and we can find it on twitter in the form of tweets. You can scrape the data according to needs but i will be using https://www.kaggle.com/deepcontractor/squid-game-netflix-twitter-data/download
library(tm)
library(tidytext)
library(dplyr)
library(wordcloud2)
library(tidyverse)
library(glue)
library(stringr)
tweets <- read.csv('squid_game.csv')
tweets = subset(tweets, select = -c(user_location) )
head(tweets)
tweets = DropNA(tweets)
sum(is.na(tweets))
tweets$text <- gsub("https\\S*", "", tweets$text)
tweets$text <- gsub("@\\S*", "", tweets$text)
tweets$text <- gsub("amp", "", tweets$text)
tweets$text <- gsub("[\r\n]", "", tweets$text)
tweets$text <- gsub("[[:punct:]]", "", tweets$text)
tweets_words <- tweets %>%
select(text) %>%
unnest_tokens(word, text)
words <- tweets_words %>% count(word, sort=TRUE)
df <- data.frame(word = words$word,freq=words$n)
head(df)
set.seed(1234)
wordcloud2(data=df, size=1.6, color='random-dark')
setn <- tweets_words %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
sentiments_scores <- function(pos, neg){
print("Most people think about the Squid Game")
if (pos > neg){
print("Positive π")
}
else if (neg > pos){
print("Negative π ")
}else{
print("Neutral π ")
}
}
sentiments_scores(setn$positive, setn$negative)
dfs = data.frame(nature=c('Positive', 'Negative'), scores=c(setn$positive, setn$negative))
ggplot(dfs, aes(x = nature, y = scores, fill = nature)) +
geom_bar(stat = "identity") +
labs(title="People thinking Nature about the Squid Game") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Nature") +
ylab("Frequancy") +
geom_text(aes(label = scores), vjust = 2)+
scale_fill_manual("Nature", values = c("Negative" = "#E74C3C", "Positive" = "#2ECC71"))
library(tm)
library(tidytext)
library(dplyr)
library(wordcloud2)
library(tidyverse)
library(glue)
library(stringr)
tweets <- read.csv("twitter.csv")
head(tweets)
tweets = DropNA(tweets)
sum(is.na(tweets))
clean_text <- function(uncleaned_text){
uncleaned_text <- gsub("https\\S*", "", uncleaned_text)
uncleaned_text <- gsub("@\\S*", "", uncleaned_text)
uncleaned_text <- gsub("amp", "", uncleaned_text)
uncleaned_text <- gsub("[\r\n]", "", uncleaned_text)
uncleaned_text <- gsub("[[:punct:]]", "", uncleaned_text)
return (uncleaned_text)
}
tweets$tweet = clean_text(tweets$tweet)
tweets_words <- tweets %>%
select(tweet) %>%
unnest_tokens(word, tweet)
words <- tweets_words %>% count(word, sort=TRUE)
df <- data.frame(word = words$word,freq=words$n)
head(df)
set.seed(1234)
wordcloud2(data=df, size=1.6, color='random-dark')
setn <- tweets_words %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
sentiments_scores <- function(pos, neg){
print("Most people tweeted on twitter")
if (pos > neg){
print("Positive π")
}
else if (neg > pos){
print("Negative π ")
}else{
print("Neutral π ")
}
}
sentiments_scores(setn$positive, setn$negative)
dfs = data.frame(nature=c('Positive', 'Negative'), scores=c(setn$positive, setn$negative))
ggplot(dfs, aes(x = nature, y = scores, fill = nature)) +
geom_bar(stat = "identity") +
labs(title="Tweets Nature on Twitter") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Nature") +
ylab("Frequancy") +
geom_text(aes(label = scores), vjust = 2)+
scale_fill_manual("Nature", values = c("Negative" = "#E74C3C", "Positive" = "#2ECC71"))
Download the dataset from here: https://www.kaggle.com/tunguz/movietweetings/download
library(tidyr)
library(tidyverse)
library(dplyr)
library(ggplot2)
movies_df <- read.table('movies.dat', sep = ":",colClasses = c(NA, "NULL"),header = FALSE, fill = TRUE)
names(movies_df) = c('ID', 'Title', 'Genre')
head(movies_df)
rating_df <- read.table('ratings.dat', sep = ":",colClasses = c(NA, "NULL"),header = FALSE, fill = TRUE)
names(rating_df) = c('User', 'ID', 'Ratings', 'Timestamp')
head(rating_df)
merged_df <- merge(movies_df, rating_df, by = "ID")
head(merged_df)
fdf = table(merged_df$Ratings)
df_prob <- fdf / sum(fdf)
perc_df <- df_prob * 100
perc_df2 <- round(perc_df, 2)
names(perc_df2) <- names(perc_df)
final_df <- data.frame(perc = perc_df2)
Rating <- final_df$perc.Var1
ggplot(final_df, aes(x=1, y=final_df$perc.Freq, fill=Rating)) +
labs(title = "Most rated movies pie chart") +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
geom_text(aes(label = paste0(final_df$perc.Freq, '%')), position = position_stack(vjust = 0.5)) +
theme_void()
df10 <- merged_df[merged_df$Ratings == 10, ]
title_df = data.frame(table(df10$Title))
title_df = title_df[order(title_df$Freq, decreasing = TRUE),][1:10, ]
head(title_df)
ggplot(title_df, aes(x=title_df$Var1, y=title_df$Freq)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Top 10 Movies having rating 10", x="Title", y="Count") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))