-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFinal Project.r
124 lines (91 loc) · 3.91 KB
/
Final Project.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
install.packages("curl")
library("curl")
install.packages("httr")
library("httr")
install.packages("rvest")
library("rvest")
# Call the get_wiki_covid19_page function and print the response
covid19_url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"
response <- GET(covid19_url)
response
# Get the root html node from the http response in task 1
covid19_root_node <- read_html( "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country")
covid19_root_node
# Get the table node from the root html node
covid19_table_node <- html_node(covid19_root_node, "table")
covid19_table_node
# Read the table node and convert it into a data frame, and print the data frame for review
covid19_data_frame <- html_table(covid19_table_node)
head(covid19_data_frame)
# Print the summary of the data frame
summary(covid19_data_frame)
# call `preprocess_covid_data_frame` function and assign it to a new data frame
wiki_covid19_data_frame <- preprocess_covid_data_frame(covid19_data_frame)
wiki_covid19_data_frame
# Print the summary of the processed data frame again
summary(wiki_covid19_data_frame)
# Export the data frame to a csv file
write.csv(wiki_covid19_data_frame, file = "covid.csv", row.names = FALSE)
# Get working directory
wd <- getwd()
# Get exported
file_path <- paste(wd, sep="", "/covid.csv")
# File path
print(file_path)
file.exists(file_path)
## Download a sample csv file
#covid_csv_file <- download.file("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-RP0101EN-Coursera/v2/dataset/covid.csv", destfile="covid.csv")
#covid_data_frame_csv <- read.csv("covid.csv", header=TRUE, sep=",")
# Read covid_data_frame_csv from the csv file
covid_data_frame_csv <- read.csv("covid.csv", header=TRUE, sep=",")
# Get the 5th to 10th rows, with two "country" "confirmed" columns
covid_data_frame_csv[ 5:10, c( "country", "confirmed") ]
# Get the total confirmed cases worldwide
confirmed_cases <- covid_data_frame_csv[ , 4]
confirmed_cases
total_confirmed_cases <- sum(confirmed_cases)
total_confirmed_cases
# Get the total tested cases worldwide
tested_cases <- covid_data_frame_csv[ , 3]
tested_cases
total_tested_cases <- sum(tested_cases)
total_tested_cases
# Get the positive ratio (confirmed / tested)
positive_ratio <- total_confirmed_cases/total_tested_cases
positive_ratio
# Get the `country` column
country_column <- covid_data_frame_csv[ , 1]
country_column
# Check its class ( should be Factor)
class(country_column)
# Conver the country column into character so that you can easily sort them
as.character ( country_column)
# Sort the countries AtoZ
sort(country_column)
# Sort the countries ZtoA
Country_ZtoA <- sort(country_column, decreasing = TRUE)
Country_ZtoA
# Print the sorted ZtoA list
print( Country_ZtoA)
# Use a regular expression `United.+` to find matches
matches <- regexpr("United.+", covid_data_frame_csv[ ,"country"])
countires_start_with_United<- regmatches(covid_data_frame_csv[ ,"country"], matches)
countires_start_with_United
# Print the matched country names
print(countires_start_with_United)
# Select a subset (should be only one row) of data frame based on a selected country name and columns
wiki_covid19_data_frame[1, c( "country", "confirmed", "confirmed.population.ratio") ]
# Select a subset (should be only one row) of data frame based on a selected country name and columns
wiki_covid19_data_frame[ 20, c("country", "confirmed", "confirmed.population.ratio") ]
if (49621 > 1491) {
print( "Afghanistan has larger ratio of confirmed cases to population")
} else {
print( "Bhutan has larger ratio of confirmed cases to population")
}
# Get a subset of any countries with `confirmed.population.ratio` less than the threshold
threshold = "lessRisk"
if (threshold == "lessRisk"){
subset(wiki_covid19_data_frame, confirmed.population.ratio < .01)
} else {
subset(wiki_covid19_data_frame, confirmed.population.ratio > .01)
}