Skip to content

Commit

Permalink
Adjusted functions files. fixed errors
Browse files Browse the repository at this point in the history
  • Loading branch information
Arf9999 committed Dec 7, 2021
1 parent 4258d35 commit 03f38e8
Show file tree
Hide file tree
Showing 46 changed files with 1,876 additions and 1,905 deletions.
174 changes: 174 additions & 0 deletions R/GOT3_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#############################################################################
#'@title Function to rehydrate tweets into rtweet tibbles from GOT3 data format
#'@param got3_df A dataframe as returned by GetOldTweets3
#'@param token An OAuth token loaded in the environment for twitter's Standard API (REQUIRED if a list of tokens is not supplied)
#'@param token_list A list of OAuth tokens loaded in the environment (REQUIRED if token is not specified)
#'@description Uses status_id to request full {rtweet} tibble of tweet data
#'@keywords twitter, getoldtweets3, rtweet
#'@export
###############################################################################
rehydrate_got3_statuses <-
function(got3_df,
token = NULL,
token_list = NULL) {
require(rtweet, quietly = TRUE)
require(dplyr, quietly = TRUE)
require(purrr, quietly = TRUE)
require(readr, quietly = TRUE)

### Check tokens
if (is.null(token) & is.null(token_list)) {
stop(
"Please designate either a token or a list of tokens that are loaded into the environment"
)

}
if (is.null(token_list)) {
token_list = c(token)
}
###

df_length <- nrow(got3_df) ##check length of GOT list of statuses
last_capture <- 0L ##initial setting for statuses captured
#check ratelimits for all tokens
ratelimits <-
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")


if (as.numeric(max(ratelimits$remaining)) > df_length / 100) {
##if no ratelimit reset or token rotation required.
message(paste0("lookup of ", df_length, " statuses"))

row_of_max_rl <- which.max(ratelimits$remaining)
rehydration <- rtweet::lookup_statuses(unlist(got3_df[, "id"]),
token = token_list[[row_of_max_rl]])


} else{
while (last_capture < nrow(got3_df)) {
# iterate through the rows using token rotation & rate reset pausing
row_of_max_rl <- which.max(ratelimits$remaining)

##first iteration
if (last_capture == 0L) {
last_capture <-
as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100)
message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"])))
message(paste0("base: ", 1L, " upper: ", last_capture))

rehydration <-
rtweet::lookup_statuses(unlist(got3_df[c(1:last_capture), "id"]),
token = token_list[[row_of_max_rl]])
} else{
#iterate through remainder of status_ids
base_capture <- last_capture + 1
last_capture <-
min(c(
base_capture + as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100),
nrow(got3_df)
))
message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"])))
message(paste0("base: ", base_capture, " upper: ", last_capture))

rehydration <- dplyr::bind_rows(rehydration,
rtweet::lookup_statuses(unlist(got3_df[c(base_capture:last_capture), "id"]),
token = token_list[[row_of_max_rl]]))

}
ratelimits <-
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")#check ratelimits



###manage ratelimit resets as gracefully as possible - conservatively set to 100 queries as test
if (max(ratelimits$remaining) < 100 &
last_capture < nrow(got3_df)) {
message(paste0(
"Pausing for ratelimit reset: ",
min(ratelimits$reset),
" minutes"
))
Sys.sleep(as.numeric(min(ratelimits$reset) * 60))
ratelimits <-
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")
}

}
}

##check for missing tweets and re-lookup
orig <- dplyr::as_tibble(got3_df[, "id"]) %>%
rename(status_id = id)
message(paste0("original: ", nrow(orig)))

missing <-
anti_join(orig, as_tibble(rehydration[, "status_id"]), by = "status_id")

message(paste0("missing: ", nrow(missing)))

if (nrow(missing) > 0) {
##try again to look up missing statuses
ratelimits <-
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")
row_of_max_rl <- which.max(ratelimits$remaining)
df_length <- nrow(missing)
message(paste0("Attempting to populate missing tweets: ", df_length))
rehydration <- bind_rows(rehydration,
rtweet::lookup_statuses(unlist(missing[c(1:df_length), "status_id"]),
token = token_list[[row_of_max_rl]]))

##write log file of missing tweets.
missing <-
anti_join(orig, as_tibble(rehydration[, "status_id"]), by = "status_id") %>%
rename(id = status_id) %>%
left_join(got3_df, by = "id") %>%
mutate(error = "Status not downloaded")

message(
paste0(
nrow(missing),
" tweets not downloaded, see log file for details: ",
"got3_rehydration_missing_log_",
as.numeric(Sys.time()),
".csv"
)
)

readr::write_csv(missing,
paste0(
"got3_rehydration_missing_log_",
as.numeric(Sys.time()),
".csv"
))


}
##Get maximum number of retweets
retweets <- filter(rehydration, retweet_count > 1) %>%
select(status_id)

message(paste ("tweets with retweets:", nrow(retweets)))

ratelimits <-
purrr::map_df(token_list, rtweet::rate_limit, query = "get_retweets")
row_of_max_rl <- which.max(ratelimits$remaining)

retweet_temp <-
purrr::map_df(retweets$status_id,
rtweet::get_retweets,
n = 100,
token = token_list[[row_of_max_rl]])
message(paste0("temp rows:", nrow(retweet_temp)))

message("Captured ",
nrow(retweet_temp),
" retweets (a maximum of 100 per original tweet)")



rehydration <- bind_rows(rehydration, retweet_temp)

return(rehydration)

}

114 changes: 114 additions & 0 deletions R/SNscrape_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#############################################################################
#'@title Wrapper to undertake historical searches calling Python snscrape library.
#'@param search_string A search string (in quotes)
#'@param since_date Start date for the search (iso format date in quotes)
#'@param until_date Latest date for the search (iso format date in quotes NB: search works backward)
#'@param n Maximum number of results - twitter API has 90k rate-limit per 15 minutes
#'@param file temporary file name for snscrape URL list, timestamp will be appended.
#'@param token An OAuth token loaded in the environment for twitter's Standard API (if not specified, default token will be used)
#'@param delete_tempfile Clear temp file of statuses default = TRUE
#'@description Calls Python script to pull status URLs of a search, rtweet to rehdrate those statuses.
#' See https://github.com/JustAnotherArchivist/snscrape
#'@keywords twitter, snscrape, rtweet
#'@export
#'@examples
#'test <- snscrape_search("Trump", since_date = "2016-09-06", until_date = "2016-11-06", n = 1000, file = "test_", token = NULL)
###############################################################################

snscrape_search <- function(search_string, #search terms in quotes
since_date = NULL, #optional iso date in quotes
until_date = NULL, #optional iso date in quotes
n = 100, #max number of statuses to retrieve. Fewer than 90K recommend due to rate-limiting
file = "temp", #temporary name for snscrape text file
token = NULL, #specify token if required
delete_tempfile = TRUE) #delete text file of statuses
{
require(rtweet, quietly = TRUE)
require(readr, quietly = TRUE)
require(dplyr, quietly = TRUE)

if(!is.null(since_date)) {
search_string <- paste0(search_string," since:",since_date)
}

if(!is.null(until_date)){
search_string <-paste0(search_string, " until:", until_date)
}

output_path <- paste0(file, as.numeric(Sys.time()), ".txt")

##call Python scraper
system(paste0("snscrape -n ", n," twitter-search ","\"", search_string,"\"",
" > ", output_path))

##import status_ids from text file
scrape_statuses <- read_delim(output_path, "/", escape_double = FALSE,
col_names = FALSE,
col_types = cols(X2 = col_skip(),
X1 = col_skip(), X3 = col_skip(),
X5 = col_skip(), X6 = col_character()),
trim_ws = TRUE) %>%
dplyr::rename(screen_name = X4,
status_id = X6) %>%
dplyr::distinct(status_id, .keep_all = TRUE)

message(paste(nrow(scrape_statuses), "status URLS captured, rehydrating..."))
temp_rehydration <- rtweet::lookup_statuses(scrape_statuses$status_id, token = token)

##cleanup temp files
if (delete_tempfile == TRUE & file.exists(output_path)){
file.remove(output_path)
}

return(temp_rehydration)

}

#############################################################################
#'@title Wrapper to pull the timeline of a user
#'@param screen_name AA twitter username/handle (no"@" required) or user_id (in quotes)
#'@param n Maximum number of results - twitter API has 90k rate-limit per 15 minutes
#'@param file temporary file name for snscrape URL list, timestamp will be appended.
#'@param token An OAuth token loaded in the environment for twitter's Standard API (if not specified, default token will be used)
#'@param delete_tempfile Clear temp file of statuses default = TRUE
#'@description Calls Python script to pull status URLs of a user search, rtweet to rehdrate those statuses.
#' See https://github.com/JustAnotherArchivist/snscrape
#'@keywords twitter, snscrape, rtweet
#'@export
#'@examples
#'test <- snscrape_get_timeline("Jack", n = 1000, file = "test_")
###############################################################################

snscrape_get_timeline <- function (screen_name,
n = 100,
file = "temp",
token = NULL,
delete_tempfile = TRUE)
{
require(rtweet, quietly = TRUE)
require(readr, quietly = TRUE)
require(dplyr, quietly = TRUE)


output_path <- paste0(file, as.numeric(Sys.time()), ".txt")

##call Python scraper
system(paste0("snscrape -n ", n," twitter-user ", screen_name,
" > ", output_path))

##import status_ids from text file
scrape_timeline <- read_delim(output_path, "/", escape_double = FALSE,
col_names = FALSE,
col_types = cols(X2 = col_skip(),
X1 = col_skip(), X3 = col_skip(),
X5 = col_skip(), X6 = col_character()),
trim_ws = TRUE) %>%
dplyr::rename(screen_name = X4,
status_id = X6) %>%
dplyr::distinct(status_id, .keep_all = TRUE)

message(paste(nrow(scrape_timeline), "status URLS captured, rehydrating timeline..."))
timeline_rehydration <- rtweet::lookup_statuses(scrape_timeline$status_id, token = token)

return(timeline_rehydration)
}
86 changes: 86 additions & 0 deletions R/accountactivity.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#'@title Account Activity Plot for a Twitter Account
#'@description This function creates a bubble plot of account activity by hour of a single twitter screen_name
#' (inspired by python script by twitter user "@Conspirator0")
#'@param account_name A twitter screen_name, in quotes.
#'@param depth The maximum depth of tweets to be visualised. Starts from most recent tweet.
#'Twitter API maximum and default is 3200. Only those tweets occuring in the no_of_weeks param will be shown
#'@param time_zone The timezone of the account.
#' Requires timezone in format of TZ database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) in quotes.
#' Default is "Africa/Johannesburg"
#'@param no_of_weeks The number of weeks to display. Default is 4. Plot will automatically scale to exclude any period without activity.
#'@param token A twitter oauth token. Default is NULL,
#' and will utilise an existing token loaded into environment, but can be over-ridden to use a particular token.
#'@keywords twitter, rtweet, visualization, activity, bubble plot.
#'@export
#'@examples account_activity("jack",
#' depth = 3200,
#' time_zone = "America/Los_Angeles",
#' no_of_weeks = 4,token = NULL)


account_activity <- function(account_name,
depth = 3200,
time_zone = "Africa/Johannesburg",
no_of_weeks = 4,
token = NULL) {
require(rtweet, quietly = TRUE)
require(tidyverse, quietly = TRUE)
require(lubridate, quietly = TRUE)

rtweet::get_timeline(
account_name,
n = depth,
retryonratelimit = TRUE,
token = token
)[, 1:6] %>%
mutate(created_at2 = with_tz(created_at, tzone = time_zone)) %>%
mutate(tweet_date = lubridate::date(created_at2)) %>%
mutate(tweet_hour = lubridate::hour(created_at2)) %>%
group_by(screen_name) %>%
group_by(source, add = TRUE) %>%
group_by(tweet_date, add = TRUE) %>%
group_by(tweet_hour, add = TRUE) %>%
mutate(hourly_tweet_count = n()) %>%
ungroup %>%
mutate(tweet_period = (as.duration(interval(
max(created_at2), created_at2
)))) %>%
filter(tweet_period > as.duration(-604800 * no_of_weeks)) %>%
group_by(source, tweet_date, tweet_hour) %>%
slice(1) %>%
ungroup() %>%
mutate(bubble_scale = max(hourly_tweet_count) / 5) %>%
ggplot(
aes(
x = tweet_hour,
y = tweet_date,
size = hourly_tweet_count,
colour = source,
alpha = 0.3
)
) +
geom_point() +
scale_size_continuous(name = "Hourly tweet volume") +
scale_x_discrete(
limits = factor(c(0:23)),
breaks = c(0:23),
labels = c(0:23)
) +
expand_limits(x = c(0:23)) +
theme_minimal() +
labs(
title = paste(
"Account activity: ",
account_name,
" (as at ",
as_datetime(Sys.Date()),
")\n",
" Time zone of tweets: ",
time_zone,
sep = ""
),
x = "Hour of day",
y = "Date"
) +
guides(alpha = "none")
}
Loading

0 comments on commit 03f38e8

Please sign in to comment.