-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adjusted functions files. fixed errors
- Loading branch information
Showing
46 changed files
with
1,876 additions
and
1,905 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
############################################################################# | ||
#'@title Function to rehydrate tweets into rtweet tibbles from GOT3 data format | ||
#'@param got3_df A dataframe as returned by GetOldTweets3 | ||
#'@param token An OAuth token loaded in the environment for twitter's Standard API (REQUIRED if a list of tokens is not supplied) | ||
#'@param token_list A list of OAuth tokens loaded in the environment (REQUIRED if token is not specified) | ||
#'@description Uses status_id to request full {rtweet} tibble of tweet data | ||
#'@keywords twitter, getoldtweets3, rtweet | ||
#'@export | ||
############################################################################### | ||
rehydrate_got3_statuses <- | ||
function(got3_df, | ||
token = NULL, | ||
token_list = NULL) { | ||
require(rtweet, quietly = TRUE) | ||
require(dplyr, quietly = TRUE) | ||
require(purrr, quietly = TRUE) | ||
require(readr, quietly = TRUE) | ||
|
||
### Check tokens | ||
if (is.null(token) & is.null(token_list)) { | ||
stop( | ||
"Please designate either a token or a list of tokens that are loaded into the environment" | ||
) | ||
|
||
} | ||
if (is.null(token_list)) { | ||
token_list = c(token) | ||
} | ||
### | ||
|
||
df_length <- nrow(got3_df) ##check length of GOT list of statuses | ||
last_capture <- 0L ##initial setting for statuses captured | ||
#check ratelimits for all tokens | ||
ratelimits <- | ||
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses") | ||
|
||
|
||
if (as.numeric(max(ratelimits$remaining)) > df_length / 100) { | ||
##if no ratelimit reset or token rotation required. | ||
message(paste0("lookup of ", df_length, " statuses")) | ||
|
||
row_of_max_rl <- which.max(ratelimits$remaining) | ||
rehydration <- rtweet::lookup_statuses(unlist(got3_df[, "id"]), | ||
token = token_list[[row_of_max_rl]]) | ||
|
||
|
||
} else{ | ||
while (last_capture < nrow(got3_df)) { | ||
# iterate through the rows using token rotation & rate reset pausing | ||
row_of_max_rl <- which.max(ratelimits$remaining) | ||
|
||
##first iteration | ||
if (last_capture == 0L) { | ||
last_capture <- | ||
as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100) | ||
message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"]))) | ||
message(paste0("base: ", 1L, " upper: ", last_capture)) | ||
|
||
rehydration <- | ||
rtweet::lookup_statuses(unlist(got3_df[c(1:last_capture), "id"]), | ||
token = token_list[[row_of_max_rl]]) | ||
} else{ | ||
#iterate through remainder of status_ids | ||
base_capture <- last_capture + 1 | ||
last_capture <- | ||
min(c( | ||
base_capture + as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100), | ||
nrow(got3_df) | ||
)) | ||
message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"]))) | ||
message(paste0("base: ", base_capture, " upper: ", last_capture)) | ||
|
||
rehydration <- dplyr::bind_rows(rehydration, | ||
rtweet::lookup_statuses(unlist(got3_df[c(base_capture:last_capture), "id"]), | ||
token = token_list[[row_of_max_rl]])) | ||
|
||
} | ||
ratelimits <- | ||
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")#check ratelimits | ||
|
||
|
||
|
||
###manage ratelimit resets as gracefully as possible - conservatively set to 100 queries as test | ||
if (max(ratelimits$remaining) < 100 & | ||
last_capture < nrow(got3_df)) { | ||
message(paste0( | ||
"Pausing for ratelimit reset: ", | ||
min(ratelimits$reset), | ||
" minutes" | ||
)) | ||
Sys.sleep(as.numeric(min(ratelimits$reset) * 60)) | ||
ratelimits <- | ||
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses") | ||
} | ||
|
||
} | ||
} | ||
|
||
##check for missing tweets and re-lookup | ||
orig <- dplyr::as_tibble(got3_df[, "id"]) %>% | ||
rename(status_id = id) | ||
message(paste0("original: ", nrow(orig))) | ||
|
||
missing <- | ||
anti_join(orig, as_tibble(rehydration[, "status_id"]), by = "status_id") | ||
|
||
message(paste0("missing: ", nrow(missing))) | ||
|
||
if (nrow(missing) > 0) { | ||
##try again to look up missing statuses | ||
ratelimits <- | ||
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses") | ||
row_of_max_rl <- which.max(ratelimits$remaining) | ||
df_length <- nrow(missing) | ||
message(paste0("Attempting to populate missing tweets: ", df_length)) | ||
rehydration <- bind_rows(rehydration, | ||
rtweet::lookup_statuses(unlist(missing[c(1:df_length), "status_id"]), | ||
token = token_list[[row_of_max_rl]])) | ||
|
||
##write log file of missing tweets. | ||
missing <- | ||
anti_join(orig, as_tibble(rehydration[, "status_id"]), by = "status_id") %>% | ||
rename(id = status_id) %>% | ||
left_join(got3_df, by = "id") %>% | ||
mutate(error = "Status not downloaded") | ||
|
||
message( | ||
paste0( | ||
nrow(missing), | ||
" tweets not downloaded, see log file for details: ", | ||
"got3_rehydration_missing_log_", | ||
as.numeric(Sys.time()), | ||
".csv" | ||
) | ||
) | ||
|
||
readr::write_csv(missing, | ||
paste0( | ||
"got3_rehydration_missing_log_", | ||
as.numeric(Sys.time()), | ||
".csv" | ||
)) | ||
|
||
|
||
} | ||
##Get maximum number of retweets | ||
retweets <- filter(rehydration, retweet_count > 1) %>% | ||
select(status_id) | ||
|
||
message(paste ("tweets with retweets:", nrow(retweets))) | ||
|
||
ratelimits <- | ||
purrr::map_df(token_list, rtweet::rate_limit, query = "get_retweets") | ||
row_of_max_rl <- which.max(ratelimits$remaining) | ||
|
||
retweet_temp <- | ||
purrr::map_df(retweets$status_id, | ||
rtweet::get_retweets, | ||
n = 100, | ||
token = token_list[[row_of_max_rl]]) | ||
message(paste0("temp rows:", nrow(retweet_temp))) | ||
|
||
message("Captured ", | ||
nrow(retweet_temp), | ||
" retweets (a maximum of 100 per original tweet)") | ||
|
||
|
||
|
||
rehydration <- bind_rows(rehydration, retweet_temp) | ||
|
||
return(rehydration) | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
############################################################################# | ||
#'@title Wrapper to undertake historical searches calling Python snscrape library. | ||
#'@param search_string A search string (in quotes) | ||
#'@param since_date Start date for the search (iso format date in quotes) | ||
#'@param until_date Latest date for the search (iso format date in quotes NB: search works backward) | ||
#'@param n Maximum number of results - twitter API has 90k rate-limit per 15 minutes | ||
#'@param file temporary file name for snscrape URL list, timestamp will be appended. | ||
#'@param token An OAuth token loaded in the environment for twitter's Standard API (if not specified, default token will be used) | ||
#'@param delete_tempfile Clear temp file of statuses default = TRUE | ||
#'@description Calls Python script to pull status URLs of a search, rtweet to rehdrate those statuses. | ||
#' See https://github.com/JustAnotherArchivist/snscrape | ||
#'@keywords twitter, snscrape, rtweet | ||
#'@export | ||
#'@examples | ||
#'test <- snscrape_search("Trump", since_date = "2016-09-06", until_date = "2016-11-06", n = 1000, file = "test_", token = NULL) | ||
############################################################################### | ||
|
||
snscrape_search <- function(search_string, #search terms in quotes | ||
since_date = NULL, #optional iso date in quotes | ||
until_date = NULL, #optional iso date in quotes | ||
n = 100, #max number of statuses to retrieve. Fewer than 90K recommend due to rate-limiting | ||
file = "temp", #temporary name for snscrape text file | ||
token = NULL, #specify token if required | ||
delete_tempfile = TRUE) #delete text file of statuses | ||
{ | ||
require(rtweet, quietly = TRUE) | ||
require(readr, quietly = TRUE) | ||
require(dplyr, quietly = TRUE) | ||
|
||
if(!is.null(since_date)) { | ||
search_string <- paste0(search_string," since:",since_date) | ||
} | ||
|
||
if(!is.null(until_date)){ | ||
search_string <-paste0(search_string, " until:", until_date) | ||
} | ||
|
||
output_path <- paste0(file, as.numeric(Sys.time()), ".txt") | ||
|
||
##call Python scraper | ||
system(paste0("snscrape -n ", n," twitter-search ","\"", search_string,"\"", | ||
" > ", output_path)) | ||
|
||
##import status_ids from text file | ||
scrape_statuses <- read_delim(output_path, "/", escape_double = FALSE, | ||
col_names = FALSE, | ||
col_types = cols(X2 = col_skip(), | ||
X1 = col_skip(), X3 = col_skip(), | ||
X5 = col_skip(), X6 = col_character()), | ||
trim_ws = TRUE) %>% | ||
dplyr::rename(screen_name = X4, | ||
status_id = X6) %>% | ||
dplyr::distinct(status_id, .keep_all = TRUE) | ||
|
||
message(paste(nrow(scrape_statuses), "status URLS captured, rehydrating...")) | ||
temp_rehydration <- rtweet::lookup_statuses(scrape_statuses$status_id, token = token) | ||
|
||
##cleanup temp files | ||
if (delete_tempfile == TRUE & file.exists(output_path)){ | ||
file.remove(output_path) | ||
} | ||
|
||
return(temp_rehydration) | ||
|
||
} | ||
|
||
############################################################################# | ||
#'@title Wrapper to pull the timeline of a user | ||
#'@param screen_name AA twitter username/handle (no"@" required) or user_id (in quotes) | ||
#'@param n Maximum number of results - twitter API has 90k rate-limit per 15 minutes | ||
#'@param file temporary file name for snscrape URL list, timestamp will be appended. | ||
#'@param token An OAuth token loaded in the environment for twitter's Standard API (if not specified, default token will be used) | ||
#'@param delete_tempfile Clear temp file of statuses default = TRUE | ||
#'@description Calls Python script to pull status URLs of a user search, rtweet to rehdrate those statuses. | ||
#' See https://github.com/JustAnotherArchivist/snscrape | ||
#'@keywords twitter, snscrape, rtweet | ||
#'@export | ||
#'@examples | ||
#'test <- snscrape_get_timeline("Jack", n = 1000, file = "test_") | ||
############################################################################### | ||
|
||
snscrape_get_timeline <- function (screen_name, | ||
n = 100, | ||
file = "temp", | ||
token = NULL, | ||
delete_tempfile = TRUE) | ||
{ | ||
require(rtweet, quietly = TRUE) | ||
require(readr, quietly = TRUE) | ||
require(dplyr, quietly = TRUE) | ||
|
||
|
||
output_path <- paste0(file, as.numeric(Sys.time()), ".txt") | ||
|
||
##call Python scraper | ||
system(paste0("snscrape -n ", n," twitter-user ", screen_name, | ||
" > ", output_path)) | ||
|
||
##import status_ids from text file | ||
scrape_timeline <- read_delim(output_path, "/", escape_double = FALSE, | ||
col_names = FALSE, | ||
col_types = cols(X2 = col_skip(), | ||
X1 = col_skip(), X3 = col_skip(), | ||
X5 = col_skip(), X6 = col_character()), | ||
trim_ws = TRUE) %>% | ||
dplyr::rename(screen_name = X4, | ||
status_id = X6) %>% | ||
dplyr::distinct(status_id, .keep_all = TRUE) | ||
|
||
message(paste(nrow(scrape_timeline), "status URLS captured, rehydrating timeline...")) | ||
timeline_rehydration <- rtweet::lookup_statuses(scrape_timeline$status_id, token = token) | ||
|
||
return(timeline_rehydration) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#'@title Account Activity Plot for a Twitter Account | ||
#'@description This function creates a bubble plot of account activity by hour of a single twitter screen_name | ||
#' (inspired by python script by twitter user "@Conspirator0") | ||
#'@param account_name A twitter screen_name, in quotes. | ||
#'@param depth The maximum depth of tweets to be visualised. Starts from most recent tweet. | ||
#'Twitter API maximum and default is 3200. Only those tweets occuring in the no_of_weeks param will be shown | ||
#'@param time_zone The timezone of the account. | ||
#' Requires timezone in format of TZ database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) in quotes. | ||
#' Default is "Africa/Johannesburg" | ||
#'@param no_of_weeks The number of weeks to display. Default is 4. Plot will automatically scale to exclude any period without activity. | ||
#'@param token A twitter oauth token. Default is NULL, | ||
#' and will utilise an existing token loaded into environment, but can be over-ridden to use a particular token. | ||
#'@keywords twitter, rtweet, visualization, activity, bubble plot. | ||
#'@export | ||
#'@examples account_activity("jack", | ||
#' depth = 3200, | ||
#' time_zone = "America/Los_Angeles", | ||
#' no_of_weeks = 4,token = NULL) | ||
|
||
|
||
account_activity <- function(account_name, | ||
depth = 3200, | ||
time_zone = "Africa/Johannesburg", | ||
no_of_weeks = 4, | ||
token = NULL) { | ||
require(rtweet, quietly = TRUE) | ||
require(tidyverse, quietly = TRUE) | ||
require(lubridate, quietly = TRUE) | ||
|
||
rtweet::get_timeline( | ||
account_name, | ||
n = depth, | ||
retryonratelimit = TRUE, | ||
token = token | ||
)[, 1:6] %>% | ||
mutate(created_at2 = with_tz(created_at, tzone = time_zone)) %>% | ||
mutate(tweet_date = lubridate::date(created_at2)) %>% | ||
mutate(tweet_hour = lubridate::hour(created_at2)) %>% | ||
group_by(screen_name) %>% | ||
group_by(source, add = TRUE) %>% | ||
group_by(tweet_date, add = TRUE) %>% | ||
group_by(tweet_hour, add = TRUE) %>% | ||
mutate(hourly_tweet_count = n()) %>% | ||
ungroup %>% | ||
mutate(tweet_period = (as.duration(interval( | ||
max(created_at2), created_at2 | ||
)))) %>% | ||
filter(tweet_period > as.duration(-604800 * no_of_weeks)) %>% | ||
group_by(source, tweet_date, tweet_hour) %>% | ||
slice(1) %>% | ||
ungroup() %>% | ||
mutate(bubble_scale = max(hourly_tweet_count) / 5) %>% | ||
ggplot( | ||
aes( | ||
x = tweet_hour, | ||
y = tweet_date, | ||
size = hourly_tweet_count, | ||
colour = source, | ||
alpha = 0.3 | ||
) | ||
) + | ||
geom_point() + | ||
scale_size_continuous(name = "Hourly tweet volume") + | ||
scale_x_discrete( | ||
limits = factor(c(0:23)), | ||
breaks = c(0:23), | ||
labels = c(0:23) | ||
) + | ||
expand_limits(x = c(0:23)) + | ||
theme_minimal() + | ||
labs( | ||
title = paste( | ||
"Account activity: ", | ||
account_name, | ||
" (as at ", | ||
as_datetime(Sys.Date()), | ||
")\n", | ||
" Time zone of tweets: ", | ||
time_zone, | ||
sep = "" | ||
), | ||
x = "Hour of day", | ||
y = "Date" | ||
) + | ||
guides(alpha = "none") | ||
} |
Oops, something went wrong.