Adjusted functions files. fixed errors

Arf9999 · Dec 7, 2021 · 03f38e8 · 03f38e8
1 parent 4258d35
commit 03f38e8
Show file tree

Hide file tree

Showing 46 changed files with 1,876 additions and 1,905 deletions.
diff --git a/R/GOT3_functions.R b/R/GOT3_functions.R
@@ -0,0 +1,174 @@
+#############################################################################
+#'@title Function to rehydrate tweets into rtweet tibbles from GOT3 data format
+#'@param got3_df A dataframe as returned by GetOldTweets3
+#'@param token An OAuth token loaded in the environment for twitter's Standard API (REQUIRED if a list of tokens is not supplied)
+#'@param token_list A list of OAuth tokens loaded in the environment (REQUIRED if token is not specified)
+#'@description Uses status_id to request full {rtweet} tibble of tweet data
+#'@keywords twitter, getoldtweets3, rtweet
+#'@export
+###############################################################################
+rehydrate_got3_statuses <-
+  function(got3_df,
+           token = NULL,
+           token_list = NULL) {
+    require(rtweet, quietly = TRUE)
+    require(dplyr, quietly = TRUE)
+    require(purrr, quietly = TRUE)
+    require(readr, quietly = TRUE)
+
+    ### Check tokens
+    if (is.null(token) & is.null(token_list)) {
+      stop(
+        "Please designate either a token or a list of tokens that are loaded into the environment"
+      )
+
+    }
+    if (is.null(token_list)) {
+      token_list = c(token)
+    }
+    ###
+
+    df_length <- nrow(got3_df)  ##check length of GOT list of statuses
+    last_capture <- 0L ##initial setting for statuses captured
+    #check ratelimits for all tokens
+    ratelimits <-
+      purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")
+
+
+    if (as.numeric(max(ratelimits$remaining)) > df_length / 100) {
+      ##if no ratelimit reset or token rotation required.
+      message(paste0("lookup of ", df_length, " statuses"))
+
+      row_of_max_rl <- which.max(ratelimits$remaining)
+      rehydration <- rtweet::lookup_statuses(unlist(got3_df[, "id"]),
+                                             token = token_list[[row_of_max_rl]])
+
+
+    } else{
+      while (last_capture < nrow(got3_df)) {
+        # iterate through the rows using token rotation & rate reset pausing
+        row_of_max_rl <- which.max(ratelimits$remaining)
+
+        ##first iteration
+        if (last_capture == 0L) {
+          last_capture <-
+            as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100)
+          message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"])))
+          message(paste0("base: ", 1L, " upper: ", last_capture))
+
+          rehydration <-
+            rtweet::lookup_statuses(unlist(got3_df[c(1:last_capture), "id"]),
+                                    token = token_list[[row_of_max_rl]])
+        } else{
+          #iterate through remainder of status_ids
+          base_capture <- last_capture + 1
+          last_capture <-
+            min(c(
+              base_capture + as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100),
+              nrow(got3_df)
+            ))
+          message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"])))
+          message(paste0("base: ", base_capture, " upper: ", last_capture))
+
+          rehydration <- dplyr::bind_rows(rehydration,
+                                          rtweet::lookup_statuses(unlist(got3_df[c(base_capture:last_capture), "id"]),
+                                                                  token = token_list[[row_of_max_rl]]))
+
+        }
+        ratelimits <-
+          purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")#check ratelimits
+
+
+
+        ###manage ratelimit resets as gracefully as possible - conservatively set to 100 queries as test
+        if (max(ratelimits$remaining) < 100 &
+            last_capture < nrow(got3_df)) {
+          message(paste0(
+            "Pausing for ratelimit reset: ",
+            min(ratelimits$reset),
+            " minutes"
+          ))
+          Sys.sleep(as.numeric(min(ratelimits$reset) * 60))
+          ratelimits <-
+            purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")
+        }
+
+      }
+    }
+
+    ##check for missing tweets and re-lookup
+    orig <- dplyr::as_tibble(got3_df[, "id"]) %>%
+      rename(status_id = id)
+    message(paste0("original: ", nrow(orig)))
+
+    missing <-
+      anti_join(orig, as_tibble(rehydration[, "status_id"]), by = "status_id")
+
+    message(paste0("missing: ", nrow(missing)))
+
+    if (nrow(missing) > 0) {
+      ##try again to look up missing statuses
+      ratelimits <-
+        purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")
+      row_of_max_rl <- which.max(ratelimits$remaining)
+      df_length <- nrow(missing)
+      message(paste0("Attempting to populate missing tweets: ", df_length))
+      rehydration <- bind_rows(rehydration,
+                               rtweet::lookup_statuses(unlist(missing[c(1:df_length), "status_id"]),
+                                                       token = token_list[[row_of_max_rl]]))
+
+      ##write log file of missing tweets.
+      missing <-
+        anti_join(orig, as_tibble(rehydration[, "status_id"]), by = "status_id") %>%
+        rename(id = status_id) %>%
+        left_join(got3_df, by = "id") %>%
+        mutate(error = "Status not downloaded")
+
+      message(
+        paste0(
+          nrow(missing),
+          " tweets not downloaded, see log file for details: ",
+          "got3_rehydration_missing_log_",
+          as.numeric(Sys.time()),
+          ".csv"
+        )
+      )
+
+      readr::write_csv(missing,
+                       paste0(
+                         "got3_rehydration_missing_log_",
+                         as.numeric(Sys.time()),
+                         ".csv"
+                       ))
+
+
+    }
+    ##Get maximum number of retweets
+    retweets <- filter(rehydration, retweet_count > 1) %>%
+      select(status_id)
+
+    message(paste ("tweets with retweets:", nrow(retweets)))
+
+    ratelimits <-
+      purrr::map_df(token_list, rtweet::rate_limit, query = "get_retweets")
+    row_of_max_rl <- which.max(ratelimits$remaining)
+
+    retweet_temp <-
+      purrr::map_df(retweets$status_id,
+                    rtweet::get_retweets,
+                    n = 100,
+                    token = token_list[[row_of_max_rl]])
+    message(paste0("temp rows:", nrow(retweet_temp)))
+
+    message("Captured ",
+            nrow(retweet_temp),
+            " retweets (a maximum of 100 per original tweet)")
+
+
+
+    rehydration <- bind_rows(rehydration, retweet_temp)
+
+    return(rehydration)
+
+  }
+
diff --git a/R/SNscrape_functions.R b/R/SNscrape_functions.R
@@ -0,0 +1,114 @@
+#############################################################################
+#'@title Wrapper to undertake historical searches calling Python snscrape library.
+#'@param search_string A search string (in quotes)
+#'@param since_date Start date for the search (iso format date in quotes)
+#'@param until_date Latest date for the search (iso format date in quotes NB: search works backward)
+#'@param n Maximum number of results - twitter API has 90k rate-limit per 15 minutes
+#'@param file temporary file name for snscrape URL list, timestamp will be appended.
+#'@param token An OAuth token loaded in the environment for twitter's Standard API (if not specified, default token will be used)
+#'@param delete_tempfile Clear temp file of statuses default = TRUE
+#'@description Calls Python script to pull status URLs of a search, rtweet to rehdrate those statuses.
+#'            See https://github.com/JustAnotherArchivist/snscrape
+#'@keywords twitter, snscrape, rtweet
+#'@export
+#'@examples
+#'test <- snscrape_search("Trump", since_date = "2016-09-06", until_date = "2016-11-06", n = 1000, file = "test_", token = NULL)
+###############################################################################
+
+snscrape_search <- function(search_string, #search terms in quotes
+                            since_date = NULL, #optional iso date in quotes
+                            until_date = NULL, #optional iso date in quotes
+                            n = 100, #max number of statuses to retrieve. Fewer than 90K recommend due to rate-limiting
+                            file = "temp", #temporary name for snscrape text file
+                            token = NULL, #specify token if required
+                            delete_tempfile = TRUE) #delete text file of statuses
+{
+  require(rtweet, quietly = TRUE)
+  require(readr, quietly = TRUE)
+  require(dplyr, quietly = TRUE)
+
+  if(!is.null(since_date)) {
+    search_string <- paste0(search_string," since:",since_date)
+  }
+
+  if(!is.null(until_date)){
+    search_string <-paste0(search_string, " until:", until_date)
+  }
+
+  output_path <- paste0(file, as.numeric(Sys.time()), ".txt")
+
+  ##call Python scraper
+  system(paste0("snscrape -n ", n," twitter-search ","\"", search_string,"\"",
+                " > ", output_path))
+
+  ##import status_ids from text file
+  scrape_statuses <-  read_delim(output_path, "/", escape_double = FALSE,
+                                 col_names = FALSE,
+                                 col_types = cols(X2 = col_skip(),
+                                                  X1 = col_skip(), X3 = col_skip(),
+                                                  X5 = col_skip(), X6 = col_character()),
+                                 trim_ws = TRUE) %>%
+    dplyr::rename(screen_name = X4,
+                  status_id = X6) %>%
+    dplyr::distinct(status_id, .keep_all = TRUE)
+
+  message(paste(nrow(scrape_statuses), "status URLS captured, rehydrating..."))
+  temp_rehydration <- rtweet::lookup_statuses(scrape_statuses$status_id, token = token)
+
+  ##cleanup temp files
+  if (delete_tempfile == TRUE & file.exists(output_path)){
+    file.remove(output_path)
+  }
+
+  return(temp_rehydration)
+
+}
+
+#############################################################################
+#'@title Wrapper to pull the timeline of a user
+#'@param screen_name AA twitter username/handle (no"@" required) or user_id (in quotes)
+#'@param n Maximum number of results - twitter API has 90k rate-limit per 15 minutes
+#'@param file temporary file name for snscrape URL list, timestamp will be appended.
+#'@param token An OAuth token loaded in the environment for twitter's Standard API (if not specified, default token will be used)
+#'@param delete_tempfile Clear temp file of statuses default = TRUE
+#'@description Calls Python script to pull status URLs of a user search, rtweet to rehdrate those statuses.
+#'            See https://github.com/JustAnotherArchivist/snscrape
+#'@keywords twitter, snscrape, rtweet
+#'@export
+#'@examples
+#'test <- snscrape_get_timeline("Jack",  n = 1000, file = "test_")
+###############################################################################
+
+snscrape_get_timeline <- function (screen_name,
+                                   n = 100,
+                                   file = "temp",
+                                   token = NULL,
+                                   delete_tempfile = TRUE)
+{
+  require(rtweet, quietly = TRUE)
+  require(readr, quietly = TRUE)
+  require(dplyr, quietly = TRUE)
+
+
+  output_path <- paste0(file, as.numeric(Sys.time()), ".txt")
+
+  ##call Python scraper
+  system(paste0("snscrape -n ", n," twitter-user ", screen_name,
+                " > ", output_path))
+
+  ##import status_ids from text file
+  scrape_timeline <-  read_delim(output_path, "/", escape_double = FALSE,
+                                 col_names = FALSE,
+                                 col_types = cols(X2 = col_skip(),
+                                                  X1 = col_skip(), X3 = col_skip(),
+                                                  X5 = col_skip(), X6 = col_character()),
+                                 trim_ws = TRUE) %>%
+    dplyr::rename(screen_name = X4,
+                  status_id = X6) %>%
+    dplyr::distinct(status_id, .keep_all = TRUE)
+
+  message(paste(nrow(scrape_timeline), "status URLS captured, rehydrating timeline..."))
+  timeline_rehydration <- rtweet::lookup_statuses(scrape_timeline$status_id, token = token)
+
+  return(timeline_rehydration)
+}
diff --git a/R/accountactivity.R b/R/accountactivity.R
@@ -0,0 +1,86 @@
+#'@title Account Activity Plot for a Twitter Account
+#'@description This function creates a bubble plot of account activity by hour of a single twitter screen_name
+#'     (inspired by python script by twitter user "@Conspirator0")
+#'@param account_name A twitter screen_name, in quotes.
+#'@param depth The maximum depth of tweets to be visualised. Starts from most recent tweet.
+#'Twitter API maximum and default is 3200. Only those tweets occuring in the no_of_weeks param will be shown
+#'@param time_zone The timezone of the account.
+#'    Requires timezone in format of TZ database (https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) in quotes.
+#'    Default is "Africa/Johannesburg"
+#'@param no_of_weeks The number of weeks to display. Default is 4. Plot will automatically scale to exclude any period without activity.
+#'@param token A twitter oauth token. Default is NULL,
+#'    and will utilise an existing token loaded into environment, but can be over-ridden to use a particular token.
+#'@keywords twitter, rtweet, visualization, activity, bubble plot.
+#'@export
+#'@examples account_activity("jack",
+#'    depth = 3200,
+#'    time_zone = "America/Los_Angeles",
+#'    no_of_weeks = 4,token = NULL)
+
+
+account_activity <- function(account_name,
+                             depth = 3200,
+                             time_zone = "Africa/Johannesburg",
+                             no_of_weeks = 4,
+                             token = NULL) {
+  require(rtweet, quietly = TRUE)
+  require(tidyverse, quietly = TRUE)
+  require(lubridate, quietly = TRUE)
+
+  rtweet::get_timeline(
+    account_name,
+    n = depth,
+    retryonratelimit = TRUE,
+    token = token
+  )[, 1:6] %>%
+    mutate(created_at2 = with_tz(created_at, tzone = time_zone)) %>%
+    mutate(tweet_date = lubridate::date(created_at2)) %>%
+    mutate(tweet_hour = lubridate::hour(created_at2)) %>%
+    group_by(screen_name) %>%
+    group_by(source, add = TRUE) %>%
+    group_by(tweet_date, add = TRUE) %>%
+    group_by(tweet_hour, add = TRUE) %>%
+    mutate(hourly_tweet_count = n()) %>%
+    ungroup %>%
+    mutate(tweet_period = (as.duration(interval(
+      max(created_at2), created_at2
+    )))) %>%
+    filter(tweet_period > as.duration(-604800 * no_of_weeks)) %>%
+    group_by(source, tweet_date, tweet_hour) %>%
+    slice(1) %>%
+    ungroup() %>%
+    mutate(bubble_scale = max(hourly_tweet_count) / 5) %>%
+    ggplot(
+      aes(
+        x = tweet_hour,
+        y = tweet_date,
+        size = hourly_tweet_count,
+        colour = source,
+        alpha = 0.3
+      )
+    ) +
+    geom_point() +
+    scale_size_continuous(name = "Hourly tweet volume") +
+    scale_x_discrete(
+      limits = factor(c(0:23)),
+      breaks = c(0:23),
+      labels = c(0:23)
+    ) +
+    expand_limits(x = c(0:23)) +
+    theme_minimal() +
+    labs(
+      title = paste(
+        "Account activity: ",
+        account_name,
+        " (as at ",
+        as_datetime(Sys.Date()),
+        ")\n",
+        " Time zone of tweets: ",
+        time_zone,
+        sep = ""
+      ),
+      x = "Hour of day",
+      y = "Date"
+    ) +
+    guides(alpha = "none")
+}