Skip to content

Commit

Permalink
Add first data pulling function
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexAxthelm committed Dec 12, 2023
1 parent ad93160 commit cbe4a73
Show file tree
Hide file tree
Showing 8 changed files with 285 additions and 12 deletions.
9 changes: 9 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,12 @@ License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Imports:
DBI,
dbplyr,
dplyr,
logger,
RPostgres,
withr
Suggests:
rstudioapi
20 changes: 8 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,19 @@ RUN groupadd -r runner-workflow-factset \
&& chown -R runner-workflow-factset /home/runner-workflow-factset
WORKDIR /home/runner-workflow-factset

# # install system dependencies
# RUN apt-get update \
# && apt-get install -y --no-install-recommends \
# git=1:2.34.* \
# libcurl4-openssl-dev=7.81.* \
# libicu-dev=70.* \
# libssl-dev=3.0.* \
# openssh-client=1:8.* \
# wget=1.21.* \
# && chmod -R a+rwX /root \
# && rm -rf /var/lib/apt/lists/*
# install system dependencies
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libicu-dev=70.* \
libpq-dev=14.* \
&& chmod -R a+rwX /root \
&& rm -rf /var/lib/apt/lists/*

# set frozen CRAN repo
ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30"
RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" \
# install packages for dependency resolution and installation
&& Rscript -e "install.packages('pak')"
&& Rscript -e "install.packages(c('pak', 'jsonlite'))"

# copy in everything from this repo
COPY . /workflow.factset
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# Generated by roxygen2: do not edit by hand

export(get_factset_entity_info)
importFrom(dplyr,"%>%")
98 changes: 98 additions & 0 deletions R/connect_factset_db.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Connection function

connect_factset_db <-
function(
dbname = "delta",
host = "data-eval-db.postgres.database.azure.com",
port = 5432L,
options = "-c search_path=fds",
username = Sys.getenv("R_DATABASE_USER"),
password = Sys.getenv("R_DATABASE_PASSWORD"),
keyring_service_name = "2dii_factset_database") {

if (username == "") {
logger::log_error("No database username could be found. Please set the username as an environment variable")
}

if (password == "") {
# if password not defined in .env, look in systems keyring
if (requireNamespace("keyring", quietly = TRUE)) {
if (!username %in% keyring::key_list(service = keyring_service_name)$username) {
keyring::key_set(
service = keyring_service_name,
username = username,
prompt = "Enter password for the FactSet database (it will be stored in your system's keyring): "
)
}
password <- keyring::key_get(
service = keyring_service_name,
username = username
)
} else if (interactive() && requireNamespace("rstudioapi", quietly = TRUE)) {
password <- rstudioapi::askForPassword(
prompt = "Please enter the FactSet database password:"
)
} else {
logger::log_error(
"No database password could be found. Please set the password
as an environment variable"
)
}
}

logger::log_trace(
"Connecting to database {dbname} on {host}:{port} as {username}"
)
conn <-
DBI::dbConnect(
drv = RPostgres::Postgres(),
dbname = dbname,
host = host,
port = port,
user = username,
password = password,
options = options
)

reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame())
}

# connection finalizer to ensure connection is closed --------------------------
# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/

reg_conn_finalizer <- function(conn, close_fun, envir) {
is_parent_global <- identical(.GlobalEnv, envir)

if (isTRUE(is_parent_global)) {
env_finalizer <- new.env(parent = emptyenv())
env_finalizer$conn <- conn
attr(conn, "env_finalizer") <- env_finalizer

reg.finalizer(env_finalizer, function(e) {
if (DBI::dbIsValid(e$conn)) {
logger::log_warn("Warning: A database connection was closed automatically because the connection object was removed or the R session was closed.")
try(close_fun(e$conn))
}
}, onexit = TRUE)
} else {
withr::defer(
{
if (DBI::dbIsValid(conn)) {
dbname <- DBI::dbGetInfo(conn)$dbname
host <- DBI::dbGetInfo(conn)$host

logger::log_warn(
"The database connection to {dbname} on {host} was
closed automatically because the calling environment was closed."
)
try(close_fun(conn))
}
},
envir = envir,
priority = "last"
)
}

logger::log_trace("Database connection registered for finalization")
return(conn)
}
113 changes: 113 additions & 0 deletions R/get_factset_entity_info.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#' Get the entity info data from the FactSet database and prepare the
#' `factset_entity_info` tibble
#'
#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
#' specifying database connection parameters)
#'
#' @return A tibble properly prepared to be saved as the
#' `factset_entity_info.rds` output file
#'
#' @export

get_factset_entity_info <-
function(...) {
# build connection to database ---------------------------------------------

factset_db <- connect_factset_db(...)

logger::log_debug("Extracting entity info from database.")

# company_name -------------------------------------------------------------

logger::log_trace("Accessing entity proper names.")
factset_entity_id__entity_proper_name <-
dplyr::tbl(factset_db, "sym_v1_sym_entity") %>%
dplyr::select("factset_entity_id", "entity_proper_name")


# country_of_domicile ------------------------------------------------------

logger::log_trace("Accessing entity country of domicile.")
factset_entity_id__iso_country <-
dplyr::tbl(factset_db, "sym_v1_sym_entity") %>%
dplyr::select("factset_entity_id", "iso_country")


# sector -------------------------------------------------------------------

logger::log_trace("Accessing entity sector.")
factset_entity_id__sector_code <-
dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>%
dplyr::select("factset_entity_id", "sector_code")

factset_sector_code__factset_sector_desc <-
dplyr::tbl(factset_db, "ref_v2_factset_sector_map") %>%
dplyr::select(.data$factset_sector_code, .data$factset_sector_desc)

factset_entity_id__factset_sector_desc <-
factset_entity_id__sector_code %>%
dplyr::left_join(factset_sector_code__factset_sector_desc, by = c("sector_code" = "factset_sector_code")) %>%
dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc")


# sub-sector/industry ------------------------------------------------------

logger::log_trace("Accessing entity industry/sector/subsector.")
factset_entity_id__industry_code <-
dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>%
dplyr::select("factset_entity_id", "industry_code")

factset_industry_code_factset_industry_desc <-
dplyr::tbl(factset_db, "ref_v2_factset_industry_map") %>%
dplyr::select("factset_industry_code", "factset_industry_desc")

factset_entity_id__factset_industry_desc <-
factset_entity_id__industry_code %>%
dplyr::left_join(factset_industry_code_factset_industry_desc, by = c("industry_code" = "factset_industry_code")) %>%
dplyr::select("factset_entity_id", "industry_code", "factset_industry_desc")


# credit risk parent -------------------------------------------------------

logger::log_trace("Accessing entity credit risk parent.")
ent_v1_ent_entity_affiliates <- dplyr::tbl(factset_db, "ent_v1_ent_entity_affiliates")
ref_v2_affiliate_type_map <- dplyr::tbl(factset_db, "ref_v2_affiliate_type_map")

ent_entity_affiliates_last_update <-
dplyr::tbl(factset_db, "fds_fds_file_history") %>%
dplyr::filter(.data$table_name == "ent_entity_affiliates") %>%
dplyr::filter(.data$begin_time == max(.data$begin_time, na.rm = TRUE)) %>%
dplyr::pull("begin_time")

factset_entity_id__credit_parent_id <-
ent_v1_ent_entity_affiliates %>%
dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>%
dplyr::filter(.data$aff_type_desc == "Credit Risk Parent") %>%
dplyr::select(
factset_entity_id = "factset_affiliated_entity_id",
credit_parent_id = "factset_entity_id"
) %>%
dplyr::mutate(ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update)


# merge and collect --------------------------------------------------------

logger::log_trace("Merging entity info.")
entity_info <-
factset_entity_id__entity_proper_name %>%
dplyr::left_join(factset_entity_id__iso_country, by = "factset_entity_id") %>%
dplyr::left_join(factset_entity_id__factset_sector_desc, by = "factset_entity_id") %>%
dplyr::left_join(factset_entity_id__factset_industry_desc, by = "factset_entity_id") %>%
dplyr::left_join(factset_entity_id__credit_parent_id, by = "factset_entity_id")

logger::log_trace("Downloading merged entity info from database.")
entity_info <- dplyr::collect(entity_info)
logger::log_trace("Download complete.")

logger::log_trace("Disconnecting from database.")
DBI::dbDisconnect(factset_db)


# return prepared data -----------------------------------------------------
return(entity_info)
}
7 changes: 7 additions & 0 deletions R/workflow.factset-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#' @keywords internal
"_PACKAGE"

## usethis namespace: start
#' @importFrom dplyr %>%
## usethis namespace: end
NULL
21 changes: 21 additions & 0 deletions man/get_factset_entity_info.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions man/workflow.factset-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit cbe4a73

Please sign in to comment.