From 0b24fba1ebc1c3c60da6c3232121ec42e2d6c628 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 13:53:36 +0100 Subject: [PATCH 01/33] add inital Dockerfile --- .dockerignore | 2 ++ Dockerfile | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2833d34 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.git/ +Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9b7bbeb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,65 @@ +# using rocker r-vers as a base with R 4.3.1 +# https://hub.docker.com/r/rocker/r-ver +# https://rocker-project.org/images/versioned/r-ver.html +# +# sets CRAN repo to use Posit Package Manager to freeze R package versions to +# those available on 2023-10-30 +# https://packagemanager.posit.co/client/#/repos/2/overview +# https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30 + +# set proper base image +ARG R_VERS="4.3.1" +FROM rocker/r-ver:$R_VERS AS base + +# set Docker image labels +LABEL org.opencontainers.image.source=https://github.com/RMI-PACTA/workflow.factset +LABEL org.opencontainers.image.description="Extract FactSet Data for use in PACTA" +LABEL org.opencontainers.image.licenses=MIT +LABEL org.opencontainers.image.title="" +LABEL org.opencontainers.image.revision="" +LABEL org.opencontainers.image.version="" +LABEL org.opencontainers.image.vendor="" +LABEL org.opencontainers.image.base.name="" +LABEL org.opencontainers.image.ref.name="" +LABEL org.opencontainers.image.authors="" + +# set apt-get to noninteractive mode +ARG DEBIAN_FRONTEND="noninteractive" +ARG DEBCONF_NOWARNINGS="yes" + +RUN groupadd -r runner-workflow-factset \ + && useradd -r -g runner-workflow-factset runner-workflow-factset \ + && mkdir -p /home/runner-workflow-factset \ + && chown -R runner-workflow-factset /home/runner-workflow-factset +WORKDIR /home/runner-workflow-factset + +# # install system dependencies +# RUN apt-get update \ +# && apt-get install -y --no-install-recommends \ +# git=1:2.34.* \ +# libcurl4-openssl-dev=7.81.* \ +# libicu-dev=70.* \ +# libssl-dev=3.0.* \ +# openssh-client=1:8.* \ +# wget=1.21.* \ +# && chmod -R a+rwX /root \ +# && rm -rf /var/lib/apt/lists/* + +# set frozen CRAN repo +ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30" +RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" \ + # install packages for dependency resolution and installation + && Rscript -e "install.packages('pak')" + +# copy in everything from this repo +COPY . /workflow.factset + +# install R package dependencies +RUN Rscript -e "\ + pak::pkg_install('local::/workflow.factset'); \ + " + +USER runner-workflow-factset + +# set default run behavior +CMD ["input_dir/default_config.json"] From ad93160ef564980af241368134e2113c52b66677 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 13:55:16 +0100 Subject: [PATCH 02/33] Add GitHub actions --- .Rbuildignore | 2 + .dockerignore | 1 + .../workflows/build-Docker-image-nightly.yml | 12 ++++ .../build-Docker-image-on-push-to-main.yml | 12 ++++ .../build-Docker-image-on-push-to-pr.yml | 37 ++++++++++ .../workflows/build-and-push-Docker-image.yml | 67 +++++++++++++++++++ .github/workflows/check-R-sysdeps.yml | 32 +++++++++ .github/workflows/run-hadolint.yml | 11 +++ 8 files changed, 174 insertions(+) create mode 100644 .github/workflows/build-Docker-image-nightly.yml create mode 100644 .github/workflows/build-Docker-image-on-push-to-main.yml create mode 100644 .github/workflows/build-Docker-image-on-push-to-pr.yml create mode 100644 .github/workflows/build-and-push-Docker-image.yml create mode 100644 .github/workflows/check-R-sysdeps.yml create mode 100644 .github/workflows/run-hadolint.yml diff --git a/.Rbuildignore b/.Rbuildignore index 5163d0b..8cc3750 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1 +1,3 @@ ^LICENSE\.md$ +.git/ +.github/ diff --git a/.dockerignore b/.dockerignore index 2833d34..c49182c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,3 @@ .git/ +.github/ Dockerfile diff --git a/.github/workflows/build-Docker-image-nightly.yml b/.github/workflows/build-Docker-image-nightly.yml new file mode 100644 index 0000000..7ffa64f --- /dev/null +++ b/.github/workflows/build-Docker-image-nightly.yml @@ -0,0 +1,12 @@ +on: + schedule: + - cron: '0 0 * * 1,2,3,4,5' + +jobs: + build_docker_image: + name: "Call build and push action" + uses: ./.github/workflows/build-and-push-Docker-image.yml + secrets: inherit + with: + image-name: workflow.factset + image-tag: nightly diff --git a/.github/workflows/build-Docker-image-on-push-to-main.yml b/.github/workflows/build-Docker-image-on-push-to-main.yml new file mode 100644 index 0000000..b75fca6 --- /dev/null +++ b/.github/workflows/build-Docker-image-on-push-to-main.yml @@ -0,0 +1,12 @@ +on: + push: + branches: [main] + +jobs: + build_docker_image: + name: "Call build and push action" + uses: ./.github/workflows/build-and-push-Docker-image.yml + secrets: inherit + with: + image-name: workflow.factset + image-tag: main diff --git a/.github/workflows/build-Docker-image-on-push-to-pr.yml b/.github/workflows/build-Docker-image-on-push-to-pr.yml new file mode 100644 index 0000000..16934bb --- /dev/null +++ b/.github/workflows/build-Docker-image-on-push-to-pr.yml @@ -0,0 +1,37 @@ +on: + pull_request: + +jobs: + build_docker_image: + name: "Call build and push action" + uses: ./.github/workflows/build-and-push-Docker-image.yml + secrets: inherit + with: + image-name: workflow.factset + image-tag: pr${{ github.event.pull_request.number }} + + add_comment: + needs: build_docker_image + runs-on: ubuntu-latest + steps: + - name: Find Comment + # https://github.com/peter-evans/find-comment + uses: peter-evans/find-comment@v2 + id: fc + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-includes: Docker image from this PR + + - name: Create or update comment + # https://github.com/peter-evans/create-or-update-comment + uses: peter-evans/create-or-update-comment@v3 + with: + comment-id: ${{ steps.fc.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + body: | + Docker image from this PR (${{ github.event.pull_request.head.sha }}) created + ``` + docker pull ${{ needs.build_docker_image.outputs.full-image-name }} + ``` + edit-mode: replace diff --git a/.github/workflows/build-and-push-Docker-image.yml b/.github/workflows/build-and-push-Docker-image.yml new file mode 100644 index 0000000..b6d8e1e --- /dev/null +++ b/.github/workflows/build-and-push-Docker-image.yml @@ -0,0 +1,67 @@ +--- +name: Build and push docker image + +on: + workflow_call: + inputs: + image-name: + required: true + type: string + image-tag: + required: true + type: string + outputs: + full-image-name: + description: "Full pushed image name including host/registry, name, and tag" + value: ${{ jobs.docker.outputs.full-image-name }} + +jobs: + docker: + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + timeout-minutes: 25 + outputs: + full-image-name: ${{ steps.image-name.outputs.full-image-name }} + + steps: + + - name: Define image name + id: image-name + run: | + full_image_name="ghcr.io/${{ github.repository_owner }}/${{ inputs.image-name }}:${{ inputs.image-tag }}" + full_image_name=$(echo $full_image_name | tr '[A-Z]' '[a-z]') + echo "full-image-name=$full_image_name" >> "$GITHUB_OUTPUT" + echo "$full_image_name" > full-image-name + + - uses: actions/upload-artifact@v3 + with: + name: full-image-name + path: . + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + push: true + tags: ${{ steps.image-name.outputs.full-image-name }} + cache-from: type=gha + cache-to: type=gha,mode=min + no-cache-filters: install-pacta + + check-system-dependencies: + name: "Check System Dependencies" + needs: docker + uses: ./.github/workflows/check-R-sysdeps.yml + with: + image: ${{ needs.docker.outputs.full-image-name }} \ No newline at end of file diff --git a/.github/workflows/check-R-sysdeps.yml b/.github/workflows/check-R-sysdeps.yml new file mode 100644 index 0000000..3a1c08b --- /dev/null +++ b/.github/workflows/check-R-sysdeps.yml @@ -0,0 +1,32 @@ +--- +name: Check R system dependencies + +on: + workflow_call: + inputs: + image: + required: true + type: string + +jobs: + + check-system-dependencies: + runs-on: ubuntu-latest + steps: + - name: 'Pull image' + run: | + echo ${{ inputs.image }} + docker pull ${{ inputs.image }} + - name: 'Run pak::sysreqs_check_installed()' + run: | + + docker run \ + --rm \ + --entrypoint "/bin/sh" \ + ${{ inputs.image }} \ + -c "Rscript -e ' + x <- pak::sysreqs_check_installed() + print(x) + is_installed <- as.data.frame(x)[[\"installed\"]] + stopifnot(all(is_installed)) + '" diff --git a/.github/workflows/run-hadolint.yml b/.github/workflows/run-hadolint.yml new file mode 100644 index 0000000..0f07812 --- /dev/null +++ b/.github/workflows/run-hadolint.yml @@ -0,0 +1,11 @@ +--- +on: [push, pull_request] + +jobs: + hadolint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: hadolint/hadolint-action@v3.1.0 + with: + dockerfile: Dockerfile From cbe4a73adb739ec9583924e36d1722640e3f5868 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 16:49:56 +0100 Subject: [PATCH 03/33] Add first data pulling function --- DESCRIPTION | 9 +++ Dockerfile | 20 +++--- NAMESPACE | 2 + R/connect_factset_db.R | 98 +++++++++++++++++++++++++++ R/get_factset_entity_info.R | 113 ++++++++++++++++++++++++++++++++ R/workflow.factset-package.R | 7 ++ man/get_factset_entity_info.Rd | 21 ++++++ man/workflow.factset-package.Rd | 27 ++++++++ 8 files changed, 285 insertions(+), 12 deletions(-) create mode 100644 R/connect_factset_db.R create mode 100644 R/get_factset_entity_info.R create mode 100644 R/workflow.factset-package.R create mode 100644 man/get_factset_entity_info.Rd create mode 100644 man/workflow.factset-package.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 03ec3c5..852f0ee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,3 +30,12 @@ License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 +Imports: + DBI, + dbplyr, + dplyr, + logger, + RPostgres, + withr +Suggests: + rstudioapi diff --git a/Dockerfile b/Dockerfile index 9b7bbeb..ac67d99 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,23 +33,19 @@ RUN groupadd -r runner-workflow-factset \ && chown -R runner-workflow-factset /home/runner-workflow-factset WORKDIR /home/runner-workflow-factset -# # install system dependencies -# RUN apt-get update \ -# && apt-get install -y --no-install-recommends \ -# git=1:2.34.* \ -# libcurl4-openssl-dev=7.81.* \ -# libicu-dev=70.* \ -# libssl-dev=3.0.* \ -# openssh-client=1:8.* \ -# wget=1.21.* \ -# && chmod -R a+rwX /root \ -# && rm -rf /var/lib/apt/lists/* +# install system dependencies +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libicu-dev=70.* \ + libpq-dev=14.* \ + && chmod -R a+rwX /root \ + && rm -rf /var/lib/apt/lists/* # set frozen CRAN repo ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30" RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" \ # install packages for dependency resolution and installation - && Rscript -e "install.packages('pak')" + && Rscript -e "install.packages(c('pak', 'jsonlite'))" # copy in everything from this repo COPY . /workflow.factset diff --git a/NAMESPACE b/NAMESPACE index 6ae9268..f4f7aae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,4 @@ # Generated by roxygen2: do not edit by hand +export(get_factset_entity_info) +importFrom(dplyr,"%>%") diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R new file mode 100644 index 0000000..00e0c37 --- /dev/null +++ b/R/connect_factset_db.R @@ -0,0 +1,98 @@ +# Connection function + +connect_factset_db <- + function( + dbname = "delta", + host = "data-eval-db.postgres.database.azure.com", + port = 5432L, + options = "-c search_path=fds", + username = Sys.getenv("R_DATABASE_USER"), + password = Sys.getenv("R_DATABASE_PASSWORD"), + keyring_service_name = "2dii_factset_database") { + + if (username == "") { + logger::log_error("No database username could be found. Please set the username as an environment variable") + } + + if (password == "") { + # if password not defined in .env, look in systems keyring + if (requireNamespace("keyring", quietly = TRUE)) { + if (!username %in% keyring::key_list(service = keyring_service_name)$username) { + keyring::key_set( + service = keyring_service_name, + username = username, + prompt = "Enter password for the FactSet database (it will be stored in your system's keyring): " + ) + } + password <- keyring::key_get( + service = keyring_service_name, + username = username + ) + } else if (interactive() && requireNamespace("rstudioapi", quietly = TRUE)) { + password <- rstudioapi::askForPassword( + prompt = "Please enter the FactSet database password:" + ) + } else { + logger::log_error( + "No database password could be found. Please set the password + as an environment variable" + ) + } + } + + logger::log_trace( + "Connecting to database {dbname} on {host}:{port} as {username}" + ) + conn <- + DBI::dbConnect( + drv = RPostgres::Postgres(), + dbname = dbname, + host = host, + port = port, + user = username, + password = password, + options = options + ) + + reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame()) + } + +# connection finalizer to ensure connection is closed -------------------------- +# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/ + +reg_conn_finalizer <- function(conn, close_fun, envir) { + is_parent_global <- identical(.GlobalEnv, envir) + + if (isTRUE(is_parent_global)) { + env_finalizer <- new.env(parent = emptyenv()) + env_finalizer$conn <- conn + attr(conn, "env_finalizer") <- env_finalizer + + reg.finalizer(env_finalizer, function(e) { + if (DBI::dbIsValid(e$conn)) { + logger::log_warn("Warning: A database connection was closed automatically because the connection object was removed or the R session was closed.") + try(close_fun(e$conn)) + } + }, onexit = TRUE) + } else { + withr::defer( + { + if (DBI::dbIsValid(conn)) { + dbname <- DBI::dbGetInfo(conn)$dbname + host <- DBI::dbGetInfo(conn)$host + + logger::log_warn( + "The database connection to {dbname} on {host} was + closed automatically because the calling environment was closed." + ) + try(close_fun(conn)) + } + }, + envir = envir, + priority = "last" + ) + } + + logger::log_trace("Database connection registered for finalization") + return(conn) +} diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R new file mode 100644 index 0000000..6455bd3 --- /dev/null +++ b/R/get_factset_entity_info.R @@ -0,0 +1,113 @@ +#' Get the entity info data from the FactSet database and prepare the +#' `factset_entity_info` tibble +#' +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return A tibble properly prepared to be saved as the +#' `factset_entity_info.rds` output file +#' +#' @export + +get_factset_entity_info <- + function(...) { + # build connection to database --------------------------------------------- + + factset_db <- connect_factset_db(...) + + logger::log_debug("Extracting entity info from database.") + + # company_name ------------------------------------------------------------- + + logger::log_trace("Accessing entity proper names.") + factset_entity_id__entity_proper_name <- + dplyr::tbl(factset_db, "sym_v1_sym_entity") %>% + dplyr::select("factset_entity_id", "entity_proper_name") + + + # country_of_domicile ------------------------------------------------------ + + logger::log_trace("Accessing entity country of domicile.") + factset_entity_id__iso_country <- + dplyr::tbl(factset_db, "sym_v1_sym_entity") %>% + dplyr::select("factset_entity_id", "iso_country") + + + # sector ------------------------------------------------------------------- + + logger::log_trace("Accessing entity sector.") + factset_entity_id__sector_code <- + dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>% + dplyr::select("factset_entity_id", "sector_code") + + factset_sector_code__factset_sector_desc <- + dplyr::tbl(factset_db, "ref_v2_factset_sector_map") %>% + dplyr::select(.data$factset_sector_code, .data$factset_sector_desc) + + factset_entity_id__factset_sector_desc <- + factset_entity_id__sector_code %>% + dplyr::left_join(factset_sector_code__factset_sector_desc, by = c("sector_code" = "factset_sector_code")) %>% + dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc") + + + # sub-sector/industry ------------------------------------------------------ + + logger::log_trace("Accessing entity industry/sector/subsector.") + factset_entity_id__industry_code <- + dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>% + dplyr::select("factset_entity_id", "industry_code") + + factset_industry_code_factset_industry_desc <- + dplyr::tbl(factset_db, "ref_v2_factset_industry_map") %>% + dplyr::select("factset_industry_code", "factset_industry_desc") + + factset_entity_id__factset_industry_desc <- + factset_entity_id__industry_code %>% + dplyr::left_join(factset_industry_code_factset_industry_desc, by = c("industry_code" = "factset_industry_code")) %>% + dplyr::select("factset_entity_id", "industry_code", "factset_industry_desc") + + + # credit risk parent ------------------------------------------------------- + + logger::log_trace("Accessing entity credit risk parent.") + ent_v1_ent_entity_affiliates <- dplyr::tbl(factset_db, "ent_v1_ent_entity_affiliates") + ref_v2_affiliate_type_map <- dplyr::tbl(factset_db, "ref_v2_affiliate_type_map") + + ent_entity_affiliates_last_update <- + dplyr::tbl(factset_db, "fds_fds_file_history") %>% + dplyr::filter(.data$table_name == "ent_entity_affiliates") %>% + dplyr::filter(.data$begin_time == max(.data$begin_time, na.rm = TRUE)) %>% + dplyr::pull("begin_time") + + factset_entity_id__credit_parent_id <- + ent_v1_ent_entity_affiliates %>% + dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>% + dplyr::filter(.data$aff_type_desc == "Credit Risk Parent") %>% + dplyr::select( + factset_entity_id = "factset_affiliated_entity_id", + credit_parent_id = "factset_entity_id" + ) %>% + dplyr::mutate(ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update) + + + # merge and collect -------------------------------------------------------- + + logger::log_trace("Merging entity info.") + entity_info <- + factset_entity_id__entity_proper_name %>% + dplyr::left_join(factset_entity_id__iso_country, by = "factset_entity_id") %>% + dplyr::left_join(factset_entity_id__factset_sector_desc, by = "factset_entity_id") %>% + dplyr::left_join(factset_entity_id__factset_industry_desc, by = "factset_entity_id") %>% + dplyr::left_join(factset_entity_id__credit_parent_id, by = "factset_entity_id") + + logger::log_trace("Downloading merged entity info from database.") + entity_info <- dplyr::collect(entity_info) + logger::log_trace("Download complete.") + + logger::log_trace("Disconnecting from database.") + DBI::dbDisconnect(factset_db) + + + # return prepared data ----------------------------------------------------- + return(entity_info) + } diff --git a/R/workflow.factset-package.R b/R/workflow.factset-package.R new file mode 100644 index 0000000..2f30195 --- /dev/null +++ b/R/workflow.factset-package.R @@ -0,0 +1,7 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @importFrom dplyr %>% +## usethis namespace: end +NULL diff --git a/man/get_factset_entity_info.Rd b/man/get_factset_entity_info.Rd new file mode 100644 index 0000000..b163ad8 --- /dev/null +++ b/man/get_factset_entity_info.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_factset_entity_info.R +\name{get_factset_entity_info} +\alias{get_factset_entity_info} +\title{Get the entity info data from the FactSet database and prepare the +\code{factset_entity_info} tibble} +\usage{ +get_factset_entity_info(...) +} +\arguments{ +\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for +specifying database connection parameters)} +} +\value{ +A tibble properly prepared to be saved as the +\code{factset_entity_info.rds} output file +} +\description{ +Get the entity info data from the FactSet database and prepare the +\code{factset_entity_info} tibble +} diff --git a/man/workflow.factset-package.Rd b/man/workflow.factset-package.Rd new file mode 100644 index 0000000..ba4560a --- /dev/null +++ b/man/workflow.factset-package.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/workflow.factset-package.R +\docType{package} +\name{workflow.factset-package} +\alias{workflow.factset} +\alias{workflow.factset-package} +\title{workflow.factset: Extract Financial Data for use in PACTA} +\description{ +Extract data from a FactSet Postgres database for use as part of PACTA Data Preparation +} +\author{ +\strong{Maintainer}: CJ Yetman \email{cj@cjyetman.com} (\href{https://orcid.org/0000-0001-5099-9500}{ORCID}) [contractor] + +Authors: +\itemize{ + \item Jackson Hoffart \email{jackson.hoffart@gmail.com} (\href{https://orcid.org/0000-0002-8600-5042}{ORCID}) [contractor] + \item Jacob Kastl \email{jacob.kastl@gmail.com} [contractor] + \item Alex Axthelm \email{aaxthelm@rmi.org} (\href{https://orcid.org/0000-0001-8579-8565}{ORCID}) [contractor] +} + +Other contributors: +\itemize{ + \item RMI \email{PACTA4investors@rmi.org} [copyright holder, funder] +} + +} +\keyword{internal} From 2134fb60afc37b58259b823ffb335dbcc06448d9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 16:55:58 +0100 Subject: [PATCH 04/33] Add R package linter --- .Rbuildignore | 1 + .github/.gitignore | 1 + .github/workflows/lint-package.yaml | 32 +++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 .github/.gitignore create mode 100644 .github/workflows/lint-package.yaml diff --git a/.Rbuildignore b/.Rbuildignore index 8cc3750..d32b58f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ ^LICENSE\.md$ .git/ .github/ +^\.github$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/lint-package.yaml b/.github/workflows/lint-package.yaml new file mode 100644 index 0000000..f4c4ef2 --- /dev/null +++ b/.github/workflows/lint-package.yaml @@ -0,0 +1,32 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: lint + +jobs: + lint: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::lintr, local::. + needs: lint + + - name: Lint + run: lintr::lint_package() + shell: Rscript {0} + env: + LINTR_ERROR_ON_LINT: true From deca16e5423f8f023e7c7e54420f8d5b5c3e663b Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 17:31:24 +0100 Subject: [PATCH 05/33] Formatting changes for `lintr` --- R/connect_factset_db.R | 41 +++++++++++++++++++++------- R/get_factset_entity_info.R | 54 +++++++++++++++++++++++++++++-------- 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R index 00e0c37..fbd66e2 100644 --- a/R/connect_factset_db.R +++ b/R/connect_factset_db.R @@ -11,31 +11,44 @@ connect_factset_db <- keyring_service_name = "2dii_factset_database") { if (username == "") { - logger::log_error("No database username could be found. Please set the username as an environment variable") + logger::log_error( + "No database username could be found. ", + "Please set the username as an environment variable" + ) } if (password == "") { # if password not defined in .env, look in systems keyring if (requireNamespace("keyring", quietly = TRUE)) { - if (!username %in% keyring::key_list(service = keyring_service_name)$username) { + if ( + !username %in% keyring::key_list( + service = keyring_service_name + )$username + ) { + keyring_prompt <- paste( + "Enter password for the FactSet database", + "(it will be stored in your system's keyring):" + ) keyring::key_set( service = keyring_service_name, username = username, - prompt = "Enter password for the FactSet database (it will be stored in your system's keyring): " + prompt = keyring_prompt ) } password <- keyring::key_get( service = keyring_service_name, username = username ) - } else if (interactive() && requireNamespace("rstudioapi", quietly = TRUE)) { + } else if ( + interactive() && requireNamespace("rstudioapi", quietly = TRUE) + ) { password <- rstudioapi::askForPassword( prompt = "Please enter the FactSet database password:" ) } else { logger::log_error( - "No database password could be found. Please set the password - as an environment variable" + "No database password could be found. ", + "Please set the password as an environment variable" ) } } @@ -58,7 +71,7 @@ connect_factset_db <- } # connection finalizer to ensure connection is closed -------------------------- -# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/ +# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/ #nolint reg_conn_finalizer <- function(conn, close_fun, envir) { is_parent_global <- identical(.GlobalEnv, envir) @@ -70,7 +83,11 @@ reg_conn_finalizer <- function(conn, close_fun, envir) { reg.finalizer(env_finalizer, function(e) { if (DBI::dbIsValid(e$conn)) { - logger::log_warn("Warning: A database connection was closed automatically because the connection object was removed or the R session was closed.") + logger::log_warn( + "Warning: A database connection was closed automatically ", + "because the connection object was removed ", + "or the R session was closed." + ) try(close_fun(e$conn)) } }, onexit = TRUE) @@ -82,8 +99,12 @@ reg_conn_finalizer <- function(conn, close_fun, envir) { host <- DBI::dbGetInfo(conn)$host logger::log_warn( - "The database connection to {dbname} on {host} was - closed automatically because the calling environment was closed." + "The database connection to ", + dbname, + " on ", + host, + " was closed automatically ", + "because the calling environment was closed." ) try(close_fun(conn)) } diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R index 6455bd3..8c8a2b3 100644 --- a/R/get_factset_entity_info.R +++ b/R/get_factset_entity_info.R @@ -46,7 +46,10 @@ get_factset_entity_info <- factset_entity_id__factset_sector_desc <- factset_entity_id__sector_code %>% - dplyr::left_join(factset_sector_code__factset_sector_desc, by = c("sector_code" = "factset_sector_code")) %>% + dplyr::left_join( + factset_sector_code__factset_sector_desc, + by = c("sector_code" = "factset_sector_code") + ) %>% dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc") @@ -63,20 +66,35 @@ get_factset_entity_info <- factset_entity_id__factset_industry_desc <- factset_entity_id__industry_code %>% - dplyr::left_join(factset_industry_code_factset_industry_desc, by = c("industry_code" = "factset_industry_code")) %>% - dplyr::select("factset_entity_id", "industry_code", "factset_industry_desc") + dplyr::left_join( + factset_industry_code_factset_industry_desc, + by = c("industry_code" = "factset_industry_code") + ) %>% + dplyr::select( + "factset_entity_id", + "industry_code", + "factset_industry_desc" + ) # credit risk parent ------------------------------------------------------- logger::log_trace("Accessing entity credit risk parent.") - ent_v1_ent_entity_affiliates <- dplyr::tbl(factset_db, "ent_v1_ent_entity_affiliates") - ref_v2_affiliate_type_map <- dplyr::tbl(factset_db, "ref_v2_affiliate_type_map") + ent_v1_ent_entity_affiliates <- dplyr::tbl( + factset_db, + "ent_v1_ent_entity_affiliates" + ) + ref_v2_affiliate_type_map <- dplyr::tbl( + factset_db, + "ref_v2_affiliate_type_map" + ) ent_entity_affiliates_last_update <- dplyr::tbl(factset_db, "fds_fds_file_history") %>% dplyr::filter(.data$table_name == "ent_entity_affiliates") %>% - dplyr::filter(.data$begin_time == max(.data$begin_time, na.rm = TRUE)) %>% + dplyr::filter( + .data$begin_time == max(.data$begin_time, na.rm = TRUE) + ) %>% dplyr::pull("begin_time") factset_entity_id__credit_parent_id <- @@ -87,7 +105,9 @@ get_factset_entity_info <- factset_entity_id = "factset_affiliated_entity_id", credit_parent_id = "factset_entity_id" ) %>% - dplyr::mutate(ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update) + dplyr::mutate( + ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update + ) # merge and collect -------------------------------------------------------- @@ -95,10 +115,22 @@ get_factset_entity_info <- logger::log_trace("Merging entity info.") entity_info <- factset_entity_id__entity_proper_name %>% - dplyr::left_join(factset_entity_id__iso_country, by = "factset_entity_id") %>% - dplyr::left_join(factset_entity_id__factset_sector_desc, by = "factset_entity_id") %>% - dplyr::left_join(factset_entity_id__factset_industry_desc, by = "factset_entity_id") %>% - dplyr::left_join(factset_entity_id__credit_parent_id, by = "factset_entity_id") + dplyr::left_join( + factset_entity_id__iso_country, + by = "factset_entity_id" + ) %>% + dplyr::left_join( + factset_entity_id__factset_sector_desc, + by = "factset_entity_id" + ) %>% + dplyr::left_join( + factset_entity_id__factset_industry_desc, + by = "factset_entity_id" + ) %>% + dplyr::left_join( + factset_entity_id__credit_parent_id, + by = "factset_entity_id" + ) logger::log_trace("Downloading merged entity info from database.") entity_info <- dplyr::collect(entity_info) From c4f788af15f3155e33f02ce6885e01f63c5d9b4a Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 17:56:53 +0100 Subject: [PATCH 06/33] Add overall exporting function --- Dockerfile | 2 +- NAMESPACE | 1 + R/export_pacta_files.R | 24 ++++++++++++++++++++++++ man/export_pacta_files.Rd | 21 +++++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 R/export_pacta_files.R create mode 100644 man/export_pacta_files.Rd diff --git a/Dockerfile b/Dockerfile index ac67d99..ab60f34 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,4 +58,4 @@ RUN Rscript -e "\ USER runner-workflow-factset # set default run behavior -CMD ["input_dir/default_config.json"] +CMD ["Rscript", "-e", "workflow.factset::export_pacta_files()"] diff --git a/NAMESPACE b/NAMESPACE index f4f7aae..19bc627 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,5 @@ # Generated by roxygen2: do not edit by hand +export(export_pacta_files) export(get_factset_entity_info) importFrom(dplyr,"%>%") diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R new file mode 100644 index 0000000..733e0eb --- /dev/null +++ b/R/export_pacta_files.R @@ -0,0 +1,24 @@ +#' Export files for use in PACTA data preparation +#' +#' @param Destination directory for the output files +#' +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return NULL +#' +#' @export + +export_pacta_files <- function( + destination = file.path("."), + data_timestamp = Sys.time(), + ... +) { + + factset_entity_info_path <- file.path(destination, "factset_entity_info.rds") + logger::log_info("Fetching entity info data... ") + entity_info <- get_factset_entity_info(...) + saveRDS(object = entity_info, file = factset_entity_info_path) + + return(invisible(NULL)) +} diff --git a/man/export_pacta_files.Rd b/man/export_pacta_files.Rd new file mode 100644 index 0000000..7977cfa --- /dev/null +++ b/man/export_pacta_files.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export_pacta_files.R +\name{export_pacta_files} +\alias{export_pacta_files} +\title{Export files for use in PACTA data preparation} +\usage{ +export_pacta_files( + destination = file.path("."), + data_timestamp = Sys.time(), + ... +) +} +\arguments{ +\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for +specifying database connection parameters)} + +\item{Destination}{directory for the output files} +} +\description{ +Export files for use in PACTA data preparation +} From 0bc81eae941ba9d6e394b255c29ea857b51639d3 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 22:40:39 +0100 Subject: [PATCH 07/33] install package deps separate from package this allows us to leverage the build cache --- Dockerfile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ab60f34..1d8a5d2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,14 @@ RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_ # install packages for dependency resolution and installation && Rscript -e "install.packages(c('pak', 'jsonlite'))" +# Install R deopendencies +COPY DESCRIPTION /workflow.factset/DESCRIPTION + +# install R package dependencies +RUN Rscript -e "\ + deps <- pak::local_install_deps(root = '/workflow.factset'); \ + " + # copy in everything from this repo COPY . /workflow.factset @@ -58,4 +66,4 @@ RUN Rscript -e "\ USER runner-workflow-factset # set default run behavior -CMD ["Rscript", "-e", "workflow.factset::export_pacta_files()"] +CMD ["Rscript", "-e", "logger::log_threshold(Sys.getenv('LOG_LEVEL', 'INFO'));workflow.factset::export_pacta_files()"] From fce4809e907856e403387c0d12af210fc785fb38 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 12 Dec 2023 22:43:36 +0100 Subject: [PATCH 08/33] Pull arguments for exporting function from envvars --- .gitignore | 1 + R/connect_factset_db.R | 12 ++++---- R/export_pacta_files.R | 63 ++++++++++++++++++++++++++++++++++++++---- README.md | 7 +++++ example.env | 7 +++++ 5 files changed, 79 insertions(+), 11 deletions(-) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 example.env diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c49bd7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R index fbd66e2..8b44a3f 100644 --- a/R/connect_factset_db.R +++ b/R/connect_factset_db.R @@ -2,13 +2,13 @@ connect_factset_db <- function( - dbname = "delta", - host = "data-eval-db.postgres.database.azure.com", - port = 5432L, + dbname = Sys.getenv("PGDATABASE"), + host = Sys.getenv("PGHOST"), + port = Sys.getenv("PGPORT", 5432L), options = "-c search_path=fds", - username = Sys.getenv("R_DATABASE_USER"), - password = Sys.getenv("R_DATABASE_PASSWORD"), - keyring_service_name = "2dii_factset_database") { + username = Sys.getenv("PGUSER"), + password = Sys.getenv("PGPASSWORD"), + keyring_service_name = "factset_database") { if (username == "") { logger::log_error( diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index 733e0eb..30b3745 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -10,15 +10,68 @@ #' @export export_pacta_files <- function( - destination = file.path("."), - data_timestamp = Sys.time(), + destination = file.path(Sys.getenv("EXPORT_DESTINATION")), + data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()), ... ) { - factset_entity_info_path <- file.path(destination, "factset_entity_info.rds") - logger::log_info("Fetching entity info data... ") + # Prepare output directories + + if (!dir.exists(destination)) { + logger::log_error( + "The destination directory {destination} does not exist." + ) + stop("Destination directory does not exist.") + } + + if (Sys.getenv("DEPLOY_START_TIME") == "") { + logger::log_warn( + "The environment variable DEPLOY_START_TIME is not set. ", + "Using current system time as start time." + ) + } + + start_time <- Sys.getenv( + "DEPLOY_START_TIME", + format(Sys.time(), format = "%Y%m%dT%H%M%S", tz = "UTC"), + ) + + if (inherits(data_timestamp, "character")) { + data_timestamp <- lubridate::ymd_hms( + data_timestamp, + quiet = TRUE, + tz = "UTC", + truncated = 3 + ) + } + + if (inherits(data_timestamp, "POSIXct")) { + data_timestamp <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC") + } + + export_dir <- file.path( + destination, + paste0(data_timestamp, "_pulled", start_time) + ) + + if (!dir.exists(export_dir)) { + dir.create(export_dir, recursive = TRUE) + } + + # Start Extracting Data + + factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds") + logger::log_info("Fetching entity info data.") entity_info <- get_factset_entity_info(...) + logger::log_info("Exporting entity info data to {factset_entity_info_path}") saveRDS(object = entity_info, file = factset_entity_info_path) - return(invisible(NULL)) + log_info("Done with data export.") + return( + invisible( + list( + factset_entity_info_path = factset_entity_info_path + ) + ) + ) } diff --git a/README.md b/README.md new file mode 100644 index 0000000..e1038c5 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# workflow.pacta + +## Running container + +```sh +docker run -i -t --rm --env-file=.env -v ./foo:/mnt/factset-data IMAGE_NAME +``` diff --git a/example.env b/example.env new file mode 100644 index 0000000..e615517 --- /dev/null +++ b/example.env @@ -0,0 +1,7 @@ +DEPLOY_START_TIME=20000101T000001 +EXPORT_DESTINATION=/mnt/factset-data +LOG_LEVEL=TRACE +PGDATABASE=FDS +PGHOST=postgres.example.com +PGPASSWORD=SuperSecrtPassw0rd +PGUSER=postgres From 6dc886200271d17742a91a6f3d057b7a80d9f09d Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Wed, 13 Dec 2023 16:39:22 +0100 Subject: [PATCH 09/33] improve error logging --- R/export_pacta_files.R | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index 30b3745..17780c6 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -31,7 +31,7 @@ export_pacta_files <- function( ) } - start_time <- Sys.getenv( + start_time_chr <- Sys.getenv( "DEPLOY_START_TIME", format(Sys.time(), format = "%Y%m%dT%H%M%S", tz = "UTC"), ) @@ -46,12 +46,19 @@ export_pacta_files <- function( } if (inherits(data_timestamp, "POSIXct")) { - data_timestamp <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC") + data_timestamp_chr <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC") + } else { + logger::log_error( + "The data_timestamp argument must be a POSIXct object ", + "or a character string coercible to POSIXct format", + " (using lubridate::ymd_hms(truncated = 3))." + ) + stop("Invalid data_timestamp argument.") } export_dir <- file.path( destination, - paste0(data_timestamp, "_pulled", start_time) + paste0(data_timestamp_chr, "_pulled", start_time_chr) ) if (!dir.exists(export_dir)) { @@ -66,7 +73,7 @@ export_pacta_files <- function( logger::log_info("Exporting entity info data to {factset_entity_info_path}") saveRDS(object = entity_info, file = factset_entity_info_path) - log_info("Done with data export.") + logger::log_info("Done with data export.") return( invisible( list( From 8891ad4f1a07de34bf09c193a90e2cf8028e77bd Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Wed, 13 Dec 2023 17:17:24 +0100 Subject: [PATCH 10/33] Add Azure deploy Template --- .gitignore | 1 + README.md | 10 +++ azure-deploy.json | 160 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 azure-deploy.json diff --git a/.gitignore b/.gitignore index 4c49bd7..e88cb47 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .env +azure-deploy.parameters.json diff --git a/README.md b/README.md index e1038c5..1a81e56 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,13 @@ ```sh docker run -i -t --rm --env-file=.env -v ./foo:/mnt/factset-data IMAGE_NAME ``` + +```sh +# change this value as needed. +RESOURCEGROUP="myResourceGroup" + +# run from repo root + +az deployment group create --resource-group "$RESOURCEGROUP" --template-file azure-deploy.json --parameters @azure-deploy.parameters.json + +``` diff --git a/azure-deploy.json b/azure-deploy.json new file mode 100644 index 0000000..443cab3 --- /dev/null +++ b/azure-deploy.json @@ -0,0 +1,160 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "0.0.0.5", + + "parameters": { + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Location for all resources." + } + }, + "identity": { + "type": "string", + "metadata": { + "description": "The ID of the user assigned identity to use for the container group." + } + }, + "containerGroupName": { + "type": "string", + "metadata": { + "description": "The name of the container group." + } + }, + "restartPolicy": { + "type": "string", + "defaultValue": "OnFailure", + "allowedValues": [ + "Always", + "Never", + "OnFailure" + ], + "metadata": { + "description": "The behavior of Azure runtime if container has stopped." + } + }, + "rawdata-storageaccountkey": { + "type": "securestring", + "metadata": { + "description": "The storage account key for the rawdata storage account." + } + }, + "database-password": { + "type": "securestring", + "metadata": { + "description": "password to connect to database" + } + }, + "starttime": { + "type": "string", + "defaultValue": "[utcNow()]", + "metadata": { + "description": "The time to start the container group." + } + } + }, + + "variables": { + "PGDATABASE": "FDS", + "PGHOST": "[concat('factset-01-postgres', '.postgres.database.azure.com')]", + "PGUSER": "postgres", + "containerregistry": "ghcr.io/rmi-pacta", + "machineCpuCores": 1, + "machineMemoryInGB": 4, + "mountPathExport": "/mnt/factset-extracted" + }, + + "functions": [], + + "resources": [ + { + "type": "Microsoft.ContainerInstance/containerGroups", + "apiVersion": "2021-09-01", + "name": "[parameters('containerGroupName')]", + "location": "[parameters('location')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[parameters('identity')]": {} + } + }, + "properties": { + "containers": [ + { + "name": "loader-runner", + "properties": { + "image": "[concat(variables('containerregistry'),'/workflow.factset:pr1')]", + "ports": [], + "resources": { + "requests": { + "cpu": "[variables('machineCpuCores')]", + "memoryInGB": "[variables('machineMemoryInGB')]" + } + }, + "environmentVariables": [ + { + "name": "PGUSER", + "value": "[variables('PGUSER')]" + }, + { + "name": "PGPASSWORD", + "secureValue": "[parameters('database-password')]" + }, + { + "name": "PGHOST", + "value": "[variables('PGHOST')]" + }, + { + "name": "PGDATABASE", + "value": "[variables('PGDATABASE')]" + }, + { + "name": "DEPLOY_START_TIME", + "value": "[parameters('starttime')]" + }, + { + "name": "MACHINE_CORES", + "value": "[variables('machineCpuCores')]" + }, + { + "name": "LOG_LEVEL", + "value": "TRACE" + }, + { + "name": "EXPORT_DESTINATION", + "value": "[variables('mountPathExport')]" + }, + { + "name": "DATA_TIMESTAMP", + "value": "20230123" + } + + ], + "volumeMounts": [ + { + "name": "factset-extracted", + "mountPath": "[variables('mountPathExport')]" + } + ] + } + } + ], + "restartPolicy": "[parameters('restartPolicy')]", + "osType": "Linux", + "volumes": [ + { + "name": "factset-extracted", + "azureFile": { + "shareName": "factset-extracted", + "readOnly": false, + "storageAccountName": "pactarawdata", + "storageAccountKey": "[parameters('rawdata-storageaccountkey')]" + } + } + ] + } + } + ], + "outputs": {} +} From 841cf6561365567aa415f2b30e3994f6b5f7898d Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Wed, 13 Dec 2023 17:18:55 +0100 Subject: [PATCH 11/33] linting --- R/export_pacta_files.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index 17780c6..8d1dec5 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -34,7 +34,7 @@ export_pacta_files <- function( start_time_chr <- Sys.getenv( "DEPLOY_START_TIME", format(Sys.time(), format = "%Y%m%dT%H%M%S", tz = "UTC"), - ) + ) if (inherits(data_timestamp, "character")) { data_timestamp <- lubridate::ymd_hms( @@ -46,7 +46,11 @@ export_pacta_files <- function( } if (inherits(data_timestamp, "POSIXct")) { - data_timestamp_chr <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC") + data_timestamp_chr <- format( + data_timestamp, + format = "%Y%m%dT%H%M%S", + tz = "UTC" + ) } else { logger::log_error( "The data_timestamp argument must be a POSIXct object ", @@ -59,7 +63,7 @@ export_pacta_files <- function( export_dir <- file.path( destination, paste0(data_timestamp_chr, "_pulled", start_time_chr) - ) + ) if (!dir.exists(export_dir)) { dir.create(export_dir, recursive = TRUE) From b52af91b8181e4441fed42262d34324da8811790 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 09:19:37 +0100 Subject: [PATCH 12/33] Copy get_factset_financial_data copy function from pacta.data.preparation --- R/get_factset_financial_data.R | 80 ++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 R/get_factset_financial_data.R diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R new file mode 100644 index 0000000..dd809d5 --- /dev/null +++ b/R/get_factset_financial_data.R @@ -0,0 +1,80 @@ +#' Get the factset financial data from the FactSet database and prepare the +#' `factset_financial_data` tibble +#' +#' @param data_timestamp A single string specifying the desired date for the +#' data in the form "2021-12-31" +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return A tibble properly prepared to be saved as the +#' `factset_financial_data.rds` output file +#' +#' @export + +get_factset_financial_data <- + function(data_timestamp, ...) { + # build connection to database --------------------------------------------- + + factset_db <- connect_factset_db(...) + + + # fsym_id__factset_entity_id ----------------------------------------------- + + fsym_id__factset_entity_id <- + tbl(factset_db, "own_v5_own_sec_entity") %>% + select("fsym_id", "factset_entity_id") + + + # isin --------------------------------------------------------------------- + + fsym_id__isin <- tbl(factset_db, "sym_v1_sym_isin") + + + # adj_price ---------------------------------------------------------------- + + fsym_id__adj_price <- + tbl(factset_db, "own_v5_own_sec_prices") %>% + dplyr::filter(.data$price_date == .env$data_timestamp) %>% + select("fsym_id", "adj_price") + + + # adj_shares_outstanding --------------------------------------------------- + + fsym_id__adj_shares_outstanding <- + tbl(factset_db, "own_v5_own_sec_prices") %>% + dplyr::filter(.data$price_date == .env$data_timestamp) %>% + select("fsym_id", "adj_shares_outstanding") + + + # issue_type --------------------------------------------------------------- + + fsym_id__issue_type <- + tbl(factset_db, "own_v5_own_sec_coverage") %>% + select("fsym_id", "issue_type") + + + # one_adr_eq --------------------------------------------------------------- + + fsym_id__one_adr_eq <- + tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>% + select("fsym_id" = "adr_fsym_id", "one_adr_eq") + + + # merge and collect -------------------------------------------------------- + + fin_data <- + fsym_id__isin %>% + left_join(fsym_id__factset_entity_id, by = "fsym_id") %>% + left_join(fsym_id__adj_price, by = "fsym_id") %>% + left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>% + left_join(fsym_id__issue_type, by = "fsym_id") %>% + left_join(fsym_id__one_adr_eq, by = "fsym_id") %>% + dplyr::collect() + + DBI::dbDisconnect(factset_db) + + + # return prepared data ----------------------------------------------------- + + fin_data + } From bf22e52ca9c16fc6a3b95d24dca68afb868dba98 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 10:05:17 +0100 Subject: [PATCH 13/33] Add financial data to export --- R/export_pacta_files.R | 12 ++++++++ R/get_factset_financial_data.R | 53 +++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index 8d1dec5..1bf2589 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -71,6 +71,18 @@ export_pacta_files <- function( # Start Extracting Data + factset_financial_data_path <- file.path( + export_dir, + "factset_financial_data.rds" + ) + logger::log_info("Fetching financial data.") + financial_data <- get_factset_financial_data( + data_timestamp = data_timestamp, + ... + ) + logger::log_info("Exporting financial data to {factset_financial_data_path}") + saveRDS(object = financial_data, file = factset_financial_data_path) + factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds") logger::log_info("Fetching entity info data.") entity_info <- get_factset_entity_info(...) diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R index dd809d5..30acd20 100644 --- a/R/get_factset_financial_data.R +++ b/R/get_factset_financial_data.R @@ -17,64 +17,77 @@ get_factset_financial_data <- factset_db <- connect_factset_db(...) + logger::log_debug("Extracting financial info from database.") + logger::log_info("using data timestamp: ", data_timestamp) + # fsym_id__factset_entity_id ----------------------------------------------- + logger::log_trace("Accessing entity id.") fsym_id__factset_entity_id <- - tbl(factset_db, "own_v5_own_sec_entity") %>% - select("fsym_id", "factset_entity_id") + dplyr::tbl(factset_db, "own_v5_own_sec_entity") %>% + dplyr::select("fsym_id", "factset_entity_id") # isin --------------------------------------------------------------------- - fsym_id__isin <- tbl(factset_db, "sym_v1_sym_isin") + logger::log_trace("Accessing ISINs.") + fsym_id__isin <- dplyr::tbl(factset_db, "sym_v1_sym_isin") # adj_price ---------------------------------------------------------------- + browser() + logger::log_trace("Accessing share prices.") fsym_id__adj_price <- - tbl(factset_db, "own_v5_own_sec_prices") %>% + dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% - select("fsym_id", "adj_price") + dplyr::select("fsym_id", "adj_price") # adj_shares_outstanding --------------------------------------------------- + logger::log_trace("Accessing shares outstanding.") fsym_id__adj_shares_outstanding <- - tbl(factset_db, "own_v5_own_sec_prices") %>% + dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% - select("fsym_id", "adj_shares_outstanding") + dplyr::select("fsym_id", "adj_shares_outstanding") # issue_type --------------------------------------------------------------- + logger::log_trace("Accessing issue type.") fsym_id__issue_type <- - tbl(factset_db, "own_v5_own_sec_coverage") %>% - select("fsym_id", "issue_type") + dplyr::tbl(factset_db, "own_v5_own_sec_coverage") %>% + dplyr::select("fsym_id", "issue_type") # one_adr_eq --------------------------------------------------------------- + logger::log_trace("Accessing ADR equivilents.") fsym_id__one_adr_eq <- - tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>% - select("fsym_id" = "adr_fsym_id", "one_adr_eq") + dplyr::tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>% + dplyr::select("fsym_id" = "adr_fsym_id", "one_adr_eq") # merge and collect -------------------------------------------------------- + logger::log_trace("Merging financial info.") fin_data <- fsym_id__isin %>% - left_join(fsym_id__factset_entity_id, by = "fsym_id") %>% - left_join(fsym_id__adj_price, by = "fsym_id") %>% - left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>% - left_join(fsym_id__issue_type, by = "fsym_id") %>% - left_join(fsym_id__one_adr_eq, by = "fsym_id") %>% - dplyr::collect() + dplyr::left_join(fsym_id__factset_entity_id, by = "fsym_id") %>% + dplyr::left_join(fsym_id__adj_price, by = "fsym_id") %>% + dplyr::left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>% + dplyr::left_join(fsym_id__issue_type, by = "fsym_id") %>% + dplyr::left_join(fsym_id__one_adr_eq, by = "fsym_id") - DBI::dbDisconnect(factset_db) + logger::log_trace("Downloading merged financial info from database.") + fin_data <- dplyr::collect(fin_data) + logger::log_trace("Download complete.") + logger::log_trace("Disconnecting from database.") + DBI::dbDisconnect(factset_db) # return prepared data ----------------------------------------------------- - - fin_data + return(fin_data) } From 5842ca752d178dd15dca6f391fa466de12587fee Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 10:05:35 +0100 Subject: [PATCH 14/33] Increase memory request --- azure-deploy.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-deploy.json b/azure-deploy.json index 443cab3..6a1a685 100644 --- a/azure-deploy.json +++ b/azure-deploy.json @@ -61,7 +61,7 @@ "PGUSER": "postgres", "containerregistry": "ghcr.io/rmi-pacta", "machineCpuCores": 1, - "machineMemoryInGB": 4, + "machineMemoryInGB": 16, "mountPathExport": "/mnt/factset-extracted" }, From 574bc0e040f4cb0f63e3be7e16b38eea4eca1022 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 10:28:49 +0100 Subject: [PATCH 15/33] Externalize DB connection from extraction function --- R/export_pacta_files.R | 18 +++++++++--------- R/get_factset_entity_info.R | 29 +++++++++++------------------ R/get_factset_financial_data.R | 22 ++++++++-------------- 3 files changed, 28 insertions(+), 41 deletions(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index 1bf2589..feeed0d 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -2,17 +2,17 @@ #' #' @param Destination directory for the output files #' -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for -#' specifying database connection parameters) +#' @param destination path to directory where exported files will be saved +#' @param data_timestamp filter data as-of this timestamp #' -#' @return NULL +#' @return vector of paths to exported files #' #' @export export_pacta_files <- function( + conn = connect_factset_db(), destination = file.path(Sys.getenv("EXPORT_DESTINATION")), - data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()), - ... + data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()) ) { # Prepare output directories @@ -77,22 +77,22 @@ export_pacta_files <- function( ) logger::log_info("Fetching financial data.") financial_data <- get_factset_financial_data( - data_timestamp = data_timestamp, - ... + conn = conn, + data_timestamp = data_timestamp ) logger::log_info("Exporting financial data to {factset_financial_data_path}") saveRDS(object = financial_data, file = factset_financial_data_path) factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds") logger::log_info("Fetching entity info data.") - entity_info <- get_factset_entity_info(...) + entity_info <- get_factset_entity_info(conn = conn) logger::log_info("Exporting entity info data to {factset_entity_info_path}") saveRDS(object = entity_info, file = factset_entity_info_path) logger::log_info("Done with data export.") return( invisible( - list( + c( factset_entity_info_path = factset_entity_info_path ) ) diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R index 8c8a2b3..f3939bf 100644 --- a/R/get_factset_entity_info.R +++ b/R/get_factset_entity_info.R @@ -1,8 +1,7 @@ #' Get the entity info data from the FactSet database and prepare the #' `factset_entity_info` tibble #' -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for -#' specifying database connection parameters) +#' @param conn database connection #' #' @return A tibble properly prepared to be saved as the #' `factset_entity_info.rds` output file @@ -10,18 +9,16 @@ #' @export get_factset_entity_info <- - function(...) { + function(conn) { # build connection to database --------------------------------------------- - factset_db <- connect_factset_db(...) - logger::log_debug("Extracting entity info from database.") # company_name ------------------------------------------------------------- logger::log_trace("Accessing entity proper names.") factset_entity_id__entity_proper_name <- - dplyr::tbl(factset_db, "sym_v1_sym_entity") %>% + dplyr::tbl(conn, "sym_v1_sym_entity") %>% dplyr::select("factset_entity_id", "entity_proper_name") @@ -29,7 +26,7 @@ get_factset_entity_info <- logger::log_trace("Accessing entity country of domicile.") factset_entity_id__iso_country <- - dplyr::tbl(factset_db, "sym_v1_sym_entity") %>% + dplyr::tbl(conn, "sym_v1_sym_entity") %>% dplyr::select("factset_entity_id", "iso_country") @@ -37,11 +34,11 @@ get_factset_entity_info <- logger::log_trace("Accessing entity sector.") factset_entity_id__sector_code <- - dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>% + dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>% dplyr::select("factset_entity_id", "sector_code") factset_sector_code__factset_sector_desc <- - dplyr::tbl(factset_db, "ref_v2_factset_sector_map") %>% + dplyr::tbl(conn, "ref_v2_factset_sector_map") %>% dplyr::select(.data$factset_sector_code, .data$factset_sector_desc) factset_entity_id__factset_sector_desc <- @@ -57,11 +54,11 @@ get_factset_entity_info <- logger::log_trace("Accessing entity industry/sector/subsector.") factset_entity_id__industry_code <- - dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>% + dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>% dplyr::select("factset_entity_id", "industry_code") factset_industry_code_factset_industry_desc <- - dplyr::tbl(factset_db, "ref_v2_factset_industry_map") %>% + dplyr::tbl(conn, "ref_v2_factset_industry_map") %>% dplyr::select("factset_industry_code", "factset_industry_desc") factset_entity_id__factset_industry_desc <- @@ -81,16 +78,16 @@ get_factset_entity_info <- logger::log_trace("Accessing entity credit risk parent.") ent_v1_ent_entity_affiliates <- dplyr::tbl( - factset_db, + conn, "ent_v1_ent_entity_affiliates" ) ref_v2_affiliate_type_map <- dplyr::tbl( - factset_db, + conn, "ref_v2_affiliate_type_map" ) ent_entity_affiliates_last_update <- - dplyr::tbl(factset_db, "fds_fds_file_history") %>% + dplyr::tbl(conn, "fds_fds_file_history") %>% dplyr::filter(.data$table_name == "ent_entity_affiliates") %>% dplyr::filter( .data$begin_time == max(.data$begin_time, na.rm = TRUE) @@ -136,10 +133,6 @@ get_factset_entity_info <- entity_info <- dplyr::collect(entity_info) logger::log_trace("Download complete.") - logger::log_trace("Disconnecting from database.") - DBI::dbDisconnect(factset_db) - - # return prepared data ----------------------------------------------------- return(entity_info) } diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R index 30acd20..bfbe3c4 100644 --- a/R/get_factset_financial_data.R +++ b/R/get_factset_financial_data.R @@ -1,10 +1,9 @@ #' Get the factset financial data from the FactSet database and prepare the #' `factset_financial_data` tibble #' +#' @param conn databse connection #' @param data_timestamp A single string specifying the desired date for the #' data in the form "2021-12-31" -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for -#' specifying database connection parameters) #' #' @return A tibble properly prepared to be saved as the #' `factset_financial_data.rds` output file @@ -12,11 +11,9 @@ #' @export get_factset_financial_data <- - function(data_timestamp, ...) { + function(conn, data_timestamp, ...) { # build connection to database --------------------------------------------- - factset_db <- connect_factset_db(...) - logger::log_debug("Extracting financial info from database.") logger::log_info("using data timestamp: ", data_timestamp) @@ -25,14 +22,14 @@ get_factset_financial_data <- logger::log_trace("Accessing entity id.") fsym_id__factset_entity_id <- - dplyr::tbl(factset_db, "own_v5_own_sec_entity") %>% + dplyr::tbl(conn, "own_v5_own_sec_entity") %>% dplyr::select("fsym_id", "factset_entity_id") # isin --------------------------------------------------------------------- logger::log_trace("Accessing ISINs.") - fsym_id__isin <- dplyr::tbl(factset_db, "sym_v1_sym_isin") + fsym_id__isin <- dplyr::tbl(conn, "sym_v1_sym_isin") # adj_price ---------------------------------------------------------------- @@ -40,7 +37,7 @@ get_factset_financial_data <- browser() logger::log_trace("Accessing share prices.") fsym_id__adj_price <- - dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>% + dplyr::tbl(conn, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% dplyr::select("fsym_id", "adj_price") @@ -49,7 +46,7 @@ get_factset_financial_data <- logger::log_trace("Accessing shares outstanding.") fsym_id__adj_shares_outstanding <- - dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>% + dplyr::tbl(conn, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% dplyr::select("fsym_id", "adj_shares_outstanding") @@ -58,7 +55,7 @@ get_factset_financial_data <- logger::log_trace("Accessing issue type.") fsym_id__issue_type <- - dplyr::tbl(factset_db, "own_v5_own_sec_coverage") %>% + dplyr::tbl(conn, "own_v5_own_sec_coverage") %>% dplyr::select("fsym_id", "issue_type") @@ -66,7 +63,7 @@ get_factset_financial_data <- logger::log_trace("Accessing ADR equivilents.") fsym_id__one_adr_eq <- - dplyr::tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>% + dplyr::tbl(conn, "own_v5_own_sec_adr_ord_ratio") %>% dplyr::select("fsym_id" = "adr_fsym_id", "one_adr_eq") @@ -85,9 +82,6 @@ get_factset_financial_data <- fin_data <- dplyr::collect(fin_data) logger::log_trace("Download complete.") - logger::log_trace("Disconnecting from database.") - DBI::dbDisconnect(factset_db) - # return prepared data ----------------------------------------------------- return(fin_data) } From 3623846dbe994b24b6c6ca26693e9c12f53ac1d9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 12:19:51 +0100 Subject: [PATCH 16/33] Update namespace to include .data and .env These are reexported form rlang to avoid lintr errors --- DESCRIPTION | 1 + NAMESPACE | 3 +++ R/workflow.factset-package.R | 2 ++ 3 files changed, 6 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 852f0ee..4a5c654 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Imports: dbplyr, dplyr, logger, + rlang, RPostgres, withr Suggests: diff --git a/NAMESPACE b/NAMESPACE index 19bc627..a26e306 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,4 +2,7 @@ export(export_pacta_files) export(get_factset_entity_info) +export(get_factset_financial_data) importFrom(dplyr,"%>%") +importFrom(rlang,.data) +importFrom(rlang,.env) diff --git a/R/workflow.factset-package.R b/R/workflow.factset-package.R index 2f30195..4293484 100644 --- a/R/workflow.factset-package.R +++ b/R/workflow.factset-package.R @@ -3,5 +3,7 @@ ## usethis namespace: start #' @importFrom dplyr %>% +#' @importFrom rlang .data +#' @importFrom rlang .env ## usethis namespace: end NULL From 4ded8ce62b282d447e340dda8aa86601dba94e7d Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 13:06:30 +0100 Subject: [PATCH 17/33] Simplify variable names --- R/get_factset_financial_data.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R index bfbe3c4..a076f28 100644 --- a/R/get_factset_financial_data.R +++ b/R/get_factset_financial_data.R @@ -18,10 +18,10 @@ get_factset_financial_data <- logger::log_info("using data timestamp: ", data_timestamp) - # fsym_id__factset_entity_id ----------------------------------------------- + # factset_entity_id ----------------------------------------------- logger::log_trace("Accessing entity id.") - fsym_id__factset_entity_id <- + factset_entity_id <- dplyr::tbl(conn, "own_v5_own_sec_entity") %>% dplyr::select("fsym_id", "factset_entity_id") @@ -29,14 +29,14 @@ get_factset_financial_data <- # isin --------------------------------------------------------------------- logger::log_trace("Accessing ISINs.") - fsym_id__isin <- dplyr::tbl(conn, "sym_v1_sym_isin") + isin <- dplyr::tbl(conn, "sym_v1_sym_isin") # adj_price ---------------------------------------------------------------- browser() logger::log_trace("Accessing share prices.") - fsym_id__adj_price <- + adj_price <- dplyr::tbl(conn, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% dplyr::select("fsym_id", "adj_price") @@ -45,7 +45,7 @@ get_factset_financial_data <- # adj_shares_outstanding --------------------------------------------------- logger::log_trace("Accessing shares outstanding.") - fsym_id__adj_shares_outstanding <- + adj_shares_outstanding <- dplyr::tbl(conn, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% dplyr::select("fsym_id", "adj_shares_outstanding") @@ -54,7 +54,7 @@ get_factset_financial_data <- # issue_type --------------------------------------------------------------- logger::log_trace("Accessing issue type.") - fsym_id__issue_type <- + issue_type <- dplyr::tbl(conn, "own_v5_own_sec_coverage") %>% dplyr::select("fsym_id", "issue_type") @@ -62,7 +62,7 @@ get_factset_financial_data <- # one_adr_eq --------------------------------------------------------------- logger::log_trace("Accessing ADR equivilents.") - fsym_id__one_adr_eq <- + one_adr_eq <- dplyr::tbl(conn, "own_v5_own_sec_adr_ord_ratio") %>% dplyr::select("fsym_id" = "adr_fsym_id", "one_adr_eq") @@ -71,12 +71,12 @@ get_factset_financial_data <- logger::log_trace("Merging financial info.") fin_data <- - fsym_id__isin %>% - dplyr::left_join(fsym_id__factset_entity_id, by = "fsym_id") %>% - dplyr::left_join(fsym_id__adj_price, by = "fsym_id") %>% - dplyr::left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>% - dplyr::left_join(fsym_id__issue_type, by = "fsym_id") %>% - dplyr::left_join(fsym_id__one_adr_eq, by = "fsym_id") + isin %>% + dplyr::left_join(factset_entity_id, by = "fsym_id") %>% + dplyr::left_join(adj_price, by = "fsym_id") %>% + dplyr::left_join(adj_shares_outstanding, by = "fsym_id") %>% + dplyr::left_join(issue_type, by = "fsym_id") %>% + dplyr::left_join(one_adr_eq, by = "fsym_id") logger::log_trace("Downloading merged financial info from database.") fin_data <- dplyr::collect(fin_data) From ee5a6cd0195334289a81d7ee2a7c5d0a3f4856b1 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 13:18:12 +0100 Subject: [PATCH 18/33] simplify variable names --- R/get_factset_entity_info.R | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R index f3939bf..5333dd9 100644 --- a/R/get_factset_entity_info.R +++ b/R/get_factset_entity_info.R @@ -17,7 +17,7 @@ get_factset_entity_info <- # company_name ------------------------------------------------------------- logger::log_trace("Accessing entity proper names.") - factset_entity_id__entity_proper_name <- + entity_proper_name <- dplyr::tbl(conn, "sym_v1_sym_entity") %>% dplyr::select("factset_entity_id", "entity_proper_name") @@ -25,7 +25,7 @@ get_factset_entity_info <- # country_of_domicile ------------------------------------------------------ logger::log_trace("Accessing entity country of domicile.") - factset_entity_id__iso_country <- + iso_country <- dplyr::tbl(conn, "sym_v1_sym_entity") %>% dplyr::select("factset_entity_id", "iso_country") @@ -33,18 +33,18 @@ get_factset_entity_info <- # sector ------------------------------------------------------------------- logger::log_trace("Accessing entity sector.") - factset_entity_id__sector_code <- + sector_code <- dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>% dplyr::select("factset_entity_id", "sector_code") - factset_sector_code__factset_sector_desc <- + sector_code__sector_desc <- dplyr::tbl(conn, "ref_v2_factset_sector_map") %>% dplyr::select(.data$factset_sector_code, .data$factset_sector_desc) - factset_entity_id__factset_sector_desc <- - factset_entity_id__sector_code %>% + factset_sector_desc <- + sector_code %>% dplyr::left_join( - factset_sector_code__factset_sector_desc, + sector_code__sector_desc, by = c("sector_code" = "factset_sector_code") ) %>% dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc") @@ -53,18 +53,18 @@ get_factset_entity_info <- # sub-sector/industry ------------------------------------------------------ logger::log_trace("Accessing entity industry/sector/subsector.") - factset_entity_id__industry_code <- + industry_code <- dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>% dplyr::select("factset_entity_id", "industry_code") - factset_industry_code_factset_industry_desc <- + industry_code__industry_desc <- dplyr::tbl(conn, "ref_v2_factset_industry_map") %>% dplyr::select("factset_industry_code", "factset_industry_desc") - factset_entity_id__factset_industry_desc <- - factset_entity_id__industry_code %>% + factset_industry_desc <- + industry_code %>% dplyr::left_join( - factset_industry_code_factset_industry_desc, + industry_code__industry_desc, by = c("industry_code" = "factset_industry_code") ) %>% dplyr::select( @@ -94,7 +94,7 @@ get_factset_entity_info <- ) %>% dplyr::pull("begin_time") - factset_entity_id__credit_parent_id <- + credit_parent_id <- ent_v1_ent_entity_affiliates %>% dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>% dplyr::filter(.data$aff_type_desc == "Credit Risk Parent") %>% @@ -111,21 +111,21 @@ get_factset_entity_info <- logger::log_trace("Merging entity info.") entity_info <- - factset_entity_id__entity_proper_name %>% + entity_proper_name %>% dplyr::left_join( - factset_entity_id__iso_country, + iso_country, by = "factset_entity_id" ) %>% dplyr::left_join( - factset_entity_id__factset_sector_desc, + factset_sector_desc, by = "factset_entity_id" ) %>% dplyr::left_join( - factset_entity_id__factset_industry_desc, + factset_industry_desc, by = "factset_entity_id" ) %>% dplyr::left_join( - factset_entity_id__credit_parent_id, + credit_parent_id, by = "factset_entity_id" ) From 9b436d6d34a65c9e95a848a5482b617fd5ed1670 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 15:49:19 +0100 Subject: [PATCH 19/33] Terminate connection if created in function --- R/export_pacta_files.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index feeed0d..a05b46e 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -12,7 +12,11 @@ export_pacta_files <- function( conn = connect_factset_db(), destination = file.path(Sys.getenv("EXPORT_DESTINATION")), - data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()) + data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()), + terminate_connection = ( + # Terminate connection if it was created by this function. + deparse(substitute(conn)) == formals(export_pacta_files)[["conn"]] + ) ) { # Prepare output directories @@ -90,6 +94,13 @@ export_pacta_files <- function( saveRDS(object = entity_info, file = factset_entity_info_path) logger::log_info("Done with data export.") + + # Terminate connection if needed + if (terminate_connection) { + logger::log_info("Terminating database connection.") + DBI::dbDisconnect(conn) + } + return( invisible( c( From 0a68abd59f9b3bb0441db73fcbc5b82e3e298e6d Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 16:08:33 +0100 Subject: [PATCH 20/33] improve lgging in autofinalizing DB connection --- R/connect_factset_db.R | 154 +++++++++++++++++++---------------------- 1 file changed, 71 insertions(+), 83 deletions(-) diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R index 8b44a3f..af938cd 100644 --- a/R/connect_factset_db.R +++ b/R/connect_factset_db.R @@ -1,79 +1,66 @@ -# Connection function +#' Export files for use in PACTA data preparation +#' +#' @param dbname name of the database to connect to +#' @param host hostname of the server to connect to +#' @param port port number of the server to connect to +#' @param options additional options to pass to the database connection. +#' Typically used to define schema search path. +#' @param username username to use for the database connection +#' @param password password to use for the database connection +#' +#' @return a database connection object +#' +#' @export -connect_factset_db <- - function( - dbname = Sys.getenv("PGDATABASE"), - host = Sys.getenv("PGHOST"), - port = Sys.getenv("PGPORT", 5432L), - options = "-c search_path=fds", - username = Sys.getenv("PGUSER"), - password = Sys.getenv("PGPASSWORD"), - keyring_service_name = "factset_database") { - if (username == "") { - logger::log_error( - "No database username could be found. ", - "Please set the username as an environment variable" - ) - } +connect_factset_db <- function( + dbname = Sys.getenv("PGDATABASE"), + host = Sys.getenv("PGHOST"), + port = Sys.getenv("PGPORT", 5432L), + options = "-c search_path=fds", + username = Sys.getenv("PGUSER"), + password = Sys.getenv("PGPASSWORD") +) { - if (password == "") { - # if password not defined in .env, look in systems keyring - if (requireNamespace("keyring", quietly = TRUE)) { - if ( - !username %in% keyring::key_list( - service = keyring_service_name - )$username - ) { - keyring_prompt <- paste( - "Enter password for the FactSet database", - "(it will be stored in your system's keyring):" - ) - keyring::key_set( - service = keyring_service_name, - username = username, - prompt = keyring_prompt - ) - } - password <- keyring::key_get( - service = keyring_service_name, - username = username - ) - } else if ( - interactive() && requireNamespace("rstudioapi", quietly = TRUE) - ) { - password <- rstudioapi::askForPassword( - prompt = "Please enter the FactSet database password:" - ) - } else { - logger::log_error( - "No database password could be found. ", - "Please set the password as an environment variable" - ) - } - } - - logger::log_trace( - "Connecting to database {dbname} on {host}:{port} as {username}" + if (username == "") { + logger::log_error( + "No database username could be found. ", + "Please set the username as an environment variable" ) - conn <- - DBI::dbConnect( - drv = RPostgres::Postgres(), - dbname = dbname, - host = host, - port = port, - user = username, - password = password, - options = options - ) + } - reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame()) + if (password == "") { + logger::log_error( + "No database password could be found. ", + "Please set the password as an environment variable" + ) } + logger::log_trace( + "Connecting to database {dbname} on {host}:{port} as {username}" + ) + conn <- + DBI::dbConnect( + drv = RPostgres::Postgres(), + dbname = dbname, + host = host, + port = port, + user = username, + password = password, + options = options + ) + + reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame()) +} + # connection finalizer to ensure connection is closed -------------------------- # adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/ #nolint -reg_conn_finalizer <- function(conn, close_fun, envir) { +reg_conn_finalizer <- function( + conn, + close_fun, + envir +) { is_parent_global <- identical(.GlobalEnv, envir) if (isTRUE(is_parent_global)) { @@ -83,29 +70,17 @@ reg_conn_finalizer <- function(conn, close_fun, envir) { reg.finalizer(env_finalizer, function(e) { if (DBI::dbIsValid(e$conn)) { - logger::log_warn( - "Warning: A database connection was closed automatically ", - "because the connection object was removed ", - "or the R session was closed." - ) + warn_db_autoclose(e$conn) try(close_fun(e$conn)) } - }, onexit = TRUE) + }, + onexit = TRUE + ) } else { withr::defer( { if (DBI::dbIsValid(conn)) { - dbname <- DBI::dbGetInfo(conn)$dbname - host <- DBI::dbGetInfo(conn)$host - - logger::log_warn( - "The database connection to ", - dbname, - " on ", - host, - " was closed automatically ", - "because the calling environment was closed." - ) + warn_db_autoclose(conn) try(close_fun(conn)) } }, @@ -117,3 +92,16 @@ reg_conn_finalizer <- function(conn, close_fun, envir) { logger::log_trace("Database connection registered for finalization") return(conn) } + +warn_db_autoclose <- function(conn) { + dbname <- DBI::dbGetInfo(conn)$dbname + host <- DBI::dbGetInfo(conn)$host + logger::log_warn( + "The database connection to ", + dbname, + " on ", + host, + " was closed automatically ", + "because the calling environment was closed." + ) +} From 705c303c5d3ba363730d5002bee606e7e4b855fe Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 16:23:53 +0100 Subject: [PATCH 21/33] Improve log messages --- R/get_factset_financial_data.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R index a076f28..4be7c3c 100644 --- a/R/get_factset_financial_data.R +++ b/R/get_factset_financial_data.R @@ -35,7 +35,10 @@ get_factset_financial_data <- # adj_price ---------------------------------------------------------------- browser() - logger::log_trace("Accessing share prices.") + logger::log_trace( + "Accessing share prices. ", + "Filtering to date: {data_timestamp}" + ) adj_price <- dplyr::tbl(conn, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% @@ -44,7 +47,10 @@ get_factset_financial_data <- # adj_shares_outstanding --------------------------------------------------- - logger::log_trace("Accessing shares outstanding.") + logger::log_trace( + "Accessing shares outstanding. ", + "Filtering to date: {data_timestamp}" + ) adj_shares_outstanding <- dplyr::tbl(conn, "own_v5_own_sec_prices") %>% dplyr::filter(.data$price_date == .env$data_timestamp) %>% From f684b9477a43e5afb6028e29bd5f412edf6cafb5 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 16:44:34 +0100 Subject: [PATCH 22/33] Improve logging --- R/get_factset_entity_info.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R index 5333dd9..61b90ea 100644 --- a/R/get_factset_entity_info.R +++ b/R/get_factset_entity_info.R @@ -37,10 +37,12 @@ get_factset_entity_info <- dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>% dplyr::select("factset_entity_id", "sector_code") + logger::log_trace("Accessing sector descriptions.") sector_code__sector_desc <- dplyr::tbl(conn, "ref_v2_factset_sector_map") %>% dplyr::select(.data$factset_sector_code, .data$factset_sector_desc) + logger::log_trace("Merging sector codes and sector descriptions.") factset_sector_desc <- sector_code %>% dplyr::left_join( @@ -52,15 +54,17 @@ get_factset_entity_info <- # sub-sector/industry ------------------------------------------------------ - logger::log_trace("Accessing entity industry/sector/subsector.") + logger::log_trace("Accessing entity industry codes.") industry_code <- dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>% dplyr::select("factset_entity_id", "industry_code") + logger::log_trace("Accessing industry descriptions") industry_code__industry_desc <- dplyr::tbl(conn, "ref_v2_factset_industry_map") %>% dplyr::select("factset_industry_code", "factset_industry_desc") + logger::log_trace("Merging industry codes and industry descriptions.") factset_industry_desc <- industry_code %>% dplyr::left_join( @@ -76,16 +80,19 @@ get_factset_entity_info <- # credit risk parent ------------------------------------------------------- - logger::log_trace("Accessing entity credit risk parent.") + logger::log_trace("Accessing entity affiliates.") ent_v1_ent_entity_affiliates <- dplyr::tbl( conn, "ent_v1_ent_entity_affiliates" ) + + logger::log_trace("Accessing affiliate type map.") ref_v2_affiliate_type_map <- dplyr::tbl( conn, "ref_v2_affiliate_type_map" ) + logger::log_trace("Determining last update time for entity affiliates.") ent_entity_affiliates_last_update <- dplyr::tbl(conn, "fds_fds_file_history") %>% dplyr::filter(.data$table_name == "ent_entity_affiliates") %>% @@ -94,6 +101,7 @@ get_factset_entity_info <- ) %>% dplyr::pull("begin_time") + logger::log_trace("Determining credit risk parent via entity affiliates.") credit_parent_id <- ent_v1_ent_entity_affiliates %>% dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>% From 3f051261429ae770d3b693589479f7f6d42cd1d7 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 16:51:30 +0100 Subject: [PATCH 23/33] Simplify variable names --- R/get_factset_entity_info.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R index 61b90ea..9074752 100644 --- a/R/get_factset_entity_info.R +++ b/R/get_factset_entity_info.R @@ -93,12 +93,13 @@ get_factset_entity_info <- ) logger::log_trace("Determining last update time for entity affiliates.") - ent_entity_affiliates_last_update <- + affiliates_last_update <- dplyr::tbl(conn, "fds_fds_file_history") %>% dplyr::filter(.data$table_name == "ent_entity_affiliates") %>% dplyr::filter( .data$begin_time == max(.data$begin_time, na.rm = TRUE) ) %>% + # pull also handles `collect`ing the data dplyr::pull("begin_time") logger::log_trace("Determining credit risk parent via entity affiliates.") @@ -111,7 +112,7 @@ get_factset_entity_info <- credit_parent_id = "factset_entity_id" ) %>% dplyr::mutate( - ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update + ent_entity_affiliates_last_update = affiliates_last_update ) From cdbdf920cdeab51dc33f60ca05b279b2a409efb9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 17:38:34 +0100 Subject: [PATCH 24/33] Copy factset functions from pacta.data.preparation https://github.com/RMI-PACTA/pacta.data.preparation/tree/main/data (private) --- R/get_factset_entity_financing_data.R | 84 ++++++++++++++++++++++++ R/get_factset_fund_data.R | 84 ++++++++++++++++++++++++ R/get_factset_isin_to_fund_table.R | 47 ++++++++++++++ R/get_factset_iss_emissions_data.R | 93 +++++++++++++++++++++++++++ 4 files changed, 308 insertions(+) create mode 100644 R/get_factset_entity_financing_data.R create mode 100644 R/get_factset_fund_data.R create mode 100644 R/get_factset_isin_to_fund_table.R create mode 100644 R/get_factset_iss_emissions_data.R diff --git a/R/get_factset_entity_financing_data.R b/R/get_factset_entity_financing_data.R new file mode 100644 index 0000000..e8efeac --- /dev/null +++ b/R/get_factset_entity_financing_data.R @@ -0,0 +1,84 @@ +#' Get the entity financing data from the FactSet database and prepare the +#' `factset_entity_financing_data` tibble +#' +#' @param data_timestamp A single string specifying the desired date for the +#' data in the form "2021-12-31" +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return A tibble properly prepared to be saved as the +#' `factset_entity_financing_data.rds` output file +#' +#' @export + +get_factset_entity_financing_data <- function(data_timestamp, ...) { + # connect to the FactSet database -------------------------------------------- + + factset_db <- connect_factset_db(...) + + year <- lubridate::year(data_timestamp) + + + # get fsym_id to fundamentals fsym_company_id -------------------------------- + + ff_fsym_id__fsym_company_id <- tbl(factset_db, "ff_v3_ff_sec_map") + + own_fsym_id__fsym_company_id <- tbl(factset_db, "own_v5_own_sec_map") + + fsym_id__fsym_company_id <- dplyr::union_all( + ff_fsym_id__fsym_company_id, + own_fsym_id__fsym_company_id + ) + + + # get fsym_id to factset_entity_id ------------------------------------------- + + ff_sec_entity <- tbl(factset_db, "ff_v3_ff_sec_entity") + + own_sec_entity <- tbl(factset_db, "own_v5_own_sec_entity") + + sec_entity <- dplyr::union_all( + ff_sec_entity, + own_sec_entity + ) + + + # get market value data ------------------------------------------------------ + + fsym_id__ff_mkt_val <- tbl(factset_db, "ff_v3_ff_basic_der_af") %>% + select("fsym_id", "date", "currency", "ff_mkt_val") + + + # get debt outstanding data -------------------------------------------------- + + fsym_id__ff_debt <- tbl(factset_db, "ff_v3_ff_basic_af") %>% + select("fsym_id", "date", "currency", "ff_debt") + + + # merge and collect the data, then disconnect -------------------------------- + + entity_financing_data <- fsym_id__ff_mkt_val %>% + dplyr::full_join(fsym_id__ff_debt, by = c("fsym_id", "date", "currency")) %>% + left_join(fsym_id__fsym_company_id, by = "fsym_id") %>% + inner_join(sec_entity, by = c("fsym_company_id" = "fsym_id")) %>% + filter(!(is.na(.data$ff_mkt_val) & is.na(.data$ff_debt))) %>% + group_by(.data$fsym_id, .data$currency) %>% + filter(.data$date <= .env$data_timestamp) %>% + filter(lubridate::year(.data$date) == .env$year) %>% + filter(.data$date == max(.data$date)) %>% + ungroup() %>% + collect() %>% + mutate( + # convert units from millions to units + ff_mkt_val = .data$ff_mkt_val * 1e6, + ff_debt = .data$ff_debt * 1e6 + ) %>% + distinct() + + DBI::dbDisconnect(factset_db) + + + # return the entity financing data ------------------------------------------- + + entity_financing_data +} diff --git a/R/get_factset_fund_data.R b/R/get_factset_fund_data.R new file mode 100644 index 0000000..6ad15d7 --- /dev/null +++ b/R/get_factset_fund_data.R @@ -0,0 +1,84 @@ +#' Get the fund data from the FactSet database and prepare the +#' `factset_fund_data` tibble +#' +#' @param data_timestamp A single string specifying the desired date for the +#' data in the form "2021-12-31" +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return A tibble properly prepared to be saved as the `factset_fund_data.rds` +#' output file +#' +#' @export + +get_factset_fund_data <- + function(data_timestamp, ...) { + # connect to the FactSet database ------------------------------------------ + factset_db <- connect_factset_db(...) + + + # get the fund holdings and the holdings' reported market value ------------ + + factset_fund_id__holding_fsym_id <- + tbl(factset_db, "own_v5_own_fund_detail") %>% + dplyr::filter(.data$report_date == .env$data_timestamp) %>% + select( + factset_fund_id = "factset_fund_id", + holding_fsym_id = "fsym_id", + holding_reported_mv = "reported_mv" + ) + + + # -------------------------------------------------------------------------- + + factset_fund_id__generic_id <- + tbl(factset_db, "own_v5_own_fund_generic") %>% + dplyr::filter(.data$report_date == .env$data_timestamp) %>% + select( + factset_fund_id = "factset_fund_id", + holding_fsym_id = "generic_id", + holding_reported_mv = "reported_mv" + ) + + factset_fund_id__holding_fsym_id <- + dplyr::union_all( + factset_fund_id__holding_fsym_id, + factset_fund_id__generic_id + ) + + + # get the fund total reported market value --------------------------------- + + factset_fund_id__total_reported_mv <- + tbl(factset_db, "own_v5_own_ent_fund_filing_hist") %>% + dplyr::filter(.data$report_date == .env$data_timestamp) %>% + select("factset_fund_id", "total_reported_mv") + + + # symbology containing the ISIN to fsym_id link + fsym_id__isin <- + tbl(factset_db, "sym_v1_sym_isin") + + + # merge and collect the data, then disconnect ------------------------------ + + fund_data <- + factset_fund_id__total_reported_mv %>% + filter(.data$total_reported_mv != 0 | !is.na(.data$total_reported_mv)) %>% + left_join(factset_fund_id__holding_fsym_id, by = "factset_fund_id") %>% + left_join(fsym_id__isin, by = c(`holding_fsym_id` = "fsym_id")) %>% + select( + factset_fund_id = "factset_fund_id", + fund_reported_mv = "total_reported_mv", + holding_isin = "isin", + holding_reported_mv = "holding_reported_mv" + ) %>% + dplyr::collect() + + DBI::dbDisconnect(factset_db) + + + # return the fund data ----------------------------------------------------- + + fund_data + } diff --git a/R/get_factset_isin_to_fund_table.R b/R/get_factset_isin_to_fund_table.R new file mode 100644 index 0000000..e9ef6a3 --- /dev/null +++ b/R/get_factset_isin_to_fund_table.R @@ -0,0 +1,47 @@ +#' Get the isin_to_fund_table data from the FactSet database and prepare the +#' `factset_isin_to_fund_table` tibble +#' +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return A tibble properly prepared to be saved as the +#' `factset_isin_to_fund_table.rds` output file +#' +#' @export + +get_factset_isin_to_fund_table <- + function(...) { + # connect to the FactSet database ------------------------------------------ + factset_db <- connect_factset_db(...) + + + # get the ISIN to fsym_id table -------------------------------------------- + + isin__fsym_id <- + tbl(factset_db, "sym_v1_sym_isin") %>% + select("isin", "fsym_id") + + + # get the fsym_id to fund_id table ----------------------------------------- + + fsym_id__factset_fund_id <- + tbl(factset_db, "own_v5_own_ent_fund_identifiers") %>% + dplyr::filter(.data$identifier_type == "FSYM_ID") %>% + select(fsym_id = "fund_identifier", "factset_fund_id") + + + # merge and collect the data, then disconnect ------------------------------ + + isin__factset_fund_id <- + fsym_id__factset_fund_id %>% + inner_join(isin__fsym_id, by = "fsym_id") %>% + select("isin", "fsym_id", "factset_fund_id") %>% + dplyr::collect() + + DBI::dbDisconnect(factset_db) + + + # return the ISIN to fund_id table ----------------------------------------- + + isin__factset_fund_id + } diff --git a/R/get_factset_iss_emissions_data.R b/R/get_factset_iss_emissions_data.R new file mode 100644 index 0000000..f5102b8 --- /dev/null +++ b/R/get_factset_iss_emissions_data.R @@ -0,0 +1,93 @@ +#' Get the ISS emissions data from the FactSet database and prepare the +#' `factset_iss_emissions` tibble +#' +#' @param year A single numeric specifying the year of data to be returned +#' @param min_estimated_trust A single numeric specifying the minimum allowed +#' "estimated trust" value +#' @param min_reported_trust A single numeric specifying the minimum allowed +#' "reported trust" value +#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' specifying database connection parameters) +#' +#' @return A tibble properly prepared to be saved as the +#' `factset_iss_emissions.rds` output file +#' +#' @export + +get_factset_iss_emissions_data <- + function(year, min_estimated_trust = 0.0, min_reported_trust = 0.0, ...) { + # convert `year` to date --------------------------------------------------- + year_month_date <- as.Date(paste0(year, "-01-01"), "%Y-%m-%d") + + + # connect to the FactSet database ------------------------------------------ + factset_db <- connect_factset_db(...) + + + # get the relevant fsym_id to factset_entity_id table ---------------------- + + fsym_id__factset_entity_id <- + tbl(factset_db, "icc_v2_icc_sec_entity_hist") %>% + # end_date identifies the date the identifier was last associated with fsym_id + # i.e. if there is no end_date (end_date == NA) then the association is still valid + filter(.data$end_date >= .env$year_month_date | is.na(.data$end_date)) %>% + filter(!is.na(.data$fsym_id)) %>% + filter(!is.na(.data$factset_entity_id)) %>% + select("fsym_id", "factset_entity_id") %>% + distinct() + + + # get the relevant icc_security_id to factset_entity_id table -------------- + + icc_security_id__factset_entity_id <- + tbl(factset_db, "icc_v2_icc_factset_id_map") %>% + filter(.data$provider_id_type == "icc_security_id") %>% + filter(.data$factset_id_type == "fsym_security_id") %>% + filter(!is.na(.data$factset_id)) %>% + # do not use a fsym_id that was started in the current year to avoid data + # based on a partial year + filter(.data$id_start_date < .env$year_month_date) %>% + # end_date identifies the date the identifier was last associated with fsym_id + # i.e. if there is no end_date (end_date == NA) then the association is still valid + filter(.data$id_end_date >= .env$year_month_date | is.na(.data$id_end_date)) %>% + select(icc_security_id = "provider_id", fsym_id = "factset_id") %>% + inner_join(fsym_id__factset_entity_id, by = "fsym_id") %>% + select("icc_security_id", "factset_entity_id") %>% + distinct() + + + # get the factset_entity_id to icc_total_emissions data -------------------- + + factset_entity_id__icc_total_emissions <- + tbl(factset_db, "icc_v2_icc_carbon_climate_core") %>% + filter(.data$icc_emissions_fiscal_year == .env$year) %>% + group_by(.data$icc_security_id, .data$icc_emissions_fiscal_year) %>% + # icc_archive_date marks the date a data point was submitted, and some times there are updates of + # previous data submissions, so we need to filter only for the most recent submission + filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>% + ungroup() %>% + group_by(.data$icc_company_id, .data$icc_emissions_fiscal_year) %>% + filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>% + ungroup() %>% + filter( + .data$icc_emissions_estimated_trust > min_estimated_trust | + .data$icc_emissions_reported_trust > min_reported_trust + ) %>% + select("icc_security_id", "icc_total_emissions", "icc_scope_3_emissions") %>% + inner_join(icc_security_id__factset_entity_id, by = "icc_security_id") %>% + select("factset_entity_id", "icc_total_emissions", "icc_scope_3_emissions") + + + # collect the data, then disconnect ---------------------------------------- + + factset_entity_id__icc_total_emissions <- + factset_entity_id__icc_total_emissions %>% + dplyr::collect() + + DBI::dbDisconnect(factset_db) + + + # return the factset_entity_id to icc_total_emissions data ----------------- + + factset_entity_id__icc_total_emissions + } From 202dd36db6a67932e5468a166dd64ad7441a7895 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 14 Dec 2023 21:27:45 +0100 Subject: [PATCH 25/33] Update Entity Financing Data function --- R/get_factset_entity_financing_data.R | 86 +++++++++++++++------------ 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/R/get_factset_entity_financing_data.R b/R/get_factset_entity_financing_data.R index e8efeac..e9c8358 100644 --- a/R/get_factset_entity_financing_data.R +++ b/R/get_factset_entity_financing_data.R @@ -1,42 +1,46 @@ #' Get the entity financing data from the FactSet database and prepare the #' `factset_entity_financing_data` tibble #' +#' @param conn databse connection #' @param data_timestamp A single string specifying the desired date for the #' data in the form "2021-12-31" -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for -#' specifying database connection parameters) #' #' @return A tibble properly prepared to be saved as the #' `factset_entity_financing_data.rds` output file #' #' @export -get_factset_entity_financing_data <- function(data_timestamp, ...) { - # connect to the FactSet database -------------------------------------------- - - factset_db <- connect_factset_db(...) - - year <- lubridate::year(data_timestamp) - - +get_factset_entity_financing_data <- function( + conn, + data_timestamp +) { # get fsym_id to fundamentals fsym_company_id -------------------------------- - ff_fsym_id__fsym_company_id <- tbl(factset_db, "ff_v3_ff_sec_map") + logger::log_debug("Extracting entity financing info from database.") + logger::log_debug("using data timestamp: ", data_timestamp) + + logger::log_trace("Accessing security map - FactSet Fundamentals.") + ff_fsym_company_id <- dplyr::tbl(conn, "ff_v3_ff_sec_map") - own_fsym_id__fsym_company_id <- tbl(factset_db, "own_v5_own_sec_map") + logger::log_trace("Accessing security map - FactSet Ownership.") + own_fsym_company_id <- dplyr::tbl(conn, "own_v5_own_sec_map") - fsym_id__fsym_company_id <- dplyr::union_all( - ff_fsym_id__fsym_company_id, - own_fsym_id__fsym_company_id + logger::log_trace("UNIONing security maps.") + fsym_company_id <- dplyr::union_all( + ff_fsym_company_id, + own_fsym_company_id ) # get fsym_id to factset_entity_id ------------------------------------------- - ff_sec_entity <- tbl(factset_db, "ff_v3_ff_sec_entity") + logger::log_trace("Accessing security to entity map - FactSet Fundamentals.") + ff_sec_entity <- dplyr::tbl(conn, "ff_v3_ff_sec_entity") - own_sec_entity <- tbl(factset_db, "own_v5_own_sec_entity") + logger::log_trace("Accessing security to entity map - FactSet Ownership.") + own_sec_entity <- dplyr::tbl(conn, "own_v5_own_sec_entity") + logger::log_trace("UNIONing security to entity maps.") sec_entity <- dplyr::union_all( ff_sec_entity, own_sec_entity @@ -45,38 +49,46 @@ get_factset_entity_financing_data <- function(data_timestamp, ...) { # get market value data ------------------------------------------------------ - fsym_id__ff_mkt_val <- tbl(factset_db, "ff_v3_ff_basic_der_af") %>% - select("fsym_id", "date", "currency", "ff_mkt_val") + logger::log_trace("Accessing market value data.") + ff_mkt_val <- dplyr::tbl(conn, "ff_v3_ff_basic_der_af") %>% + dplyr::select("fsym_id", "date", "currency", "ff_mkt_val") # get debt outstanding data -------------------------------------------------- - fsym_id__ff_debt <- tbl(factset_db, "ff_v3_ff_basic_af") %>% - select("fsym_id", "date", "currency", "ff_debt") + logger::log_trace("Accessing balance sheet data.") + ff_debt <- dplyr::tbl(conn, "ff_v3_ff_basic_af") %>% + dplyr::select("fsym_id", "date", "currency", "ff_debt") # merge and collect the data, then disconnect -------------------------------- - entity_financing_data <- fsym_id__ff_mkt_val %>% - dplyr::full_join(fsym_id__ff_debt, by = c("fsym_id", "date", "currency")) %>% - left_join(fsym_id__fsym_company_id, by = "fsym_id") %>% - inner_join(sec_entity, by = c("fsym_company_id" = "fsym_id")) %>% - filter(!(is.na(.data$ff_mkt_val) & is.na(.data$ff_debt))) %>% - group_by(.data$fsym_id, .data$currency) %>% - filter(.data$date <= .env$data_timestamp) %>% - filter(lubridate::year(.data$date) == .env$year) %>% - filter(.data$date == max(.data$date)) %>% - ungroup() %>% - collect() %>% - mutate( + logger::log_trace("Merging entity financing data.") + entity_financing_data <- ff_mkt_val %>% + dplyr::full_join( + ff_debt, + by = c("fsym_id", "date", "currency") + ) %>% + dplyr::left_join(fsym_company_id, by = "fsym_id") %>% + dplyr::inner_join(sec_entity, by = c("fsym_company_id" = "fsym_id")) %>% + dplyr::filter(!(is.na(.data$ff_mkt_val) & is.na(.data$ff_debt))) %>% + dplyr::group_by(.data$fsym_id, .data$currency) %>% + dplyr::filter(.data$date <= .env$data_timestamp) %>% + dplyr::filter( + lubridate::year(.data$date) == lubridate::year(data_timestamp) + ) %>% + dplyr::filter(.data$date == max(.data$date)) %>% + dplyr::ungroup() + + logger::log_trace("Downloading entity financing data.") + entity_financing_data <- entity_financing_data %>% + dplyr::collect() %>% + dplyr::mutate( # convert units from millions to units ff_mkt_val = .data$ff_mkt_val * 1e6, ff_debt = .data$ff_debt * 1e6 ) %>% - distinct() - - DBI::dbDisconnect(factset_db) - + dplyr::distinct() # return the entity financing data ------------------------------------------- From dc570c3a3a695a0b8296e6f3e7256ebe9fab5bc9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 15 Dec 2023 15:37:36 +0100 Subject: [PATCH 26/33] Convert get_factset_isin_to_fund_table to package --- R/get_factset_isin_to_fund_table.R | 53 +++++++++++++----------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/R/get_factset_isin_to_fund_table.R b/R/get_factset_isin_to_fund_table.R index e9ef6a3..d22f8c0 100644 --- a/R/get_factset_isin_to_fund_table.R +++ b/R/get_factset_isin_to_fund_table.R @@ -1,47 +1,40 @@ #' Get the isin_to_fund_table data from the FactSet database and prepare the #' `factset_isin_to_fund_table` tibble #' -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for -#' specifying database connection parameters) +#' @param conn database connection #' #' @return A tibble properly prepared to be saved as the #' `factset_isin_to_fund_table.rds` output file #' #' @export -get_factset_isin_to_fund_table <- - function(...) { - # connect to the FactSet database ------------------------------------------ - factset_db <- connect_factset_db(...) +get_factset_isin_to_fund_table <- function(conn) { + # get the ISIN to fsym_id table -------------------------------------------- + logger::info("Getting ISIN to fsym_id mapping") + isin <- + dplyr::tbl(conn, "sym_v1_sym_isin") %>% + dplyr::select("isin", "fsym_id") - # get the ISIN to fsym_id table -------------------------------------------- - isin__fsym_id <- - tbl(factset_db, "sym_v1_sym_isin") %>% - select("isin", "fsym_id") + # get the fsym_id to fund_id table ----------------------------------------- + logger::info("Getting fsym_id to fund id mapping") + fund_id <- + dplyr::tbl(conn, "own_v5_own_ent_fund_identifiers") %>% + dplyr::filter(.data$identifier_type == "FSYM_ID") %>% + dplyr::select(fsym_id = "fund_identifier", "factset_fund_id") - # get the fsym_id to fund_id table ----------------------------------------- - fsym_id__factset_fund_id <- - tbl(factset_db, "own_v5_own_ent_fund_identifiers") %>% - dplyr::filter(.data$identifier_type == "FSYM_ID") %>% - select(fsym_id = "fund_identifier", "factset_fund_id") + # merge and collect the data ------------------------------ + logger::info("Merging ISIN to fsym_id and fsym_id to fund_id") + isin__factset_fund_id <- + fund_id %>% + dplyr::inner_join(isin, by = "fsym_id") %>% + dplyr::select("isin", "fsym_id", "factset_fund_id") %>% + dplyr::collect() - # merge and collect the data, then disconnect ------------------------------ - - isin__factset_fund_id <- - fsym_id__factset_fund_id %>% - inner_join(isin__fsym_id, by = "fsym_id") %>% - select("isin", "fsym_id", "factset_fund_id") %>% - dplyr::collect() - - DBI::dbDisconnect(factset_db) - - - # return the ISIN to fund_id table ----------------------------------------- - - isin__factset_fund_id - } + # return the ISIN to fund_id table ----------------------------------------- + return(isin__factset_fund_id) +} From d7d5ef2b58d4e12696c5841a4721c915c42c971b Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 16:34:54 +0100 Subject: [PATCH 27/33] Update fund data function --- R/get_factset_fund_data.R | 160 +++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 73 deletions(-) diff --git a/R/get_factset_fund_data.R b/R/get_factset_fund_data.R index 6ad15d7..b0ef9a8 100644 --- a/R/get_factset_fund_data.R +++ b/R/get_factset_fund_data.R @@ -1,84 +1,98 @@ #' Get the fund data from the FactSet database and prepare the #' `factset_fund_data` tibble #' +#' @param conn databse connection #' @param data_timestamp A single string specifying the desired date for the #' data in the form "2021-12-31" -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for -#' specifying database connection parameters) #' #' @return A tibble properly prepared to be saved as the `factset_fund_data.rds` #' output file #' #' @export -get_factset_fund_data <- - function(data_timestamp, ...) { - # connect to the FactSet database ------------------------------------------ - factset_db <- connect_factset_db(...) - - - # get the fund holdings and the holdings' reported market value ------------ - - factset_fund_id__holding_fsym_id <- - tbl(factset_db, "own_v5_own_fund_detail") %>% - dplyr::filter(.data$report_date == .env$data_timestamp) %>% - select( - factset_fund_id = "factset_fund_id", - holding_fsym_id = "fsym_id", - holding_reported_mv = "reported_mv" - ) - - - # -------------------------------------------------------------------------- - - factset_fund_id__generic_id <- - tbl(factset_db, "own_v5_own_fund_generic") %>% - dplyr::filter(.data$report_date == .env$data_timestamp) %>% - select( - factset_fund_id = "factset_fund_id", - holding_fsym_id = "generic_id", - holding_reported_mv = "reported_mv" - ) - - factset_fund_id__holding_fsym_id <- - dplyr::union_all( - factset_fund_id__holding_fsym_id, - factset_fund_id__generic_id - ) - - - # get the fund total reported market value --------------------------------- - - factset_fund_id__total_reported_mv <- - tbl(factset_db, "own_v5_own_ent_fund_filing_hist") %>% - dplyr::filter(.data$report_date == .env$data_timestamp) %>% - select("factset_fund_id", "total_reported_mv") - - - # symbology containing the ISIN to fsym_id link - fsym_id__isin <- - tbl(factset_db, "sym_v1_sym_isin") - - - # merge and collect the data, then disconnect ------------------------------ - - fund_data <- - factset_fund_id__total_reported_mv %>% - filter(.data$total_reported_mv != 0 | !is.na(.data$total_reported_mv)) %>% - left_join(factset_fund_id__holding_fsym_id, by = "factset_fund_id") %>% - left_join(fsym_id__isin, by = c(`holding_fsym_id` = "fsym_id")) %>% - select( - factset_fund_id = "factset_fund_id", - fund_reported_mv = "total_reported_mv", - holding_isin = "isin", - holding_reported_mv = "holding_reported_mv" - ) %>% - dplyr::collect() - - DBI::dbDisconnect(factset_db) - - - # return the fund data ----------------------------------------------------- - - fund_data - } +get_factset_fund_data <- function(conn, data_timestamp) { + # get the fund holdings and the holdings' reported market value ------------ + + logger::log_debug("Extracting financial info from database.") + logger::log_info("using data timestamp: ", data_timestamp) + + logger::log_trace( + "Accessing historical fund holdings - security level. ", + "Filtering to date: {data_timestamp}" + ) + fund_security <- + dplyr::tbl(conn, "own_v5_own_fund_detail") %>% + dplyr::filter(.data$report_date == .env$data_timestamp) %>% + dplyr::select( + factset_fund_id = "factset_fund_id", + holding_fsym_id = "fsym_id", + holding_reported_mv = "reported_mv" + ) + + logger::log_trace( + "Accessing historical fund holdings - non-securities. ", + "Filtering to date: {data_timestamp}" + ) + fund_nonsecurity <- + dplyr::tbl(conn, "own_v5_own_fund_generic") %>% + dplyr::filter(.data$report_date == .env$data_timestamp) %>% + dplyr::select( + factset_fund_id = "factset_fund_id", + holding_fsym_id = "generic_id", + holding_reported_mv = "reported_mv" + ) + + logger::log_trace( + "Combining historical fund holdings - security and non-security." + ) + fund_holding <- + dplyr::union_all( + fund_security, + fund_nonsecurity + ) + + + # get the fund total reported market value --------------------------------- + + logger::log_trace( + "Accessing historical fund filings.", + "Filtering to date: {data_timestamp}" + ) + fund_mv <- + dplyr::tbl(conn, "own_v5_own_ent_fund_filing_hist") %>% + dplyr::filter(.data$report_date == .env$data_timestamp) %>% + dplyr::select("factset_fund_id", "total_reported_mv") + + + logger::log_trace( + "Accessing current ISIN mappings.", + ) + # symbology containing the ISIN to fsym_id link + fsym_id__isin <- + dplyr::tbl(conn, "sym_v1_sym_isin") + + + # merge and collect the data, then disconnect ------------------------------ + + logger::log_trace("Merging the data.") + fund_data <- + fund_mv %>% + dplyr::filter( + .data$total_reported_mv != 0 | !is.na(.data$total_reported_mv) + ) %>% + dplyr::left_join(fund_holding, by = "factset_fund_id") %>% + dplyr::left_join(fsym_id__isin, by = c(`holding_fsym_id` = "fsym_id")) %>% + dplyr::select( + factset_fund_id = "factset_fund_id", + fund_reported_mv = "total_reported_mv", + holding_isin = "isin", + holding_reported_mv = "holding_reported_mv" + ) + + logger::log_trace("Downloading fund data.") + fund_data <- dplyr::collect(fund_data) + + # return the fund data ----------------------------------------------------- + + return(fund_data) +} From e8d85cbefa4b782f1ec159fed4f125b824bd526c Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 18:14:51 +0100 Subject: [PATCH 28/33] update ISS emissions function --- R/get_factset_iss_emissions_data.R | 171 ++++++++++++++++------------- 1 file changed, 93 insertions(+), 78 deletions(-) diff --git a/R/get_factset_iss_emissions_data.R b/R/get_factset_iss_emissions_data.R index f5102b8..38af01c 100644 --- a/R/get_factset_iss_emissions_data.R +++ b/R/get_factset_iss_emissions_data.R @@ -1,12 +1,13 @@ #' Get the ISS emissions data from the FactSet database and prepare the #' `factset_iss_emissions` tibble #' +#' @param conn databse connection #' @param year A single numeric specifying the year of data to be returned #' @param min_estimated_trust A single numeric specifying the minimum allowed #' "estimated trust" value #' @param min_reported_trust A single numeric specifying the minimum allowed #' "reported trust" value -#' @param ... Arguments to be passed to the `connect_factset_db()` function (for +#' @param ... Arguments to be passed to the `connect_conn()` function (for #' specifying database connection parameters) #' #' @return A tibble properly prepared to be saved as the @@ -14,80 +15,94 @@ #' #' @export -get_factset_iss_emissions_data <- - function(year, min_estimated_trust = 0.0, min_reported_trust = 0.0, ...) { - # convert `year` to date --------------------------------------------------- - year_month_date <- as.Date(paste0(year, "-01-01"), "%Y-%m-%d") - - - # connect to the FactSet database ------------------------------------------ - factset_db <- connect_factset_db(...) - - - # get the relevant fsym_id to factset_entity_id table ---------------------- - - fsym_id__factset_entity_id <- - tbl(factset_db, "icc_v2_icc_sec_entity_hist") %>% - # end_date identifies the date the identifier was last associated with fsym_id - # i.e. if there is no end_date (end_date == NA) then the association is still valid - filter(.data$end_date >= .env$year_month_date | is.na(.data$end_date)) %>% - filter(!is.na(.data$fsym_id)) %>% - filter(!is.na(.data$factset_entity_id)) %>% - select("fsym_id", "factset_entity_id") %>% - distinct() - - - # get the relevant icc_security_id to factset_entity_id table -------------- - - icc_security_id__factset_entity_id <- - tbl(factset_db, "icc_v2_icc_factset_id_map") %>% - filter(.data$provider_id_type == "icc_security_id") %>% - filter(.data$factset_id_type == "fsym_security_id") %>% - filter(!is.na(.data$factset_id)) %>% - # do not use a fsym_id that was started in the current year to avoid data - # based on a partial year - filter(.data$id_start_date < .env$year_month_date) %>% - # end_date identifies the date the identifier was last associated with fsym_id - # i.e. if there is no end_date (end_date == NA) then the association is still valid - filter(.data$id_end_date >= .env$year_month_date | is.na(.data$id_end_date)) %>% - select(icc_security_id = "provider_id", fsym_id = "factset_id") %>% - inner_join(fsym_id__factset_entity_id, by = "fsym_id") %>% - select("icc_security_id", "factset_entity_id") %>% - distinct() - - - # get the factset_entity_id to icc_total_emissions data -------------------- - - factset_entity_id__icc_total_emissions <- - tbl(factset_db, "icc_v2_icc_carbon_climate_core") %>% - filter(.data$icc_emissions_fiscal_year == .env$year) %>% - group_by(.data$icc_security_id, .data$icc_emissions_fiscal_year) %>% - # icc_archive_date marks the date a data point was submitted, and some times there are updates of - # previous data submissions, so we need to filter only for the most recent submission - filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>% - ungroup() %>% - group_by(.data$icc_company_id, .data$icc_emissions_fiscal_year) %>% - filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>% - ungroup() %>% - filter( - .data$icc_emissions_estimated_trust > min_estimated_trust | - .data$icc_emissions_reported_trust > min_reported_trust - ) %>% - select("icc_security_id", "icc_total_emissions", "icc_scope_3_emissions") %>% - inner_join(icc_security_id__factset_entity_id, by = "icc_security_id") %>% - select("factset_entity_id", "icc_total_emissions", "icc_scope_3_emissions") - - - # collect the data, then disconnect ---------------------------------------- - - factset_entity_id__icc_total_emissions <- - factset_entity_id__icc_total_emissions %>% - dplyr::collect() - - DBI::dbDisconnect(factset_db) - - - # return the factset_entity_id to icc_total_emissions data ----------------- - - factset_entity_id__icc_total_emissions - } +get_factset_iss_emissions_data <- function( + conn, + reporting_year, + min_estimated_trust = 0.0, + min_reported_trust = 0.0 +) { + # convert `year` to date --------------------------------------------------- + sql_filter_date <- as.Date(paste0(reporting_year, "-01-01"), "%Y-%m-%d") + + # get the relevant fsym_id to factset_entity_id table ---------------------- + fsym_id__factset_entity_id <- + dplyr::tbl(conn, "icc_v2_icc_sec_entity_hist") %>% + # end_date identifies the date the identifier was last associated with + # fsym_id i.e. if there is no end_date (end_date == NA) then the + # association is still valid + dplyr::filter( + .data$end_date >= sql_filter_date | is.na(.data$end_date) + ) %>% + dplyr::filter(!is.na(.data$fsym_id)) %>% + dplyr::filter(!is.na(.data$factset_entity_id)) %>% + dplyr::select("fsym_id", "factset_entity_id") %>% + dplyr::distinct() + + + # get the relevant icc_security_id to factset_entity_id table -------------- + + icc_security_id <- + dplyr::tbl(conn, "icc_v2_icc_factset_id_map") %>% + dplyr::filter(.data$provider_id_type == "icc_security_id") %>% + dplyr::filter(.data$factset_id_type == "fsym_security_id") %>% + dplyr::filter(!is.na(.data$factset_id)) %>% + # do not use a fsym_id that was started in the current year to avoid data + # based on a partial year + dplyr::filter(.data$id_start_date < sql_filter_date) %>% + # end_date identifies the date the identifier was last associated with + # fsym_id i.e. if there is no end_date (end_date == NA) then the + # association is still valid + dplyr::filter( + .data$id_end_date >= sql_filter_date | is.na(.data$id_end_date) + ) %>% + dplyr::select(icc_security_id = "provider_id", fsym_id = "factset_id") %>% + dplyr::inner_join(fsym_id__factset_entity_id, by = "fsym_id") %>% + dplyr::select("icc_security_id", "factset_entity_id") %>% + dplyr::distinct() + + + # get the factset_entity_id to icc_total_emissions data -------------------- + + icc_total_emissions <- + dplyr::tbl(conn, "icc_v2_icc_carbon_climate_core") %>% + dplyr::filter(.data$icc_emissions_fiscal_year == .env$reporting_year) %>% + dplyr::group_by(.data$icc_security_id, .data$icc_emissions_fiscal_year) %>% + # icc_archive_date marks the date a data point was submitted, and some + # times there are updates of previous data submissions, so we need to + # dplyr::filter only for the most recent submission + dplyr::filter( + .data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE) + ) %>% + dplyr::ungroup() %>% + dplyr::group_by(.data$icc_company_id, .data$icc_emissions_fiscal_year) %>% + dplyr::filter( + .data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE) + ) %>% + dplyr::ungroup() %>% + dplyr::filter( + .data$icc_emissions_estimated_trust > min_estimated_trust | + .data$icc_emissions_reported_trust > min_reported_trust + ) %>% + dplyr::select( + "icc_security_id", + "icc_total_emissions", + "icc_scope_3_emissions" + ) %>% + dplyr::inner_join(icc_security_id, by = "icc_security_id") %>% + dplyr::select( + "factset_entity_id", + "icc_total_emissions", + "icc_scope_3_emissions" + ) + + # collect the data, then disconnect ---------------------------------------- + + logger::log_trace("Downloading emissions data.") + icc_total_emissions <- + icc_total_emissions %>% + dplyr::collect() + + # return the factset_entity_id to icc_total_emissions data ----------------- + + return(icc_total_emissions) +} From 760e2e00b14423999a333498980588c380b5eb2b Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 19:06:27 +0100 Subject: [PATCH 29/33] Add new exports to export function --- R/export_pacta_files.R | 61 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index a05b46e..30deb07 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -75,7 +75,7 @@ export_pacta_files <- function( # Start Extracting Data - factset_financial_data_path <- file.path( + financial_data_path <- file.path( export_dir, "factset_financial_data.rds" ) @@ -85,13 +85,59 @@ export_pacta_files <- function( data_timestamp = data_timestamp ) logger::log_info("Exporting financial data to {factset_financial_data_path}") - saveRDS(object = financial_data, file = factset_financial_data_path) + saveRDS(object = financial_data, file = financial_data_path) - factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds") + entity_info_path <- file.path(export_dir, "factset_entity_info.rds") logger::log_info("Fetching entity info data.") entity_info <- get_factset_entity_info(conn = conn) logger::log_info("Exporting entity info data to {factset_entity_info_path}") - saveRDS(object = entity_info, file = factset_entity_info_path) + saveRDS(object = entity_info, file = entity_info_path) + + entity_financing_data_path <- file.path( + export_dir, + "factset_entity_financing_data.rds" + ) + logger::log_info("Fetching entity financing data.") + entity_financing_data <- get_factset_entity_financing_data( + conn = conn, + data_timestamp = data_timestamp + ) + logger::log_info( + "Exporting entity financing data to {factset_entity_financing_data_path}" + ) + saveRDS( + object = entity_financing_data, + file = entity_financing_data_path + ) + + fund_data_path <- file.path(export_dir, "factset_fund_data.rds") + logger::log_info("Fetching fund data.") + fund_data <- get_factset_fund_data(conn = conn) + logger::log_info("Exporting fund data to {factset_fund_data_path}") + saveRDS(object = fund_data, file = fund_data_path) + + isin_to_fund_table_path <- file.path( + export_dir, + "factset_isin_to_fund_table.rds" + ) + logger::log_info("Fetching ISIN to fund table.") + isin_to_fund_table <- get_factset_isin_to_fund_table(conn = conn) + logger::log_info( + "Exporting ISIN to fund table to {factset_isin_to_fund_table_path}" + ) + saveRDS(object = isin_to_fund_table, file = isin_to_fund_table_path) + + iss_emissions_path <- file.path( + export_dir, + "factset_iss_emissions.rds" + ) + logger::log_info("Fetching ISS emissions data.") + iss_emissions <- get_factset_iss_emissions_data(conn = conn) + logger::log_info( + "Exporting ISS emissions data to {factset_iss_emissions_path}" + ) + saveRDS(object = iss_emissions, file = iss_emissions_path) + logger::log_info("Done with data export.") @@ -104,7 +150,12 @@ export_pacta_files <- function( return( invisible( c( - factset_entity_info_path = factset_entity_info_path + financial_data_path = financial_data_path, + entity_info_path = entity_info_path, + entity_financing_data_path = entity_financing_data_path, + fund_data_path = fund_data_path, + isin_to_fund_table_path = isin_to_fund_table_path, + iss_emissions_path = iss_emissions_path ) ) ) From e4c84a652a8ad966fd13957413295a44c2905bc4 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 19:07:27 +0100 Subject: [PATCH 30/33] update documentation --- NAMESPACE | 5 ++++ man/connect_factset_db.Rd | 35 +++++++++++++++++++++++ man/export_pacta_files.Rd | 16 +++++++---- man/get_factset_entity_financing_data.Rd | 23 +++++++++++++++ man/get_factset_entity_info.Rd | 5 ++-- man/get_factset_financial_data.Rd | 23 +++++++++++++++ man/get_factset_fund_data.Rd | 23 +++++++++++++++ man/get_factset_isin_to_fund_table.Rd | 20 +++++++++++++ man/get_factset_iss_emissions_data.Rd | 36 ++++++++++++++++++++++++ 9 files changed, 178 insertions(+), 8 deletions(-) create mode 100644 man/connect_factset_db.Rd create mode 100644 man/get_factset_entity_financing_data.Rd create mode 100644 man/get_factset_financial_data.Rd create mode 100644 man/get_factset_fund_data.Rd create mode 100644 man/get_factset_isin_to_fund_table.Rd create mode 100644 man/get_factset_iss_emissions_data.Rd diff --git a/NAMESPACE b/NAMESPACE index a26e306..2b64b62 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,13 @@ # Generated by roxygen2: do not edit by hand +export(connect_factset_db) export(export_pacta_files) +export(get_factset_entity_financing_data) export(get_factset_entity_info) export(get_factset_financial_data) +export(get_factset_fund_data) +export(get_factset_isin_to_fund_table) +export(get_factset_iss_emissions_data) importFrom(dplyr,"%>%") importFrom(rlang,.data) importFrom(rlang,.env) diff --git a/man/connect_factset_db.Rd b/man/connect_factset_db.Rd new file mode 100644 index 0000000..ee82e71 --- /dev/null +++ b/man/connect_factset_db.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/connect_factset_db.R +\name{connect_factset_db} +\alias{connect_factset_db} +\title{Export files for use in PACTA data preparation} +\usage{ +connect_factset_db( + dbname = Sys.getenv("PGDATABASE"), + host = Sys.getenv("PGHOST"), + port = Sys.getenv("PGPORT", 5432L), + options = "-c search_path=fds", + username = Sys.getenv("PGUSER"), + password = Sys.getenv("PGPASSWORD") +) +} +\arguments{ +\item{dbname}{name of the database to connect to} + +\item{host}{hostname of the server to connect to} + +\item{port}{port number of the server to connect to} + +\item{options}{additional options to pass to the database connection. +Typically used to define schema search path.} + +\item{username}{username to use for the database connection} + +\item{password}{password to use for the database connection} +} +\value{ +a database connection object +} +\description{ +Export files for use in PACTA data preparation +} diff --git a/man/export_pacta_files.Rd b/man/export_pacta_files.Rd index 7977cfa..a11143e 100644 --- a/man/export_pacta_files.Rd +++ b/man/export_pacta_files.Rd @@ -5,17 +5,23 @@ \title{Export files for use in PACTA data preparation} \usage{ export_pacta_files( - destination = file.path("."), - data_timestamp = Sys.time(), - ... + conn = connect_factset_db(), + destination = file.path(Sys.getenv("EXPORT_DESTINATION")), + data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()), + terminate_connection = (deparse(substitute(conn)) == + formals(export_pacta_files)[["conn"]]) ) } \arguments{ -\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for -specifying database connection parameters)} +\item{destination}{path to directory where exported files will be saved} + +\item{data_timestamp}{filter data as-of this timestamp} \item{Destination}{directory for the output files} } +\value{ +vector of paths to exported files +} \description{ Export files for use in PACTA data preparation } diff --git a/man/get_factset_entity_financing_data.Rd b/man/get_factset_entity_financing_data.Rd new file mode 100644 index 0000000..98b3598 --- /dev/null +++ b/man/get_factset_entity_financing_data.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_factset_entity_financing_data.R +\name{get_factset_entity_financing_data} +\alias{get_factset_entity_financing_data} +\title{Get the entity financing data from the FactSet database and prepare the +\code{factset_entity_financing_data} tibble} +\usage{ +get_factset_entity_financing_data(conn, data_timestamp) +} +\arguments{ +\item{conn}{databse connection} + +\item{data_timestamp}{A single string specifying the desired date for the +data in the form "2021-12-31"} +} +\value{ +A tibble properly prepared to be saved as the +\code{factset_entity_financing_data.rds} output file +} +\description{ +Get the entity financing data from the FactSet database and prepare the +\code{factset_entity_financing_data} tibble +} diff --git a/man/get_factset_entity_info.Rd b/man/get_factset_entity_info.Rd index b163ad8..7f5ca6a 100644 --- a/man/get_factset_entity_info.Rd +++ b/man/get_factset_entity_info.Rd @@ -5,11 +5,10 @@ \title{Get the entity info data from the FactSet database and prepare the \code{factset_entity_info} tibble} \usage{ -get_factset_entity_info(...) +get_factset_entity_info(conn) } \arguments{ -\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for -specifying database connection parameters)} +\item{conn}{database connection} } \value{ A tibble properly prepared to be saved as the diff --git a/man/get_factset_financial_data.Rd b/man/get_factset_financial_data.Rd new file mode 100644 index 0000000..9b13887 --- /dev/null +++ b/man/get_factset_financial_data.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_factset_financial_data.R +\name{get_factset_financial_data} +\alias{get_factset_financial_data} +\title{Get the factset financial data from the FactSet database and prepare the +\code{factset_financial_data} tibble} +\usage{ +get_factset_financial_data(conn, data_timestamp, ...) +} +\arguments{ +\item{conn}{databse connection} + +\item{data_timestamp}{A single string specifying the desired date for the +data in the form "2021-12-31"} +} +\value{ +A tibble properly prepared to be saved as the +\code{factset_financial_data.rds} output file +} +\description{ +Get the factset financial data from the FactSet database and prepare the +\code{factset_financial_data} tibble +} diff --git a/man/get_factset_fund_data.Rd b/man/get_factset_fund_data.Rd new file mode 100644 index 0000000..7c5c447 --- /dev/null +++ b/man/get_factset_fund_data.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_factset_fund_data.R +\name{get_factset_fund_data} +\alias{get_factset_fund_data} +\title{Get the fund data from the FactSet database and prepare the +\code{factset_fund_data} tibble} +\usage{ +get_factset_fund_data(conn, data_timestamp) +} +\arguments{ +\item{conn}{databse connection} + +\item{data_timestamp}{A single string specifying the desired date for the +data in the form "2021-12-31"} +} +\value{ +A tibble properly prepared to be saved as the \code{factset_fund_data.rds} +output file +} +\description{ +Get the fund data from the FactSet database and prepare the +\code{factset_fund_data} tibble +} diff --git a/man/get_factset_isin_to_fund_table.Rd b/man/get_factset_isin_to_fund_table.Rd new file mode 100644 index 0000000..155a20a --- /dev/null +++ b/man/get_factset_isin_to_fund_table.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_factset_isin_to_fund_table.R +\name{get_factset_isin_to_fund_table} +\alias{get_factset_isin_to_fund_table} +\title{Get the isin_to_fund_table data from the FactSet database and prepare the +\code{factset_isin_to_fund_table} tibble} +\usage{ +get_factset_isin_to_fund_table(conn) +} +\arguments{ +\item{conn}{database connection} +} +\value{ +A tibble properly prepared to be saved as the +\code{factset_isin_to_fund_table.rds} output file +} +\description{ +Get the isin_to_fund_table data from the FactSet database and prepare the +\code{factset_isin_to_fund_table} tibble +} diff --git a/man/get_factset_iss_emissions_data.Rd b/man/get_factset_iss_emissions_data.Rd new file mode 100644 index 0000000..40cc055 --- /dev/null +++ b/man/get_factset_iss_emissions_data.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_factset_iss_emissions_data.R +\name{get_factset_iss_emissions_data} +\alias{get_factset_iss_emissions_data} +\title{Get the ISS emissions data from the FactSet database and prepare the +\code{factset_iss_emissions} tibble} +\usage{ +get_factset_iss_emissions_data( + conn, + reporting_year, + min_estimated_trust = 0, + min_reported_trust = 0 +) +} +\arguments{ +\item{conn}{databse connection} + +\item{min_estimated_trust}{A single numeric specifying the minimum allowed +"estimated trust" value} + +\item{min_reported_trust}{A single numeric specifying the minimum allowed +"reported trust" value} + +\item{year}{A single numeric specifying the year of data to be returned} + +\item{...}{Arguments to be passed to the \code{connect_conn()} function (for +specifying database connection parameters)} +} +\value{ +A tibble properly prepared to be saved as the +\code{factset_iss_emissions.rds} output file +} +\description{ +Get the ISS emissions data from the FactSet database and prepare the +\code{factset_iss_emissions} tibble +} From 78378230795f5258f04c6f25fd9d14e74c763871 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 19:39:23 +0100 Subject: [PATCH 31/33] Remove `browser()` --- R/get_factset_financial_data.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R index 4be7c3c..3d5577c 100644 --- a/R/get_factset_financial_data.R +++ b/R/get_factset_financial_data.R @@ -34,7 +34,6 @@ get_factset_financial_data <- # adj_price ---------------------------------------------------------------- - browser() logger::log_trace( "Accessing share prices. ", "Filtering to date: {data_timestamp}" From d3e0e6c13739679bbe996f4a3e4de659fa7feb67 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 20:38:52 +0100 Subject: [PATCH 32/33] rename function --- R/export_pacta_files.R | 2 +- R/get_factset_entity_financing_data.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R index 30deb07..8d4d3f4 100644 --- a/R/export_pacta_files.R +++ b/R/export_pacta_files.R @@ -98,7 +98,7 @@ export_pacta_files <- function( "factset_entity_financing_data.rds" ) logger::log_info("Fetching entity financing data.") - entity_financing_data <- get_factset_entity_financing_data( + entity_financing_data <- get_factset_financing_data( conn = conn, data_timestamp = data_timestamp ) diff --git a/R/get_factset_entity_financing_data.R b/R/get_factset_entity_financing_data.R index e9c8358..17900ce 100644 --- a/R/get_factset_entity_financing_data.R +++ b/R/get_factset_entity_financing_data.R @@ -10,7 +10,7 @@ #' #' @export -get_factset_entity_financing_data <- function( +get_factset_financing_data <- function( conn, data_timestamp ) { From f41ac35e8fadbaef1272e8d063fb993f62837d00 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 18 Dec 2023 20:58:53 +0100 Subject: [PATCH 33/33] Filter to most recent date, if posted in past month --- R/get_factset_financial_data.R | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R index 3d5577c..12de10a 100644 --- a/R/get_factset_financial_data.R +++ b/R/get_factset_financial_data.R @@ -11,11 +11,16 @@ #' @export get_factset_financial_data <- - function(conn, data_timestamp, ...) { + function( + conn, + data_timestamp, + data_timestamp_lookback = data_timestamp - lubridate::dmonths(1) + ) { # build connection to database --------------------------------------------- logger::log_debug("Extracting financial info from database.") logger::log_info("using data timestamp: ", data_timestamp) + logger::log_info("Looking back in data to", data_timestamp_lookback) # factset_entity_id ----------------------------------------------- @@ -38,12 +43,21 @@ get_factset_financial_data <- "Accessing share prices. ", "Filtering to date: {data_timestamp}" ) + # TODO: Optimize this query adj_price <- dplyr::tbl(conn, "own_v5_own_sec_prices") %>% - dplyr::filter(.data$price_date == .env$data_timestamp) %>% + dplyr::filter(.data$price_date <= .env$data_timestamp) %>% + dplyr::group_by(.data$fsym_id, .data$adj_price) %>% + dplyr::filter(.data$price_date == max(.data$price_date)) %>% + # TODO: CRITICAL: decision: do we want most recent price, or only for + # those that have posted in past month? + dplyr::filter( + .data$price_date >= .env$data_timestamp_lookback + ) %>% dplyr::select("fsym_id", "adj_price") + # adj_shares_outstanding --------------------------------------------------- logger::log_trace( @@ -52,7 +66,14 @@ get_factset_financial_data <- ) adj_shares_outstanding <- dplyr::tbl(conn, "own_v5_own_sec_prices") %>% - dplyr::filter(.data$price_date == .env$data_timestamp) %>% + dplyr::filter(.data$price_date <= .env$data_timestamp) %>% + dplyr::group_by(.data$fsym_id, .data$adj_price) %>% + dplyr::filter(.data$price_date == max(.data$price_date)) %>% + # TODO: CRITICAL: decision: do we want most recent price, or only for + # those that have posted in past month? + dplyr::filter( + .data$price_date >= .env$data_timestamp_lookback + ) %>% dplyr::select("fsym_id", "adj_shares_outstanding")