From 0b24fba1ebc1c3c60da6c3232121ec42e2d6c628 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 13:53:36 +0100
Subject: [PATCH 01/33] add inital Dockerfile

---
 .dockerignore |  2 ++
 Dockerfile    | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..2833d34
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+.git/
+Dockerfile
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..9b7bbeb
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,65 @@
+# using rocker r-vers as a base with R 4.3.1
+# https://hub.docker.com/r/rocker/r-ver
+# https://rocker-project.org/images/versioned/r-ver.html
+#
+# sets CRAN repo to use Posit Package Manager to freeze R package versions to
+# those available on 2023-10-30
+# https://packagemanager.posit.co/client/#/repos/2/overview
+# https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30
+
+# set proper base image
+ARG R_VERS="4.3.1"
+FROM rocker/r-ver:$R_VERS AS base
+
+# set Docker image labels
+LABEL org.opencontainers.image.source=https://github.com/RMI-PACTA/workflow.factset
+LABEL org.opencontainers.image.description="Extract FactSet Data for use in PACTA"
+LABEL org.opencontainers.image.licenses=MIT
+LABEL org.opencontainers.image.title=""
+LABEL org.opencontainers.image.revision=""
+LABEL org.opencontainers.image.version=""
+LABEL org.opencontainers.image.vendor=""
+LABEL org.opencontainers.image.base.name=""
+LABEL org.opencontainers.image.ref.name=""
+LABEL org.opencontainers.image.authors=""
+
+# set apt-get to noninteractive mode
+ARG DEBIAN_FRONTEND="noninteractive"
+ARG DEBCONF_NOWARNINGS="yes"
+
+RUN groupadd -r runner-workflow-factset \
+      && useradd -r -g runner-workflow-factset runner-workflow-factset \
+      && mkdir -p /home/runner-workflow-factset \
+      && chown -R runner-workflow-factset /home/runner-workflow-factset
+WORKDIR /home/runner-workflow-factset
+
+# # install system dependencies
+# RUN apt-get update \
+#     && apt-get install -y --no-install-recommends \
+#       git=1:2.34.* \
+#       libcurl4-openssl-dev=7.81.* \
+#       libicu-dev=70.* \
+#       libssl-dev=3.0.* \
+#       openssh-client=1:8.* \
+#       wget=1.21.* \
+#     && chmod -R a+rwX /root \
+#     && rm -rf /var/lib/apt/lists/*
+
+# set frozen CRAN repo
+ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30"
+RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" \
+      # install packages for dependency resolution and installation
+      && Rscript -e "install.packages('pak')"
+
+# copy in everything from this repo
+COPY . /workflow.factset
+
+# install R package dependencies
+RUN Rscript -e "\
+  pak::pkg_install('local::/workflow.factset'); \
+  "
+
+USER runner-workflow-factset
+
+# set default run behavior
+CMD ["input_dir/default_config.json"]

From ad93160ef564980af241368134e2113c52b66677 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 13:55:16 +0100
Subject: [PATCH 02/33] Add GitHub actions

---
 .Rbuildignore                                 |  2 +
 .dockerignore                                 |  1 +
 .../workflows/build-Docker-image-nightly.yml  | 12 ++++
 .../build-Docker-image-on-push-to-main.yml    | 12 ++++
 .../build-Docker-image-on-push-to-pr.yml      | 37 ++++++++++
 .../workflows/build-and-push-Docker-image.yml | 67 +++++++++++++++++++
 .github/workflows/check-R-sysdeps.yml         | 32 +++++++++
 .github/workflows/run-hadolint.yml            | 11 +++
 8 files changed, 174 insertions(+)
 create mode 100644 .github/workflows/build-Docker-image-nightly.yml
 create mode 100644 .github/workflows/build-Docker-image-on-push-to-main.yml
 create mode 100644 .github/workflows/build-Docker-image-on-push-to-pr.yml
 create mode 100644 .github/workflows/build-and-push-Docker-image.yml
 create mode 100644 .github/workflows/check-R-sysdeps.yml
 create mode 100644 .github/workflows/run-hadolint.yml

diff --git a/.Rbuildignore b/.Rbuildignore
index 5163d0b..8cc3750 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1 +1,3 @@
 ^LICENSE\.md$
+.git/
+.github/
diff --git a/.dockerignore b/.dockerignore
index 2833d34..c49182c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,3 @@
 .git/
+.github/
 Dockerfile
diff --git a/.github/workflows/build-Docker-image-nightly.yml b/.github/workflows/build-Docker-image-nightly.yml
new file mode 100644
index 0000000..7ffa64f
--- /dev/null
+++ b/.github/workflows/build-Docker-image-nightly.yml
@@ -0,0 +1,12 @@
+on:
+  schedule:
+    - cron:  '0 0 * * 1,2,3,4,5'
+
+jobs:
+  build_docker_image:
+    name: "Call build and push action"
+    uses: ./.github/workflows/build-and-push-Docker-image.yml
+    secrets: inherit
+    with:
+      image-name: workflow.factset
+      image-tag: nightly
diff --git a/.github/workflows/build-Docker-image-on-push-to-main.yml b/.github/workflows/build-Docker-image-on-push-to-main.yml
new file mode 100644
index 0000000..b75fca6
--- /dev/null
+++ b/.github/workflows/build-Docker-image-on-push-to-main.yml
@@ -0,0 +1,12 @@
+on:
+  push:
+    branches: [main]
+
+jobs:
+  build_docker_image:
+    name: "Call build and push action"
+    uses: ./.github/workflows/build-and-push-Docker-image.yml
+    secrets: inherit
+    with:
+      image-name: workflow.factset
+      image-tag: main
diff --git a/.github/workflows/build-Docker-image-on-push-to-pr.yml b/.github/workflows/build-Docker-image-on-push-to-pr.yml
new file mode 100644
index 0000000..16934bb
--- /dev/null
+++ b/.github/workflows/build-Docker-image-on-push-to-pr.yml
@@ -0,0 +1,37 @@
+on:
+  pull_request:
+
+jobs:
+  build_docker_image:
+    name: "Call build and push action"
+    uses: ./.github/workflows/build-and-push-Docker-image.yml
+    secrets: inherit
+    with:
+      image-name: workflow.factset
+      image-tag: pr${{ github.event.pull_request.number }}
+
+  add_comment:
+    needs: build_docker_image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find Comment
+        # https://github.com/peter-evans/find-comment
+        uses: peter-evans/find-comment@v2
+        id: fc
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-author: 'github-actions[bot]'
+          body-includes: Docker image from this PR
+
+      - name: Create or update comment
+        # https://github.com/peter-evans/create-or-update-comment
+        uses: peter-evans/create-or-update-comment@v3
+        with:
+          comment-id: ${{ steps.fc.outputs.comment-id }}
+          issue-number: ${{ github.event.pull_request.number }}
+          body: |
+            Docker image from this PR (${{ github.event.pull_request.head.sha }}) created
+            ```
+            docker pull ${{ needs.build_docker_image.outputs.full-image-name }}
+            ```
+          edit-mode: replace
diff --git a/.github/workflows/build-and-push-Docker-image.yml b/.github/workflows/build-and-push-Docker-image.yml
new file mode 100644
index 0000000..b6d8e1e
--- /dev/null
+++ b/.github/workflows/build-and-push-Docker-image.yml
@@ -0,0 +1,67 @@
+---
+name: Build and push docker image
+
+on:
+  workflow_call:
+    inputs:
+      image-name:
+        required: true
+        type: string
+      image-tag:
+        required: true
+        type: string
+    outputs:
+      full-image-name:
+        description: "Full pushed image name including host/registry, name, and tag"
+        value: ${{ jobs.docker.outputs.full-image-name }}
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+    timeout-minutes: 25
+    outputs:
+      full-image-name: ${{ steps.image-name.outputs.full-image-name }}
+
+    steps:
+
+      - name: Define image name
+        id: image-name
+        run: |
+          full_image_name="ghcr.io/${{ github.repository_owner }}/${{ inputs.image-name }}:${{ inputs.image-tag }}"
+          full_image_name=$(echo $full_image_name | tr '[A-Z]' '[a-z]')
+          echo "full-image-name=$full_image_name" >> "$GITHUB_OUTPUT"
+          echo "$full_image_name" > full-image-name
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: full-image-name
+          path: .
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: ${{ steps.image-name.outputs.full-image-name }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=min
+          no-cache-filters: install-pacta
+
+  check-system-dependencies:
+    name: "Check System Dependencies"
+    needs: docker
+    uses: ./.github/workflows/check-R-sysdeps.yml
+    with:
+      image: ${{ needs.docker.outputs.full-image-name }}
\ No newline at end of file
diff --git a/.github/workflows/check-R-sysdeps.yml b/.github/workflows/check-R-sysdeps.yml
new file mode 100644
index 0000000..3a1c08b
--- /dev/null
+++ b/.github/workflows/check-R-sysdeps.yml
@@ -0,0 +1,32 @@
+---
+name: Check R system dependencies
+
+on:
+  workflow_call:
+    inputs:
+      image:
+        required: true
+        type: string
+
+jobs:
+
+  check-system-dependencies:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Pull image'
+        run: |
+          echo ${{ inputs.image }}
+          docker pull ${{ inputs.image }}
+      - name: 'Run pak::sysreqs_check_installed()'
+        run: |
+
+          docker run \
+          --rm \
+          --entrypoint "/bin/sh" \
+          ${{ inputs.image }} \
+          -c "Rscript -e '
+            x <- pak::sysreqs_check_installed()
+            print(x)
+            is_installed <- as.data.frame(x)[[\"installed\"]]
+            stopifnot(all(is_installed))
+          '"
diff --git a/.github/workflows/run-hadolint.yml b/.github/workflows/run-hadolint.yml
new file mode 100644
index 0000000..0f07812
--- /dev/null
+++ b/.github/workflows/run-hadolint.yml
@@ -0,0 +1,11 @@
+---
+on: [push, pull_request]
+
+jobs:
+  hadolint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: hadolint/hadolint-action@v3.1.0
+        with:
+          dockerfile: Dockerfile

From cbe4a73adb739ec9583924e36d1722640e3f5868 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 16:49:56 +0100
Subject: [PATCH 03/33] Add first data pulling function

---
 DESCRIPTION                     |   9 +++
 Dockerfile                      |  20 +++---
 NAMESPACE                       |   2 +
 R/connect_factset_db.R          |  98 +++++++++++++++++++++++++++
 R/get_factset_entity_info.R     | 113 ++++++++++++++++++++++++++++++++
 R/workflow.factset-package.R    |   7 ++
 man/get_factset_entity_info.Rd  |  21 ++++++
 man/workflow.factset-package.Rd |  27 ++++++++
 8 files changed, 285 insertions(+), 12 deletions(-)
 create mode 100644 R/connect_factset_db.R
 create mode 100644 R/get_factset_entity_info.R
 create mode 100644 R/workflow.factset-package.R
 create mode 100644 man/get_factset_entity_info.Rd
 create mode 100644 man/workflow.factset-package.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 03ec3c5..852f0ee 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -30,3 +30,12 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3
+Imports: 
+    DBI,
+    dbplyr,
+    dplyr,
+    logger,
+    RPostgres,
+    withr
+Suggests: 
+    rstudioapi
diff --git a/Dockerfile b/Dockerfile
index 9b7bbeb..ac67d99 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -33,23 +33,19 @@ RUN groupadd -r runner-workflow-factset \
       && chown -R runner-workflow-factset /home/runner-workflow-factset
 WORKDIR /home/runner-workflow-factset
 
-# # install system dependencies
-# RUN apt-get update \
-#     && apt-get install -y --no-install-recommends \
-#       git=1:2.34.* \
-#       libcurl4-openssl-dev=7.81.* \
-#       libicu-dev=70.* \
-#       libssl-dev=3.0.* \
-#       openssh-client=1:8.* \
-#       wget=1.21.* \
-#     && chmod -R a+rwX /root \
-#     && rm -rf /var/lib/apt/lists/*
+# install system dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+      libicu-dev=70.* \
+      libpq-dev=14.* \
+    && chmod -R a+rwX /root \
+    && rm -rf /var/lib/apt/lists/*
 
 # set frozen CRAN repo
 ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30"
 RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" \
       # install packages for dependency resolution and installation
-      && Rscript -e "install.packages('pak')"
+      && Rscript -e "install.packages(c('pak', 'jsonlite'))"
 
 # copy in everything from this repo
 COPY . /workflow.factset
diff --git a/NAMESPACE b/NAMESPACE
index 6ae9268..f4f7aae 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,2 +1,4 @@
 # Generated by roxygen2: do not edit by hand
 
+export(get_factset_entity_info)
+importFrom(dplyr,"%>%")
diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R
new file mode 100644
index 0000000..00e0c37
--- /dev/null
+++ b/R/connect_factset_db.R
@@ -0,0 +1,98 @@
+# Connection function
+
+connect_factset_db <-
+  function(
+      dbname = "delta",
+      host = "data-eval-db.postgres.database.azure.com",
+      port = 5432L,
+      options = "-c search_path=fds",
+      username = Sys.getenv("R_DATABASE_USER"),
+      password = Sys.getenv("R_DATABASE_PASSWORD"),
+      keyring_service_name = "2dii_factset_database") {
+
+    if (username == "") {
+      logger::log_error("No database username could be found. Please set the username as an environment variable")
+    }
+
+    if (password == "") {
+      # if password not defined in .env, look in systems keyring
+      if (requireNamespace("keyring", quietly = TRUE)) {
+        if (!username %in% keyring::key_list(service = keyring_service_name)$username) {
+          keyring::key_set(
+            service = keyring_service_name,
+            username = username,
+            prompt = "Enter password for the FactSet database (it will be stored in your system's keyring): "
+          )
+        }
+        password <- keyring::key_get(
+          service = keyring_service_name,
+          username = username
+        )
+      } else if (interactive() && requireNamespace("rstudioapi", quietly = TRUE)) {
+        password <- rstudioapi::askForPassword(
+          prompt = "Please enter the FactSet database password:"
+        )
+      } else {
+        logger::log_error(
+          "No database password could be found. Please set the password
+          as an environment variable"
+        )
+      }
+    }
+
+    logger::log_trace(
+      "Connecting to database {dbname} on {host}:{port} as {username}"
+    )
+    conn <-
+      DBI::dbConnect(
+        drv = RPostgres::Postgres(),
+        dbname = dbname,
+        host = host,
+        port = port,
+        user = username,
+        password = password,
+        options = options
+      )
+
+    reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame())
+  }
+
+# connection finalizer to ensure connection is closed --------------------------
+# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/
+
+reg_conn_finalizer <- function(conn, close_fun, envir) {
+  is_parent_global <- identical(.GlobalEnv, envir)
+
+  if (isTRUE(is_parent_global)) {
+    env_finalizer <- new.env(parent = emptyenv())
+    env_finalizer$conn <- conn
+    attr(conn, "env_finalizer") <- env_finalizer
+
+    reg.finalizer(env_finalizer, function(e) {
+      if (DBI::dbIsValid(e$conn)) {
+        logger::log_warn("Warning: A database connection was closed automatically because the connection object was removed or the R session was closed.")
+        try(close_fun(e$conn))
+      }
+    }, onexit = TRUE)
+  } else {
+    withr::defer(
+      {
+        if (DBI::dbIsValid(conn)) {
+          dbname <- DBI::dbGetInfo(conn)$dbname
+          host <- DBI::dbGetInfo(conn)$host
+
+          logger::log_warn(
+            "The database connection to {dbname} on {host} was
+        closed automatically because the calling environment was closed."
+          )
+          try(close_fun(conn))
+        }
+      },
+      envir = envir,
+      priority = "last"
+    )
+  }
+
+  logger::log_trace("Database connection registered for finalization")
+  return(conn)
+}
diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R
new file mode 100644
index 0000000..6455bd3
--- /dev/null
+++ b/R/get_factset_entity_info.R
@@ -0,0 +1,113 @@
+#' Get the entity info data from the FactSet database and prepare the
+#' `factset_entity_info` tibble
+#'
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return A tibble properly prepared to be saved as the
+#'   `factset_entity_info.rds` output file
+#'
+#' @export
+
+get_factset_entity_info <-
+  function(...) {
+    # build connection to database ---------------------------------------------
+
+    factset_db <- connect_factset_db(...)
+
+    logger::log_debug("Extracting entity info from database.")
+
+    # company_name -------------------------------------------------------------
+
+    logger::log_trace("Accessing entity proper names.")
+    factset_entity_id__entity_proper_name <-
+      dplyr::tbl(factset_db, "sym_v1_sym_entity") %>%
+      dplyr::select("factset_entity_id", "entity_proper_name")
+
+
+    # country_of_domicile ------------------------------------------------------
+
+    logger::log_trace("Accessing entity country of domicile.")
+    factset_entity_id__iso_country <-
+      dplyr::tbl(factset_db, "sym_v1_sym_entity") %>%
+      dplyr::select("factset_entity_id", "iso_country")
+
+
+    # sector -------------------------------------------------------------------
+
+    logger::log_trace("Accessing entity sector.")
+    factset_entity_id__sector_code <-
+      dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>%
+      dplyr::select("factset_entity_id", "sector_code")
+
+    factset_sector_code__factset_sector_desc <-
+      dplyr::tbl(factset_db, "ref_v2_factset_sector_map") %>%
+      dplyr::select(.data$factset_sector_code, .data$factset_sector_desc)
+
+    factset_entity_id__factset_sector_desc <-
+      factset_entity_id__sector_code %>%
+      dplyr::left_join(factset_sector_code__factset_sector_desc, by = c("sector_code" = "factset_sector_code")) %>%
+      dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc")
+
+
+    # sub-sector/industry ------------------------------------------------------
+
+    logger::log_trace("Accessing entity industry/sector/subsector.")
+    factset_entity_id__industry_code <-
+      dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>%
+      dplyr::select("factset_entity_id", "industry_code")
+
+    factset_industry_code_factset_industry_desc <-
+      dplyr::tbl(factset_db, "ref_v2_factset_industry_map") %>%
+      dplyr::select("factset_industry_code", "factset_industry_desc")
+
+    factset_entity_id__factset_industry_desc <-
+      factset_entity_id__industry_code %>%
+      dplyr::left_join(factset_industry_code_factset_industry_desc, by = c("industry_code" = "factset_industry_code")) %>%
+      dplyr::select("factset_entity_id", "industry_code", "factset_industry_desc")
+
+
+    # credit risk parent -------------------------------------------------------
+
+    logger::log_trace("Accessing entity credit risk parent.")
+    ent_v1_ent_entity_affiliates <- dplyr::tbl(factset_db, "ent_v1_ent_entity_affiliates")
+    ref_v2_affiliate_type_map <- dplyr::tbl(factset_db, "ref_v2_affiliate_type_map")
+
+    ent_entity_affiliates_last_update <-
+      dplyr::tbl(factset_db, "fds_fds_file_history") %>%
+      dplyr::filter(.data$table_name == "ent_entity_affiliates") %>%
+      dplyr::filter(.data$begin_time == max(.data$begin_time, na.rm = TRUE)) %>%
+      dplyr::pull("begin_time")
+
+    factset_entity_id__credit_parent_id <-
+      ent_v1_ent_entity_affiliates %>%
+      dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>%
+      dplyr::filter(.data$aff_type_desc == "Credit Risk Parent") %>%
+      dplyr::select(
+        factset_entity_id = "factset_affiliated_entity_id",
+        credit_parent_id = "factset_entity_id"
+      ) %>%
+      dplyr::mutate(ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update)
+
+
+    # merge and collect --------------------------------------------------------
+
+    logger::log_trace("Merging entity info.")
+    entity_info <-
+      factset_entity_id__entity_proper_name %>%
+      dplyr::left_join(factset_entity_id__iso_country, by = "factset_entity_id") %>%
+      dplyr::left_join(factset_entity_id__factset_sector_desc, by = "factset_entity_id") %>%
+      dplyr::left_join(factset_entity_id__factset_industry_desc, by = "factset_entity_id") %>%
+      dplyr::left_join(factset_entity_id__credit_parent_id, by = "factset_entity_id")
+
+    logger::log_trace("Downloading merged entity info from database.")
+    entity_info <- dplyr::collect(entity_info)
+    logger::log_trace("Download complete.")
+
+    logger::log_trace("Disconnecting from database.")
+    DBI::dbDisconnect(factset_db)
+
+
+    # return prepared data -----------------------------------------------------
+    return(entity_info)
+  }
diff --git a/R/workflow.factset-package.R b/R/workflow.factset-package.R
new file mode 100644
index 0000000..2f30195
--- /dev/null
+++ b/R/workflow.factset-package.R
@@ -0,0 +1,7 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @importFrom dplyr %>%
+## usethis namespace: end
+NULL
diff --git a/man/get_factset_entity_info.Rd b/man/get_factset_entity_info.Rd
new file mode 100644
index 0000000..b163ad8
--- /dev/null
+++ b/man/get_factset_entity_info.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_factset_entity_info.R
+\name{get_factset_entity_info}
+\alias{get_factset_entity_info}
+\title{Get the entity info data from the FactSet database and prepare the
+\code{factset_entity_info} tibble}
+\usage{
+get_factset_entity_info(...)
+}
+\arguments{
+\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for
+specifying database connection parameters)}
+}
+\value{
+A tibble properly prepared to be saved as the
+\code{factset_entity_info.rds} output file
+}
+\description{
+Get the entity info data from the FactSet database and prepare the
+\code{factset_entity_info} tibble
+}
diff --git a/man/workflow.factset-package.Rd b/man/workflow.factset-package.Rd
new file mode 100644
index 0000000..ba4560a
--- /dev/null
+++ b/man/workflow.factset-package.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/workflow.factset-package.R
+\docType{package}
+\name{workflow.factset-package}
+\alias{workflow.factset}
+\alias{workflow.factset-package}
+\title{workflow.factset: Extract Financial Data for use in PACTA}
+\description{
+Extract data from a FactSet Postgres database for use as part of PACTA Data Preparation
+}
+\author{
+\strong{Maintainer}: CJ Yetman \email{cj@cjyetman.com} (\href{https://orcid.org/0000-0001-5099-9500}{ORCID}) [contractor]
+
+Authors:
+\itemize{
+  \item Jackson Hoffart \email{jackson.hoffart@gmail.com} (\href{https://orcid.org/0000-0002-8600-5042}{ORCID}) [contractor]
+  \item Jacob Kastl \email{jacob.kastl@gmail.com} [contractor]
+  \item Alex Axthelm \email{aaxthelm@rmi.org} (\href{https://orcid.org/0000-0001-8579-8565}{ORCID}) [contractor]
+}
+
+Other contributors:
+\itemize{
+  \item RMI \email{PACTA4investors@rmi.org} [copyright holder, funder]
+}
+
+}
+\keyword{internal}

From 2134fb60afc37b58259b823ffb335dbcc06448d9 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 16:55:58 +0100
Subject: [PATCH 04/33] Add R package linter

---
 .Rbuildignore                       |  1 +
 .github/.gitignore                  |  1 +
 .github/workflows/lint-package.yaml | 32 +++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 .github/.gitignore
 create mode 100644 .github/workflows/lint-package.yaml

diff --git a/.Rbuildignore b/.Rbuildignore
index 8cc3750..d32b58f 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,3 +1,4 @@
 ^LICENSE\.md$
 .git/
 .github/
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/lint-package.yaml b/.github/workflows/lint-package.yaml
new file mode 100644
index 0000000..f4c4ef2
--- /dev/null
+++ b/.github/workflows/lint-package.yaml
@@ -0,0 +1,32 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: lint
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::lintr, local::.
+          needs: lint
+
+      - name: Lint
+        run: lintr::lint_package()
+        shell: Rscript {0}
+        env:
+          LINTR_ERROR_ON_LINT: true

From deca16e5423f8f023e7c7e54420f8d5b5c3e663b Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 17:31:24 +0100
Subject: [PATCH 05/33] Formatting changes for `lintr`

---
 R/connect_factset_db.R      | 41 +++++++++++++++++++++-------
 R/get_factset_entity_info.R | 54 +++++++++++++++++++++++++++++--------
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R
index 00e0c37..fbd66e2 100644
--- a/R/connect_factset_db.R
+++ b/R/connect_factset_db.R
@@ -11,31 +11,44 @@ connect_factset_db <-
       keyring_service_name = "2dii_factset_database") {
 
     if (username == "") {
-      logger::log_error("No database username could be found. Please set the username as an environment variable")
+      logger::log_error(
+        "No database username could be found. ",
+        "Please set the username as an environment variable"
+      )
     }
 
     if (password == "") {
       # if password not defined in .env, look in systems keyring
       if (requireNamespace("keyring", quietly = TRUE)) {
-        if (!username %in% keyring::key_list(service = keyring_service_name)$username) {
+        if (
+          !username %in% keyring::key_list(
+            service = keyring_service_name
+          )$username
+        ) {
+          keyring_prompt <- paste(
+            "Enter password for the FactSet database",
+            "(it will be stored in your system's keyring):"
+          )
           keyring::key_set(
             service = keyring_service_name,
             username = username,
-            prompt = "Enter password for the FactSet database (it will be stored in your system's keyring): "
+            prompt = keyring_prompt
           )
         }
         password <- keyring::key_get(
           service = keyring_service_name,
           username = username
         )
-      } else if (interactive() && requireNamespace("rstudioapi", quietly = TRUE)) {
+      } else if (
+        interactive() && requireNamespace("rstudioapi", quietly = TRUE)
+      ) {
         password <- rstudioapi::askForPassword(
           prompt = "Please enter the FactSet database password:"
         )
       } else {
         logger::log_error(
-          "No database password could be found. Please set the password
-          as an environment variable"
+          "No database password could be found. ",
+          "Please set the password as an environment variable"
         )
       }
     }
@@ -58,7 +71,7 @@ connect_factset_db <-
   }
 
 # connection finalizer to ensure connection is closed --------------------------
-# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/
+# adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/ #nolint
 
 reg_conn_finalizer <- function(conn, close_fun, envir) {
   is_parent_global <- identical(.GlobalEnv, envir)
@@ -70,7 +83,11 @@ reg_conn_finalizer <- function(conn, close_fun, envir) {
 
     reg.finalizer(env_finalizer, function(e) {
       if (DBI::dbIsValid(e$conn)) {
-        logger::log_warn("Warning: A database connection was closed automatically because the connection object was removed or the R session was closed.")
+        logger::log_warn(
+          "Warning: A database connection was closed automatically ",
+          "because the connection object was removed ",
+          "or the R session was closed."
+        )
         try(close_fun(e$conn))
       }
     }, onexit = TRUE)
@@ -82,8 +99,12 @@ reg_conn_finalizer <- function(conn, close_fun, envir) {
           host <- DBI::dbGetInfo(conn)$host
 
           logger::log_warn(
-            "The database connection to {dbname} on {host} was
-        closed automatically because the calling environment was closed."
+            "The database connection to ",
+            dbname,
+            " on ",
+            host,
+            " was closed automatically ",
+            "because the calling environment was closed."
           )
           try(close_fun(conn))
         }
diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R
index 6455bd3..8c8a2b3 100644
--- a/R/get_factset_entity_info.R
+++ b/R/get_factset_entity_info.R
@@ -46,7 +46,10 @@ get_factset_entity_info <-
 
     factset_entity_id__factset_sector_desc <-
       factset_entity_id__sector_code %>%
-      dplyr::left_join(factset_sector_code__factset_sector_desc, by = c("sector_code" = "factset_sector_code")) %>%
+      dplyr::left_join(
+        factset_sector_code__factset_sector_desc,
+        by = c("sector_code" = "factset_sector_code")
+      ) %>%
       dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc")
 
 
@@ -63,20 +66,35 @@ get_factset_entity_info <-
 
     factset_entity_id__factset_industry_desc <-
       factset_entity_id__industry_code %>%
-      dplyr::left_join(factset_industry_code_factset_industry_desc, by = c("industry_code" = "factset_industry_code")) %>%
-      dplyr::select("factset_entity_id", "industry_code", "factset_industry_desc")
+      dplyr::left_join(
+        factset_industry_code_factset_industry_desc,
+        by = c("industry_code" = "factset_industry_code")
+      ) %>%
+      dplyr::select(
+        "factset_entity_id",
+        "industry_code",
+        "factset_industry_desc"
+      )
 
 
     # credit risk parent -------------------------------------------------------
 
     logger::log_trace("Accessing entity credit risk parent.")
-    ent_v1_ent_entity_affiliates <- dplyr::tbl(factset_db, "ent_v1_ent_entity_affiliates")
-    ref_v2_affiliate_type_map <- dplyr::tbl(factset_db, "ref_v2_affiliate_type_map")
+    ent_v1_ent_entity_affiliates <- dplyr::tbl(
+      factset_db,
+      "ent_v1_ent_entity_affiliates"
+    )
+    ref_v2_affiliate_type_map <- dplyr::tbl(
+      factset_db,
+      "ref_v2_affiliate_type_map"
+    )
 
     ent_entity_affiliates_last_update <-
       dplyr::tbl(factset_db, "fds_fds_file_history") %>%
       dplyr::filter(.data$table_name == "ent_entity_affiliates") %>%
-      dplyr::filter(.data$begin_time == max(.data$begin_time, na.rm = TRUE)) %>%
+      dplyr::filter(
+        .data$begin_time == max(.data$begin_time, na.rm = TRUE)
+      ) %>%
       dplyr::pull("begin_time")
 
     factset_entity_id__credit_parent_id <-
@@ -87,7 +105,9 @@ get_factset_entity_info <-
         factset_entity_id = "factset_affiliated_entity_id",
         credit_parent_id = "factset_entity_id"
       ) %>%
-      dplyr::mutate(ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update)
+      dplyr::mutate(
+        ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update
+      )
 
 
     # merge and collect --------------------------------------------------------
@@ -95,10 +115,22 @@ get_factset_entity_info <-
     logger::log_trace("Merging entity info.")
     entity_info <-
       factset_entity_id__entity_proper_name %>%
-      dplyr::left_join(factset_entity_id__iso_country, by = "factset_entity_id") %>%
-      dplyr::left_join(factset_entity_id__factset_sector_desc, by = "factset_entity_id") %>%
-      dplyr::left_join(factset_entity_id__factset_industry_desc, by = "factset_entity_id") %>%
-      dplyr::left_join(factset_entity_id__credit_parent_id, by = "factset_entity_id")
+      dplyr::left_join(
+        factset_entity_id__iso_country,
+        by = "factset_entity_id"
+      ) %>%
+      dplyr::left_join(
+        factset_entity_id__factset_sector_desc,
+        by = "factset_entity_id"
+      ) %>%
+      dplyr::left_join(
+        factset_entity_id__factset_industry_desc,
+        by = "factset_entity_id"
+      ) %>%
+      dplyr::left_join(
+        factset_entity_id__credit_parent_id,
+        by = "factset_entity_id"
+      )
 
     logger::log_trace("Downloading merged entity info from database.")
     entity_info <- dplyr::collect(entity_info)

From c4f788af15f3155e33f02ce6885e01f63c5d9b4a Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 17:56:53 +0100
Subject: [PATCH 06/33] Add overall exporting function

---
 Dockerfile                |  2 +-
 NAMESPACE                 |  1 +
 R/export_pacta_files.R    | 24 ++++++++++++++++++++++++
 man/export_pacta_files.Rd | 21 +++++++++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 R/export_pacta_files.R
 create mode 100644 man/export_pacta_files.Rd

diff --git a/Dockerfile b/Dockerfile
index ac67d99..ab60f34 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -58,4 +58,4 @@ RUN Rscript -e "\
 USER runner-workflow-factset
 
 # set default run behavior
-CMD ["input_dir/default_config.json"]
+CMD ["Rscript", "-e", "workflow.factset::export_pacta_files()"]
diff --git a/NAMESPACE b/NAMESPACE
index f4f7aae..19bc627 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,4 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
+export(export_pacta_files)
 export(get_factset_entity_info)
 importFrom(dplyr,"%>%")
diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
new file mode 100644
index 0000000..733e0eb
--- /dev/null
+++ b/R/export_pacta_files.R
@@ -0,0 +1,24 @@
+#' Export files for use in PACTA data preparation
+#'
+#' @param Destination directory for the output files
+#'
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return NULL
+#'
+#' @export
+
+export_pacta_files <- function(
+  destination = file.path("."),
+  data_timestamp = Sys.time(),
+  ...
+) {
+
+  factset_entity_info_path <- file.path(destination, "factset_entity_info.rds")
+  logger::log_info("Fetching entity info data... ")
+  entity_info <- get_factset_entity_info(...)
+  saveRDS(object = entity_info, file = factset_entity_info_path)
+
+  return(invisible(NULL))
+}
diff --git a/man/export_pacta_files.Rd b/man/export_pacta_files.Rd
new file mode 100644
index 0000000..7977cfa
--- /dev/null
+++ b/man/export_pacta_files.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/export_pacta_files.R
+\name{export_pacta_files}
+\alias{export_pacta_files}
+\title{Export files for use in PACTA data preparation}
+\usage{
+export_pacta_files(
+  destination = file.path("."),
+  data_timestamp = Sys.time(),
+  ...
+)
+}
+\arguments{
+\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for
+specifying database connection parameters)}
+
+\item{Destination}{directory for the output files}
+}
+\description{
+Export files for use in PACTA data preparation
+}

From 0bc81eae941ba9d6e394b255c29ea857b51639d3 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 22:40:39 +0100
Subject: [PATCH 07/33] install package deps separate from package

this allows us to leverage the build cache
---
 Dockerfile | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ab60f34..1d8a5d2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,6 +47,14 @@ RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_
       # install packages for dependency resolution and installation
       && Rscript -e "install.packages(c('pak', 'jsonlite'))"
 
+# Install R deopendencies
+COPY DESCRIPTION /workflow.factset/DESCRIPTION
+
+# install R package dependencies
+RUN Rscript -e "\
+  deps <- pak::local_install_deps(root = '/workflow.factset'); \
+  "
+
 # copy in everything from this repo
 COPY . /workflow.factset
 
@@ -58,4 +66,4 @@ RUN Rscript -e "\
 USER runner-workflow-factset
 
 # set default run behavior
-CMD ["Rscript", "-e", "workflow.factset::export_pacta_files()"]
+CMD ["Rscript", "-e", "logger::log_threshold(Sys.getenv('LOG_LEVEL', 'INFO'));workflow.factset::export_pacta_files()"]

From fce4809e907856e403387c0d12af210fc785fb38 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Tue, 12 Dec 2023 22:43:36 +0100
Subject: [PATCH 08/33] Pull arguments for exporting function from envvars

---
 .gitignore             |  1 +
 R/connect_factset_db.R | 12 ++++----
 R/export_pacta_files.R | 63 ++++++++++++++++++++++++++++++++++++++----
 README.md              |  7 +++++
 example.env            |  7 +++++
 5 files changed, 79 insertions(+), 11 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 example.env

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4c49bd7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R
index fbd66e2..8b44a3f 100644
--- a/R/connect_factset_db.R
+++ b/R/connect_factset_db.R
@@ -2,13 +2,13 @@
 
 connect_factset_db <-
   function(
-      dbname = "delta",
-      host = "data-eval-db.postgres.database.azure.com",
-      port = 5432L,
+      dbname = Sys.getenv("PGDATABASE"),
+      host = Sys.getenv("PGHOST"),
+      port = Sys.getenv("PGPORT", 5432L),
       options = "-c search_path=fds",
-      username = Sys.getenv("R_DATABASE_USER"),
-      password = Sys.getenv("R_DATABASE_PASSWORD"),
-      keyring_service_name = "2dii_factset_database") {
+      username = Sys.getenv("PGUSER"),
+      password = Sys.getenv("PGPASSWORD"),
+      keyring_service_name = "factset_database") {
 
     if (username == "") {
       logger::log_error(
diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index 733e0eb..30b3745 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -10,15 +10,68 @@
 #' @export
 
 export_pacta_files <- function(
-  destination = file.path("."),
-  data_timestamp = Sys.time(),
+  destination = file.path(Sys.getenv("EXPORT_DESTINATION")),
+  data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()),
   ...
 ) {
 
-  factset_entity_info_path <- file.path(destination, "factset_entity_info.rds")
-  logger::log_info("Fetching entity info data... ")
+  # Prepare output directories
+
+  if (!dir.exists(destination)) {
+    logger::log_error(
+      "The destination directory {destination} does not exist."
+    )
+    stop("Destination directory does not exist.")
+  }
+
+  if (Sys.getenv("DEPLOY_START_TIME") == "") {
+    logger::log_warn(
+      "The environment variable DEPLOY_START_TIME is not set. ",
+      "Using current system time as start time."
+    )
+  }
+
+  start_time <- Sys.getenv(
+    "DEPLOY_START_TIME",
+    format(Sys.time(), format = "%Y%m%dT%H%M%S", tz = "UTC"),
+    )
+
+  if (inherits(data_timestamp, "character")) {
+    data_timestamp <- lubridate::ymd_hms(
+      data_timestamp,
+      quiet = TRUE,
+      tz = "UTC",
+      truncated = 3
+    )
+  }
+
+  if (inherits(data_timestamp, "POSIXct")) {
+    data_timestamp <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC")
+  }
+
+  export_dir <- file.path(
+    destination,
+    paste0(data_timestamp, "_pulled", start_time)
+    )
+
+  if (!dir.exists(export_dir)) {
+    dir.create(export_dir, recursive = TRUE)
+  }
+
+  # Start Extracting Data
+
+  factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds")
+  logger::log_info("Fetching entity info data.")
   entity_info <- get_factset_entity_info(...)
+  logger::log_info("Exporting entity info data to {factset_entity_info_path}")
   saveRDS(object = entity_info, file = factset_entity_info_path)
 
-  return(invisible(NULL))
+  log_info("Done with data export.")
+  return(
+    invisible(
+      list(
+        factset_entity_info_path = factset_entity_info_path
+      )
+    )
+  )
 }
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e1038c5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# workflow.pacta
+
+## Running container
+
+```sh
+docker run -i -t --rm --env-file=.env -v ./foo:/mnt/factset-data IMAGE_NAME
+```
diff --git a/example.env b/example.env
new file mode 100644
index 0000000..e615517
--- /dev/null
+++ b/example.env
@@ -0,0 +1,7 @@
+DEPLOY_START_TIME=20000101T000001
+EXPORT_DESTINATION=/mnt/factset-data
+LOG_LEVEL=TRACE
+PGDATABASE=FDS
+PGHOST=postgres.example.com
+PGPASSWORD=SuperSecrtPassw0rd
+PGUSER=postgres

From 6dc886200271d17742a91a6f3d057b7a80d9f09d Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Wed, 13 Dec 2023 16:39:22 +0100
Subject: [PATCH 09/33] improve error logging

---
 R/export_pacta_files.R | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index 30b3745..17780c6 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -31,7 +31,7 @@ export_pacta_files <- function(
     )
   }
 
-  start_time <- Sys.getenv(
+  start_time_chr <- Sys.getenv(
     "DEPLOY_START_TIME",
     format(Sys.time(), format = "%Y%m%dT%H%M%S", tz = "UTC"),
     )
@@ -46,12 +46,19 @@ export_pacta_files <- function(
   }
 
   if (inherits(data_timestamp, "POSIXct")) {
-    data_timestamp <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC")
+    data_timestamp_chr <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC")
+  } else {
+    logger::log_error(
+      "The data_timestamp argument must be a POSIXct object ",
+      "or a character string coercible to POSIXct format",
+      " (using lubridate::ymd_hms(truncated = 3))."
+    )
+    stop("Invalid data_timestamp argument.")
   }
 
   export_dir <- file.path(
     destination,
-    paste0(data_timestamp, "_pulled", start_time)
+    paste0(data_timestamp_chr, "_pulled", start_time_chr)
     )
 
   if (!dir.exists(export_dir)) {
@@ -66,7 +73,7 @@ export_pacta_files <- function(
   logger::log_info("Exporting entity info data to {factset_entity_info_path}")
   saveRDS(object = entity_info, file = factset_entity_info_path)
 
-  log_info("Done with data export.")
+  logger::log_info("Done with data export.")
   return(
     invisible(
       list(

From 8891ad4f1a07de34bf09c193a90e2cf8028e77bd Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Wed, 13 Dec 2023 17:17:24 +0100
Subject: [PATCH 10/33] Add Azure deploy Template

---
 .gitignore        |   1 +
 README.md         |  10 +++
 azure-deploy.json | 160 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+)
 create mode 100644 azure-deploy.json

diff --git a/.gitignore b/.gitignore
index 4c49bd7..e88cb47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .env
+azure-deploy.parameters.json
diff --git a/README.md b/README.md
index e1038c5..1a81e56 100644
--- a/README.md
+++ b/README.md
@@ -5,3 +5,13 @@
 ```sh
 docker run -i -t --rm --env-file=.env -v ./foo:/mnt/factset-data IMAGE_NAME
 ```
+
+```sh
+# change this value as needed.
+RESOURCEGROUP="myResourceGroup"
+
+# run from repo root
+
+az deployment group create --resource-group "$RESOURCEGROUP" --template-file azure-deploy.json --parameters @azure-deploy.parameters.json
+
+```
diff --git a/azure-deploy.json b/azure-deploy.json
new file mode 100644
index 0000000..443cab3
--- /dev/null
+++ b/azure-deploy.json
@@ -0,0 +1,160 @@
+{
+  "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+  "contentVersion": "0.0.0.5",
+
+  "parameters": {
+    "location": {
+      "type": "string",
+      "defaultValue": "[resourceGroup().location]",
+      "metadata": {
+        "description": "Location for all resources."
+      }
+    },
+    "identity": {
+      "type": "string",
+      "metadata": {
+        "description": "The ID of the user assigned identity to use for the container group."
+      }
+    },
+    "containerGroupName": {
+      "type": "string",
+      "metadata": {
+        "description": "The name of the container group."
+      }
+    },
+    "restartPolicy": {
+      "type": "string",
+      "defaultValue": "OnFailure",
+      "allowedValues": [
+        "Always",
+        "Never",
+        "OnFailure"
+      ],
+      "metadata": {
+        "description": "The behavior of Azure runtime if container has stopped."
+      }
+    },
+    "rawdata-storageaccountkey": {
+      "type": "securestring",
+      "metadata": {
+        "description": "The storage account key for the rawdata storage account."
+      }
+    },
+    "database-password": {
+      "type": "securestring",
+      "metadata": {
+        "description": "password to connect to database"
+      }
+    },
+    "starttime": {
+      "type": "string",
+      "defaultValue": "[utcNow()]",
+      "metadata": {
+        "description": "The time to start the container group."
+      }
+    }
+  },
+
+  "variables": {
+    "PGDATABASE": "FDS",
+    "PGHOST": "[concat('factset-01-postgres', '.postgres.database.azure.com')]",
+    "PGUSER": "postgres",
+    "containerregistry": "ghcr.io/rmi-pacta",
+    "machineCpuCores": 1,
+    "machineMemoryInGB": 4,
+    "mountPathExport": "/mnt/factset-extracted"
+  },
+
+  "functions": [],
+
+  "resources": [
+    {
+      "type": "Microsoft.ContainerInstance/containerGroups",
+      "apiVersion": "2021-09-01",
+      "name": "[parameters('containerGroupName')]",
+      "location": "[parameters('location')]",
+      "identity": {
+        "type": "UserAssigned",
+        "userAssignedIdentities": {
+          "[parameters('identity')]": {}
+        }
+      },
+      "properties": {
+        "containers": [
+          {
+            "name": "loader-runner",
+            "properties": {
+              "image": "[concat(variables('containerregistry'),'/workflow.factset:pr1')]",
+              "ports": [],
+              "resources": {
+                "requests": {
+                  "cpu": "[variables('machineCpuCores')]",
+                  "memoryInGB": "[variables('machineMemoryInGB')]"
+                }
+              },
+              "environmentVariables": [
+                {
+                  "name": "PGUSER",
+                  "value": "[variables('PGUSER')]"
+                },
+                {
+                  "name": "PGPASSWORD",
+                  "secureValue": "[parameters('database-password')]"
+                },
+                {
+                  "name": "PGHOST",
+                  "value": "[variables('PGHOST')]"
+                },
+                {
+                  "name": "PGDATABASE",
+                  "value": "[variables('PGDATABASE')]"
+                },
+                {
+                  "name": "DEPLOY_START_TIME",
+                  "value": "[parameters('starttime')]"
+                },
+                {
+                  "name": "MACHINE_CORES",
+                  "value": "[variables('machineCpuCores')]"
+                },
+                {
+                  "name": "LOG_LEVEL",
+                  "value": "TRACE"
+                },
+                {
+                  "name": "EXPORT_DESTINATION",
+                  "value": "[variables('mountPathExport')]"
+                },
+                                {
+                  "name": "DATA_TIMESTAMP",
+                  "value": "20230123"
+                }
+
+              ],
+              "volumeMounts": [
+                {
+                  "name": "factset-extracted",
+                  "mountPath": "[variables('mountPathExport')]"
+                }
+              ]
+            }
+          }
+        ],
+        "restartPolicy": "[parameters('restartPolicy')]",
+        "osType": "Linux",
+        "volumes": [
+          {
+            "name": "factset-extracted",
+            "azureFile": {
+              "shareName": "factset-extracted",
+              "readOnly": false,
+              "storageAccountName": "pactarawdata",
+              "storageAccountKey": "[parameters('rawdata-storageaccountkey')]"
+            }
+          }
+        ]
+      }
+    }
+  ],
+  "outputs": {}
+}

From 841cf6561365567aa415f2b30e3994f6b5f7898d Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Wed, 13 Dec 2023 17:18:55 +0100
Subject: [PATCH 11/33] linting

---
 R/export_pacta_files.R | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index 17780c6..8d1dec5 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -34,7 +34,7 @@ export_pacta_files <- function(
   start_time_chr <- Sys.getenv(
     "DEPLOY_START_TIME",
     format(Sys.time(), format = "%Y%m%dT%H%M%S", tz = "UTC"),
-    )
+  )
 
   if (inherits(data_timestamp, "character")) {
     data_timestamp <- lubridate::ymd_hms(
@@ -46,7 +46,11 @@ export_pacta_files <- function(
   }
 
   if (inherits(data_timestamp, "POSIXct")) {
-    data_timestamp_chr <- format(data_timestamp, format = "%Y%m%dT%H%M%S", tz = "UTC")
+    data_timestamp_chr <- format(
+      data_timestamp,
+      format = "%Y%m%dT%H%M%S",
+      tz = "UTC"
+    )
   } else {
     logger::log_error(
       "The data_timestamp argument must be a POSIXct object ",
@@ -59,7 +63,7 @@ export_pacta_files <- function(
   export_dir <- file.path(
     destination,
     paste0(data_timestamp_chr, "_pulled", start_time_chr)
-    )
+  )
 
   if (!dir.exists(export_dir)) {
     dir.create(export_dir, recursive = TRUE)

From b52af91b8181e4441fed42262d34324da8811790 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 09:19:37 +0100
Subject: [PATCH 12/33] Copy get_factset_financial_data

copy function from pacta.data.preparation
---
 R/get_factset_financial_data.R | 80 ++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 R/get_factset_financial_data.R

diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
new file mode 100644
index 0000000..dd809d5
--- /dev/null
+++ b/R/get_factset_financial_data.R
@@ -0,0 +1,80 @@
+#' Get the factset financial data from the FactSet database and prepare the
+#' `factset_financial_data` tibble
+#'
+#' @param data_timestamp A single string specifying the desired date for the
+#'   data in the form "2021-12-31"
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return A tibble properly prepared to be saved as the
+#'   `factset_financial_data.rds` output file
+#'
+#' @export
+
+get_factset_financial_data <-
+  function(data_timestamp, ...) {
+    # build connection to database ---------------------------------------------
+
+    factset_db <- connect_factset_db(...)
+
+
+    # fsym_id__factset_entity_id -----------------------------------------------
+
+    fsym_id__factset_entity_id <-
+      tbl(factset_db, "own_v5_own_sec_entity") %>%
+      select("fsym_id", "factset_entity_id")
+
+
+    # isin ---------------------------------------------------------------------
+
+    fsym_id__isin <- tbl(factset_db, "sym_v1_sym_isin")
+
+
+    # adj_price ----------------------------------------------------------------
+
+    fsym_id__adj_price <-
+      tbl(factset_db, "own_v5_own_sec_prices") %>%
+      dplyr::filter(.data$price_date == .env$data_timestamp) %>%
+      select("fsym_id", "adj_price")
+
+
+    # adj_shares_outstanding ---------------------------------------------------
+
+    fsym_id__adj_shares_outstanding <-
+      tbl(factset_db, "own_v5_own_sec_prices") %>%
+      dplyr::filter(.data$price_date == .env$data_timestamp) %>%
+      select("fsym_id", "adj_shares_outstanding")
+
+
+    # issue_type ---------------------------------------------------------------
+
+    fsym_id__issue_type <-
+      tbl(factset_db, "own_v5_own_sec_coverage") %>%
+      select("fsym_id", "issue_type")
+
+
+    # one_adr_eq ---------------------------------------------------------------
+
+    fsym_id__one_adr_eq <-
+      tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>%
+      select("fsym_id" = "adr_fsym_id", "one_adr_eq")
+
+
+    # merge and collect --------------------------------------------------------
+
+    fin_data <-
+      fsym_id__isin %>%
+      left_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
+      left_join(fsym_id__adj_price, by = "fsym_id") %>%
+      left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>%
+      left_join(fsym_id__issue_type, by = "fsym_id") %>%
+      left_join(fsym_id__one_adr_eq, by = "fsym_id") %>%
+      dplyr::collect()
+
+    DBI::dbDisconnect(factset_db)
+
+
+    # return prepared data -----------------------------------------------------
+
+    fin_data
+  }

From bf22e52ca9c16fc6a3b95d24dca68afb868dba98 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 10:05:17 +0100
Subject: [PATCH 13/33] Add financial data to export

---
 R/export_pacta_files.R         | 12 ++++++++
 R/get_factset_financial_data.R | 53 +++++++++++++++++++++-------------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index 8d1dec5..1bf2589 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -71,6 +71,18 @@ export_pacta_files <- function(
 
   # Start Extracting Data
 
+  factset_financial_data_path <- file.path(
+    export_dir,
+    "factset_financial_data.rds"
+  )
+  logger::log_info("Fetching financial data.")
+  financial_data <- get_factset_financial_data(
+    data_timestamp = data_timestamp,
+    ...
+  )
+  logger::log_info("Exporting financial data to {factset_financial_data_path}")
+  saveRDS(object = financial_data, file = factset_financial_data_path)
+
   factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds")
   logger::log_info("Fetching entity info data.")
   entity_info <- get_factset_entity_info(...)
diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
index dd809d5..30acd20 100644
--- a/R/get_factset_financial_data.R
+++ b/R/get_factset_financial_data.R
@@ -17,64 +17,77 @@ get_factset_financial_data <-
 
     factset_db <- connect_factset_db(...)
 
+    logger::log_debug("Extracting financial info from database.")
+    logger::log_info("using data timestamp: ", data_timestamp)
+
 
     # fsym_id__factset_entity_id -----------------------------------------------
 
+    logger::log_trace("Accessing entity id.")
     fsym_id__factset_entity_id <-
-      tbl(factset_db, "own_v5_own_sec_entity") %>%
-      select("fsym_id", "factset_entity_id")
+      dplyr::tbl(factset_db, "own_v5_own_sec_entity") %>%
+      dplyr::select("fsym_id", "factset_entity_id")
 
 
     # isin ---------------------------------------------------------------------
 
-    fsym_id__isin <- tbl(factset_db, "sym_v1_sym_isin")
+    logger::log_trace("Accessing ISINs.")
+    fsym_id__isin <- dplyr::tbl(factset_db, "sym_v1_sym_isin")
 
 
     # adj_price ----------------------------------------------------------------
 
+    browser()
+    logger::log_trace("Accessing share prices.")
     fsym_id__adj_price <-
-      tbl(factset_db, "own_v5_own_sec_prices") %>%
+      dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
-      select("fsym_id", "adj_price")
+      dplyr::select("fsym_id", "adj_price")
 
 
     # adj_shares_outstanding ---------------------------------------------------
 
+    logger::log_trace("Accessing shares outstanding.")
     fsym_id__adj_shares_outstanding <-
-      tbl(factset_db, "own_v5_own_sec_prices") %>%
+      dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
-      select("fsym_id", "adj_shares_outstanding")
+      dplyr::select("fsym_id", "adj_shares_outstanding")
 
 
     # issue_type ---------------------------------------------------------------
 
+    logger::log_trace("Accessing issue type.")
     fsym_id__issue_type <-
-      tbl(factset_db, "own_v5_own_sec_coverage") %>%
-      select("fsym_id", "issue_type")
+      dplyr::tbl(factset_db, "own_v5_own_sec_coverage") %>%
+      dplyr::select("fsym_id", "issue_type")
 
 
     # one_adr_eq ---------------------------------------------------------------
 
+    logger::log_trace("Accessing ADR equivilents.")
     fsym_id__one_adr_eq <-
-      tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>%
-      select("fsym_id" = "adr_fsym_id", "one_adr_eq")
+      dplyr::tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>%
+      dplyr::select("fsym_id" = "adr_fsym_id", "one_adr_eq")
 
 
     # merge and collect --------------------------------------------------------
 
+    logger::log_trace("Merging financial info.")
     fin_data <-
       fsym_id__isin %>%
-      left_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
-      left_join(fsym_id__adj_price, by = "fsym_id") %>%
-      left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>%
-      left_join(fsym_id__issue_type, by = "fsym_id") %>%
-      left_join(fsym_id__one_adr_eq, by = "fsym_id") %>%
-      dplyr::collect()
+      dplyr::left_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
+      dplyr::left_join(fsym_id__adj_price, by = "fsym_id") %>%
+      dplyr::left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>%
+      dplyr::left_join(fsym_id__issue_type, by = "fsym_id") %>%
+      dplyr::left_join(fsym_id__one_adr_eq, by = "fsym_id")
 
-    DBI::dbDisconnect(factset_db)
+    logger::log_trace("Downloading merged financial info from database.")
+    fin_data <- dplyr::collect(fin_data)
+    logger::log_trace("Download complete.")
 
+    logger::log_trace("Disconnecting from database.")
+    DBI::dbDisconnect(factset_db)
 
     # return prepared data -----------------------------------------------------
-
-    fin_data
+    return(fin_data)
   }

From 5842ca752d178dd15dca6f391fa466de12587fee Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 10:05:35 +0100
Subject: [PATCH 14/33] Increase memory request

---
 azure-deploy.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-deploy.json b/azure-deploy.json
index 443cab3..6a1a685 100644
--- a/azure-deploy.json
+++ b/azure-deploy.json
@@ -61,7 +61,7 @@
     "PGUSER": "postgres",
     "containerregistry": "ghcr.io/rmi-pacta",
     "machineCpuCores": 1,
-    "machineMemoryInGB": 4,
+    "machineMemoryInGB": 16,
     "mountPathExport": "/mnt/factset-extracted"
   },
 

From 574bc0e040f4cb0f63e3be7e16b38eea4eca1022 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 10:28:49 +0100
Subject: [PATCH 15/33] Externalize DB connection from extraction function

---
 R/export_pacta_files.R         | 18 +++++++++---------
 R/get_factset_entity_info.R    | 29 +++++++++++------------------
 R/get_factset_financial_data.R | 22 ++++++++--------------
 3 files changed, 28 insertions(+), 41 deletions(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index 1bf2589..feeed0d 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -2,17 +2,17 @@
 #'
 #' @param Destination directory for the output files
 #'
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
-#'   specifying database connection parameters)
+#' @param destination path to directory where exported files will be saved
+#' @param data_timestamp filter data as-of this timestamp
 #'
-#' @return NULL
+#' @return vector of paths to exported files
 #'
 #' @export
 
 export_pacta_files <- function(
+  conn = connect_factset_db(),
   destination = file.path(Sys.getenv("EXPORT_DESTINATION")),
-  data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()),
-  ...
+  data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time())
 ) {
 
   # Prepare output directories
@@ -77,22 +77,22 @@ export_pacta_files <- function(
   )
   logger::log_info("Fetching financial data.")
   financial_data <- get_factset_financial_data(
-    data_timestamp = data_timestamp,
-    ...
+    conn = conn,
+    data_timestamp = data_timestamp
   )
   logger::log_info("Exporting financial data to {factset_financial_data_path}")
   saveRDS(object = financial_data, file = factset_financial_data_path)
 
   factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds")
   logger::log_info("Fetching entity info data.")
-  entity_info <- get_factset_entity_info(...)
+  entity_info <- get_factset_entity_info(conn = conn)
   logger::log_info("Exporting entity info data to {factset_entity_info_path}")
   saveRDS(object = entity_info, file = factset_entity_info_path)
 
   logger::log_info("Done with data export.")
   return(
     invisible(
-      list(
+      c(
         factset_entity_info_path = factset_entity_info_path
       )
     )
diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R
index 8c8a2b3..f3939bf 100644
--- a/R/get_factset_entity_info.R
+++ b/R/get_factset_entity_info.R
@@ -1,8 +1,7 @@
 #' Get the entity info data from the FactSet database and prepare the
 #' `factset_entity_info` tibble
 #'
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
-#'   specifying database connection parameters)
+#' @param conn database connection
 #'
 #' @return A tibble properly prepared to be saved as the
 #'   `factset_entity_info.rds` output file
@@ -10,18 +9,16 @@
 #' @export
 
 get_factset_entity_info <-
-  function(...) {
+  function(conn) {
     # build connection to database ---------------------------------------------
 
-    factset_db <- connect_factset_db(...)
-
     logger::log_debug("Extracting entity info from database.")
 
     # company_name -------------------------------------------------------------
 
     logger::log_trace("Accessing entity proper names.")
     factset_entity_id__entity_proper_name <-
-      dplyr::tbl(factset_db, "sym_v1_sym_entity") %>%
+      dplyr::tbl(conn, "sym_v1_sym_entity") %>%
       dplyr::select("factset_entity_id", "entity_proper_name")
 
 
@@ -29,7 +26,7 @@ get_factset_entity_info <-
 
     logger::log_trace("Accessing entity country of domicile.")
     factset_entity_id__iso_country <-
-      dplyr::tbl(factset_db, "sym_v1_sym_entity") %>%
+      dplyr::tbl(conn, "sym_v1_sym_entity") %>%
       dplyr::select("factset_entity_id", "iso_country")
 
 
@@ -37,11 +34,11 @@ get_factset_entity_info <-
 
     logger::log_trace("Accessing entity sector.")
     factset_entity_id__sector_code <-
-      dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>%
+      dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>%
       dplyr::select("factset_entity_id", "sector_code")
 
     factset_sector_code__factset_sector_desc <-
-      dplyr::tbl(factset_db, "ref_v2_factset_sector_map") %>%
+      dplyr::tbl(conn, "ref_v2_factset_sector_map") %>%
       dplyr::select(.data$factset_sector_code, .data$factset_sector_desc)
 
     factset_entity_id__factset_sector_desc <-
@@ -57,11 +54,11 @@ get_factset_entity_info <-
 
     logger::log_trace("Accessing entity industry/sector/subsector.")
     factset_entity_id__industry_code <-
-      dplyr::tbl(factset_db, "sym_v1_sym_entity_sector") %>%
+      dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>%
       dplyr::select("factset_entity_id", "industry_code")
 
     factset_industry_code_factset_industry_desc <-
-      dplyr::tbl(factset_db, "ref_v2_factset_industry_map") %>%
+      dplyr::tbl(conn, "ref_v2_factset_industry_map") %>%
       dplyr::select("factset_industry_code", "factset_industry_desc")
 
     factset_entity_id__factset_industry_desc <-
@@ -81,16 +78,16 @@ get_factset_entity_info <-
 
     logger::log_trace("Accessing entity credit risk parent.")
     ent_v1_ent_entity_affiliates <- dplyr::tbl(
-      factset_db,
+      conn,
       "ent_v1_ent_entity_affiliates"
     )
     ref_v2_affiliate_type_map <- dplyr::tbl(
-      factset_db,
+      conn,
       "ref_v2_affiliate_type_map"
     )
 
     ent_entity_affiliates_last_update <-
-      dplyr::tbl(factset_db, "fds_fds_file_history") %>%
+      dplyr::tbl(conn, "fds_fds_file_history") %>%
       dplyr::filter(.data$table_name == "ent_entity_affiliates") %>%
       dplyr::filter(
         .data$begin_time == max(.data$begin_time, na.rm = TRUE)
@@ -136,10 +133,6 @@ get_factset_entity_info <-
     entity_info <- dplyr::collect(entity_info)
     logger::log_trace("Download complete.")
 
-    logger::log_trace("Disconnecting from database.")
-    DBI::dbDisconnect(factset_db)
-
-
     # return prepared data -----------------------------------------------------
     return(entity_info)
   }
diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
index 30acd20..bfbe3c4 100644
--- a/R/get_factset_financial_data.R
+++ b/R/get_factset_financial_data.R
@@ -1,10 +1,9 @@
 #' Get the factset financial data from the FactSet database and prepare the
 #' `factset_financial_data` tibble
 #'
+#' @param conn databse connection
 #' @param data_timestamp A single string specifying the desired date for the
 #'   data in the form "2021-12-31"
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
-#'   specifying database connection parameters)
 #'
 #' @return A tibble properly prepared to be saved as the
 #'   `factset_financial_data.rds` output file
@@ -12,11 +11,9 @@
 #' @export
 
 get_factset_financial_data <-
-  function(data_timestamp, ...) {
+  function(conn, data_timestamp, ...) {
     # build connection to database ---------------------------------------------
 
-    factset_db <- connect_factset_db(...)
-
     logger::log_debug("Extracting financial info from database.")
     logger::log_info("using data timestamp: ", data_timestamp)
 
@@ -25,14 +22,14 @@ get_factset_financial_data <-
 
     logger::log_trace("Accessing entity id.")
     fsym_id__factset_entity_id <-
-      dplyr::tbl(factset_db, "own_v5_own_sec_entity") %>%
+      dplyr::tbl(conn, "own_v5_own_sec_entity") %>%
       dplyr::select("fsym_id", "factset_entity_id")
 
 
     # isin ---------------------------------------------------------------------
 
     logger::log_trace("Accessing ISINs.")
-    fsym_id__isin <- dplyr::tbl(factset_db, "sym_v1_sym_isin")
+    fsym_id__isin <- dplyr::tbl(conn, "sym_v1_sym_isin")
 
 
     # adj_price ----------------------------------------------------------------
@@ -40,7 +37,7 @@ get_factset_financial_data <-
     browser()
     logger::log_trace("Accessing share prices.")
     fsym_id__adj_price <-
-      dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>%
+      dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
       dplyr::select("fsym_id", "adj_price")
 
@@ -49,7 +46,7 @@ get_factset_financial_data <-
 
     logger::log_trace("Accessing shares outstanding.")
     fsym_id__adj_shares_outstanding <-
-      dplyr::tbl(factset_db, "own_v5_own_sec_prices") %>%
+      dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
       dplyr::select("fsym_id", "adj_shares_outstanding")
 
@@ -58,7 +55,7 @@ get_factset_financial_data <-
 
     logger::log_trace("Accessing issue type.")
     fsym_id__issue_type <-
-      dplyr::tbl(factset_db, "own_v5_own_sec_coverage") %>%
+      dplyr::tbl(conn, "own_v5_own_sec_coverage") %>%
       dplyr::select("fsym_id", "issue_type")
 
 
@@ -66,7 +63,7 @@ get_factset_financial_data <-
 
     logger::log_trace("Accessing ADR equivilents.")
     fsym_id__one_adr_eq <-
-      dplyr::tbl(factset_db, "own_v5_own_sec_adr_ord_ratio") %>%
+      dplyr::tbl(conn, "own_v5_own_sec_adr_ord_ratio") %>%
       dplyr::select("fsym_id" = "adr_fsym_id", "one_adr_eq")
 
 
@@ -85,9 +82,6 @@ get_factset_financial_data <-
     fin_data <- dplyr::collect(fin_data)
     logger::log_trace("Download complete.")
 
-    logger::log_trace("Disconnecting from database.")
-    DBI::dbDisconnect(factset_db)
-
     # return prepared data -----------------------------------------------------
     return(fin_data)
   }

From 3623846dbe994b24b6c6ca26693e9c12f53ac1d9 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 12:19:51 +0100
Subject: [PATCH 16/33] Update namespace to include .data and .env

These are reexported form rlang to avoid lintr errors
---
 DESCRIPTION                  | 1 +
 NAMESPACE                    | 3 +++
 R/workflow.factset-package.R | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 852f0ee..4a5c654 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -35,6 +35,7 @@ Imports:
     dbplyr,
     dplyr,
     logger,
+    rlang,
     RPostgres,
     withr
 Suggests: 
diff --git a/NAMESPACE b/NAMESPACE
index 19bc627..a26e306 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,4 +2,7 @@
 
 export(export_pacta_files)
 export(get_factset_entity_info)
+export(get_factset_financial_data)
 importFrom(dplyr,"%>%")
+importFrom(rlang,.data)
+importFrom(rlang,.env)
diff --git a/R/workflow.factset-package.R b/R/workflow.factset-package.R
index 2f30195..4293484 100644
--- a/R/workflow.factset-package.R
+++ b/R/workflow.factset-package.R
@@ -3,5 +3,7 @@
 
 ## usethis namespace: start
 #' @importFrom dplyr %>%
+#' @importFrom rlang .data
+#' @importFrom rlang .env
 ## usethis namespace: end
 NULL

From 4ded8ce62b282d447e340dda8aa86601dba94e7d Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 13:06:30 +0100
Subject: [PATCH 17/33] Simplify variable names

---
 R/get_factset_financial_data.R | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
index bfbe3c4..a076f28 100644
--- a/R/get_factset_financial_data.R
+++ b/R/get_factset_financial_data.R
@@ -18,10 +18,10 @@ get_factset_financial_data <-
     logger::log_info("using data timestamp: ", data_timestamp)
 
 
-    # fsym_id__factset_entity_id -----------------------------------------------
+    # factset_entity_id -----------------------------------------------
 
     logger::log_trace("Accessing entity id.")
-    fsym_id__factset_entity_id <-
+    factset_entity_id <-
       dplyr::tbl(conn, "own_v5_own_sec_entity") %>%
       dplyr::select("fsym_id", "factset_entity_id")
 
@@ -29,14 +29,14 @@ get_factset_financial_data <-
     # isin ---------------------------------------------------------------------
 
     logger::log_trace("Accessing ISINs.")
-    fsym_id__isin <- dplyr::tbl(conn, "sym_v1_sym_isin")
+    isin <- dplyr::tbl(conn, "sym_v1_sym_isin")
 
 
     # adj_price ----------------------------------------------------------------
 
     browser()
     logger::log_trace("Accessing share prices.")
-    fsym_id__adj_price <-
+    adj_price <-
       dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
       dplyr::select("fsym_id", "adj_price")
@@ -45,7 +45,7 @@ get_factset_financial_data <-
     # adj_shares_outstanding ---------------------------------------------------
 
     logger::log_trace("Accessing shares outstanding.")
-    fsym_id__adj_shares_outstanding <-
+    adj_shares_outstanding <-
       dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
       dplyr::select("fsym_id", "adj_shares_outstanding")
@@ -54,7 +54,7 @@ get_factset_financial_data <-
     # issue_type ---------------------------------------------------------------
 
     logger::log_trace("Accessing issue type.")
-    fsym_id__issue_type <-
+    issue_type <-
       dplyr::tbl(conn, "own_v5_own_sec_coverage") %>%
       dplyr::select("fsym_id", "issue_type")
 
@@ -62,7 +62,7 @@ get_factset_financial_data <-
     # one_adr_eq ---------------------------------------------------------------
 
     logger::log_trace("Accessing ADR equivilents.")
-    fsym_id__one_adr_eq <-
+    one_adr_eq <-
       dplyr::tbl(conn, "own_v5_own_sec_adr_ord_ratio") %>%
       dplyr::select("fsym_id" = "adr_fsym_id", "one_adr_eq")
 
@@ -71,12 +71,12 @@ get_factset_financial_data <-
 
     logger::log_trace("Merging financial info.")
     fin_data <-
-      fsym_id__isin %>%
-      dplyr::left_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
-      dplyr::left_join(fsym_id__adj_price, by = "fsym_id") %>%
-      dplyr::left_join(fsym_id__adj_shares_outstanding, by = "fsym_id") %>%
-      dplyr::left_join(fsym_id__issue_type, by = "fsym_id") %>%
-      dplyr::left_join(fsym_id__one_adr_eq, by = "fsym_id")
+      isin %>%
+      dplyr::left_join(factset_entity_id, by = "fsym_id") %>%
+      dplyr::left_join(adj_price, by = "fsym_id") %>%
+      dplyr::left_join(adj_shares_outstanding, by = "fsym_id") %>%
+      dplyr::left_join(issue_type, by = "fsym_id") %>%
+      dplyr::left_join(one_adr_eq, by = "fsym_id")
 
     logger::log_trace("Downloading merged financial info from database.")
     fin_data <- dplyr::collect(fin_data)

From ee5a6cd0195334289a81d7ee2a7c5d0a3f4856b1 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 13:18:12 +0100
Subject: [PATCH 18/33] simplify variable names

---
 R/get_factset_entity_info.R | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R
index f3939bf..5333dd9 100644
--- a/R/get_factset_entity_info.R
+++ b/R/get_factset_entity_info.R
@@ -17,7 +17,7 @@ get_factset_entity_info <-
     # company_name -------------------------------------------------------------
 
     logger::log_trace("Accessing entity proper names.")
-    factset_entity_id__entity_proper_name <-
+    entity_proper_name <-
       dplyr::tbl(conn, "sym_v1_sym_entity") %>%
       dplyr::select("factset_entity_id", "entity_proper_name")
 
@@ -25,7 +25,7 @@ get_factset_entity_info <-
     # country_of_domicile ------------------------------------------------------
 
     logger::log_trace("Accessing entity country of domicile.")
-    factset_entity_id__iso_country <-
+    iso_country <-
       dplyr::tbl(conn, "sym_v1_sym_entity") %>%
       dplyr::select("factset_entity_id", "iso_country")
 
@@ -33,18 +33,18 @@ get_factset_entity_info <-
     # sector -------------------------------------------------------------------
 
     logger::log_trace("Accessing entity sector.")
-    factset_entity_id__sector_code <-
+    sector_code <-
       dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>%
       dplyr::select("factset_entity_id", "sector_code")
 
-    factset_sector_code__factset_sector_desc <-
+    sector_code__sector_desc <-
       dplyr::tbl(conn, "ref_v2_factset_sector_map") %>%
       dplyr::select(.data$factset_sector_code, .data$factset_sector_desc)
 
-    factset_entity_id__factset_sector_desc <-
-      factset_entity_id__sector_code %>%
+    factset_sector_desc <-
+      sector_code %>%
       dplyr::left_join(
-        factset_sector_code__factset_sector_desc,
+        sector_code__sector_desc,
         by = c("sector_code" = "factset_sector_code")
       ) %>%
       dplyr::select("factset_entity_id", "sector_code", "factset_sector_desc")
@@ -53,18 +53,18 @@ get_factset_entity_info <-
     # sub-sector/industry ------------------------------------------------------
 
     logger::log_trace("Accessing entity industry/sector/subsector.")
-    factset_entity_id__industry_code <-
+    industry_code <-
       dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>%
       dplyr::select("factset_entity_id", "industry_code")
 
-    factset_industry_code_factset_industry_desc <-
+    industry_code__industry_desc <-
       dplyr::tbl(conn, "ref_v2_factset_industry_map") %>%
       dplyr::select("factset_industry_code", "factset_industry_desc")
 
-    factset_entity_id__factset_industry_desc <-
-      factset_entity_id__industry_code %>%
+    factset_industry_desc <-
+      industry_code %>%
       dplyr::left_join(
-        factset_industry_code_factset_industry_desc,
+        industry_code__industry_desc,
         by = c("industry_code" = "factset_industry_code")
       ) %>%
       dplyr::select(
@@ -94,7 +94,7 @@ get_factset_entity_info <-
       ) %>%
       dplyr::pull("begin_time")
 
-    factset_entity_id__credit_parent_id <-
+    credit_parent_id <-
       ent_v1_ent_entity_affiliates %>%
       dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>%
       dplyr::filter(.data$aff_type_desc == "Credit Risk Parent") %>%
@@ -111,21 +111,21 @@ get_factset_entity_info <-
 
     logger::log_trace("Merging entity info.")
     entity_info <-
-      factset_entity_id__entity_proper_name %>%
+      entity_proper_name %>%
       dplyr::left_join(
-        factset_entity_id__iso_country,
+        iso_country,
         by = "factset_entity_id"
       ) %>%
       dplyr::left_join(
-        factset_entity_id__factset_sector_desc,
+        factset_sector_desc,
         by = "factset_entity_id"
       ) %>%
       dplyr::left_join(
-        factset_entity_id__factset_industry_desc,
+        factset_industry_desc,
         by = "factset_entity_id"
       ) %>%
       dplyr::left_join(
-        factset_entity_id__credit_parent_id,
+        credit_parent_id,
         by = "factset_entity_id"
       )
 

From 9b436d6d34a65c9e95a848a5482b617fd5ed1670 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 15:49:19 +0100
Subject: [PATCH 19/33] Terminate connection if created in function

---
 R/export_pacta_files.R | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index feeed0d..a05b46e 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -12,7 +12,11 @@
 export_pacta_files <- function(
   conn = connect_factset_db(),
   destination = file.path(Sys.getenv("EXPORT_DESTINATION")),
-  data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time())
+  data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()),
+  terminate_connection = (
+    # Terminate connection if it was created by this function.
+    deparse(substitute(conn)) == formals(export_pacta_files)[["conn"]]
+  )
 ) {
 
   # Prepare output directories
@@ -90,6 +94,13 @@ export_pacta_files <- function(
   saveRDS(object = entity_info, file = factset_entity_info_path)
 
   logger::log_info("Done with data export.")
+
+  # Terminate connection if needed
+  if (terminate_connection) {
+    logger::log_info("Terminating database connection.")
+    DBI::dbDisconnect(conn)
+  }
+
   return(
     invisible(
       c(

From 0a68abd59f9b3bb0441db73fcbc5b82e3e298e6d Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 16:08:33 +0100
Subject: [PATCH 20/33] improve lgging in autofinalizing DB connection

---
 R/connect_factset_db.R | 154 +++++++++++++++++++----------------------
 1 file changed, 71 insertions(+), 83 deletions(-)

diff --git a/R/connect_factset_db.R b/R/connect_factset_db.R
index 8b44a3f..af938cd 100644
--- a/R/connect_factset_db.R
+++ b/R/connect_factset_db.R
@@ -1,79 +1,66 @@
-# Connection function
+#' Export files for use in PACTA data preparation
+#'
+#' @param dbname name of the database to connect to
+#' @param host hostname of the server to connect to
+#' @param port port number of the server to connect to
+#' @param options additional options to pass to the database connection.
+#' Typically used to define schema search path.
+#' @param username username to use for the database connection
+#' @param password password to use for the database connection
+#'
+#' @return a database connection object
+#'
+#' @export
 
-connect_factset_db <-
-  function(
-      dbname = Sys.getenv("PGDATABASE"),
-      host = Sys.getenv("PGHOST"),
-      port = Sys.getenv("PGPORT", 5432L),
-      options = "-c search_path=fds",
-      username = Sys.getenv("PGUSER"),
-      password = Sys.getenv("PGPASSWORD"),
-      keyring_service_name = "factset_database") {
 
-    if (username == "") {
-      logger::log_error(
-        "No database username could be found. ",
-        "Please set the username as an environment variable"
-      )
-    }
+connect_factset_db <- function(
+  dbname = Sys.getenv("PGDATABASE"),
+  host = Sys.getenv("PGHOST"),
+  port = Sys.getenv("PGPORT", 5432L),
+  options = "-c search_path=fds",
+  username = Sys.getenv("PGUSER"),
+  password = Sys.getenv("PGPASSWORD")
+) {
 
-    if (password == "") {
-      # if password not defined in .env, look in systems keyring
-      if (requireNamespace("keyring", quietly = TRUE)) {
-        if (
-          !username %in% keyring::key_list(
-            service = keyring_service_name
-          )$username
-        ) {
-          keyring_prompt <- paste(
-            "Enter password for the FactSet database",
-            "(it will be stored in your system's keyring):"
-          )
-          keyring::key_set(
-            service = keyring_service_name,
-            username = username,
-            prompt = keyring_prompt
-          )
-        }
-        password <- keyring::key_get(
-          service = keyring_service_name,
-          username = username
-        )
-      } else if (
-        interactive() && requireNamespace("rstudioapi", quietly = TRUE)
-      ) {
-        password <- rstudioapi::askForPassword(
-          prompt = "Please enter the FactSet database password:"
-        )
-      } else {
-        logger::log_error(
-          "No database password could be found. ",
-          "Please set the password as an environment variable"
-        )
-      }
-    }
-
-    logger::log_trace(
-      "Connecting to database {dbname} on {host}:{port} as {username}"
+  if (username == "") {
+    logger::log_error(
+      "No database username could be found. ",
+      "Please set the username as an environment variable"
     )
-    conn <-
-      DBI::dbConnect(
-        drv = RPostgres::Postgres(),
-        dbname = dbname,
-        host = host,
-        port = port,
-        user = username,
-        password = password,
-        options = options
-      )
+  }
 
-    reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame())
+  if (password == "") {
+    logger::log_error(
+      "No database password could be found. ",
+      "Please set the password as an environment variable"
+    )
   }
 
+  logger::log_trace(
+    "Connecting to database {dbname} on {host}:{port} as {username}"
+  )
+  conn <-
+    DBI::dbConnect(
+      drv = RPostgres::Postgres(),
+      dbname = dbname,
+      host = host,
+      port = port,
+      user = username,
+      password = password,
+      options = options
+    )
+
+  reg_conn_finalizer(conn, DBI::dbDisconnect, parent.frame())
+}
+
 # connection finalizer to ensure connection is closed --------------------------
 # adapted from: https://shrektan.com/post/2019/07/26/create-a-database-connection-that-can-be-disconnected-automatically/ #nolint
 
-reg_conn_finalizer <- function(conn, close_fun, envir) {
+reg_conn_finalizer <- function(
+  conn,
+  close_fun,
+  envir
+) {
   is_parent_global <- identical(.GlobalEnv, envir)
 
   if (isTRUE(is_parent_global)) {
@@ -83,29 +70,17 @@ reg_conn_finalizer <- function(conn, close_fun, envir) {
 
     reg.finalizer(env_finalizer, function(e) {
       if (DBI::dbIsValid(e$conn)) {
-        logger::log_warn(
-          "Warning: A database connection was closed automatically ",
-          "because the connection object was removed ",
-          "or the R session was closed."
-        )
+        warn_db_autoclose(e$conn)
         try(close_fun(e$conn))
       }
-    }, onexit = TRUE)
+    },
+    onexit = TRUE
+    )
   } else {
     withr::defer(
       {
         if (DBI::dbIsValid(conn)) {
-          dbname <- DBI::dbGetInfo(conn)$dbname
-          host <- DBI::dbGetInfo(conn)$host
-
-          logger::log_warn(
-            "The database connection to ",
-            dbname,
-            " on ",
-            host,
-            " was closed automatically ",
-            "because the calling environment was closed."
-          )
+          warn_db_autoclose(conn)
           try(close_fun(conn))
         }
       },
@@ -117,3 +92,16 @@ reg_conn_finalizer <- function(conn, close_fun, envir) {
   logger::log_trace("Database connection registered for finalization")
   return(conn)
 }
+
+warn_db_autoclose <- function(conn) {
+  dbname <- DBI::dbGetInfo(conn)$dbname
+  host <- DBI::dbGetInfo(conn)$host
+  logger::log_warn(
+    "The database connection to ",
+    dbname,
+    " on ",
+    host,
+    " was closed automatically ",
+    "because the calling environment was closed."
+  )
+}

From 705c303c5d3ba363730d5002bee606e7e4b855fe Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 16:23:53 +0100
Subject: [PATCH 21/33] Improve log messages

---
 R/get_factset_financial_data.R | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
index a076f28..4be7c3c 100644
--- a/R/get_factset_financial_data.R
+++ b/R/get_factset_financial_data.R
@@ -35,7 +35,10 @@ get_factset_financial_data <-
     # adj_price ----------------------------------------------------------------
 
     browser()
-    logger::log_trace("Accessing share prices.")
+    logger::log_trace(
+      "Accessing share prices. ",
+      "Filtering to date: {data_timestamp}"
+    )
     adj_price <-
       dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%
@@ -44,7 +47,10 @@ get_factset_financial_data <-
 
     # adj_shares_outstanding ---------------------------------------------------
 
-    logger::log_trace("Accessing shares outstanding.")
+    logger::log_trace(
+      "Accessing shares outstanding. ",
+      "Filtering to date: {data_timestamp}"
+    )
     adj_shares_outstanding <-
       dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
       dplyr::filter(.data$price_date == .env$data_timestamp) %>%

From f684b9477a43e5afb6028e29bd5f412edf6cafb5 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 16:44:34 +0100
Subject: [PATCH 22/33] Improve logging

---
 R/get_factset_entity_info.R | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R
index 5333dd9..61b90ea 100644
--- a/R/get_factset_entity_info.R
+++ b/R/get_factset_entity_info.R
@@ -37,10 +37,12 @@ get_factset_entity_info <-
       dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>%
       dplyr::select("factset_entity_id", "sector_code")
 
+    logger::log_trace("Accessing sector descriptions.")
     sector_code__sector_desc <-
       dplyr::tbl(conn, "ref_v2_factset_sector_map") %>%
       dplyr::select(.data$factset_sector_code, .data$factset_sector_desc)
 
+    logger::log_trace("Merging sector codes and sector descriptions.")
     factset_sector_desc <-
       sector_code %>%
       dplyr::left_join(
@@ -52,15 +54,17 @@ get_factset_entity_info <-
 
     # sub-sector/industry ------------------------------------------------------
 
-    logger::log_trace("Accessing entity industry/sector/subsector.")
+    logger::log_trace("Accessing entity industry codes.")
     industry_code <-
       dplyr::tbl(conn, "sym_v1_sym_entity_sector") %>%
       dplyr::select("factset_entity_id", "industry_code")
 
+    logger::log_trace("Accessing industry descriptions")
     industry_code__industry_desc <-
       dplyr::tbl(conn, "ref_v2_factset_industry_map") %>%
       dplyr::select("factset_industry_code", "factset_industry_desc")
 
+    logger::log_trace("Merging industry codes and industry descriptions.")
     factset_industry_desc <-
       industry_code %>%
       dplyr::left_join(
@@ -76,16 +80,19 @@ get_factset_entity_info <-
 
     # credit risk parent -------------------------------------------------------
 
-    logger::log_trace("Accessing entity credit risk parent.")
+    logger::log_trace("Accessing entity affiliates.")
     ent_v1_ent_entity_affiliates <- dplyr::tbl(
       conn,
       "ent_v1_ent_entity_affiliates"
     )
+
+    logger::log_trace("Accessing affiliate type map.")
     ref_v2_affiliate_type_map <- dplyr::tbl(
       conn,
       "ref_v2_affiliate_type_map"
     )
 
+    logger::log_trace("Determining last update time for entity affiliates.")
     ent_entity_affiliates_last_update <-
       dplyr::tbl(conn, "fds_fds_file_history") %>%
       dplyr::filter(.data$table_name == "ent_entity_affiliates") %>%
@@ -94,6 +101,7 @@ get_factset_entity_info <-
       ) %>%
       dplyr::pull("begin_time")
 
+    logger::log_trace("Determining credit risk parent via entity affiliates.")
     credit_parent_id <-
       ent_v1_ent_entity_affiliates %>%
       dplyr::left_join(ref_v2_affiliate_type_map, by = "aff_type_code") %>%

From 3f051261429ae770d3b693589479f7f6d42cd1d7 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 16:51:30 +0100
Subject: [PATCH 23/33] Simplify variable names

---
 R/get_factset_entity_info.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/get_factset_entity_info.R b/R/get_factset_entity_info.R
index 61b90ea..9074752 100644
--- a/R/get_factset_entity_info.R
+++ b/R/get_factset_entity_info.R
@@ -93,12 +93,13 @@ get_factset_entity_info <-
     )
 
     logger::log_trace("Determining last update time for entity affiliates.")
-    ent_entity_affiliates_last_update <-
+    affiliates_last_update <-
       dplyr::tbl(conn, "fds_fds_file_history") %>%
       dplyr::filter(.data$table_name == "ent_entity_affiliates") %>%
       dplyr::filter(
         .data$begin_time == max(.data$begin_time, na.rm = TRUE)
       ) %>%
+      # pull also handles `collect`ing the data
       dplyr::pull("begin_time")
 
     logger::log_trace("Determining credit risk parent via entity affiliates.")
@@ -111,7 +112,7 @@ get_factset_entity_info <-
         credit_parent_id = "factset_entity_id"
       ) %>%
       dplyr::mutate(
-        ent_entity_affiliates_last_update = .env$ent_entity_affiliates_last_update
+        ent_entity_affiliates_last_update = affiliates_last_update
       )
 
 

From cdbdf920cdeab51dc33f60ca05b279b2a409efb9 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 17:38:34 +0100
Subject: [PATCH 24/33] Copy factset functions from pacta.data.preparation

https://github.com/RMI-PACTA/pacta.data.preparation/tree/main/data
(private)
---
 R/get_factset_entity_financing_data.R | 84 ++++++++++++++++++++++++
 R/get_factset_fund_data.R             | 84 ++++++++++++++++++++++++
 R/get_factset_isin_to_fund_table.R    | 47 ++++++++++++++
 R/get_factset_iss_emissions_data.R    | 93 +++++++++++++++++++++++++++
 4 files changed, 308 insertions(+)
 create mode 100644 R/get_factset_entity_financing_data.R
 create mode 100644 R/get_factset_fund_data.R
 create mode 100644 R/get_factset_isin_to_fund_table.R
 create mode 100644 R/get_factset_iss_emissions_data.R

diff --git a/R/get_factset_entity_financing_data.R b/R/get_factset_entity_financing_data.R
new file mode 100644
index 0000000..e8efeac
--- /dev/null
+++ b/R/get_factset_entity_financing_data.R
@@ -0,0 +1,84 @@
+#' Get the entity financing data from the FactSet database and prepare the
+#' `factset_entity_financing_data` tibble
+#'
+#' @param data_timestamp A single string specifying the desired date for the
+#'   data in the form "2021-12-31"
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return A tibble properly prepared to be saved as the
+#'   `factset_entity_financing_data.rds` output file
+#'
+#' @export
+
+get_factset_entity_financing_data <- function(data_timestamp, ...) {
+  # connect to the FactSet database --------------------------------------------
+
+  factset_db <- connect_factset_db(...)
+
+  year <- lubridate::year(data_timestamp)
+
+
+  # get fsym_id to fundamentals fsym_company_id --------------------------------
+
+  ff_fsym_id__fsym_company_id <- tbl(factset_db, "ff_v3_ff_sec_map")
+
+  own_fsym_id__fsym_company_id <- tbl(factset_db, "own_v5_own_sec_map")
+
+  fsym_id__fsym_company_id <- dplyr::union_all(
+    ff_fsym_id__fsym_company_id,
+    own_fsym_id__fsym_company_id
+  )
+
+
+  # get fsym_id to factset_entity_id -------------------------------------------
+
+  ff_sec_entity <- tbl(factset_db, "ff_v3_ff_sec_entity")
+
+  own_sec_entity <- tbl(factset_db, "own_v5_own_sec_entity")
+
+  sec_entity <- dplyr::union_all(
+    ff_sec_entity,
+    own_sec_entity
+  )
+
+
+  # get market value data ------------------------------------------------------
+
+  fsym_id__ff_mkt_val <- tbl(factset_db, "ff_v3_ff_basic_der_af") %>%
+    select("fsym_id", "date", "currency", "ff_mkt_val")
+
+
+  # get debt outstanding data --------------------------------------------------
+
+  fsym_id__ff_debt <- tbl(factset_db, "ff_v3_ff_basic_af") %>%
+    select("fsym_id", "date", "currency", "ff_debt")
+
+
+  # merge and collect the data, then disconnect --------------------------------
+
+  entity_financing_data <- fsym_id__ff_mkt_val %>%
+    dplyr::full_join(fsym_id__ff_debt, by = c("fsym_id", "date", "currency")) %>%
+    left_join(fsym_id__fsym_company_id, by = "fsym_id") %>%
+    inner_join(sec_entity, by = c("fsym_company_id" = "fsym_id")) %>%
+    filter(!(is.na(.data$ff_mkt_val) & is.na(.data$ff_debt))) %>%
+    group_by(.data$fsym_id, .data$currency) %>%
+    filter(.data$date <= .env$data_timestamp) %>%
+    filter(lubridate::year(.data$date) == .env$year) %>%
+    filter(.data$date == max(.data$date)) %>%
+    ungroup() %>%
+    collect() %>%
+    mutate(
+      # convert units from millions to units
+      ff_mkt_val = .data$ff_mkt_val * 1e6,
+      ff_debt = .data$ff_debt * 1e6
+    ) %>%
+    distinct()
+
+  DBI::dbDisconnect(factset_db)
+
+
+  # return the entity financing data -------------------------------------------
+
+  entity_financing_data
+}
diff --git a/R/get_factset_fund_data.R b/R/get_factset_fund_data.R
new file mode 100644
index 0000000..6ad15d7
--- /dev/null
+++ b/R/get_factset_fund_data.R
@@ -0,0 +1,84 @@
+#' Get the fund data from the FactSet database and prepare the
+#' `factset_fund_data` tibble
+#'
+#' @param data_timestamp A single string specifying the desired date for the
+#'   data in the form "2021-12-31"
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return A tibble properly prepared to be saved as the `factset_fund_data.rds`
+#'   output file
+#'
+#' @export
+
+get_factset_fund_data <-
+  function(data_timestamp, ...) {
+    # connect to the FactSet database ------------------------------------------
+    factset_db <- connect_factset_db(...)
+
+
+    # get the fund holdings and the holdings' reported market value ------------
+
+    factset_fund_id__holding_fsym_id <-
+      tbl(factset_db, "own_v5_own_fund_detail") %>%
+      dplyr::filter(.data$report_date == .env$data_timestamp) %>%
+      select(
+        factset_fund_id = "factset_fund_id",
+        holding_fsym_id = "fsym_id",
+        holding_reported_mv = "reported_mv"
+      )
+
+
+    # --------------------------------------------------------------------------
+
+    factset_fund_id__generic_id <-
+      tbl(factset_db, "own_v5_own_fund_generic") %>%
+      dplyr::filter(.data$report_date == .env$data_timestamp) %>%
+      select(
+        factset_fund_id = "factset_fund_id",
+        holding_fsym_id = "generic_id",
+        holding_reported_mv = "reported_mv"
+      )
+
+    factset_fund_id__holding_fsym_id <-
+      dplyr::union_all(
+        factset_fund_id__holding_fsym_id,
+        factset_fund_id__generic_id
+      )
+
+
+    # get the fund total reported market value ---------------------------------
+
+    factset_fund_id__total_reported_mv <-
+      tbl(factset_db, "own_v5_own_ent_fund_filing_hist") %>%
+      dplyr::filter(.data$report_date == .env$data_timestamp) %>%
+      select("factset_fund_id", "total_reported_mv")
+
+
+    # symbology containing the ISIN to fsym_id link
+    fsym_id__isin <-
+      tbl(factset_db, "sym_v1_sym_isin")
+
+
+    # merge and collect the data, then disconnect ------------------------------
+
+    fund_data <-
+      factset_fund_id__total_reported_mv %>%
+      filter(.data$total_reported_mv != 0 | !is.na(.data$total_reported_mv)) %>%
+      left_join(factset_fund_id__holding_fsym_id, by = "factset_fund_id") %>%
+      left_join(fsym_id__isin, by = c(`holding_fsym_id` = "fsym_id")) %>%
+      select(
+        factset_fund_id = "factset_fund_id",
+        fund_reported_mv = "total_reported_mv",
+        holding_isin = "isin",
+        holding_reported_mv = "holding_reported_mv"
+      ) %>%
+      dplyr::collect()
+
+    DBI::dbDisconnect(factset_db)
+
+
+    # return the fund data -----------------------------------------------------
+
+    fund_data
+  }
diff --git a/R/get_factset_isin_to_fund_table.R b/R/get_factset_isin_to_fund_table.R
new file mode 100644
index 0000000..e9ef6a3
--- /dev/null
+++ b/R/get_factset_isin_to_fund_table.R
@@ -0,0 +1,47 @@
+#' Get the isin_to_fund_table data from the FactSet database and prepare the
+#' `factset_isin_to_fund_table` tibble
+#'
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return A tibble properly prepared to be saved as the
+#'   `factset_isin_to_fund_table.rds` output file
+#'
+#' @export
+
+get_factset_isin_to_fund_table <-
+  function(...) {
+    # connect to the FactSet database ------------------------------------------
+    factset_db <- connect_factset_db(...)
+
+
+    # get the ISIN to fsym_id table --------------------------------------------
+
+    isin__fsym_id <-
+      tbl(factset_db, "sym_v1_sym_isin") %>%
+      select("isin", "fsym_id")
+
+
+    # get the fsym_id to fund_id table -----------------------------------------
+
+    fsym_id__factset_fund_id <-
+      tbl(factset_db, "own_v5_own_ent_fund_identifiers") %>%
+      dplyr::filter(.data$identifier_type == "FSYM_ID") %>%
+      select(fsym_id = "fund_identifier", "factset_fund_id")
+
+
+    # merge and collect the data, then disconnect ------------------------------
+
+    isin__factset_fund_id <-
+      fsym_id__factset_fund_id %>%
+      inner_join(isin__fsym_id, by = "fsym_id") %>%
+      select("isin", "fsym_id", "factset_fund_id") %>%
+      dplyr::collect()
+
+    DBI::dbDisconnect(factset_db)
+
+
+    # return the ISIN to fund_id table -----------------------------------------
+
+    isin__factset_fund_id
+  }
diff --git a/R/get_factset_iss_emissions_data.R b/R/get_factset_iss_emissions_data.R
new file mode 100644
index 0000000..f5102b8
--- /dev/null
+++ b/R/get_factset_iss_emissions_data.R
@@ -0,0 +1,93 @@
+#' Get the ISS emissions data from the FactSet database and prepare the
+#' `factset_iss_emissions` tibble
+#'
+#' @param year A single numeric specifying the year of data to be returned
+#' @param min_estimated_trust A single numeric specifying the minimum allowed
+#'   "estimated trust" value
+#' @param min_reported_trust A single numeric specifying the minimum allowed
+#'   "reported trust" value
+#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#'   specifying database connection parameters)
+#'
+#' @return A tibble properly prepared to be saved as the
+#'   `factset_iss_emissions.rds` output file
+#'
+#' @export
+
+get_factset_iss_emissions_data <-
+  function(year, min_estimated_trust = 0.0, min_reported_trust = 0.0, ...) {
+    # convert `year` to date ---------------------------------------------------
+    year_month_date <- as.Date(paste0(year, "-01-01"), "%Y-%m-%d")
+
+
+    # connect to the FactSet database ------------------------------------------
+    factset_db <- connect_factset_db(...)
+
+
+    # get the relevant fsym_id to factset_entity_id table ----------------------
+
+    fsym_id__factset_entity_id <-
+      tbl(factset_db, "icc_v2_icc_sec_entity_hist") %>%
+      # end_date identifies the date the identifier was last associated with fsym_id
+      # i.e. if there is no end_date (end_date == NA) then the association is still valid
+      filter(.data$end_date >= .env$year_month_date | is.na(.data$end_date)) %>%
+      filter(!is.na(.data$fsym_id)) %>%
+      filter(!is.na(.data$factset_entity_id)) %>%
+      select("fsym_id", "factset_entity_id") %>%
+      distinct()
+
+
+    # get the relevant icc_security_id to factset_entity_id table --------------
+
+    icc_security_id__factset_entity_id <-
+      tbl(factset_db, "icc_v2_icc_factset_id_map") %>%
+      filter(.data$provider_id_type == "icc_security_id") %>%
+      filter(.data$factset_id_type == "fsym_security_id") %>%
+      filter(!is.na(.data$factset_id)) %>%
+      # do not use a fsym_id that was started in the current year to avoid data
+      # based on a partial year
+      filter(.data$id_start_date < .env$year_month_date) %>%
+      # end_date identifies the date the identifier was last associated with fsym_id
+      # i.e. if there is no end_date (end_date == NA) then the association is still valid
+      filter(.data$id_end_date >= .env$year_month_date | is.na(.data$id_end_date)) %>%
+      select(icc_security_id = "provider_id", fsym_id = "factset_id") %>%
+      inner_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
+      select("icc_security_id", "factset_entity_id") %>%
+      distinct()
+
+
+    # get the factset_entity_id to icc_total_emissions data --------------------
+
+    factset_entity_id__icc_total_emissions <-
+      tbl(factset_db, "icc_v2_icc_carbon_climate_core") %>%
+      filter(.data$icc_emissions_fiscal_year == .env$year) %>%
+      group_by(.data$icc_security_id, .data$icc_emissions_fiscal_year) %>%
+      # icc_archive_date marks the date a data point was submitted, and some times there are updates of
+      # previous data submissions, so we need to filter only for the most recent submission
+      filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>%
+      ungroup() %>%
+      group_by(.data$icc_company_id, .data$icc_emissions_fiscal_year) %>%
+      filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>%
+      ungroup() %>%
+      filter(
+        .data$icc_emissions_estimated_trust > min_estimated_trust |
+          .data$icc_emissions_reported_trust > min_reported_trust
+      ) %>%
+      select("icc_security_id", "icc_total_emissions", "icc_scope_3_emissions") %>%
+      inner_join(icc_security_id__factset_entity_id, by = "icc_security_id") %>%
+      select("factset_entity_id", "icc_total_emissions", "icc_scope_3_emissions")
+
+
+    # collect the data, then disconnect ----------------------------------------
+
+    factset_entity_id__icc_total_emissions <-
+      factset_entity_id__icc_total_emissions %>%
+      dplyr::collect()
+
+    DBI::dbDisconnect(factset_db)
+
+
+    # return the factset_entity_id to icc_total_emissions data -----------------
+
+    factset_entity_id__icc_total_emissions
+  }

From 202dd36db6a67932e5468a166dd64ad7441a7895 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Thu, 14 Dec 2023 21:27:45 +0100
Subject: [PATCH 25/33] Update Entity Financing Data function

---
 R/get_factset_entity_financing_data.R | 86 +++++++++++++++------------
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/R/get_factset_entity_financing_data.R b/R/get_factset_entity_financing_data.R
index e8efeac..e9c8358 100644
--- a/R/get_factset_entity_financing_data.R
+++ b/R/get_factset_entity_financing_data.R
@@ -1,42 +1,46 @@
 #' Get the entity financing data from the FactSet database and prepare the
 #' `factset_entity_financing_data` tibble
 #'
+#' @param conn databse connection
 #' @param data_timestamp A single string specifying the desired date for the
 #'   data in the form "2021-12-31"
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
-#'   specifying database connection parameters)
 #'
 #' @return A tibble properly prepared to be saved as the
 #'   `factset_entity_financing_data.rds` output file
 #'
 #' @export
 
-get_factset_entity_financing_data <- function(data_timestamp, ...) {
-  # connect to the FactSet database --------------------------------------------
-
-  factset_db <- connect_factset_db(...)
-
-  year <- lubridate::year(data_timestamp)
-
-
+get_factset_entity_financing_data <- function(
+  conn,
+  data_timestamp
+) {
   # get fsym_id to fundamentals fsym_company_id --------------------------------
 
-  ff_fsym_id__fsym_company_id <- tbl(factset_db, "ff_v3_ff_sec_map")
+  logger::log_debug("Extracting entity financing info from database.")
+  logger::log_debug("using data timestamp: ", data_timestamp)
+
+  logger::log_trace("Accessing security map - FactSet Fundamentals.")
+  ff_fsym_company_id <- dplyr::tbl(conn, "ff_v3_ff_sec_map")
 
-  own_fsym_id__fsym_company_id <- tbl(factset_db, "own_v5_own_sec_map")
+  logger::log_trace("Accessing security map - FactSet Ownership.")
+  own_fsym_company_id <- dplyr::tbl(conn, "own_v5_own_sec_map")
 
-  fsym_id__fsym_company_id <- dplyr::union_all(
-    ff_fsym_id__fsym_company_id,
-    own_fsym_id__fsym_company_id
+  logger::log_trace("UNIONing security maps.")
+  fsym_company_id <- dplyr::union_all(
+    ff_fsym_company_id,
+    own_fsym_company_id
   )
 
 
   # get fsym_id to factset_entity_id -------------------------------------------
 
-  ff_sec_entity <- tbl(factset_db, "ff_v3_ff_sec_entity")
+  logger::log_trace("Accessing security to entity map - FactSet Fundamentals.")
+  ff_sec_entity <- dplyr::tbl(conn, "ff_v3_ff_sec_entity")
 
-  own_sec_entity <- tbl(factset_db, "own_v5_own_sec_entity")
+  logger::log_trace("Accessing security to entity map - FactSet Ownership.")
+  own_sec_entity <- dplyr::tbl(conn, "own_v5_own_sec_entity")
 
+  logger::log_trace("UNIONing security to entity maps.")
   sec_entity <- dplyr::union_all(
     ff_sec_entity,
     own_sec_entity
@@ -45,38 +49,46 @@ get_factset_entity_financing_data <- function(data_timestamp, ...) {
 
   # get market value data ------------------------------------------------------
 
-  fsym_id__ff_mkt_val <- tbl(factset_db, "ff_v3_ff_basic_der_af") %>%
-    select("fsym_id", "date", "currency", "ff_mkt_val")
+  logger::log_trace("Accessing market value data.")
+  ff_mkt_val <- dplyr::tbl(conn, "ff_v3_ff_basic_der_af") %>%
+    dplyr::select("fsym_id", "date", "currency", "ff_mkt_val")
 
 
   # get debt outstanding data --------------------------------------------------
 
-  fsym_id__ff_debt <- tbl(factset_db, "ff_v3_ff_basic_af") %>%
-    select("fsym_id", "date", "currency", "ff_debt")
+  logger::log_trace("Accessing balance sheet data.")
+  ff_debt <- dplyr::tbl(conn, "ff_v3_ff_basic_af") %>%
+    dplyr::select("fsym_id", "date", "currency", "ff_debt")
 
 
   # merge and collect the data, then disconnect --------------------------------
 
-  entity_financing_data <- fsym_id__ff_mkt_val %>%
-    dplyr::full_join(fsym_id__ff_debt, by = c("fsym_id", "date", "currency")) %>%
-    left_join(fsym_id__fsym_company_id, by = "fsym_id") %>%
-    inner_join(sec_entity, by = c("fsym_company_id" = "fsym_id")) %>%
-    filter(!(is.na(.data$ff_mkt_val) & is.na(.data$ff_debt))) %>%
-    group_by(.data$fsym_id, .data$currency) %>%
-    filter(.data$date <= .env$data_timestamp) %>%
-    filter(lubridate::year(.data$date) == .env$year) %>%
-    filter(.data$date == max(.data$date)) %>%
-    ungroup() %>%
-    collect() %>%
-    mutate(
+  logger::log_trace("Merging entity financing data.")
+  entity_financing_data <- ff_mkt_val %>%
+    dplyr::full_join(
+      ff_debt,
+      by = c("fsym_id", "date", "currency")
+    ) %>%
+    dplyr::left_join(fsym_company_id, by = "fsym_id") %>%
+    dplyr::inner_join(sec_entity, by = c("fsym_company_id" = "fsym_id")) %>%
+    dplyr::filter(!(is.na(.data$ff_mkt_val) & is.na(.data$ff_debt))) %>%
+    dplyr::group_by(.data$fsym_id, .data$currency) %>%
+    dplyr::filter(.data$date <= .env$data_timestamp) %>%
+    dplyr::filter(
+      lubridate::year(.data$date) == lubridate::year(data_timestamp)
+    ) %>%
+    dplyr::filter(.data$date == max(.data$date)) %>%
+    dplyr::ungroup()
+
+  logger::log_trace("Downloading entity financing data.")
+  entity_financing_data <- entity_financing_data %>%
+    dplyr::collect() %>%
+    dplyr::mutate(
       # convert units from millions to units
       ff_mkt_val = .data$ff_mkt_val * 1e6,
       ff_debt = .data$ff_debt * 1e6
     ) %>%
-    distinct()
-
-  DBI::dbDisconnect(factset_db)
-
+    dplyr::distinct()
 
   # return the entity financing data -------------------------------------------
 

From dc570c3a3a695a0b8296e6f3e7256ebe9fab5bc9 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Fri, 15 Dec 2023 15:37:36 +0100
Subject: [PATCH 26/33] Convert get_factset_isin_to_fund_table to package

---
 R/get_factset_isin_to_fund_table.R | 53 +++++++++++++-----------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/R/get_factset_isin_to_fund_table.R b/R/get_factset_isin_to_fund_table.R
index e9ef6a3..d22f8c0 100644
--- a/R/get_factset_isin_to_fund_table.R
+++ b/R/get_factset_isin_to_fund_table.R
@@ -1,47 +1,40 @@
 #' Get the isin_to_fund_table data from the FactSet database and prepare the
 #' `factset_isin_to_fund_table` tibble
 #'
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
-#'   specifying database connection parameters)
+#' @param conn database connection
 #'
 #' @return A tibble properly prepared to be saved as the
 #'   `factset_isin_to_fund_table.rds` output file
 #'
 #' @export
 
-get_factset_isin_to_fund_table <-
-  function(...) {
-    # connect to the FactSet database ------------------------------------------
-    factset_db <- connect_factset_db(...)
+get_factset_isin_to_fund_table <- function(conn) {
+  # get the ISIN to fsym_id table --------------------------------------------
 
+  logger::info("Getting ISIN to fsym_id mapping")
+  isin <-
+    dplyr::tbl(conn, "sym_v1_sym_isin") %>%
+    dplyr::select("isin", "fsym_id")
 
-    # get the ISIN to fsym_id table --------------------------------------------
 
-    isin__fsym_id <-
-      tbl(factset_db, "sym_v1_sym_isin") %>%
-      select("isin", "fsym_id")
+  # get the fsym_id to fund_id table -----------------------------------------
 
+  logger::info("Getting fsym_id to fund id mapping")
+  fund_id <-
+    dplyr::tbl(conn, "own_v5_own_ent_fund_identifiers") %>%
+    dplyr::filter(.data$identifier_type == "FSYM_ID") %>%
+    dplyr::select(fsym_id = "fund_identifier", "factset_fund_id")
 
-    # get the fsym_id to fund_id table -----------------------------------------
 
-    fsym_id__factset_fund_id <-
-      tbl(factset_db, "own_v5_own_ent_fund_identifiers") %>%
-      dplyr::filter(.data$identifier_type == "FSYM_ID") %>%
-      select(fsym_id = "fund_identifier", "factset_fund_id")
+  # merge and collect the data ------------------------------
 
+  logger::info("Merging ISIN to fsym_id and fsym_id to fund_id")
+  isin__factset_fund_id <-
+    fund_id %>%
+    dplyr::inner_join(isin, by = "fsym_id") %>%
+    dplyr::select("isin", "fsym_id", "factset_fund_id") %>%
+    dplyr::collect()
 
-    # merge and collect the data, then disconnect ------------------------------
-
-    isin__factset_fund_id <-
-      fsym_id__factset_fund_id %>%
-      inner_join(isin__fsym_id, by = "fsym_id") %>%
-      select("isin", "fsym_id", "factset_fund_id") %>%
-      dplyr::collect()
-
-    DBI::dbDisconnect(factset_db)
-
-
-    # return the ISIN to fund_id table -----------------------------------------
-
-    isin__factset_fund_id
-  }
+  # return the ISIN to fund_id table -----------------------------------------
+  return(isin__factset_fund_id)
+}

From d7d5ef2b58d4e12696c5841a4721c915c42c971b Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 16:34:54 +0100
Subject: [PATCH 27/33] Update fund data function

---
 R/get_factset_fund_data.R | 160 +++++++++++++++++++++-----------------
 1 file changed, 87 insertions(+), 73 deletions(-)

diff --git a/R/get_factset_fund_data.R b/R/get_factset_fund_data.R
index 6ad15d7..b0ef9a8 100644
--- a/R/get_factset_fund_data.R
+++ b/R/get_factset_fund_data.R
@@ -1,84 +1,98 @@
 #' Get the fund data from the FactSet database and prepare the
 #' `factset_fund_data` tibble
 #'
+#' @param conn databse connection
 #' @param data_timestamp A single string specifying the desired date for the
 #'   data in the form "2021-12-31"
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
-#'   specifying database connection parameters)
 #'
 #' @return A tibble properly prepared to be saved as the `factset_fund_data.rds`
 #'   output file
 #'
 #' @export
 
-get_factset_fund_data <-
-  function(data_timestamp, ...) {
-    # connect to the FactSet database ------------------------------------------
-    factset_db <- connect_factset_db(...)
-
-
-    # get the fund holdings and the holdings' reported market value ------------
-
-    factset_fund_id__holding_fsym_id <-
-      tbl(factset_db, "own_v5_own_fund_detail") %>%
-      dplyr::filter(.data$report_date == .env$data_timestamp) %>%
-      select(
-        factset_fund_id = "factset_fund_id",
-        holding_fsym_id = "fsym_id",
-        holding_reported_mv = "reported_mv"
-      )
-
-
-    # --------------------------------------------------------------------------
-
-    factset_fund_id__generic_id <-
-      tbl(factset_db, "own_v5_own_fund_generic") %>%
-      dplyr::filter(.data$report_date == .env$data_timestamp) %>%
-      select(
-        factset_fund_id = "factset_fund_id",
-        holding_fsym_id = "generic_id",
-        holding_reported_mv = "reported_mv"
-      )
-
-    factset_fund_id__holding_fsym_id <-
-      dplyr::union_all(
-        factset_fund_id__holding_fsym_id,
-        factset_fund_id__generic_id
-      )
-
-
-    # get the fund total reported market value ---------------------------------
-
-    factset_fund_id__total_reported_mv <-
-      tbl(factset_db, "own_v5_own_ent_fund_filing_hist") %>%
-      dplyr::filter(.data$report_date == .env$data_timestamp) %>%
-      select("factset_fund_id", "total_reported_mv")
-
-
-    # symbology containing the ISIN to fsym_id link
-    fsym_id__isin <-
-      tbl(factset_db, "sym_v1_sym_isin")
-
-
-    # merge and collect the data, then disconnect ------------------------------
-
-    fund_data <-
-      factset_fund_id__total_reported_mv %>%
-      filter(.data$total_reported_mv != 0 | !is.na(.data$total_reported_mv)) %>%
-      left_join(factset_fund_id__holding_fsym_id, by = "factset_fund_id") %>%
-      left_join(fsym_id__isin, by = c(`holding_fsym_id` = "fsym_id")) %>%
-      select(
-        factset_fund_id = "factset_fund_id",
-        fund_reported_mv = "total_reported_mv",
-        holding_isin = "isin",
-        holding_reported_mv = "holding_reported_mv"
-      ) %>%
-      dplyr::collect()
-
-    DBI::dbDisconnect(factset_db)
-
-
-    # return the fund data -----------------------------------------------------
-
-    fund_data
-  }
+get_factset_fund_data <- function(conn, data_timestamp) {
+  # get the fund holdings and the holdings' reported market value ------------
+
+  logger::log_debug("Extracting financial info from database.")
+  logger::log_info("using data timestamp: ", data_timestamp)
+
+  logger::log_trace(
+    "Accessing historical fund holdings - security level. ",
+    "Filtering to date: {data_timestamp}"
+  )
+  fund_security <-
+    dplyr::tbl(conn, "own_v5_own_fund_detail") %>%
+    dplyr::filter(.data$report_date == .env$data_timestamp) %>%
+    dplyr::select(
+      factset_fund_id = "factset_fund_id",
+      holding_fsym_id = "fsym_id",
+      holding_reported_mv = "reported_mv"
+    )
+
+  logger::log_trace(
+    "Accessing historical fund holdings - non-securities. ",
+    "Filtering to date: {data_timestamp}"
+  )
+  fund_nonsecurity <-
+    dplyr::tbl(conn, "own_v5_own_fund_generic") %>%
+    dplyr::filter(.data$report_date == .env$data_timestamp) %>%
+    dplyr::select(
+      factset_fund_id = "factset_fund_id",
+      holding_fsym_id = "generic_id",
+      holding_reported_mv = "reported_mv"
+    )
+
+  logger::log_trace(
+    "Combining historical fund holdings - security and non-security."
+  )
+  fund_holding <-
+    dplyr::union_all(
+      fund_security,
+      fund_nonsecurity
+    )
+
+
+  # get the fund total reported market value ---------------------------------
+
+  logger::log_trace(
+    "Accessing historical fund filings.",
+    "Filtering to date: {data_timestamp}"
+  )
+  fund_mv <-
+    dplyr::tbl(conn, "own_v5_own_ent_fund_filing_hist") %>%
+    dplyr::filter(.data$report_date == .env$data_timestamp) %>%
+    dplyr::select("factset_fund_id", "total_reported_mv")
+
+
+  logger::log_trace(
+    "Accessing current ISIN mappings.",
+  )
+  # symbology containing the ISIN to fsym_id link
+  fsym_id__isin <-
+    dplyr::tbl(conn, "sym_v1_sym_isin")
+
+
+  # merge and collect the data, then disconnect ------------------------------
+
+  logger::log_trace("Merging the data.")
+  fund_data <-
+    fund_mv %>%
+    dplyr::filter(
+      .data$total_reported_mv != 0 | !is.na(.data$total_reported_mv)
+    ) %>%
+    dplyr::left_join(fund_holding, by = "factset_fund_id") %>%
+    dplyr::left_join(fsym_id__isin, by = c(`holding_fsym_id` = "fsym_id")) %>%
+    dplyr::select(
+      factset_fund_id = "factset_fund_id",
+      fund_reported_mv = "total_reported_mv",
+      holding_isin = "isin",
+      holding_reported_mv = "holding_reported_mv"
+    )
+
+  logger::log_trace("Downloading fund data.")
+  fund_data <- dplyr::collect(fund_data)
+
+  # return the fund data -----------------------------------------------------
+
+  return(fund_data)
+}

From e8d85cbefa4b782f1ec159fed4f125b824bd526c Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 18:14:51 +0100
Subject: [PATCH 28/33] update ISS emissions function

---
 R/get_factset_iss_emissions_data.R | 171 ++++++++++++++++-------------
 1 file changed, 93 insertions(+), 78 deletions(-)

diff --git a/R/get_factset_iss_emissions_data.R b/R/get_factset_iss_emissions_data.R
index f5102b8..38af01c 100644
--- a/R/get_factset_iss_emissions_data.R
+++ b/R/get_factset_iss_emissions_data.R
@@ -1,12 +1,13 @@
 #' Get the ISS emissions data from the FactSet database and prepare the
 #' `factset_iss_emissions` tibble
 #'
+#' @param conn databse connection
 #' @param year A single numeric specifying the year of data to be returned
 #' @param min_estimated_trust A single numeric specifying the minimum allowed
 #'   "estimated trust" value
 #' @param min_reported_trust A single numeric specifying the minimum allowed
 #'   "reported trust" value
-#' @param ... Arguments to be passed to the `connect_factset_db()` function (for
+#' @param ... Arguments to be passed to the `connect_conn()` function (for
 #'   specifying database connection parameters)
 #'
 #' @return A tibble properly prepared to be saved as the
@@ -14,80 +15,94 @@
 #'
 #' @export
 
-get_factset_iss_emissions_data <-
-  function(year, min_estimated_trust = 0.0, min_reported_trust = 0.0, ...) {
-    # convert `year` to date ---------------------------------------------------
-    year_month_date <- as.Date(paste0(year, "-01-01"), "%Y-%m-%d")
-
-
-    # connect to the FactSet database ------------------------------------------
-    factset_db <- connect_factset_db(...)
-
-
-    # get the relevant fsym_id to factset_entity_id table ----------------------
-
-    fsym_id__factset_entity_id <-
-      tbl(factset_db, "icc_v2_icc_sec_entity_hist") %>%
-      # end_date identifies the date the identifier was last associated with fsym_id
-      # i.e. if there is no end_date (end_date == NA) then the association is still valid
-      filter(.data$end_date >= .env$year_month_date | is.na(.data$end_date)) %>%
-      filter(!is.na(.data$fsym_id)) %>%
-      filter(!is.na(.data$factset_entity_id)) %>%
-      select("fsym_id", "factset_entity_id") %>%
-      distinct()
-
-
-    # get the relevant icc_security_id to factset_entity_id table --------------
-
-    icc_security_id__factset_entity_id <-
-      tbl(factset_db, "icc_v2_icc_factset_id_map") %>%
-      filter(.data$provider_id_type == "icc_security_id") %>%
-      filter(.data$factset_id_type == "fsym_security_id") %>%
-      filter(!is.na(.data$factset_id)) %>%
-      # do not use a fsym_id that was started in the current year to avoid data
-      # based on a partial year
-      filter(.data$id_start_date < .env$year_month_date) %>%
-      # end_date identifies the date the identifier was last associated with fsym_id
-      # i.e. if there is no end_date (end_date == NA) then the association is still valid
-      filter(.data$id_end_date >= .env$year_month_date | is.na(.data$id_end_date)) %>%
-      select(icc_security_id = "provider_id", fsym_id = "factset_id") %>%
-      inner_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
-      select("icc_security_id", "factset_entity_id") %>%
-      distinct()
-
-
-    # get the factset_entity_id to icc_total_emissions data --------------------
-
-    factset_entity_id__icc_total_emissions <-
-      tbl(factset_db, "icc_v2_icc_carbon_climate_core") %>%
-      filter(.data$icc_emissions_fiscal_year == .env$year) %>%
-      group_by(.data$icc_security_id, .data$icc_emissions_fiscal_year) %>%
-      # icc_archive_date marks the date a data point was submitted, and some times there are updates of
-      # previous data submissions, so we need to filter only for the most recent submission
-      filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>%
-      ungroup() %>%
-      group_by(.data$icc_company_id, .data$icc_emissions_fiscal_year) %>%
-      filter(.data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)) %>%
-      ungroup() %>%
-      filter(
-        .data$icc_emissions_estimated_trust > min_estimated_trust |
-          .data$icc_emissions_reported_trust > min_reported_trust
-      ) %>%
-      select("icc_security_id", "icc_total_emissions", "icc_scope_3_emissions") %>%
-      inner_join(icc_security_id__factset_entity_id, by = "icc_security_id") %>%
-      select("factset_entity_id", "icc_total_emissions", "icc_scope_3_emissions")
-
-
-    # collect the data, then disconnect ----------------------------------------
-
-    factset_entity_id__icc_total_emissions <-
-      factset_entity_id__icc_total_emissions %>%
-      dplyr::collect()
-
-    DBI::dbDisconnect(factset_db)
-
-
-    # return the factset_entity_id to icc_total_emissions data -----------------
-
-    factset_entity_id__icc_total_emissions
-  }
+get_factset_iss_emissions_data <- function(
+  conn,
+  reporting_year,
+  min_estimated_trust = 0.0,
+  min_reported_trust = 0.0
+) {
+  # convert `year` to date ---------------------------------------------------
+  sql_filter_date <- as.Date(paste0(reporting_year, "-01-01"), "%Y-%m-%d")
+
+  # get the relevant fsym_id to factset_entity_id table ----------------------
+  fsym_id__factset_entity_id <-
+    dplyr::tbl(conn, "icc_v2_icc_sec_entity_hist") %>%
+    # end_date identifies the date the identifier was last associated with
+    # fsym_id i.e. if there is no end_date (end_date == NA) then the
+    # association is still valid
+    dplyr::filter(
+      .data$end_date >= sql_filter_date | is.na(.data$end_date)
+    ) %>%
+    dplyr::filter(!is.na(.data$fsym_id)) %>%
+    dplyr::filter(!is.na(.data$factset_entity_id)) %>%
+    dplyr::select("fsym_id", "factset_entity_id") %>%
+    dplyr::distinct()
+
+
+  # get the relevant icc_security_id to factset_entity_id table --------------
+
+  icc_security_id <-
+    dplyr::tbl(conn, "icc_v2_icc_factset_id_map") %>%
+    dplyr::filter(.data$provider_id_type == "icc_security_id") %>%
+    dplyr::filter(.data$factset_id_type == "fsym_security_id") %>%
+    dplyr::filter(!is.na(.data$factset_id)) %>%
+    # do not use a fsym_id that was started in the current year to avoid data
+    # based on a partial year
+    dplyr::filter(.data$id_start_date < sql_filter_date) %>%
+    # end_date identifies the date the identifier was last associated with
+    # fsym_id i.e. if there is no end_date (end_date == NA) then the
+    # association is still valid
+    dplyr::filter(
+      .data$id_end_date >= sql_filter_date | is.na(.data$id_end_date)
+    ) %>%
+    dplyr::select(icc_security_id = "provider_id", fsym_id = "factset_id") %>%
+    dplyr::inner_join(fsym_id__factset_entity_id, by = "fsym_id") %>%
+    dplyr::select("icc_security_id", "factset_entity_id") %>%
+    dplyr::distinct()
+
+
+  # get the factset_entity_id to icc_total_emissions data --------------------
+
+  icc_total_emissions <-
+    dplyr::tbl(conn, "icc_v2_icc_carbon_climate_core") %>%
+    dplyr::filter(.data$icc_emissions_fiscal_year == .env$reporting_year) %>%
+    dplyr::group_by(.data$icc_security_id, .data$icc_emissions_fiscal_year) %>%
+    # icc_archive_date marks the date a data point was submitted, and some
+    # times there are updates of previous data submissions, so we need to
+    # dplyr::filter only for the most recent submission
+    dplyr::filter(
+      .data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::group_by(.data$icc_company_id, .data$icc_emissions_fiscal_year) %>%
+    dplyr::filter(
+      .data$icc_archive_date == max(.data$icc_archive_date, na.rm = TRUE)
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::filter(
+      .data$icc_emissions_estimated_trust > min_estimated_trust |
+        .data$icc_emissions_reported_trust > min_reported_trust
+    ) %>%
+    dplyr::select(
+      "icc_security_id",
+      "icc_total_emissions",
+      "icc_scope_3_emissions"
+    ) %>%
+    dplyr::inner_join(icc_security_id, by = "icc_security_id") %>%
+    dplyr::select(
+      "factset_entity_id",
+      "icc_total_emissions",
+      "icc_scope_3_emissions"
+    )
+
+  # collect the data, then disconnect ----------------------------------------
+
+  logger::log_trace("Downloading emissions data.")
+  icc_total_emissions <-
+    icc_total_emissions %>%
+    dplyr::collect()
+
+  # return the factset_entity_id to icc_total_emissions data -----------------
+
+  return(icc_total_emissions)
+}

From 760e2e00b14423999a333498980588c380b5eb2b Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 19:06:27 +0100
Subject: [PATCH 29/33] Add new exports to export function

---
 R/export_pacta_files.R | 61 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index a05b46e..30deb07 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -75,7 +75,7 @@ export_pacta_files <- function(
 
   # Start Extracting Data
 
-  factset_financial_data_path <- file.path(
+  financial_data_path <- file.path(
     export_dir,
     "factset_financial_data.rds"
   )
@@ -85,13 +85,59 @@ export_pacta_files <- function(
     data_timestamp = data_timestamp
   )
   logger::log_info("Exporting financial data to {factset_financial_data_path}")
-  saveRDS(object = financial_data, file = factset_financial_data_path)
+  saveRDS(object = financial_data, file = financial_data_path)
 
-  factset_entity_info_path <- file.path(export_dir, "factset_entity_info.rds")
+  entity_info_path <- file.path(export_dir, "factset_entity_info.rds")
   logger::log_info("Fetching entity info data.")
   entity_info <- get_factset_entity_info(conn = conn)
   logger::log_info("Exporting entity info data to {factset_entity_info_path}")
-  saveRDS(object = entity_info, file = factset_entity_info_path)
+  saveRDS(object = entity_info, file = entity_info_path)
+
+  entity_financing_data_path <- file.path(
+    export_dir,
+    "factset_entity_financing_data.rds"
+  )
+  logger::log_info("Fetching entity financing data.")
+  entity_financing_data <- get_factset_entity_financing_data(
+    conn = conn,
+    data_timestamp = data_timestamp
+  )
+  logger::log_info(
+    "Exporting entity financing data to {factset_entity_financing_data_path}"
+  )
+  saveRDS(
+    object = entity_financing_data,
+    file = entity_financing_data_path
+  )
+
+  fund_data_path <- file.path(export_dir, "factset_fund_data.rds")
+  logger::log_info("Fetching fund data.")
+  fund_data <- get_factset_fund_data(conn = conn)
+  logger::log_info("Exporting fund data to {factset_fund_data_path}")
+  saveRDS(object = fund_data, file = fund_data_path)
+
+  isin_to_fund_table_path <- file.path(
+    export_dir,
+    "factset_isin_to_fund_table.rds"
+  )
+  logger::log_info("Fetching ISIN to fund table.")
+  isin_to_fund_table <- get_factset_isin_to_fund_table(conn = conn)
+  logger::log_info(
+    "Exporting ISIN to fund table to {factset_isin_to_fund_table_path}"
+  )
+  saveRDS(object = isin_to_fund_table, file = isin_to_fund_table_path)
+
+  iss_emissions_path <- file.path(
+    export_dir,
+    "factset_iss_emissions.rds"
+  )
+  logger::log_info("Fetching ISS emissions data.")
+  iss_emissions <- get_factset_iss_emissions_data(conn = conn)
+  logger::log_info(
+    "Exporting ISS emissions data to {factset_iss_emissions_path}"
+  )
+  saveRDS(object = iss_emissions, file = iss_emissions_path)
+
 
   logger::log_info("Done with data export.")
 
@@ -104,7 +150,12 @@ export_pacta_files <- function(
   return(
     invisible(
       c(
-        factset_entity_info_path = factset_entity_info_path
+        financial_data_path = financial_data_path,
+        entity_info_path = entity_info_path,
+        entity_financing_data_path = entity_financing_data_path,
+        fund_data_path = fund_data_path,
+        isin_to_fund_table_path = isin_to_fund_table_path,
+        iss_emissions_path = iss_emissions_path
       )
     )
   )

From e4c84a652a8ad966fd13957413295a44c2905bc4 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 19:07:27 +0100
Subject: [PATCH 30/33] update documentation

---
 NAMESPACE                                |  5 ++++
 man/connect_factset_db.Rd                | 35 +++++++++++++++++++++++
 man/export_pacta_files.Rd                | 16 +++++++----
 man/get_factset_entity_financing_data.Rd | 23 +++++++++++++++
 man/get_factset_entity_info.Rd           |  5 ++--
 man/get_factset_financial_data.Rd        | 23 +++++++++++++++
 man/get_factset_fund_data.Rd             | 23 +++++++++++++++
 man/get_factset_isin_to_fund_table.Rd    | 20 +++++++++++++
 man/get_factset_iss_emissions_data.Rd    | 36 ++++++++++++++++++++++++
 9 files changed, 178 insertions(+), 8 deletions(-)
 create mode 100644 man/connect_factset_db.Rd
 create mode 100644 man/get_factset_entity_financing_data.Rd
 create mode 100644 man/get_factset_financial_data.Rd
 create mode 100644 man/get_factset_fund_data.Rd
 create mode 100644 man/get_factset_isin_to_fund_table.Rd
 create mode 100644 man/get_factset_iss_emissions_data.Rd

diff --git a/NAMESPACE b/NAMESPACE
index a26e306..2b64b62 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,8 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
+export(connect_factset_db)
 export(export_pacta_files)
+export(get_factset_entity_financing_data)
 export(get_factset_entity_info)
 export(get_factset_financial_data)
+export(get_factset_fund_data)
+export(get_factset_isin_to_fund_table)
+export(get_factset_iss_emissions_data)
 importFrom(dplyr,"%>%")
 importFrom(rlang,.data)
 importFrom(rlang,.env)
diff --git a/man/connect_factset_db.Rd b/man/connect_factset_db.Rd
new file mode 100644
index 0000000..ee82e71
--- /dev/null
+++ b/man/connect_factset_db.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/connect_factset_db.R
+\name{connect_factset_db}
+\alias{connect_factset_db}
+\title{Export files for use in PACTA data preparation}
+\usage{
+connect_factset_db(
+  dbname = Sys.getenv("PGDATABASE"),
+  host = Sys.getenv("PGHOST"),
+  port = Sys.getenv("PGPORT", 5432L),
+  options = "-c search_path=fds",
+  username = Sys.getenv("PGUSER"),
+  password = Sys.getenv("PGPASSWORD")
+)
+}
+\arguments{
+\item{dbname}{name of the database to connect to}
+
+\item{host}{hostname of the server to connect to}
+
+\item{port}{port number of the server to connect to}
+
+\item{options}{additional options to pass to the database connection.
+Typically used to define schema search path.}
+
+\item{username}{username to use for the database connection}
+
+\item{password}{password to use for the database connection}
+}
+\value{
+a database connection object
+}
+\description{
+Export files for use in PACTA data preparation
+}
diff --git a/man/export_pacta_files.Rd b/man/export_pacta_files.Rd
index 7977cfa..a11143e 100644
--- a/man/export_pacta_files.Rd
+++ b/man/export_pacta_files.Rd
@@ -5,17 +5,23 @@
 \title{Export files for use in PACTA data preparation}
 \usage{
 export_pacta_files(
-  destination = file.path("."),
-  data_timestamp = Sys.time(),
-  ...
+  conn = connect_factset_db(),
+  destination = file.path(Sys.getenv("EXPORT_DESTINATION")),
+  data_timestamp = Sys.getenv("DATA_TIMESTAMP", Sys.time()),
+  terminate_connection = (deparse(substitute(conn)) ==
+    formals(export_pacta_files)[["conn"]])
 )
 }
 \arguments{
-\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for
-specifying database connection parameters)}
+\item{destination}{path to directory where exported files will be saved}
+
+\item{data_timestamp}{filter data as-of this timestamp}
 
 \item{Destination}{directory for the output files}
 }
+\value{
+vector of paths to exported files
+}
 \description{
 Export files for use in PACTA data preparation
 }
diff --git a/man/get_factset_entity_financing_data.Rd b/man/get_factset_entity_financing_data.Rd
new file mode 100644
index 0000000..98b3598
--- /dev/null
+++ b/man/get_factset_entity_financing_data.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_factset_entity_financing_data.R
+\name{get_factset_entity_financing_data}
+\alias{get_factset_entity_financing_data}
+\title{Get the entity financing data from the FactSet database and prepare the
+\code{factset_entity_financing_data} tibble}
+\usage{
+get_factset_entity_financing_data(conn, data_timestamp)
+}
+\arguments{
+\item{conn}{databse connection}
+
+\item{data_timestamp}{A single string specifying the desired date for the
+data in the form "2021-12-31"}
+}
+\value{
+A tibble properly prepared to be saved as the
+\code{factset_entity_financing_data.rds} output file
+}
+\description{
+Get the entity financing data from the FactSet database and prepare the
+\code{factset_entity_financing_data} tibble
+}
diff --git a/man/get_factset_entity_info.Rd b/man/get_factset_entity_info.Rd
index b163ad8..7f5ca6a 100644
--- a/man/get_factset_entity_info.Rd
+++ b/man/get_factset_entity_info.Rd
@@ -5,11 +5,10 @@
 \title{Get the entity info data from the FactSet database and prepare the
 \code{factset_entity_info} tibble}
 \usage{
-get_factset_entity_info(...)
+get_factset_entity_info(conn)
 }
 \arguments{
-\item{...}{Arguments to be passed to the \code{connect_factset_db()} function (for
-specifying database connection parameters)}
+\item{conn}{database connection}
 }
 \value{
 A tibble properly prepared to be saved as the
diff --git a/man/get_factset_financial_data.Rd b/man/get_factset_financial_data.Rd
new file mode 100644
index 0000000..9b13887
--- /dev/null
+++ b/man/get_factset_financial_data.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_factset_financial_data.R
+\name{get_factset_financial_data}
+\alias{get_factset_financial_data}
+\title{Get the factset financial data from the FactSet database and prepare the
+\code{factset_financial_data} tibble}
+\usage{
+get_factset_financial_data(conn, data_timestamp, ...)
+}
+\arguments{
+\item{conn}{databse connection}
+
+\item{data_timestamp}{A single string specifying the desired date for the
+data in the form "2021-12-31"}
+}
+\value{
+A tibble properly prepared to be saved as the
+\code{factset_financial_data.rds} output file
+}
+\description{
+Get the factset financial data from the FactSet database and prepare the
+\code{factset_financial_data} tibble
+}
diff --git a/man/get_factset_fund_data.Rd b/man/get_factset_fund_data.Rd
new file mode 100644
index 0000000..7c5c447
--- /dev/null
+++ b/man/get_factset_fund_data.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_factset_fund_data.R
+\name{get_factset_fund_data}
+\alias{get_factset_fund_data}
+\title{Get the fund data from the FactSet database and prepare the
+\code{factset_fund_data} tibble}
+\usage{
+get_factset_fund_data(conn, data_timestamp)
+}
+\arguments{
+\item{conn}{databse connection}
+
+\item{data_timestamp}{A single string specifying the desired date for the
+data in the form "2021-12-31"}
+}
+\value{
+A tibble properly prepared to be saved as the \code{factset_fund_data.rds}
+output file
+}
+\description{
+Get the fund data from the FactSet database and prepare the
+\code{factset_fund_data} tibble
+}
diff --git a/man/get_factset_isin_to_fund_table.Rd b/man/get_factset_isin_to_fund_table.Rd
new file mode 100644
index 0000000..155a20a
--- /dev/null
+++ b/man/get_factset_isin_to_fund_table.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_factset_isin_to_fund_table.R
+\name{get_factset_isin_to_fund_table}
+\alias{get_factset_isin_to_fund_table}
+\title{Get the isin_to_fund_table data from the FactSet database and prepare the
+\code{factset_isin_to_fund_table} tibble}
+\usage{
+get_factset_isin_to_fund_table(conn)
+}
+\arguments{
+\item{conn}{database connection}
+}
+\value{
+A tibble properly prepared to be saved as the
+\code{factset_isin_to_fund_table.rds} output file
+}
+\description{
+Get the isin_to_fund_table data from the FactSet database and prepare the
+\code{factset_isin_to_fund_table} tibble
+}
diff --git a/man/get_factset_iss_emissions_data.Rd b/man/get_factset_iss_emissions_data.Rd
new file mode 100644
index 0000000..40cc055
--- /dev/null
+++ b/man/get_factset_iss_emissions_data.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_factset_iss_emissions_data.R
+\name{get_factset_iss_emissions_data}
+\alias{get_factset_iss_emissions_data}
+\title{Get the ISS emissions data from the FactSet database and prepare the
+\code{factset_iss_emissions} tibble}
+\usage{
+get_factset_iss_emissions_data(
+  conn,
+  reporting_year,
+  min_estimated_trust = 0,
+  min_reported_trust = 0
+)
+}
+\arguments{
+\item{conn}{databse connection}
+
+\item{min_estimated_trust}{A single numeric specifying the minimum allowed
+"estimated trust" value}
+
+\item{min_reported_trust}{A single numeric specifying the minimum allowed
+"reported trust" value}
+
+\item{year}{A single numeric specifying the year of data to be returned}
+
+\item{...}{Arguments to be passed to the \code{connect_conn()} function (for
+specifying database connection parameters)}
+}
+\value{
+A tibble properly prepared to be saved as the
+\code{factset_iss_emissions.rds} output file
+}
+\description{
+Get the ISS emissions data from the FactSet database and prepare the
+\code{factset_iss_emissions} tibble
+}

From 78378230795f5258f04c6f25fd9d14e74c763871 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 19:39:23 +0100
Subject: [PATCH 31/33] Remove `browser()`

---
 R/get_factset_financial_data.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
index 4be7c3c..3d5577c 100644
--- a/R/get_factset_financial_data.R
+++ b/R/get_factset_financial_data.R
@@ -34,7 +34,6 @@ get_factset_financial_data <-
 
     # adj_price ----------------------------------------------------------------
 
-    browser()
     logger::log_trace(
       "Accessing share prices. ",
       "Filtering to date: {data_timestamp}"

From d3e0e6c13739679bbe996f4a3e4de659fa7feb67 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 20:38:52 +0100
Subject: [PATCH 32/33] rename function

---
 R/export_pacta_files.R                | 2 +-
 R/get_factset_entity_financing_data.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/export_pacta_files.R b/R/export_pacta_files.R
index 30deb07..8d4d3f4 100644
--- a/R/export_pacta_files.R
+++ b/R/export_pacta_files.R
@@ -98,7 +98,7 @@ export_pacta_files <- function(
     "factset_entity_financing_data.rds"
   )
   logger::log_info("Fetching entity financing data.")
-  entity_financing_data <- get_factset_entity_financing_data(
+  entity_financing_data <- get_factset_financing_data(
     conn = conn,
     data_timestamp = data_timestamp
   )
diff --git a/R/get_factset_entity_financing_data.R b/R/get_factset_entity_financing_data.R
index e9c8358..17900ce 100644
--- a/R/get_factset_entity_financing_data.R
+++ b/R/get_factset_entity_financing_data.R
@@ -10,7 +10,7 @@
 #'
 #' @export
 
-get_factset_entity_financing_data <- function(
+get_factset_financing_data <- function(
   conn,
   data_timestamp
 ) {

From f41ac35e8fadbaef1272e8d063fb993f62837d00 Mon Sep 17 00:00:00 2001
From: Alex Axthelm <aaxthelm@RMI.org>
Date: Mon, 18 Dec 2023 20:58:53 +0100
Subject: [PATCH 33/33] Filter to most recent date, if posted in past month

---
 R/get_factset_financial_data.R | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/R/get_factset_financial_data.R b/R/get_factset_financial_data.R
index 3d5577c..12de10a 100644
--- a/R/get_factset_financial_data.R
+++ b/R/get_factset_financial_data.R
@@ -11,11 +11,16 @@
 #' @export
 
 get_factset_financial_data <-
-  function(conn, data_timestamp, ...) {
+  function(
+    conn,
+    data_timestamp,
+    data_timestamp_lookback = data_timestamp - lubridate::dmonths(1)
+    ) {
     # build connection to database ---------------------------------------------
 
     logger::log_debug("Extracting financial info from database.")
     logger::log_info("using data timestamp: ", data_timestamp)
+    logger::log_info("Looking back in data to", data_timestamp_lookback)
 
 
     # factset_entity_id -----------------------------------------------
@@ -38,12 +43,21 @@ get_factset_financial_data <-
       "Accessing share prices. ",
       "Filtering to date: {data_timestamp}"
     )
+    # TODO: Optimize this query
     adj_price <-
       dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
-      dplyr::filter(.data$price_date == .env$data_timestamp) %>%
+      dplyr::filter(.data$price_date <= .env$data_timestamp) %>%
+      dplyr::group_by(.data$fsym_id, .data$adj_price) %>%
+      dplyr::filter(.data$price_date == max(.data$price_date)) %>%
+      # TODO: CRITICAL: decision: do we want most recent price, or only for
+      # those that have posted in past month?
+      dplyr::filter(
+        .data$price_date >= .env$data_timestamp_lookback
+        ) %>%
       dplyr::select("fsym_id", "adj_price")
 
 
+
     # adj_shares_outstanding ---------------------------------------------------
 
     logger::log_trace(
@@ -52,7 +66,14 @@ get_factset_financial_data <-
     )
     adj_shares_outstanding <-
       dplyr::tbl(conn, "own_v5_own_sec_prices") %>%
-      dplyr::filter(.data$price_date == .env$data_timestamp) %>%
+      dplyr::filter(.data$price_date <= .env$data_timestamp) %>%
+      dplyr::group_by(.data$fsym_id, .data$adj_price) %>%
+      dplyr::filter(.data$price_date == max(.data$price_date)) %>%
+      # TODO: CRITICAL: decision: do we want most recent price, or only for
+      # those that have posted in past month?
+      dplyr::filter(
+        .data$price_date >= .env$data_timestamp_lookback
+        ) %>%
       dplyr::select("fsym_id", "adj_shares_outstanding")