From 362596416e99e5e9b95b0542a4cb858a55a4c633 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 10:53:37 +0100 Subject: [PATCH 01/33] add default 2023Q4 config --- config.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/config.yml b/config.yml index d677f38..bf9a8dd 100644 --- a/config.yml +++ b/config.yml @@ -112,6 +112,26 @@ default: scenario_raw_data_to_include: ["geco_2022", "ipr_2021", "isf_2021", "weo_2022"] global_aggregate_scenario_sources_list: ["WEO2022"] + +2023Q4: + masterdata_ownership_filename: "" + masterdata_debt_filename: "" + ar_company_id__factset_entity_id_filename: "" + factset_financial_data_filename: "" + factset_entity_info_filename: "" + factset_entity_financing_data_filename: "" + factset_fund_data_filename: "" + factset_isin_to_fund_table_filename: "" + factset_iss_emissions_data_filename: "" + factset_issue_code_bridge_filename: "" + imf_quarter_timestamp: "2023-Q4" + pacta_financial_timestamp: "2023Q4" + market_share_target_reference_year: 2023 + scenario_sources_list: ["GECO2023", "ISF2023", "WEO2023"] + scenario_raw_data_to_include: ["geco_2023", "isf_2023", "weo_2023"] + global_aggregate_scenario_sources_list: ["WEO2023"] + + desktop: inherits: 2022Q4 data_prep_outputs_path: "./outputs" From de2bd6000f417d349a05da041fe2692b765b5105 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 11:17:44 +0100 Subject: [PATCH 02/33] add `factset_industry_map_bridge_filename` --- config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.yml b/config.yml index d6b32e6..e1d7121 100644 --- a/config.yml +++ b/config.yml @@ -124,6 +124,7 @@ default: factset_isin_to_fund_table_filename: "" factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" + factset_industry_map_bridge_filename: "" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From 58e365c5e3569e9d9190542fd082f1158683bde4 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 20:33:22 +0100 Subject: [PATCH 03/33] add `factset_manual_pacta_sector_override` --- config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config.yml b/config.yml index da9318a..5b5a353 100644 --- a/config.yml +++ b/config.yml @@ -13,6 +13,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" update_currencies: TRUE export_sqlite_files: TRUE imf_quarter_timestamp: "2021-Q4" @@ -43,6 +44,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2021-Q4" pacta_financial_timestamp: "2021Q4" market_share_target_reference_year: 2021 @@ -83,6 +85,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2022-Q2" pacta_financial_timestamp: "2022Q2" market_share_target_reference_year: 2022 @@ -110,6 +113,7 @@ default: factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 @@ -130,6 +134,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" + factset_manual_pacta_sector_override: "" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From 5a2ec24a858a40ca9b4f2049e56dde34413e7fd9 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 20:35:50 +0100 Subject: [PATCH 04/33] add `_filename` suffix --- config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config.yml b/config.yml index 5b5a353..fefa772 100644 --- a/config.yml +++ b/config.yml @@ -13,7 +13,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" update_currencies: TRUE export_sqlite_files: TRUE imf_quarter_timestamp: "2021-Q4" @@ -44,7 +44,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2021-Q4" pacta_financial_timestamp: "2021Q4" market_share_target_reference_year: 2021 @@ -85,7 +85,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2022-Q2" pacta_financial_timestamp: "2022Q2" market_share_target_reference_year: 2022 @@ -113,7 +113,7 @@ default: factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 @@ -134,7 +134,7 @@ default: factset_iss_emissions_data_filename: "" factset_issue_code_bridge_filename: "" factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override: "" + factset_manual_pacta_sector_override_filename: "" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 From a96f286fce743c0f651cf106d8cc3f97cd8496e3 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 21:19:55 +0100 Subject: [PATCH 05/33] add AI dataset filenames --- config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.yml b/config.yml index fefa772..31eb699 100644 --- a/config.yml +++ b/config.yml @@ -123,9 +123,9 @@ default: 2023Q4: - masterdata_ownership_filename: "" - masterdata_debt_filename: "" - ar_company_id__factset_entity_id_filename: "" + masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" + masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" + ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv" factset_financial_data_filename: "" factset_entity_info_filename: "" factset_entity_financing_data_filename: "" From 6e34fde39c55ac41b9d2fe88dceeebdc88884fd6 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Thu, 15 Feb 2024 21:38:29 +0100 Subject: [PATCH 06/33] change default "desktop" config to use 2023Q4 --- config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yml b/config.yml index 31eb699..ab4cdfa 100644 --- a/config.yml +++ b/config.yml @@ -144,7 +144,7 @@ default: desktop: - inherits: 2022Q4 + inherits: 2023Q4 data_prep_outputs_path: "./outputs" asset_impact_data_path: "./ai_inputs" factset_data_path: "./factset_inputs" From 965c2da51e25a3305e131e5ccfa9000dd01508ba Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Fri, 16 Feb 2024 07:50:21 +0100 Subject: [PATCH 07/33] Update from main --- config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yml b/config.yml index ab4cdfa..9fdfb1e 100644 --- a/config.yml +++ b/config.yml @@ -113,7 +113,7 @@ default: factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "" + factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From 00c26bc10f53f1f6790e9e1649564aa7ef15f7ab Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Sat, 17 Feb 2024 19:27:39 +0100 Subject: [PATCH 08/33] add more parameters to review --- config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config.yml b/config.yml index 9fdfb1e..b82d30e 100644 --- a/config.yml +++ b/config.yml @@ -141,6 +141,13 @@ default: scenario_sources_list: ["GECO2023", "ISF2023", "WEO2023"] scenario_raw_data_to_include: ["geco_2023", "isf_2023", "weo_2023"] global_aggregate_scenario_sources_list: ["WEO2023"] + sector_list: [] + other_sector_list: [] + zero_emission_factor_techs: [] + green_techs: [] + tech_exclude: [] + scenario_geographies_list: [] + global_aggregate_sector_list: [] desktop: From 58d78fa148578394067bcd17f4208495eafdc053 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 17 Feb 2024 19:39:06 +0100 Subject: [PATCH 09/33] docs(deploy): Define prerequisites Define the prerequisite steps prior to running data prep --- README.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ac5d44a..561b699 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # workflow.data.preparation -`workflow.data.preparation` orchestrates the PACTA data preparation process, combining production, financial, scenario, and currency data into a format suitable for use in a PACTA for investors analysis. Assuming that the computing resource being used has sufficient memory (which can be >16gb depending on the inputs), storage space, and access to the necessary inputs, this is intended to work on a desktop or laptop using RStudio or run using the included [Dockerfile](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/Dockerfile) and [docker-compose.yml](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/docker-compose.yml). +`workflow.data.preparation` orchestrates the PACTA data preparation process, combining production, financial, scenario, and currency data into a format suitable for use in a PACTA for investors analysis. Assuming that the computing resource being used has sufficient memory (which can be >16Gb depending on the inputs), storage space, and access to the necessary inputs, this is intended to work on a desktop or laptop using RStudio or run using the included [Dockerfile](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/Dockerfile) and [docker-compose.yml](https://github.com/RMI-PACTA/workflow.data.preparation/blob/main/docker-compose.yml). ## Running in RStudio @@ -12,7 +12,7 @@ Running workflow.data.preparation has a number of R package dependencies that ar To make things easier, the recommended way to specify the desired config set when running locally in RStudio is by setting the active config set to `desktop` and modifying/adding only a few of the properties in the `desktop` config set. By doing so, you benefit from inheriting many of the appropriate configuration values without having to explicitly specify each one. -You will need to set the `inherits` parameter, e.g. `inherits: 2022Q4`, to select which of the config sets specified in the config.yml file that is desired. +You will need to set the `inherits` parameter, e.g. `inherits: 2022Q4`, to select which of the config sets specified in the config.yml file that is desired. You will need to set `data_prep_outputs_path` to an *existing* directory where you want the outputs to be saved, e.g. `data_prep_outputs_path: "./outputs"` to point to an existing directory named `outputs` in the working directory of the R session you will be running data.prep in. This directory must exist before running data.prep (and ideally be empty). The script will throw an error early on if it does not exist. @@ -57,6 +57,26 @@ Run `docker-compose up` from the root directory, and docker will build the image Use `docker-compose build --no-cache` to force a rebuild of the Docker image. +## Running Data Preparation interactively on Azure VM + +*Instructions specific to the RMI-PACTA team's Azure instance are in Italics.* + +0. **Prerequisites:** + - Set up Storage Accounts containing the [required files](#required-input-files). + While all the files can exist on a single file share, in a single storage account, the workflow can access different storage accounts, to allow for read-only access to raw data, to prevent accident manipulation of source data. + The recommended structure (used by RMI) is: + - Storage Account: `pactadatadev`: (read/write) *RMI QAs datasets prior to moving them to PROD with[ `workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)* + - File Share `workflow-data-preparation-outputs`: Outputs from this workflow. + - Storage Account: `pactarawdata` (read-only) + - File Share `factset-extracted`: Outputs from [`workflow.factset`](https://github.com/RMI-PACTA/workflow.factset) + - File Share `AssetImpact` Raw data files from [Asset Impact](https://asset-impact.gresb.com/) + - (Optional, but recommended) Create a User Assigned Managed Identity. + Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. + * **RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* + - Grant Appropriate permissions to the Identity: + - `pactadatadev`: "Storage File Data SMB Share Contributor" + - `pactarawdata`: "Storage File Data SMB Share Reader" + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). From 032459e05f91b1327793f15e3a3e9dab8c5d702f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 17 Feb 2024 20:55:37 +0100 Subject: [PATCH 10/33] docs(deploy): Instructions up through connecting Everything works up through creating and connecting to VM --- README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 561b699..641e340 100644 --- a/README.md +++ b/README.md @@ -62,21 +62,86 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. *Instructions specific to the RMI-PACTA team's Azure instance are in Italics.* 0. **Prerequisites:** + *These steps have been completed on the RMI Azure instance.* + - Ensure a Virtual Network with a Gateway has been set up, permitting SSH (Port 22) access. + Details of setting this up are out of scope for these instructions. + Talk to your network coordinator for help. - Set up Storage Accounts containing the [required files](#required-input-files). While all the files can exist on a single file share, in a single storage account, the workflow can access different storage accounts, to allow for read-only access to raw data, to prevent accident manipulation of source data. - The recommended structure (used by RMI) is: - - Storage Account: `pactadatadev`: (read/write) *RMI QAs datasets prior to moving them to PROD with[ `workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)* + The recommended structure (*used by RMI*) is: + - Storage Account: `pactadatadev`: (read/write). + Naming note: *RMI QAs datasets prior to moving them to PROD with [`workflow.pacta.data.qa`](https://github.com/RMI-PACTA/workflow.pacta.data.qa)*. - File Share `workflow-data-preparation-outputs`: Outputs from this workflow. - Storage Account: `pactarawdata` (read-only) - File Share `factset-extracted`: Outputs from [`workflow.factset`](https://github.com/RMI-PACTA/workflow.factset) - File Share `AssetImpact` Raw data files from [Asset Impact](https://asset-impact.gresb.com/) - (Optional, but recommended) Create a User Assigned Managed Identity. - Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. - * **RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* + Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. ***RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* - Grant Appropriate permissions to the Identity: - `pactadatadev`: "Storage File Data SMB Share Contributor" - `pactarawdata`: "Storage File Data SMB Share Reader" +1. Start a VM. + While the machine can be deployed via the Portal (WebUI), for simplicity, the following code block is provided which ensures consistency: + + ```sh + # The options here work with the RMI-PACTA team's Azure setup. + # Change values for your own instance as needed. + + # Get Network details. + VNET_RESOURCE_GROUP="RMI-PROD-EU-VNET-RG" + VNET_NAME="RMI-PROD-EU-VNET" + SUBNET_NAME="RMI-SP-PACTA-DEV-VNET" + SUBNET_ID=$(az network vnet subnet show --resource-group $VNET_RESOURCE_GROUP --name $SUBNET_NAME --vnet-name $VNET_NAME --query id -o tsv) + + # Use the identity previously setup (see Prerequisites) + MACHINEIDENTITY="/subscriptions/feef729b-4584-44af-a0f9-4827075512f9/resourceGroups/RMI-SP-PACTA-PROD/providers/Microsoft.ManagedIdentity/userAssignedIdentities/workflow-data-preparation" + # This size has 2 vCPU, and 32GiB memory, recommended settings. + MACHINE_SIZE="Standard_E4-2as_v4" + # Using epoch to give machine a (probably) unique name + MACHINE_NAME="dataprep-runner-$(date +%s)" + # NOTE: Change this to your own RG as needed. + VM_RESOURCE_GROUP="RMI-SP-PACTA-DEV" + + # **NOTE: Check these options prior to running** + # Non-RMI users may choose to omit the --public-ip-address line for public SSH Access. + + az vm create \ + --admin-username azureuser \ + --assign-identity "$MACHINEIDENTITY" \ + --generate-ssh-keys \ + --image Ubuntu2204 \ + --name "$MACHINE_NAME" \ + --nic-delete-option delete \ + --os-disk-delete-option delete \ + --public-ip-address "" \ + --resource-group "$VM_RESOURCE_GROUP" \ + --size "$MACHINE_SIZE" \ + --subnet "$SUBNETID" + + ``` + + If this command successfully runs, it will output a JSON block describing the resource (VM) created. + +2. **Connect to the Network.** (Optional) + ***RMI:** Connecting to the VPN will enable SSH access.* + Connect to the Virtual Network specified above, as the comand above does not create a Public IP Address. + Details for this are out of scope for these instructions. + Contact your network coordinator for help. + +2. Connect to the newly created VM via SSH. + + ```sh + This connects to the VM created above via SSH. + + az ssh vm \ + --local-user azureuser \ + --name "$MACHINE_NAME" \ + --prefer-private-ip \ + --resource-group "$VM_RESOURCE_GROUP" + + ``` + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). From 080ad61552d766a5b2d7b886a11c5b24d7a40a4f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 17 Feb 2024 21:19:05 +0100 Subject: [PATCH 11/33] feat(deploy): Add mount_afs script Add a helper script to mount Azure File Shares --- scripts/mount_afs.sh | 107 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100755 scripts/mount_afs.sh diff --git a/scripts/mount_afs.sh b/scripts/mount_afs.sh new file mode 100755 index 0000000..350388d --- /dev/null +++ b/scripts/mount_afs.sh @@ -0,0 +1,107 @@ +#! /bin/sh + +# mount an Azure File Share at a given location. +# Requires az cli to be installed and logged in. + +usage() { + echo "Usage: mount_afs.sh [-h] [-v] -r -a -f -m " + echo " -h: help (this message)" + echo " -v: verbose" + echo " -r: resource group (Required)" + echo " -a: storage account name (Required)" + echo " -f: file share name (Required)" + echo " -m: mount point (Required)" + echo " -?: help" + exit 1 +} + +while getopts "h?vr:a:f:m:" opt; do + case "$opt" in + h|\?) + usage + ;; + v) VERBOSE=1 + ;; + r) RESOURCEGROUP=$OPTARG + ;; + a) STORAGEACCOUNTNAME=$OPTARG + ;; + f) FILESHARENAME=$OPTARG + ;; + m) MOUNTPOINT=$OPTARG + ;; + *) + usage + ;; + esac +done + +missing_opts=0 +if [ -z "$RESOURCEGROUP" ]; then + echo "ERROR: Resource group is required" + missing_opts=1 +fi + +if [ -z "$STORAGEACCOUNTNAME" ]; then + echo "ERROR: Storage account name is required" + missing_opts=1 +fi + +if [ -z "$FILESHARENAME" ]; then + echo "ERROR: File share name is required" + missing_opts=1 +fi + +if [ -z "$MOUNTPOINT" ]; then + echo "ERROR: Mount point is required" + missing_opts=1 +fi + +if [ $missing_opts -eq 1 ]; then + usage +fi + +if [ -n "$VERBOSE" ]; then + echo "RESOURCEGROUP: $RESOURCEGROUP" + echo "STORAGEACCOUNTNAME: $STORAGEACCOUNTNAME" + echo "FILESHARENAME: $FILESHARENAME" + echo "MOUNTPOINT: $MOUNTPOINT" +fi + +# This command assumes you have logged in with az login + +if [ -n "$VERBOSE" ]; then + echo "Getting https endpoint for storage account $STORAGEACCOUNTNAME" +fi + +httpEndpoint=$(az storage account show \ + --resource-group "$RESOURCEGROUP" \ + --name "$STORAGEACCOUNTNAME" \ + --query "primaryEndpoints.file" --output tsv | tr -d '"') +smbPath=$(echo "$httpEndpoint" | cut -c7-${#httpEndpoint})$FILESHARENAME +fileHost=$(echo "$httpEndpoint" | cut -c7-${#httpEndpoint}| tr -d "/") +nc -zvw3 "$fileHost" 445 + +if [ -n "$VERBOSE" ]; then + echo "httpEndpoint: $httpEndpoint" + echo "smbPath: $smbPath" + echo "fileHost: $fileHost" +fi + +if [ -n "$VERBOSE" ]; then + echo "Getting storage account key" +fi +storageAccountKey=$(az storage account keys list \ + --resource-group "$RESOURCEGROUP" \ + --account-name "$STORAGEACCOUNTNAME" \ + --query "[0].value" --output tsv | tr -d '"') + +if [ -n "$VERBOSE" ]; then + echo "Creating mount path: $MOUNTPOINT" +fi +sudo mkdir -p "$MOUNTPOINT" + +if [ -n "$VERBOSE" ]; then + echo "Mounting $smbPath to $MOUNTPOINT" +fi +sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,file_mode=0777,nobrl,dir_mode=0777,vers=3.1.1 From 222375e6087a9608880258f662513351d01016e6 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 00:11:13 +0100 Subject: [PATCH 12/33] fix(deploy): Update script to default to read-only Mounting an Azure File Share to a linux OS via SMB defaults to read/write access. The change to the mount script default to read-only with file permissions (`0555`) Update Docs accordingly. --- README.md | 36 +++++++++++++++++++++++++++++++----- scripts/mount_afs.sh | 14 ++++++++++++-- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 641e340..8859125 100644 --- a/README.md +++ b/README.md @@ -78,10 +78,12 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. - (Optional, but recommended) Create a User Assigned Managed Identity. Alternately, after creating the VM with a system-managed identity, you can assign all appropriate permissions. ***RMI:** The `workflow-data-preparation` Identity exists with all the appropriate permissions.* - Grant Appropriate permissions to the Identity: - - `pactadatadev`: "Storage File Data SMB Share Contributor" - - `pactarawdata`: "Storage File Data SMB Share Reader" + - `pactadatadev`: "Reader and Data Access". + - `pactarawdata`: "Reader and Data Access" + Note that this gives read/write access the Storage Account via the Storage Account Key. + To grant read-only access to the VM, use the `mount_afs` script without the `-w` flag, as shown below. -1. Start a VM. +1. **Start a VM** While the machine can be deployed via the Portal (WebUI), for simplicity, the following code block is provided which ensures consistency: ```sh @@ -123,13 +125,13 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. If this command successfully runs, it will output a JSON block describing the resource (VM) created. -2. **Connect to the Network.** (Optional) +2. **Connect to the Network.** (Optional) ***RMI:** Connecting to the VPN will enable SSH access.* Connect to the Virtual Network specified above, as the comand above does not create a Public IP Address. Details for this are out of scope for these instructions. Contact your network coordinator for help. -2. Connect to the newly created VM via SSH. +3. **Connect to the newly created VM via SSH.** ```sh This connects to the VM created above via SSH. @@ -142,6 +144,30 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ``` +4. **Connect the VM to required resources** + Clone this repo, install the `az` cli utility, and mount the appropriate Azure File Shares. + + ```sh + # Clone this repo through https to avoid need for an SSH key + git clone https://github.com/RMI-PACTA/workflow.data.preparation.git ~/workflow.data.preparation + + # Install az cli + sudo apt update + # See https://aka.ms/installcli for alternate instructions + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + + # Login to azure with assigned identity + az login --identity + + # Use script from this repo to connect to file shares + ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-PROD" -a "pactarawdata" -f "factset-extracted" -m "/mnt/factset-extracted" + ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-PROD" -a "pactarawdata" -f "asset-impact" -m "/mnt/asset-impact" + + # Note the outputs directory has the -w flag, meaning write permissions are enabled. + ~/workflow.data.preparation/scripts/mount_afs.sh -r "RMI-SP-PACTA-DEV" -a "pactadatadev" -f "workflow-data-preparation-outputs" -m "/mnt/workflow-data-preparation-outputs" -w + + ``` + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). diff --git a/scripts/mount_afs.sh b/scripts/mount_afs.sh index 350388d..d0f58c9 100755 --- a/scripts/mount_afs.sh +++ b/scripts/mount_afs.sh @@ -7,6 +7,7 @@ usage() { echo "Usage: mount_afs.sh [-h] [-v] -r -a -f -m " echo " -h: help (this message)" echo " -v: verbose" + echo " -w: Allow write access to the file share (default is read-only)" echo " -r: resource group (Required)" echo " -a: storage account name (Required)" echo " -f: file share name (Required)" @@ -15,13 +16,15 @@ usage() { exit 1 } -while getopts "h?vr:a:f:m:" opt; do +while getopts "h?vwr:a:f:m:" opt; do case "$opt" in h|\?) usage ;; v) VERBOSE=1 ;; + w) ALLOW_WRITE=1 + ;; r) RESOURCEGROUP=$OPTARG ;; a) STORAGEACCOUNTNAME=$OPTARG @@ -104,4 +107,11 @@ sudo mkdir -p "$MOUNTPOINT" if [ -n "$VERBOSE" ]; then echo "Mounting $smbPath to $MOUNTPOINT" fi -sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,file_mode=0777,nobrl,dir_mode=0777,vers=3.1.1 + +if [ -n "$ALLOW_WRITE" ]; then + permissions="file_mode=0777,dir_mode=0777" +else + permissions="file_mode=0555,dir_mode=0555" +fi + +sudo mount -t cifs "$smbPath" "$MOUNTPOINT" -o username="$STORAGEACCOUNTNAME",password="$storageAccountKey",serverino,nosharesock,actimeo=30,nobrl,"$permissions",vers=3.1.1 From c2e798c93c4b3e69cad869f90f8b6a7ff17eb9f2 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 02:09:15 +0100 Subject: [PATCH 13/33] feat(deploy): Use new split inputs in docker-compose --- config.yml | 2 ++ docker-compose.yml | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/config.yml b/config.yml index 0680490..b342d56 100644 --- a/config.yml +++ b/config.yml @@ -102,6 +102,8 @@ default: 2022Q4: + asset_impact_data_path: "/mnt/factset-extracted" + factset_data_path: "/mnt/asset-impact" masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" diff --git a/docker-compose.yml b/docker-compose.yml index e8baf92..f0b3f7d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,8 +6,11 @@ services: context: . volumes: - type: bind - source: ${HOST_INPUTS_PATH} - target: /inputs + source: ${HOST_FACTSET_EXTRACTED_PATH} + target: /mnt/factset-extracted + - type: bind + source: ${HOST_ASSET_IMPACT_PATH} + target: /asset-impact - type: bind source: ${HOST_OUTPUTS_PATH} target: /outputs From 4f42a33aae6e3b8afcc019f6db45bdb34d707670 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:17:25 +0100 Subject: [PATCH 14/33] feat(deploy): Change AI File paths Reflect actual Azure FIles structure --- config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config.yml b/config.yml index b342d56..9af590a 100644 --- a/config.yml +++ b/config.yml @@ -102,11 +102,11 @@ default: 2022Q4: - asset_impact_data_path: "/mnt/factset-extracted" - factset_data_path: "/mnt/asset-impact" - masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + asset_impact_data_path: "/mnt/asset-impact" + factset_data_path: "/mnt/factset-extracted" + masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" From 13d2d3c39df951a63a6716451e2487832b67e9cb Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:22:40 +0100 Subject: [PATCH 15/33] feat(deploy): Update Factset file paths for 2022Q4 --- config.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/config.yml b/config.yml index 9af590a..00d3e01 100644 --- a/config.yml +++ b/config.yml @@ -103,19 +103,19 @@ default: 2022Q4: asset_impact_data_path: "/mnt/asset-impact" - factset_data_path: "/mnt/factset-extracted" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" - factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" - factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" - factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" - factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds" - factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds" - factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" - factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" - factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" + factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From dfc4825c4e8a66c11f265ca1d3b4aba43293e6f5 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:23:37 +0100 Subject: [PATCH 16/33] ci(deploy): Add verbose logging for remote environment --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index f0b3f7d..a25de22 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,8 @@ services: data_prep: build: context: . + environment: + - LOG_LEVEL=TRACE volumes: - type: bind source: ${HOST_FACTSET_EXTRACTED_PATH} From 12499023a5f408622af6a9d0d473b8b4c56c82ac Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:25:14 +0100 Subject: [PATCH 17/33] docs(deploy): Update README instructions --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8859125..f666945 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ Running the workflow requires a file `.env` to exist in the root directory, that ```sh # .env -HOST_INPUTS_PATH=/PATH/TO/inputs +HOST_FACTSET_EXTRACTED_PATH=/PATH/TO/factset-extracted +HOST_ASSET_IMPACT_PATH=/PATH/TO/asset-impact HOST_OUTPUTS_PATH=/PATH/TO/YYYYQQ_pacta_analysis_inputs_YYYY-MM-DD/YYYYQQ GITHUB_PAT=ghp_XXXXxxXxXXXxXxxX R_CONFIG_ACTIVE=YYYYQQ @@ -119,7 +120,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. --public-ip-address "" \ --resource-group "$VM_RESOURCE_GROUP" \ --size "$MACHINE_SIZE" \ - --subnet "$SUBNETID" + --subnet "$SUBNET_ID" ``` @@ -134,7 +135,8 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. 3. **Connect to the newly created VM via SSH.** ```sh - This connects to the VM created above via SSH. + # This connects to the VM created above via SSH. + # See above block for envvars referenced here. az ssh vm \ --local-user azureuser \ @@ -168,6 +170,49 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ``` +5. **Install Docker** + + ```sh + # install docker + sudo apt -y install \ + docker-compose \ + docker.io + + # Allow azureuser to run docker without sudo + sudo usermod -aG docker azureuser + ``` + + At this point, you need to log out of the shell to reevaluate group memberships (add the `docker` group to `azureuser`). + You can log back in with the `az ssh` command from step 3. + When you are back into the shell, you can run `docker run --rm hello-world` to confirm that docker is working correctly, and you are able to run as a non-root user. + +6. **Prepare `.env` file** + The `ubuntu2204` image used for the VM includes both `vim` and `nano`. + Create a `.env` file in the `workflow.data.preparation` directory, according to the instructions in the [running locally](running-locally-with-docker-compose) section of this file. + +7. **Build Docker image** + The cloned git repo in the home directory, and mounted directories should sill be in place after logging in again. + Additionally, `azureuser` should be part of the `docker` group. + you can confirm this with + + ```sh + groups + ls ~ + ls /mnt + ``` + + With that in place, you are ready to build the `workflow.data.preparation` docker image. + + ```sh + # navigate to the workflow.data.preparation repo + cd ~/workflow.data.preparation + + docker-compose build + + docker-compose up + + ``` + ## Required Input Files All required files must exist at `$HOST_INPUTS_PATH`, in a single directory (no subdirectories). From e730ab70870090fff7ea7e5e3981307ef65af0c8 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:41:45 +0100 Subject: [PATCH 18/33] fix(deploy): fix path in docker volume mount --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index a25de22..b0fc14f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ services: target: /mnt/factset-extracted - type: bind source: ${HOST_ASSET_IMPACT_PATH} - target: /asset-impact + target: /mnt/asset-impact - type: bind source: ${HOST_OUTPUTS_PATH} target: /outputs From 90b32936c1bcf07eea6cef091c270d7cf611e1b8 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:46:15 +0100 Subject: [PATCH 19/33] feat(deploy): make docker-compose mounts read-only --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index b0fc14f..4e5deb5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,9 +10,11 @@ services: - type: bind source: ${HOST_FACTSET_EXTRACTED_PATH} target: /mnt/factset-extracted + read_only: true - type: bind source: ${HOST_ASSET_IMPACT_PATH} target: /mnt/asset-impact + read_only: true - type: bind source: ${HOST_OUTPUTS_PATH} target: /outputs From fa02e0139e0796ff33f5246727abf45598484a38 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 04:47:29 +0100 Subject: [PATCH 20/33] docs(deploy): update Readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f666945..834e447 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. 7. **Build Docker image** The cloned git repo in the home directory, and mounted directories should sill be in place after logging in again. Additionally, `azureuser` should be part of the `docker` group. - you can confirm this with + you can confirm this with: ```sh groups @@ -202,11 +202,14 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ``` With that in place, you are ready to build the `workflow.data.preparation` docker image. + **To ensure that a dropped network connection does not kill the process, you should run this in `tmux`.** ```sh # navigate to the workflow.data.preparation repo cd ~/workflow.data.preparation + tmux + docker-compose build docker-compose up From 3020e58357433cc9ff83c16629b39986c1821b8a Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 11:35:41 +0100 Subject: [PATCH 21/33] Add current working config for 2022q4 --- config.yml | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/config.yml b/config.yml index 0680490..00d3e01 100644 --- a/config.yml +++ b/config.yml @@ -102,18 +102,20 @@ default: 2022Q4: - masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" - factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" - factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" - factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" - factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds" - factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds" - factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" - factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" - factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" + asset_impact_data_path: "/mnt/asset-impact" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" + masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From 665a67698c08700cca67637e2d087be641f9e403 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 18 Feb 2024 11:37:34 +0100 Subject: [PATCH 22/33] return config to `main` don't touch config in this PR. --- config.yml | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/config.yml b/config.yml index 00d3e01..0680490 100644 --- a/config.yml +++ b/config.yml @@ -102,20 +102,18 @@ default: 2022Q4: - asset_impact_data_path: "/mnt/asset-impact" - factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" - masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" - factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" - factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" - factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" - factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_fund_data.rds" - factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_isin_to_fund_table.rds" - factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_iss_emissions.rds" - factset_issue_code_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_issue_code_bridge.rds" - factset_industry_map_bridge_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_industry_map_bridge.rds" - factset_manual_pacta_sector_override_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_manual_sector_override.rds" + masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20221231T000000Z_pulled-20240207T161053Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "test-from-fds-test-20240207-03-postgres_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20230123T000000Z_pulled-20000101T000001_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20230123T000000Z_pulled-20000101T000002_factset_manual_sector_override.rds" imf_quarter_timestamp: "2022-Q4" pacta_financial_timestamp: "2022Q4" market_share_target_reference_year: 2022 From 1097f531458927607f2631fc52419b654e1b205c Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Fri, 23 Feb 2024 09:28:20 +0100 Subject: [PATCH 23/33] Update config.yml --- config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.yml b/config.yml index 1b125f5..2c42300 100644 --- a/config.yml +++ b/config.yml @@ -134,6 +134,7 @@ docker: 2023Q4: + inherits: docker masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv" From a1ce304d02eab52016b64a517988046f3037c39d Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 23 Feb 2024 15:44:56 +0100 Subject: [PATCH 24/33] build(docker): #143 specify `2022Q4_docker` config Add a config option for running 2022Q4 data prep with the docker container Closes: #143 --- config.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/config.yml b/config.yml index eafad59..6a67cd6 100644 --- a/config.yml +++ b/config.yml @@ -40,13 +40,13 @@ desktop: factset_data_path: "./factset_inputs" docker: + inherits: 2022Q4 data_prep_outputs_path: "/mnt/outputs" - asset_impact_data_path: "/inputs" - factset_data_path: "/inputs" + asset_impact_data_path: "/mnt/inputs" + factset_data_path: "/mnt/inputs" 2021Q4: - inherits: docker masterdata_ownership_filename: "2023-06-05_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2021q4.csv" masterdata_debt_filename: "2023-06-05_AI_RMI Bespoke_Company Data Products_masterdata_debt_2021q4.csv" ar_company_id__factset_entity_id_filename: "2022-08-17_rmi_ar_fs_id_bridge_2021q4.csv" @@ -76,7 +76,6 @@ docker: 2022Q2: - inherits: docker masterdata_ownership_filename: "2022-08-30_rmi_masterdata_ownership_2022q2.csv" masterdata_debt_filename: "2022-10-03_rmi_masterdata_debt_2022q2.csv" ar_company_id__factset_entity_id_filename: "2022-08-17_rmi_ar_fs_id_bridge_2021q4.csv" @@ -106,9 +105,6 @@ docker: 2022Q4: - inherits: docker - asset_impact_data_path: "/mnt/asset-impact" - factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" @@ -135,3 +131,9 @@ docker: scenario_geographies_list: ["Global", "NonOECD", "OECD"] global_aggregate_scenario_sources_list: ["WEO2022"] global_aggregate_sector_list: ["Power"] + +2022Q4_docker: + inherits: 2022Q4 + data_prep_outputs_path: "/mnt/outputs" + asset_impact_data_path: "/mnt/asset-impact" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" From d6ddce81516576d20853219e6166c75503d58323 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 23 Feb 2024 15:46:52 +0100 Subject: [PATCH 25/33] build(docker): #143 remove default docker config the default docker config should not be used, since it does not specify any file paths --- config.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/config.yml b/config.yml index 6a67cd6..748f89a 100644 --- a/config.yml +++ b/config.yml @@ -39,12 +39,6 @@ desktop: asset_impact_data_path: "./ai_inputs" factset_data_path: "./factset_inputs" -docker: - inherits: 2022Q4 - data_prep_outputs_path: "/mnt/outputs" - asset_impact_data_path: "/mnt/inputs" - factset_data_path: "/mnt/inputs" - 2021Q4: masterdata_ownership_filename: "2023-06-05_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2021q4.csv" From b2ca041e203fb23221d0d7dd5aa48cdd67f155c9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 11:42:17 +0100 Subject: [PATCH 26/33] build(deploy): #143 Restore `docker` inheritance Restore inheritance for previous (pre-2022Q4) configs. --- config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config.yml b/config.yml index 748f89a..660845d 100644 --- a/config.yml +++ b/config.yml @@ -39,8 +39,14 @@ desktop: asset_impact_data_path: "./ai_inputs" factset_data_path: "./factset_inputs" +docker: + data_prep_outputs_path: "/mnt/outputs" + asset_impact_data_path: "/inputs" + factset_data_path: "/inputs" + 2021Q4: + inherits: docker masterdata_ownership_filename: "2023-06-05_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2021q4.csv" masterdata_debt_filename: "2023-06-05_AI_RMI Bespoke_Company Data Products_masterdata_debt_2021q4.csv" ar_company_id__factset_entity_id_filename: "2022-08-17_rmi_ar_fs_id_bridge_2021q4.csv" @@ -70,6 +76,7 @@ desktop: 2022Q2: + inherits: docker masterdata_ownership_filename: "2022-08-30_rmi_masterdata_ownership_2022q2.csv" masterdata_debt_filename: "2022-10-03_rmi_masterdata_debt_2022q2.csv" ar_company_id__factset_entity_id_filename: "2022-08-17_rmi_ar_fs_id_bridge_2021q4.csv" From 2b8077f17957e4e3321b817e3e2035a88e20bbb7 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 12:10:41 +0100 Subject: [PATCH 27/33] build(deploy): #143 Sut AFS-specific paths in azure config --- config.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/config.yml b/config.yml index 660845d..7eca762 100644 --- a/config.yml +++ b/config.yml @@ -106,9 +106,9 @@ docker: 2022Q4: - masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" - masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" - ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" + masterdata_ownership_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" factset_financial_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_financial_data.rds" factset_entity_info_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_info.rds" factset_entity_financing_data_filename: "timestamp-20221231T000000Z_pulled-20240217T134528Z_factset_entity_financing_data.rds" @@ -133,8 +133,12 @@ docker: global_aggregate_scenario_sources_list: ["WEO2022"] global_aggregate_sector_list: ["Power"] -2022Q4_docker: +2022Q4_azure: inherits: 2022Q4 data_prep_outputs_path: "/mnt/outputs" asset_impact_data_path: "/mnt/asset-impact" factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20221231T000000Z_pulled-20240217T134528Z" + # Asset impact files are stored in separate subdirectories on Azure File Share. + masterdata_ownership_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2022Q4.csv" + masterdata_debt_filename: "2023-06-18/2023-06-18_AI_RMI Bespoke_Company Data Products_masterdata_debt_2022Q4.csv" + ar_company_id__factset_entity_id_filename: "2023-02-15/2023-02-15_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2022Q4.csv" From b725057b830857481f8eb5810c4e8b5dad9a274f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 12:17:13 +0100 Subject: [PATCH 28/33] build(docker): #143 update docker-compose to match config.yml --- docker-compose.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2c1e990..400d371 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,10 +4,17 @@ services: data_prep: build: context: . + environment: + - LOG_LEVEL=TRACE volumes: - type: bind - source: ${HOST_INPUTS_PATH} - target: /inputs + source: ${HOST_FACTSET_EXTRACTED_PATH} + target: /mnt/factset-extracted + read_only: true + - type: bind + source: ${HOST_ASSET_IMPACT_PATH} + target: /mnt/asset-impact + read_only: true - type: bind source: ${HOST_OUTPUTS_PATH} target: /mnt/outputs From 4b47c5fe0df47d25cb3d196e72d781493d5768b9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 12:49:02 +0100 Subject: [PATCH 29/33] feat(deploy): 144 Add 2023Q4_azure config Add config matching format in #145 --- config.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/config.yml b/config.yml index bcf86dc..4df4b57 100644 --- a/config.yml +++ b/config.yml @@ -136,7 +136,6 @@ docker: 2023Q4: - inherits: docker masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv" @@ -163,3 +162,9 @@ docker: scenario_geographies_list: ["Global", "NonOECD", "OECD"] global_aggregate_scenario_sources_list: ["WEO2023"] global_aggregate_sector_list: ["Power"] + +2023Q4_azure: + inherits: 2023Q4 + data_prep_outputs_path: "/mnt/outputs" + asset_impact_data_path: "/mnt/asset-impact" + factset_data_path: "/mnt/factset-extracted" From cea47759e35173d01fac615e51eaff1fd324286b Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 13:19:43 +0100 Subject: [PATCH 30/33] feat(deploy): 144 Add filepaths add filepaths to latest known working input data files Closes: 144 --- config.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/config.yml b/config.yml index 4df4b57..0d4c81f 100644 --- a/config.yml +++ b/config.yml @@ -139,15 +139,15 @@ docker: masterdata_ownership_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_ownership_2023Q4.csv" masterdata_debt_filename: "2024-02-14_AI_RMI Bespoke_Company Data Products_masterdata_debt_2023Q4.csv" ar_company_id__factset_entity_id_filename: "2024-02-14_AI_RMI_Bespoke_Company_Data_Products_Company_ID_List_2023Q4.csv" - factset_financial_data_filename: "" - factset_entity_info_filename: "" - factset_entity_financing_data_filename: "" - factset_fund_data_filename: "" - factset_isin_to_fund_table_filename: "" - factset_iss_emissions_data_filename: "" - factset_issue_code_bridge_filename: "" - factset_industry_map_bridge_filename: "" - factset_manual_pacta_sector_override_filename: "" + factset_financial_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_financial_data.rds" + factset_entity_info_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_entity_info.rds" + factset_entity_financing_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_entity_financing_data.rds" + factset_fund_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_fund_data.rds" + factset_isin_to_fund_table_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_isin_to_fund_table.rds" + factset_iss_emissions_data_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_iss_emissions.rds" + factset_issue_code_bridge_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_issue_code_bridge.rds" + factset_industry_map_bridge_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_industry_map_bridge.rds" + factset_manual_pacta_sector_override_filename: "timestamp-20231231T000000Z_pulled-20240217T135833Z_factset_manual_sector_override.rds" imf_quarter_timestamp: "2023-Q4" pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 @@ -166,5 +166,5 @@ docker: 2023Q4_azure: inherits: 2023Q4 data_prep_outputs_path: "/mnt/outputs" - asset_impact_data_path: "/mnt/asset-impact" - factset_data_path: "/mnt/factset-extracted" + asset_impact_data_path: "/mnt/asset-impact/2024-02-15_AI_RMI_2023Q4" + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20231231T000000Z_pulled-20240217T135833Z" From 6685f4ce9852fd67d94b4efaffe759b32bdc9259 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 16:04:11 +0100 Subject: [PATCH 31/33] fix(deploy): Reverty scenarios to 2022 Scenarios Reverting scenarios to 2022 editions, since 2023 not avialable yet. --- config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.yml b/config.yml index d98ce93..fdfd54a 100644 --- a/config.yml +++ b/config.yml @@ -161,15 +161,15 @@ docker: pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 time_horizon: 5 - scenario_sources_list: ["GECO2023", "ISF2023", "WEO2023"] + scenario_sources_list: ["GECO2022", "ISF2021", "WEO2022"] sector_list: ["Automotive", "Power", "Oil&Gas", "Coal"] other_sector_list: ["Steel", "Aviation", "Cement"] zero_emission_factor_techs: ["Electric", "HydroCap", "NuclearCap", "RenewablesCap"] green_techs: ["FuelCell", "Electric", "Hybrid", "RenewablesCap", "HydroCap", "NuclearCap", "FuelCell_HDV", "Electric_HDV", "Hybrid_HDV"] - scenario_raw_data_to_include: ["geco_2023", "isf_2023", "weo_2023"] + scenario_raw_data_to_include: ["geco_2022", "isf_2021", "weo_2022"] tech_exclude: ["OtherCap", "OtherFF", "Coking Plant", "Sintering Plant", "Direct Or Smelting Reduction Plant", "Pelletizing Plant", "Grinding Plant", "Passenger / Freight"] scenario_geographies_list: ["Global", "NonOECD", "OECD"] - global_aggregate_scenario_sources_list: ["WEO2023"] + global_aggregate_scenario_sources_list: ["WEO2022"] global_aggregate_sector_list: ["Power"] 2023Q4_azure: From 4ee09daeefb780511b78b00ed560dbf692649e6f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 24 Feb 2024 16:17:22 +0100 Subject: [PATCH 32/33] Add branch-specific instructions --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 834e447..c66653d 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,10 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. # Clone this repo through https to avoid need for an SSH key git clone https://github.com/RMI-PACTA/workflow.data.preparation.git ~/workflow.data.preparation + cd ~/workflow.data.preparation + git checkout develop-vm-20240224 + cd ~ + # Install az cli sudo apt update # See https://aka.ms/installcli for alternate instructions From 69149ca72b7f920f9feb216c312ca23b0fd8c0da Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 26 Feb 2024 19:19:44 +0100 Subject: [PATCH 33/33] Bump ISF to 2023 :tada: --- config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.yml b/config.yml index f9f52eb..786dd1c 100644 --- a/config.yml +++ b/config.yml @@ -161,12 +161,12 @@ docker: pacta_financial_timestamp: "2023Q4" market_share_target_reference_year: 2023 time_horizon: 5 - scenario_sources_list: ["GECO2022", "ISF2021", "WEO2022"] + scenario_sources_list: ["GECO2022", "ISF2023", "WEO2022"] sector_list: ["Automotive", "Power", "Oil&Gas", "Coal"] other_sector_list: ["Steel", "Aviation", "Cement"] zero_emission_factor_techs: ["Electric", "HydroCap", "NuclearCap", "RenewablesCap"] green_techs: ["FuelCell", "Electric", "Hybrid", "RenewablesCap", "HydroCap", "NuclearCap", "FuelCell_HDV", "Electric_HDV", "Hybrid_HDV"] - scenario_raw_data_to_include: ["geco_2022", "isf_2021", "weo_2022"] + scenario_raw_data_to_include: ["geco_2022", "isf_2023", "weo_2022"] tech_exclude: ["OtherCap", "OtherFF", "Coking Plant", "Sintering Plant", "Direct Or Smelting Reduction Plant", "Pelletizing Plant", "Grinding Plant", "Passenger / Freight"] scenario_geographies_list: ["Global", "NonOECD", "OECD"] global_aggregate_scenario_sources_list: ["WEO2022"] @@ -176,4 +176,4 @@ docker: inherits: 2023Q4 data_prep_outputs_path: "/mnt/outputs" asset_impact_data_path: "/mnt/asset-impact/2024-02-15_AI_RMI_2023Q4" - factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20231231T000000Z_pulled-20240217T135833Z" \ No newline at end of file + factset_data_path: "/mnt/factset-extracted/factset-pacta_timestamp-20231231T000000Z_pulled-20240217T135833Z"