Merge pull request #1 from RMI-PACTA/add-matching-script

add matching script
RMI-PACTA · Apr 2, 2024 · d4e8402 · d4e8402
2 parents 33ed02d + 753b1cc
commit d4e8402
Show file tree

Hide file tree

Showing 4 changed files with 406 additions and 5 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,11 +21,8 @@ Imports:
     pacta.multi.loanbook.plot,
     readr,
     rlang,
-    tidyr
-Remotes:
-    RMI-PACTA/pacta.data.preparation,
-    RMI-PACTA/pacta.data.scraping,
-    RMI-PACTA/pacta.scenario.preparation
+    tidyr,
+    vroom
 Depends:
     R (>= 4.1.0)
 License: MIT + file LICENSE
diff --git a/example.config.yml b/example.config.yml
@@ -0,0 +1,47 @@
+default:
+  directories:
+    dir_scenario: "path/to/scenario_folder"
+    dir_abcd: "path/to/abcd_folder"
+    dir_raw: "path/to/raw_folder"
+    dir_matched: "path/to/matched_folder"
+    dir_output: "path/to/output_folder"
+  file_names:
+    filename_raw: "raw_loanbook_123.csv"
+    filename_scenario_tms: "scenarios_2022_tms.csv"
+    filename_scenario_sda: "scenarios_2022_sda.csv"
+    filename_abcd: "abcd.csv"
+  project_parameters:
+    scenario_source: "weo_2022"
+    scenario_select: "nze_2050"
+    region_select: "global"
+    # normally the start year should correspond with year of the publication of
+    # the scenario in use
+    start_year_select: 2022
+    time_frame_select: 5
+    # regions must be available for the selected scenario
+    benchmark_regions_select: "global,european union"
+    remove_inactive_companies: TRUE
+  sector_split:
+    apply_sector_split: TRUE
+    sector_split_type: "equal_weights"
+    # sector_split_type: "worst_case"
+    dir_split_company_id: "path/to/split_folder"
+    filename_split_company_id: "split_company_ids.csv"
+    dir_advanced_company_indicators: "path/to/advanced_company_indicators_folder"
+    filename_advanced_company_indicators: "advanced_company_indicators.xlsx"
+  matching:
+    prep_input_level: "direct_loantaker"
+    params_match_name:
+      by_sector: TRUE
+      min_score: 0.9
+      method: "jw"
+      p: 0.1
+      overwrite: NULL
+      join_id: NULL
+    own_sector_classification:
+      use_own_sector_classification: FALSE
+      dir_own_sector_classification: "path/to/own_sector_classification_folder"
+      filename_own_sector_classification: "own_sector_classification.csv"
+
+
+
diff --git a/expected_columns.R b/expected_columns.R
@@ -0,0 +1,157 @@
+# expected columns region isos file
+col_types_region_isos <- readr::cols_only(
+  region = "c",
+  isos = "c",
+  source = "c"
+)
+col_select_region_isos <- names(col_types_region_isos[["cols"]])
+
+# expected columns tms scenario file
+col_types_scenario_tms <- readr::cols_only(
+  scenario_source = "c",
+  region = "c",
+  scenario = "c",
+  sector = "c",
+  technology = "c",
+  year = "i",
+  smsp = "n",
+  tmsr = "n"
+)
+col_select_scenario_tms <- names(col_types_scenario_tms[["cols"]])
+
+# expected columns sda scenario file
+col_types_scenario_sda <- readr::cols_only(
+  scenario_source = "c",
+  region = "c",
+  scenario = "c",
+  sector = "c",
+  year = "i",
+  emission_factor = "n",
+  emission_factor_unit = "c"
+)
+col_select_scenario_sda <- names(col_types_scenario_sda[["cols"]])
+
+# expected columns abcd file
+col_types_abcd <- readr::cols_only(
+  company_id = "i",
+  name_company = "c",
+  lei = "c",
+  is_ultimate_owner = "l",
+  sector = "c",
+  technology = "c",
+  plant_location = "c",
+  year = "i",
+  production = "n",
+  production_unit = "c",
+  emission_factor = "n",
+  emission_factor_unit = "c",
+  ald_timestamp = "c"
+)
+col_select_abcd <- names(col_types_abcd[["cols"]])
+
+# expected columns matched_prioritized_all_groups file
+col_types_matched_prio_all_groups <- readr::cols_only(
+  group_id = "c",
+  id_loan = "c",
+  id_direct_loantaker = "c",
+  name_direct_loantaker = "c",
+  id_intermediate_parent_1 = "c",
+  name_intermediate_parent_1 = "c",
+  id_ultimate_parent = "c",
+  name_ultimate_parent = "c",
+  loan_size_outstanding = "n",
+  loan_size_outstanding_currency = "c",
+  loan_size_credit_limit = "n",
+  loan_size_credit_limit_currency = "c",
+  sector_classification_system = "c",
+  sector_classification_input_type = "c",
+  sector_classification_direct_loantaker = "c",
+  fi_type = "c",
+  flag_project_finance_loan = "c",
+  name_project = "c",
+  lei_direct_loantaker = "c",
+  isin_direct_loantaker = "c",
+  id_2dii = "c",
+  level = "c",
+  sector = "c",
+  sector_abcd = "c",
+  name = "c",
+  name_abcd = "c",
+  score = "n",
+  source = "c",
+  borderline = "l"
+)
+col_select_matched_prio_all_groups <- names(col_types_matched_prio_all_groups[["cols"]])
+
+# expected columns matched_all_groups file
+col_types_matched_all_groups <- readr::cols_only(
+  group_id = "c",
+  id_loan = "c",
+  id_direct_loantaker = "c",
+  name_direct_loantaker = "c",
+  id_intermediate_parent_1 = "c",
+  name_intermediate_parent_1 = "c",
+  id_ultimate_parent = "c",
+  name_ultimate_parent = "c",
+  loan_size_outstanding = "n",
+  loan_size_outstanding_currency = "c",
+  loan_size_credit_limit = "n",
+  loan_size_credit_limit_currency = "c",
+  sector_classification_system = "c",
+  sector_classification_input_type = "c",
+  sector_classification_direct_loantaker = "c",
+  fi_type = "c",
+  flag_project_finance_loan = "c",
+  name_project = "c",
+  lei_direct_loantaker = "c",
+  isin_direct_loantaker = "c",
+  id_2dii = "c",
+  level = "c",
+  sector = "c",
+  sector_abcd = "c",
+  name = "c",
+  name_abcd = "c",
+  score = "n",
+  source = "c",
+  borderline = "l"
+)
+col_select_matched_all_groups <- names(col_types_matched_all_groups[["cols"]])
+
+# expected columns raw loan book file
+col_types_raw <- readr::cols(
+  id_loan = "c",
+  id_direct_loantaker = "c",
+  name_direct_loantaker = "c",
+  id_intermediate_parent_1 = "c",
+  name_intermediate_parent_1 = "c",
+  id_ultimate_parent = "c",
+  name_ultimate_parent = "c",
+  loan_size_outstanding = "n",
+  loan_size_outstanding_currency = "c",
+  loan_size_credit_limit = "n",
+  loan_size_credit_limit_currency = "c",
+  sector_classification_system = "c",
+  sector_classification_input_type = "c",
+  sector_classification_direct_loantaker = "c",
+  fi_type = "c",
+  flag_project_finance_loan = "c",
+  name_project = "c",
+  lei_direct_loantaker = "c",
+  isin_direct_loantaker = "c"
+)
+
+# expected columns companies_sector_split file
+col_types_companies_sector_split <- readr::cols_only(
+  company_id = "i",
+  sector = "c",
+  sector_split = "n"
+)
+col_select_companies_sector_split <- names(col_types_companies_sector_split[["cols"]])
+
+# expected columns companies_sector_split_worst_case file
+col_types_companies_sector_split_worst_case <- readr::cols_only(
+  name_company = "c",
+  sector = "c",
+  sector_split = "n"
+)
+col_select_companies_sector_split_worst_case <- names(col_types_companies_sector_split_worst_case[["cols"]])