diff --git a/DESCRIPTION b/DESCRIPTION index 94176fea..3111cb95 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,11 +21,8 @@ Imports: pacta.multi.loanbook.plot, readr, rlang, - tidyr -Remotes: - RMI-PACTA/pacta.data.preparation, - RMI-PACTA/pacta.data.scraping, - RMI-PACTA/pacta.scenario.preparation + tidyr, + vroom Depends: R (>= 4.1.0) License: MIT + file LICENSE diff --git a/example.config.yml b/example.config.yml new file mode 100644 index 00000000..c2936ef3 --- /dev/null +++ b/example.config.yml @@ -0,0 +1,47 @@ +default: + directories: + dir_scenario: "path/to/scenario_folder" + dir_abcd: "path/to/abcd_folder" + dir_raw: "path/to/raw_folder" + dir_matched: "path/to/matched_folder" + dir_output: "path/to/output_folder" + file_names: + filename_raw: "raw_loanbook_123.csv" + filename_scenario_tms: "scenarios_2022_tms.csv" + filename_scenario_sda: "scenarios_2022_sda.csv" + filename_abcd: "abcd.csv" + project_parameters: + scenario_source: "weo_2022" + scenario_select: "nze_2050" + region_select: "global" + # normally the start year should correspond with year of the publication of + # the scenario in use + start_year_select: 2022 + time_frame_select: 5 + # regions must be available for the selected scenario + benchmark_regions_select: "global,european union" + remove_inactive_companies: TRUE + sector_split: + apply_sector_split: TRUE + sector_split_type: "equal_weights" + # sector_split_type: "worst_case" + dir_split_company_id: "path/to/split_folder" + filename_split_company_id: "split_company_ids.csv" + dir_advanced_company_indicators: "path/to/advanced_company_indicators_folder" + filename_advanced_company_indicators: "advanced_company_indicators.xlsx" + matching: + prep_input_level: "direct_loantaker" + params_match_name: + by_sector: TRUE + min_score: 0.9 + method: "jw" + p: 0.1 + overwrite: NULL + join_id: NULL + own_sector_classification: + use_own_sector_classification: FALSE + dir_own_sector_classification: "path/to/own_sector_classification_folder" + filename_own_sector_classification: "own_sector_classification.csv" + + + diff --git a/expected_columns.R b/expected_columns.R new file mode 100644 index 00000000..daa66362 --- /dev/null +++ b/expected_columns.R @@ -0,0 +1,157 @@ +# expected columns region isos file +col_types_region_isos <- readr::cols_only( + region = "c", + isos = "c", + source = "c" +) +col_select_region_isos <- names(col_types_region_isos[["cols"]]) + +# expected columns tms scenario file +col_types_scenario_tms <- readr::cols_only( + scenario_source = "c", + region = "c", + scenario = "c", + sector = "c", + technology = "c", + year = "i", + smsp = "n", + tmsr = "n" +) +col_select_scenario_tms <- names(col_types_scenario_tms[["cols"]]) + +# expected columns sda scenario file +col_types_scenario_sda <- readr::cols_only( + scenario_source = "c", + region = "c", + scenario = "c", + sector = "c", + year = "i", + emission_factor = "n", + emission_factor_unit = "c" +) +col_select_scenario_sda <- names(col_types_scenario_sda[["cols"]]) + +# expected columns abcd file +col_types_abcd <- readr::cols_only( + company_id = "i", + name_company = "c", + lei = "c", + is_ultimate_owner = "l", + sector = "c", + technology = "c", + plant_location = "c", + year = "i", + production = "n", + production_unit = "c", + emission_factor = "n", + emission_factor_unit = "c", + ald_timestamp = "c" +) +col_select_abcd <- names(col_types_abcd[["cols"]]) + +# expected columns matched_prioritized_all_groups file +col_types_matched_prio_all_groups <- readr::cols_only( + group_id = "c", + id_loan = "c", + id_direct_loantaker = "c", + name_direct_loantaker = "c", + id_intermediate_parent_1 = "c", + name_intermediate_parent_1 = "c", + id_ultimate_parent = "c", + name_ultimate_parent = "c", + loan_size_outstanding = "n", + loan_size_outstanding_currency = "c", + loan_size_credit_limit = "n", + loan_size_credit_limit_currency = "c", + sector_classification_system = "c", + sector_classification_input_type = "c", + sector_classification_direct_loantaker = "c", + fi_type = "c", + flag_project_finance_loan = "c", + name_project = "c", + lei_direct_loantaker = "c", + isin_direct_loantaker = "c", + id_2dii = "c", + level = "c", + sector = "c", + sector_abcd = "c", + name = "c", + name_abcd = "c", + score = "n", + source = "c", + borderline = "l" +) +col_select_matched_prio_all_groups <- names(col_types_matched_prio_all_groups[["cols"]]) + +# expected columns matched_all_groups file +col_types_matched_all_groups <- readr::cols_only( + group_id = "c", + id_loan = "c", + id_direct_loantaker = "c", + name_direct_loantaker = "c", + id_intermediate_parent_1 = "c", + name_intermediate_parent_1 = "c", + id_ultimate_parent = "c", + name_ultimate_parent = "c", + loan_size_outstanding = "n", + loan_size_outstanding_currency = "c", + loan_size_credit_limit = "n", + loan_size_credit_limit_currency = "c", + sector_classification_system = "c", + sector_classification_input_type = "c", + sector_classification_direct_loantaker = "c", + fi_type = "c", + flag_project_finance_loan = "c", + name_project = "c", + lei_direct_loantaker = "c", + isin_direct_loantaker = "c", + id_2dii = "c", + level = "c", + sector = "c", + sector_abcd = "c", + name = "c", + name_abcd = "c", + score = "n", + source = "c", + borderline = "l" +) +col_select_matched_all_groups <- names(col_types_matched_all_groups[["cols"]]) + +# expected columns raw loan book file +col_types_raw <- readr::cols( + id_loan = "c", + id_direct_loantaker = "c", + name_direct_loantaker = "c", + id_intermediate_parent_1 = "c", + name_intermediate_parent_1 = "c", + id_ultimate_parent = "c", + name_ultimate_parent = "c", + loan_size_outstanding = "n", + loan_size_outstanding_currency = "c", + loan_size_credit_limit = "n", + loan_size_credit_limit_currency = "c", + sector_classification_system = "c", + sector_classification_input_type = "c", + sector_classification_direct_loantaker = "c", + fi_type = "c", + flag_project_finance_loan = "c", + name_project = "c", + lei_direct_loantaker = "c", + isin_direct_loantaker = "c" +) + +# expected columns companies_sector_split file +col_types_companies_sector_split <- readr::cols_only( + company_id = "i", + sector = "c", + sector_split = "n" +) +col_select_companies_sector_split <- names(col_types_companies_sector_split[["cols"]]) + +# expected columns companies_sector_split_worst_case file +col_types_companies_sector_split_worst_case <- readr::cols_only( + name_company = "c", + sector = "c", + sector_split = "n" +) +col_select_companies_sector_split_worst_case <- names(col_types_companies_sector_split_worst_case[["cols"]]) diff --git a/run_matching.R b/run_matching.R new file mode 100644 index 00000000..eb7d62b3 --- /dev/null +++ b/run_matching.R @@ -0,0 +1,200 @@ +# set up project and load packages---- +library(dplyr, warn.conflicts = FALSE) +library(r2dii.data) +library(r2dii.match) +library(readr) +library(withr) + +# source helpers---- +source("expected_columns.R") + +# load config---- +# TODO: all params to function signature +config_dir <- config::get("directories") +config_files <- config::get("file_names") + +dir_raw <- config_dir$dir_raw +path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd) +dir_matched <- config_dir$dir_matched + +config_matching <- config::get("matching") + +matching_by_sector <- config_matching$params_match_name$by_sector +matching_min_score <- config_matching$params_match_name$min_score +matching_method <- config_matching$params_match_name$method +matching_p <- config_matching$params_match_name$p +matching_overwrite <- config_matching$params_match_name$overwrite +matching_join_id <- config_matching$params_match_name$join_id + +matching_use_own_sector_classification <- config_matching$own_sector_classification$use_own_sector_classification +if (matching_use_own_sector_classification) { + dir_own_sector_classification <- config_matching$own_sector_classification$dir_own_sector_classification + filename_own_sector_classification <- config_matching$own_sector_classification$filename_own_sector_classification + path_own_sector_classification <- file.path(dir_own_sector_classification, filename_own_sector_classification) +} + +# validate config values---- +if (!length(dir_raw) == 1) { + stop("Argument dir_raw must be of length 1. Please check your input.") +} +if (!inherits(dir_raw, "character")) { + stop("Argument dir_raw must be of class character. Please check your input.") +} +if (!length(path_abcd) == 1) { + stop("Argument path_abcd must be of length 1. Please check your input.") +} +if (!inherits(path_abcd, "character")) { + stop("Argument path_abcd must be of class character. Please check your input.") +} +if (!length(dir_matched) == 1) { + stop("Argument dir_matched must be of length 1. Please check your input.") +} +if (!inherits(dir_matched, "character")) { + stop("Argument dir_matched must be of class character. Please check your input.") +} +if (!length(matching_by_sector) == 1) { + stop("Argument matching_by_sector must be of length 1. Please check your input.") +} +if (!inherits(matching_by_sector, "logical")) { + stop("Argument matching_by_sector must be of class logical. Please check your input.") +} +if (!length(matching_min_score) == 1) { + stop("Argument matching_min_score must be of length 1. Please check your input.") +} +if (!inherits(matching_min_score, "numeric")) { + stop("Argument matching_min_score must be of class numeric. Please check your input.") +} +if (!length(matching_method) == 1) { + stop("Argument matching_method must be of length 1. Please check your input.") +} +if (!inherits(matching_method, "character")) { + stop("Argument matching_method must be of class character Please check your input.") +} +if (!length(matching_p) == 1) { + stop("Argument matching_p must be of length 1. Please check your input.") +} +if (!inherits(matching_p, "numeric")) { + stop("Argument matching_p must be of class numeric. Please check your input.") +} +# TODO: check for data.frame +# if (!length(matching_overwrite) == 1) { +# stop("Argument matching_overwrite must be of length 1. Please check your input.") +# } +# if (!inherits(matching_overwrite, "numeric")) { +# stop("Argument matching_overwrite must be of class numeric. Please check your input.") +# } +# TODO: check for join_object +# if (!length(matching_join_id) == 1) { +# stop("Argument matching_join_id must be of length 1. Please check your input.") +# } +# if (!inherits(matching_join_id, "numeric")) { +# stop("Argument matching_join_id must be of class numeric. Please check your input.") +# } +if (!length(matching_use_own_sector_classification) == 1) { + stop("Argument matching_use_own_sector_classification must be of length 1. Please check your input.") +} +if (!inherits(matching_use_own_sector_classification, "logical")) { + stop("Argument matching_use_own_sector_classification must be of class logical. Please check your input.") +} +# path to own sector classification onlz required if boolean TRUE +if (matching_use_own_sector_classification) { + if (!length(path_own_sector_classification) == 1) { + stop("When matching_use_own_sector_classification == TRUE, argument path_own_sector_classification must be of length 1. Please check your input.") + } + if (!inherits(path_own_sector_classification, "character")) { + stop("When matching_use_own_sector_classification == TRUE, argument path_own_sector_classification must be of class character. Please check your input.") + } +} + +# load data---- + +## load abcd---- +if (!file.exists(path_abcd)) { + stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!")) +} + +abcd <- readr::read_csv( + file.path(path_abcd), + col_types = col_types_abcd, + col_select = dplyr::all_of(col_select_abcd) +) + +## optionally load own classification system---- +if (matching_use_own_sector_classification) { + if (!file.exists(path_own_sector_classification)) { + stop(glue::glue("No sector classification file found at path {path_own_sector_classification}. Please check your project setup!")) + } + + sector_classification_system <- readr::read_csv( + file = path_own_sector_classification, + col_types = readr::cols_only( + code_system = "c", + code = "c", + sector = "c", + borderline = "l" + ) + ) %>% + dplyr::select(names(r2dii.data::sector_classifications)) +} + +## load raw loan books---- +list_raw <- list.files(dir_raw)[grepl(".csv", list.files(dir_raw))] + +if (length(list_raw) == 0) { + stop(glue::glue("No raw loan book csvs found in {dir_raw}. Please check your project setup!")) +} + +raw_lbk <- vroom::vroom( + file = file.path(dir_raw, list_raw), + col_types = col_types_raw, + id = "group_id" +) %>% + dplyr::mutate( + group_id = gsub(glue::glue("{dir_raw}/"), "", .data$group_id), + group_id = gsub(".csv", "", .data$group_id) + ) %>% + dplyr::group_split(.data$group_id) + +# match and save loan books---- +for (i in 1:length(raw_lbk)) { + group_name <- unique(raw_lbk[[i]]$group_id) + + ## match data---- + if (matching_use_own_sector_classification) { + withr::with_options( + new = list(r2dii.match.sector_classifications = sector_classification_system), + code = { + getOption("r2dii.match.sector_classifications") + matched_lbk_i <- r2dii.match::match_name( + loanbook = raw_lbk[[i]], + abcd = abcd, + by_sector = matching_by_sector, + min_score = matching_min_score, + method = matching_method, + p = matching_p, + overwrite = matching_overwrite, + join_id = matching_join_id + # TODO: allow surfacing the other match_name args + ) + } + ) + } else { + matched_lbk_i <- r2dii.match::match_name( + loanbook = raw_lbk[[i]], + abcd = abcd, + by_sector = matching_by_sector, + min_score = matching_min_score, + method = matching_method, + p = matching_p, + overwrite = matching_overwrite, + join_id = matching_join_id + ) + } + + ## write matched data to file---- + matched_lbk_i %>% + readr::write_csv( + file = file.path(dir_matched, glue::glue("matched_lbk_{group_name}.csv")), + na = "" + ) +}