Skip to content

Commit

Permalink
Merge pull request #13 from JGCRI/simplify-query-param
Browse files Browse the repository at this point in the history
Simplify query param
  • Loading branch information
pralitp authored Apr 12, 2022
2 parents 684014e + 1620fd7 commit c480012
Show file tree
Hide file tree
Showing 13 changed files with 217 additions and 102 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ Encoding: UTF-8
LazyData: true
LinkingTo: Rcpp
Imports: Rcpp, dplyr, yaml, stringr
RoxygenNote: 6.1.1
RoxygenNote: 7.1.1
16 changes: 8 additions & 8 deletions R/gcamwrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ run_to_period <- function(gcam, period = NULL) {
#' @param query_params (list[string] -> array(string)) User options to translate placeholder
#' @return GCAM instance
#' @export
set_data <- function(gcam, data, query, query_params = NULL) {
if(!is.null(query_params)) {
query <- apply_query_params(query, query_params, FALSE)
}
set_data <- function(gcam, data, query, query_params = list()) {
# replace any potential place holders in the query with the query params
query <- apply_query_params(query, query_params, FALSE)

gcam$set_data(data, query)
}

Expand All @@ -62,11 +62,11 @@ set_data <- function(gcam, data, query, query_params = NULL) {
#' @export
#' @importFrom dplyr group_by_at vars summarize_at ungroup as_tibble
#' @importFrom magrittr %>%
get_data <- function(gcam, query, query_params = NULL) {
get_data <- function(gcam, query, query_params = list()) {
units <- attr(query, 'units')
if(!is.null(query_params)) {
query <- apply_query_params(query, query_params, TRUE)
}
# replace any potential place holders in the query with the query params
query <- apply_query_params(query, query_params, TRUE)

data <- gcam$get_data(query)
# The data comming out of gcam is unaggregated so we will need to do that now
# first figure out what the "value" column is, group by everything else, and summarize
Expand Down
82 changes: 59 additions & 23 deletions R/query_library.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,28 +98,44 @@ find_placeholders <- function(query_str) {
#' Parse user options for integer operators and generate GCAM Fusion syntax
#'
#' The currently supported operators are:
#' `+`: Indicates to read/write to a DataFrame. Note if `is_get_data` this is always
#' implied to be set.
#' `*`: which always matches and so no additional operands are necessary (note if
#' param_operands is empty this operator is assumed)
#' `+`, `-`: Indicates to read/write to a DataFrame if `+` or not to read/write if `-`
#' `*`: which always matches and so no additional operands are necessary
#' The standard comparison operators: `=`, `<`, `<=`, `>`, `>=`. Note if `is_get_data` is true
#' or the `+` is not set an additional argument must be supplied which is the integer (or a
#' or the `-` is set an additional argument must be supplied which is the integer (or a
#' string that can be converted to integer) to be used as the RHS operand in the comparison.
#' Finally if param_operands is NULL or empty then if `is_get_data` then c('+', '*') is assumed
#' otherwise c('+', '=').
#' @param param_operands An array containing operators and potentially an operand to be used with that
#' operator.
#' @param is_get_data A boolean if true follows get data symantics and set data if false.
#' @return A GCAM Fusion filter string representing the parameters given.
parse_int_query_param <- function(param_operands, is_get_data) {
wrapper_to_fusion_lookup = list('*'= 'MatchesAny', '<'= 'IntLessThan', '<='= 'IntLessThanEq', '>'= 'IntGreaterThan', '>='= 'IntGreaterThanEq', '='= 'IntEquals')
if(is_get_data || '+' %in% param_operands) {
# the default behavior is to set the '+' operator
is_read = TRUE
if('-' %in% param_operands) {
ret = '['
is_read = FALSE
# remove the - (if set) for easier error checking later
param_operands = param_operands[param_operands != '-']
} else {
ret = '[+'
is_read = TRUE
# remove the + (if set) for easier error checking later
param_operands = param_operands[param_operands != '+']
} else {
is_read = FALSE
ret = '['
}
if(is.na(param_operands) || length(param_operands) == 0 || param_operands[1] == '*') {

# use default behavior if no param_operands were given
if(is.na(param_operands) || length(param_operands) == 0) {
# for get data the default is to match any
# for set data the default is to match =
param_operands = ifelse(is_get_data, c('*'), c('='))
}

if(param_operands[1] == '*') {
if(!is_get_data && is_read) {
stop(paste0('Using * without explictly not reading from columns with - is not valid in set_data: ', param_operands))
}
ret = paste0(ret, 'YearFilter,', wrapper_to_fusion_lookup['*'])
} else if(!is_get_data && is_read) {
if(length(param_operands) < 1) {
Expand All @@ -145,28 +161,44 @@ parse_int_query_param <- function(param_operands, is_get_data) {
#' Parse user options for string operators and generate GCAM Fusion syntax
#'
#' The currently supported operators are:
#' `+`: Indicates to read/write to a DataFrame. Note if `is_get_data` this is always
#' implied to be set.
#' `*`: which always matches and so no additional operands are necessary (note if
#' param_operands is empty this operator is assumed)
#' `+`, `-`: Indicates to read/write to a DataFrame if `+` or not to read/write if `-`
#' `*`: which always matches and so no additional operands are necessary
#' The operators: `=`, `=~` (regular expression matching). Note if `is_get_data` is true
#' or the `+` is not set an additional argument must be supplied which is the string to
#' or the `-` is set an additional argument must be supplied which is the string to
#' be used as the RHS operand in the comparison.
#' Finally if param_operands is NULL or empty then if `is_get_data` then c('+', '*') is assumed
#' otherwise c('+', '=').
#' @param param_operands An array containing operators and potentially an operand to be used with that
#' operator.
#' @param is_get_data A boolean if true follows get data symantics and set data if false.
#' @return A GCAM Fusion filter string representing the parameters given.
parse_str_query_param <- function(param_operands, is_get_data) {
wrapper_to_fusion_lookup = list('*'= 'MatchesAny', '='= 'StringEquals','=~'= 'StringRegexMatches')
if(is_get_data || '+' %in% param_operands) {
# the default behavior is to set the '+' operator
is_read = TRUE
if('-' %in% param_operands) {
ret = '[NamedFilter,'
is_read = FALSE
# remove the - (if set) for easier error checking later
param_operands = param_operands[param_operands != '-']
} else {
ret = '[+NamedFilter,'
is_read = TRUE
# remove the + (if set) for easier error checking later
param_operands = param_operands[param_operands != '+']
} else {
is_read = FALSE
ret = '[NamedFilter,'
}
if(is.na(param_operands) || length(param_operands) == 0 || param_operands[1] == '*') {

# use default behavior if no param_operands were given
if(is.na(param_operands) || length(param_operands) == 0) {
# for get data the default is to match any
# for set data the default is to match =
param_operands = ifelse(is_get_data, c('*'), c('='))
}

if(param_operands[1] == '*') {
if(!is_get_data && is_read) {
stop(paste0('Using * without explictly not reading from columns with - is not valid in set_data: ', param_operands))
}
ret = paste0(ret, wrapper_to_fusion_lookup['*'])
} else if(!is_get_data && is_read) {
if(length(param_operands) < 1) {
Expand All @@ -190,8 +222,8 @@ parse_str_query_param <- function(param_operands, is_get_data) {
#' will be used to process the value of query_params[[arg_tag]] and if "year" then
#' \link{parse_int_query_param} is used.
#' Note symantics are slightly different if is_get_data is true as described in parse_.*_query_params.
#' For any arg_tag which has no entry in query_params it will be replaced with nothing
#' which tells GCAM Fusion to match any but do not read/write to the DataFrame for that container.
#' For any arg_tag which has no entry in query_params it will be given the results of passing `NULL`
#' to parse_int/str_query_param.
#' @param query The raw query which needs to have it's placeholders translated.
#' @param query_params The user options provided as a list of arg_tags to and array of
#' operators and potentially operands which will get translated to GCAM Fusion syntax.
Expand All @@ -214,9 +246,13 @@ apply_query_params <- function(query, query_params, is_get_data) {
}
}

# double check if we have any placeholders for which the user did not explicitly
# provide a parameter
for(param in names(placeholders)) {
if(!param %in% names(query_params)) {
parsed_params[param] = ''
# if no param was provided get the default value by passing NULL to the
# appropriate parse_XXX_query_param
parsed_params[param] = ifelse(placeholders[[param]] == 'year', parse_int_query_param(NULL, is_get_data), parse_str_query_param(NULL, is_get_data))
}
}

Expand Down
40 changes: 22 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,15 @@ query_params <- list(
"region" = # The key is the place holder "tag"
c("=", "USA"), # The value is an array with the first value an operator and the second the
# RHS operand. Note for get_data the "+" is implied but could be added explicitly
# If a user wanted to match region = "USA" but not record the result into the
# output DataFrame a user can use the "-" operand instead.
# The available operators include:
# For strings (@name): "*" (any) "=" or "=~" (regular expression matches)
# For ints (@year): "*" (any), "=", "<", "<=", ">", ">="
"year" = c("=", 2020))
# The placeholders will then get transformed into:
# "world/region[+NamedFilter,MatchesAny]//ghg[+NamedFilter,StringEquals,CO2]/emissions[+YearFilter,IntEquals,2020]")
# We can then pass the query and query_params and retrieve the results in a DataFrame.
# Note: if no query_params are given we assume the user is providing GCAM Fusion query
# and no placeholders need to be translated.
co2_core <- get_data(g, co2_query, query_params)
# Returns A tibble: 32 x 4
# region ghg year emissions
Expand All @@ -177,8 +177,9 @@ co2_core <- get_data(g, co2_query, query_params)
# and since we didn't explicitly filter we will get a value for all years AND the accompanied
# year column
labor_prod_query <- get_query("socioeconomic", "labor_productivity")
# note the shorthand here, when we have just the key and no value it assumes you want +MatchesAny
labor_prod <- get_data(g, labor_prod_query, list("region"))
# Note: if a user did not provide any params for a place holder the default param will be assumed,
# which is to c("+", "*")
labor_prod <- get_data(g, labor_prod_query)
# Returns A tibble: 704 x 3
# region year laborproductivity
# <chr> <int> <dbl>
Expand All @@ -202,9 +203,10 @@ labor_prod %>%
# set the updated labor productivity values back into the model
# Note: the syntax for setting data is similar to get_data only now the
# + indicates to read the value to match from the current row of the table
# and for that reason, when we are doing set_data you must be explicit on
# where to include the `+` as apposed to get_data which will implicity add
# it to your query_params
# If a user wanted to match region = "USA" instead of reading the value to match
# from a DataFrame a user can explictly use the "-" operand instead.
# Note: if a user did not provide any params for a place holder the default param will be assumed,
# which for set_data is to c("+", "=")
set_data(g, change_prod, labor_prod_query, list("region" = c("+", "="), "year" = c("+", "=")))
# double check that the values got set
double_check <- get_data(g, labor_prod_query, list("region", "year" = c("=", 2020)))
Expand Down Expand Up @@ -311,16 +313,16 @@ query_params = dict(
"region": # The key is the place holder "tag"
["=", "USA"], # The value is an array with the first value an operator and the second the
# RHS operand. Note for get_data the "+" is implied but could be added explicitly
# If a user wanted to match region = "USA" but not record the result into the
# output DataFrame a user can use the "-" operand instead.
# The available operators include:
# For strings (@name): "*" (any) "=" or "=~" (regular expression matches)
# For ints (@year): "*" (any), "=", "<", "<=", ">", ">="
"year": ["=", 2020])
# The placeholders will then get transformed into:
# "world/region[+NamedFilter,MatchesAny]//ghg[+NamedFilter,StringEquals,CO2]/emissions[+YearFilter,IntEquals,2020]")
# We can then pass the query and query_params and retrieve the results in a DataFrame.
# Note: if no query_params are given we assume the user is providing GCAM Fusion query
# and no placeholders need to be translated.
co2_core = g.get_data(co2_query, query_params)
co2_core = g.get_data(co2_query, **query_params)
# Returns a Pandas.DataFrame: co2_core.head()
# region ghg year emissions
# 0 Africa_Eastern CO2 2020 27.749129
Expand All @@ -334,8 +336,9 @@ co2_core = g.get_data(co2_query, query_params)
# and since we didn't explicitly filter we will get a value for all years AND the accompanied
# year column
labor_prod_query <- get_query("socioeconomic", "labor_productivity")
# note the shorthand here, when we have just the key and no value it assumes you want +MatchesAny
labor_prod = g.get_data(labor_prod_query, {"region": None})
# Note: if a user did not provide any params for a place holder the default param will be assumed,
# which is to ["+", "*"]
labor_prod = g.get_data(labor_prod_query)
# Returns a Pandas.DataFrame: labor_prod.head()
# region year laborproductivity
# 0 Africa_Eastern 1975 0.00154
Expand All @@ -350,19 +353,20 @@ labor_prod_change.loc[:,'laborproductivity'] = labor_prod_change['laborproductiv
# set the updated labor productivity values back into the model
# Note: the syntax for setting data is similar to get_data only now the
# + indicates to read the value to match from the current row of the table
# and for that reason, when we are doing set_data you must be explicit on
# where to include the `+` as apposed to get_data which will implicity add
# it to your query_params
g.set_data(labor_prod_change, labor_prod_query, {"region": ["+", "="], "year": ["+", "="]})
# If a user wanted to match region = "USA" instead of reading the value to match
# from a DataFrame a user can explictly use the "-" operand instead.
# Note: if a user did not provide any params for a place holder the default param will be assumed,
# which for set_data is to ["+", "="]
g.set_data(labor_prod_change, labor_prod_query, region=["+", "="], year=["+", "="])
# double check that the values got set
double_check = g.get_data(labor_prod_query, {"region": None, "year": ["=", 2020]})
double_check = g.get_data(labor_prod_query, "region", "year"=["=", 2020])

# we have only set the parameters at this point, to see how it effects
# results we must re-run period 5
g.run_to_period(5)

# Get the CO2 emissions again and see how they have changed
co2_change = g.get_data(co2_query, query_params)
co2_change = g.get_data(co2_query, **query_params)
co2_diff = co2_core.merge(co2_change, on=["region", "ghg", "year"])
co2_diff["diff"] = co2_diff["emissions_x"] - co2_diff["emissions_y"]

Expand Down
51 changes: 38 additions & 13 deletions gcamwrapper/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gcam_module
from pandas import DataFrame, Series
from gcamwrapper.query_library import apply_query_params
import warnings


class Gcam(gcam_module.gcam):
Expand All @@ -26,22 +27,30 @@ def run_to_period(self, period=None):
period = self.get_current_period() + 1
super(Gcam, self).run_to_period(period)

def get_data(self, query, query_params=None):
def get_data(self, query, *args, **kwargs):
"""Queries for arbitrary data from a running instance of GCAM.
:param query: GCAM fusion query
:type query: str
:param query_params: User options to translate placeholder expressions in query should
it have any
:type query_params: dict(string: array(string))
:param *args: User options to translate placeholder expressions which will
will be added to the kwargs as dict(arg: None)
:type *args: str
:param **kargs: User options to translate placeholder expressions which will
get combined with *args and passed on to apply_query_params
:type **kargs: key = arrary(str)
:returns: DataFrame with the query results.
"""

units = query.units if hasattr(query, "units") else None
if query_params is not None:
query = apply_query_params(query, query_params, True)
# fold args into kwargs by using the value as the key and the implict value is None
for arg in args:
kwargs[arg] = None

# replace any potential place holders in the query with the query params
query = apply_query_params(query, kwargs, True)

data_dict = super(Gcam, self).get_data(query)
data_df = DataFrame(data_dict)
# The data comming out of gcam is unaggregated so we will need to do that now
Expand All @@ -51,24 +60,40 @@ def get_data(self, query, query_params=None):
value_col = cols[-2] if cols[-1] == "year" else cols[-1]
data_df = data_df.groupby(cols.drop(value_col).to_list(), as_index=False).sum()
if units is not None:
data_df.meta = {'units': units}
# Attempting to attach meta data to the data frame will generate a warning:
# Pandas doesn't allow columns to be created via a new attribute name
# We are of course not trying to generate a new column so the warning
# is not relevant, thus an explict ignore is needed
with warnings.catch_warnings():
warnings.simplefilter("ignore")
data_df.meta = {'units': units}
return data_df

def set_data(self, data_df, query, query_params=None):
def set_data(self, data_df, query, *args, **kwargs):
"""Changes arbitrary data in a running instance of GCAM.
:param data_df: DataFrame of data to set
:type data_df: DataFrame
:param query: GCAM fusion query
:param query_params: User options to translate placeholder expressions in query should
it have any
:type query_params: dict(string: array(string))
:type query: str
:param *args: User options to translate placeholder expressions which will
will be added to the kwargs as dict(arg: None)
:type *args: str
:param **kargs: User options to translate placeholder expressions which will
get combined with *args and passed on to apply_query_params
:type **kargs: key = arrary(str)
"""

if query_params is not None:
query = apply_query_params(query, query_params, False)
# fold args into kwargs by using the value as the key and the implict value is None
for arg in args:
kwargs[arg] = None

# replace any potential place holders in the query with the query params
query = apply_query_params(query, kwargs, False)

# we need to transform the data from a DataFrame to a dict where the column
# name key maps to the column as a numpy array
data_dict = dict()
for key, value in data_df.items():
data_dict[key] = value.to_numpy()
Expand Down
Loading

0 comments on commit c480012

Please sign in to comment.