Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v1.5.0, 08/12/25 -- Improved read and write speed
add gdx.GdxFile.read_single_symbol(self, filename, target_symbol_name)
add disable_gc kwarg to gdx.GdxSymbol.load,
add disable_gc kwarg to read_gdx.to_dataframes and read_gdx.to_dataframe
v1.4.0, 07/21/23 -- add get_data_types function that maps symbol name to gdx.GamsDataType;
add load_set_text kwarg to gdx.GdxSymbol.load, to_dataframe, and to_dataframes
v1.3.0, 05/09/23 -- performance improvements (faster read and write)
Expand Down
2 changes: 1 addition & 1 deletion gdxpds/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
__title__ = "gdxpds"
__description__ = "gdx-pandas is a python package to translate between gdx (GAMS data) and pandas"
__url__ = "https://github.com/NREL/gdx-pandas"
__version__ = "1.4.0"
__version__ = "1.5.0"
__author__ = "Elaine T. Hale"
__author_email__ = "elaine.hale@nrel.gov"
__license__ = "BSD-3"
Expand Down
184 changes: 148 additions & 36 deletions gdxpds/gdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,74 @@ def read(self,filename):
symbol.load()
return

def read_single_symbol(self,filename,target_symbol_name):
"""
Optimized read method that only loads metadata for a specific symbol.

Parameters
----------
filename : pathlib.Path or str
Path to the GDX file to read
target_symbol_name : str
Name of the specific symbol to read

Returns
-------
GdxSymbol
The requested symbol (not yet loaded with data)

Raises
------
Error
If not self.empty or if symbol not found
"""
if not self.empty:
raise Error("GdxFile.read_single_symbol can only be used if the GdxFile is .empty")
Comment on lines +291 to +292
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that you can't call this function multiple times to read in the metadata for multiple symbols? So you can either read in the meta data for all symbols or for just one symbol?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That section should probably be improved. You can call that function multiple times, because it always closes the file and starts refreshed. That was a safeguard while developing and the code should not reach that section, but there might be some use case I can't think of and didn't remove that Error.


# open the file
rc = gdxcc.gdxOpenRead(self.H, str(filename))
if not rc[0]:
raise GdxError(self.H, f"Could not open {filename!r}")
self._filename = filename

# read in meta-data for the file
ret, self._version, self._producer = gdxcc.gdxFileVersion(self.H)
if ret != 1:
raise GdxError(self.H, "Could not get file version")
ret, symbol_count, element_count = gdxcc.gdxSystemInfo(self.H)
logger.debug(f"Opening '{filename}' with {symbol_count} symbols and "
f"{element_count} elements, searching for '{target_symbol_name}' with optimized read.")

# read universal set
ret, name, dims, data_type = gdxcc.gdxSymbolInfo(self.H, 0)
if ret != 1:
raise GdxError(self.H, "Could not get symbol info for the universal set")
self.universal_set = GdxSymbol(name, data_type, dims=dims, file=self, index=0)

# search for target symbol without creating objects for others
target_symbol = None
for i in range(symbol_count):
index = i + 1
ret, name, dims, data_type = gdxcc.gdxSymbolInfo(self.H, index)
if ret != 1:
raise GdxError(self.H, f"Could not get symbol info for symbol {index}")

if name == target_symbol_name:
# found our target - create the symbol object with full metadata
try:
target_symbol = GdxSymbol(name, data_type, dims=dims, file=self, index=index)
self.append(target_symbol)
Comment on lines +325 to +326
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is skipping this for symbols whose data you don't want to load costly enough in terms of time to warrant this separate function?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In large GDX files, it is a very significant improvement in runtime. We encounter regularly 100-1000 Mb GDX files with hundreds of symbols. This method is sometimes up to 15x faster when reading a single symbol from a large file. There are of course other overheads, like opening the file, etc and thus the old method is significantly better to read all symbols. The increase in code maintenance and documentation is the biggest drawback. Small files are always extremely fast.

logger.debug(f"Found and loaded metadata for symbol '{target_symbol_name}' at index {index}")
break
except Exception as e:
logger.error(f"Unable to initialize target GdxSymbol {name!r}, because {e}.")
raise

if target_symbol is None:
raise Error(f"No symbol named '{target_symbol_name}' in '{filename}'")

return target_symbol

def write(self,filename):
"""
Writes this :py:class:`GdxFile` to filename
Expand Down Expand Up @@ -991,16 +1059,19 @@ def __str__(self):
s += ", loaded" if self.loaded else ", not loaded"
return s

def load(self, load_set_text=False):
def load(self,load_set_text=False,disable_gc=True):
"""
Loads this :py:class:`GdxSymbol` from its :py:attr:`file`, thereby popluating
Loads this :py:class:`GdxSymbol` from its :py:attr:`file`, thereby populating
:py:attr:`dataframe`.

Parameters
----------
load_set_text : bool
If True (default is False) and this symbol is a :class:`GamsDataType.Set <GamsDataType>`,
loads the GDX Text field into the :py:attr:`dataframe` rather than a `c_bool`.
disable_gc: bool
If True (default is True), disables Python's garbage collector when reading data to
speed up the process.
"""
if self.loaded:
logger.info("Nothing to do. Symbol already loaded.")
Expand All @@ -1010,29 +1081,51 @@ def load(self, load_set_text=False):
if not self.index:
raise Error("Cannot load {} because there is no symbol index".format(repr(self)))

if self.data_type == GamsDataType.Parameter and HAVE_GDX2PY:
self.dataframe = gdx2py.par2list(self.file.filename,self.name)
self._loaded = True
return
# GDX2PY does not have property .par2list and the lines below should be rewritten. Commenting out for now.
#if self.data_type == GamsDataType.Parameter and HAVE_GDX2PY:
# self.dataframe = gdx2py.par2list(self.file.filename,self.name)
# self._loaded = True
# return

_ret, records = gdxcc.gdxDataReadStrStart(self.file.H,self.index)
# preprocessing
_, records = gdxcc.gdxDataReadStrStart(self.file.H,self.index)

# Local bindings to speed up the loops
fH = self.file.H
gdxDataReadStr = gdxcc.gdxDataReadStr
gdxGetElemText = gdxcc.gdxGetElemText
value_indices = [col_ind for _, col_ind in self.value_cols]

def reader():
handle = self.file.H
for i in range(records):
yield gdxcc.gdxDataReadStr(handle)

vc = self.value_cols # do this for speed in the next line
if load_set_text and (self.data_type == GamsDataType.Set):
data = [elements + [gdxcc.gdxGetElemText(self.file.H,int(values[col_ind]))[1]
for _col_name, col_ind in vc]
for _ret, elements, values, _afdim in reader()]
self._fixup_set_vals = False
else:
data = [elements + [values[col_ind] for col_name, col_ind in vc] for ret, elements, values, afdim in reader()]
self.dataframe = data
for _ in range(records):
yield gdxDataReadStr(fH)

# Disable GC
import gc
gc_was_enabled = False
if disable_gc:
gc_was_enabled = gc.isenabled()
gc.disable()

try:
# Read data row by row
if load_set_text and (self.data_type == GamsDataType.Set):
self.dataframe = [elements + [gdxGetElemText(fH, int(values[i]))[1]
for i in value_indices]
for _, elements, values, _ in reader()]
self._fixup_set_vals = False
else:
self.dataframe = [elements + [values[i] for i in value_indices]
for _, elements, values, _ in reader()]

finally:
# restore GC if changed
if disable_gc and gc_was_enabled and not gc.isenabled():
gc.enable()
Comment on lines +1122 to +1124
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm glad this is here. It makes me more comfortable that disabling is default behavior.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: "def load(self,load_set_text=False,disable_gc=True):" Carbage collector is Python's internal guard against memory leaks etc. and this disables it by default. "disable_gc=False" would be the python default behavior. The speedup with large GDX files is quite large and I didn't notice any issues while testing.


if not self.data_type in (GamsDataType.Set, GamsDataType.Alias):
self.dataframe = special.convert_gdx_to_np_svs(self.dataframe, self.num_dims)

self._loaded = True
return

Expand Down Expand Up @@ -1078,31 +1171,50 @@ def write(self,index=None):
self.data_type.value,
userinfo):
raise GdxError(self.file.H,"Could not start writing data for symbol {}".format(repr(self.name)))

# set domain information
if self.num_dims > 0:
if self.index:
if not gdxcc.gdxSymbolSetDomainX(self.file.H,self.index,self.dims):
raise GdxError(self.file.H,"Could not set domain information for {}. Domains are {}".format(repr(self.name),repr(self.dims)))
else:
logger.info("Not writing domain information because symbol index is unknown.")

if self.data_type not in (GamsDataType.Set, GamsDataType.Alias):
# Only reset index if actually needed for the conversion
if self.dataframe.index.duplicated().any() or not self.dataframe.index.is_monotonic_increasing:
self.dataframe = self.dataframe.reset_index(drop=True)
to_write = convert_np_to_gdx_svs(self.dataframe, self.num_dims)
else:
to_write = self.dataframe.copy()

# Local bindings to speed up the loops
values = gdxcc.doubleArray(gdxcc.GMS_VAL_MAX)
# make sure index is clean -- needed for merging in convert_np_to_gdx_svs
self.dataframe = self.dataframe.reset_index(drop=True)
# convert special numeric values if appropriate
to_write = self.dataframe.copy() if (self.data_type in (GamsDataType.Set, GamsDataType.Alias)) else special.convert_np_to_gdx_svs(self.dataframe, self.num_dims)
# write each row
for row in to_write.itertuples(index=False, name=None):
dims = [str(x) for x in row[:self.num_dims]]
vals = row[self.num_dims:]
for _col_name, col_ind in self.value_cols:
values[col_ind] = float(0.0)
try:
if isinstance(vals[col_ind],Number):
values[col_ind] = float(vals[col_ind])
except:
raise Error("Unable to set element {} from {}.".format(col_ind,vals))
gdxcc.gdxDataWriteStr(self.file.H,dims,values)
gdxDataWriteStr = gdxcc.gdxDataWriteStr
fh = self.file.H
value_indices = [col_ind for _, col_ind in self.value_cols]
snd = self.num_dims

# Convert dimensions to string
try:
to_write.iloc[:, :snd] = to_write.iloc[:, :snd].astype(str)
except Exception as e:
raise Error(f"Unable to convert values in to_write df to string: {e}")

# write each row
for row in to_write.itertuples(index=False, name=None):
dims = list(row[:snd])
for i in value_indices:
try:
v = row[snd + i]
values[i] = float(v) if isinstance(v, Number) else 0.0
except:
raise Error("Unable to set element {} from {}.".format(i,vals))
gdxDataWriteStr(fh,dims,values)

# close
gdxcc.gdxDataWriteDone(self.file.H)

return


Expand Down
35 changes: 24 additions & 11 deletions gdxpds/read_gdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
class Translator(object):
def __init__(self,gdx_file,gams_dir=None,lazy_load=False):
self.__gdx = GdxFile(gams_dir=gams_dir,lazy_load=lazy_load)
self.__gdx.read(gdx_file)
if gdx_file is not None:
self.__gdx.read(gdx_file)
self.__dataframes = None

def __exit__(self, *args):
Expand Down Expand Up @@ -56,7 +57,7 @@ def dataframes(self):
@property
def symbols(self):
return [symbol.name for symbol in self.gdx]

@property
def data_types(self):
return {symbol.name: symbol.data_type for symbol in self.gdx}
Expand All @@ -74,12 +75,12 @@ def _get_dataframes(self, load_set_text=False):
self.__dataframes = OrderedDict()
for symbol in self.__gdx:
if not symbol.loaded:
symbol.load(load_set_text=load_set_text)
symbol.load(load_set_text=load_set_text,disable_gc=disable_gc)
self.__dataframes[symbol.name] = symbol.dataframe.copy()
return self.__dataframes


def to_dataframes(gdx_file,gams_dir=None,load_set_text=False):

def to_dataframes(gdx_file,gams_dir=None,load_set_text=False,disable_gc=True):
"""
Primary interface for converting a GAMS GDX file to pandas DataFrames.

Expand All @@ -92,6 +93,9 @@ def to_dataframes(gdx_file,gams_dir=None,load_set_text=False):
load_set_text : bool
If True (default is False), then for every symbol that is a Set, loads
the GDX Text field into the dataframe rather than a `c_bool`.
disable_gc: bool
If True (default is True), disables Python's garbage collector when reading data to
speed up the process.

Returns
-------
Expand All @@ -100,7 +104,7 @@ def to_dataframes(gdx_file,gams_dir=None,load_set_text=False):
file, keyed with the symbol name.
"""
if load_set_text:
return Translator(gdx_file,gams_dir=gams_dir,lazy_load=True)._get_dataframes(load_set_text=load_set_text)
return Translator(gdx_file,gams_dir=gams_dir,lazy_load=True)._get_dataframes(load_set_text=load_set_text,disable_gc=disable_gc)
return Translator(gdx_file,gams_dir=gams_dir).dataframes


Expand Down Expand Up @@ -143,7 +147,7 @@ def get_data_types(gdx_file,gams_dir=None):



def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_text=False):
def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_text=False,disable_gc=True):
"""
Interface for getting the data for a single symbol

Expand All @@ -161,6 +165,9 @@ def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_
load_set_text : bool
If True (default is False) and symbol_name is a Set, loads the GDX Text
field into the dataframe rather than a `c_bool`.
disable_gc: bool
If True (default is True), disables Python's garbage collector when reading data to
speed up the process.

Returns
-------
Expand All @@ -169,8 +176,14 @@ def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_
where the key is symbol_name and the value is the corresponding
pd.DataFrame. Otherwise (if not old_interface), returns just the
pd.DataFrame.

"""
df = Translator(gdx_file,gams_dir=gams_dir,lazy_load=True).dataframe(
symbol_name,
load_set_text=load_set_text)
return {symbol_name: df} if old_interface else df
gdx = GdxFile(gams_dir=gams_dir, lazy_load=True)
try:
symbol = gdx.read_single_symbol(gdx_file,symbol_name)
symbol.load(load_set_text=load_set_text,disable_gc=disable_gc)
df = symbol.dataframe.copy()
return {symbol_name: df} if old_interface else df

finally:
gdx.cleanup()
6 changes: 2 additions & 4 deletions gdxpds/special.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,8 @@ def convert_gdx_to_np_svs(df, num_dims):

# create clean copy of df
tmp = df.copy()

# apply the map to the value columns and merge with the dimensional information
tmp = (tmp.iloc[:, :num_dims]).merge(tmp.iloc[:, num_dims:].replace(GDX_TO_NP_SVS),
left_index=True, right_index=True)
# replace values in the relevant columns
tpm.iloc[:, num_dims:] = tmp.iloc[:, num_dims:].replace(GDX_TO_NP_SVS)
return tmp


Expand Down