From 94d414427443e42a44f0528adb69905ffbad71af Mon Sep 17 00:00:00 2001 From: tomiL Date: Fri, 22 Aug 2025 10:17:36 +0300 Subject: [PATCH] Improving read and write speed, adding gdx.GdxFile.read_single_symbol, and adding disable_gc kwarg to gdx.GdxSymbol.load, read_gdx.to_dataframes, and read_gdx.to_dataframe --- CHANGES.txt | 4 + gdxpds/_version.py | 2 +- gdxpds/gdx.py | 184 ++++++++++++++++++++++++++++++++++++--------- gdxpds/read_gdx.py | 35 ++++++--- gdxpds/special.py | 6 +- 5 files changed, 179 insertions(+), 52 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 0428723..cbc5f40 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,7 @@ +v1.5.0, 08/12/25 -- Improved read and write speed + add gdx.GdxFile.read_single_symbol(self, filename, target_symbol_name) + add disable_gc kwarg to gdx.GdxSymbol.load, + add disable_gc kwarg to read_gdx.to_dataframes and read_gdx.to_dataframe v1.4.0, 07/21/23 -- add get_data_types function that maps symbol name to gdx.GamsDataType; add load_set_text kwarg to gdx.GdxSymbol.load, to_dataframe, and to_dataframes v1.3.0, 05/09/23 -- performance improvements (faster read and write) diff --git a/gdxpds/_version.py b/gdxpds/_version.py index 293fc1a..c0b3bd7 100644 --- a/gdxpds/_version.py +++ b/gdxpds/_version.py @@ -3,7 +3,7 @@ __title__ = "gdxpds" __description__ = "gdx-pandas is a python package to translate between gdx (GAMS data) and pandas" __url__ = "https://github.com/NREL/gdx-pandas" -__version__ = "1.4.0" +__version__ = "1.5.0" __author__ = "Elaine T. Hale" __author_email__ = "elaine.hale@nrel.gov" __license__ = "BSD-3" diff --git a/gdxpds/gdx.py b/gdxpds/gdx.py index 78fea7d..f730735 100644 --- a/gdxpds/gdx.py +++ b/gdxpds/gdx.py @@ -267,6 +267,74 @@ def read(self,filename): symbol.load() return + def read_single_symbol(self,filename,target_symbol_name): + """ + Optimized read method that only loads metadata for a specific symbol. + + Parameters + ---------- + filename : pathlib.Path or str + Path to the GDX file to read + target_symbol_name : str + Name of the specific symbol to read + + Returns + ------- + GdxSymbol + The requested symbol (not yet loaded with data) + + Raises + ------ + Error + If not self.empty or if symbol not found + """ + if not self.empty: + raise Error("GdxFile.read_single_symbol can only be used if the GdxFile is .empty") + + # open the file + rc = gdxcc.gdxOpenRead(self.H, str(filename)) + if not rc[0]: + raise GdxError(self.H, f"Could not open {filename!r}") + self._filename = filename + + # read in meta-data for the file + ret, self._version, self._producer = gdxcc.gdxFileVersion(self.H) + if ret != 1: + raise GdxError(self.H, "Could not get file version") + ret, symbol_count, element_count = gdxcc.gdxSystemInfo(self.H) + logger.debug(f"Opening '{filename}' with {symbol_count} symbols and " + f"{element_count} elements, searching for '{target_symbol_name}' with optimized read.") + + # read universal set + ret, name, dims, data_type = gdxcc.gdxSymbolInfo(self.H, 0) + if ret != 1: + raise GdxError(self.H, "Could not get symbol info for the universal set") + self.universal_set = GdxSymbol(name, data_type, dims=dims, file=self, index=0) + + # search for target symbol without creating objects for others + target_symbol = None + for i in range(symbol_count): + index = i + 1 + ret, name, dims, data_type = gdxcc.gdxSymbolInfo(self.H, index) + if ret != 1: + raise GdxError(self.H, f"Could not get symbol info for symbol {index}") + + if name == target_symbol_name: + # found our target - create the symbol object with full metadata + try: + target_symbol = GdxSymbol(name, data_type, dims=dims, file=self, index=index) + self.append(target_symbol) + logger.debug(f"Found and loaded metadata for symbol '{target_symbol_name}' at index {index}") + break + except Exception as e: + logger.error(f"Unable to initialize target GdxSymbol {name!r}, because {e}.") + raise + + if target_symbol is None: + raise Error(f"No symbol named '{target_symbol_name}' in '{filename}'") + + return target_symbol + def write(self,filename): """ Writes this :py:class:`GdxFile` to filename @@ -991,9 +1059,9 @@ def __str__(self): s += ", loaded" if self.loaded else ", not loaded" return s - def load(self, load_set_text=False): + def load(self,load_set_text=False,disable_gc=True): """ - Loads this :py:class:`GdxSymbol` from its :py:attr:`file`, thereby popluating + Loads this :py:class:`GdxSymbol` from its :py:attr:`file`, thereby populating :py:attr:`dataframe`. Parameters @@ -1001,6 +1069,9 @@ def load(self, load_set_text=False): load_set_text : bool If True (default is False) and this symbol is a :class:`GamsDataType.Set `, loads the GDX Text field into the :py:attr:`dataframe` rather than a `c_bool`. + disable_gc: bool + If True (default is True), disables Python's garbage collector when reading data to + speed up the process. """ if self.loaded: logger.info("Nothing to do. Symbol already loaded.") @@ -1010,29 +1081,51 @@ def load(self, load_set_text=False): if not self.index: raise Error("Cannot load {} because there is no symbol index".format(repr(self))) - if self.data_type == GamsDataType.Parameter and HAVE_GDX2PY: - self.dataframe = gdx2py.par2list(self.file.filename,self.name) - self._loaded = True - return + # GDX2PY does not have property .par2list and the lines below should be rewritten. Commenting out for now. + #if self.data_type == GamsDataType.Parameter and HAVE_GDX2PY: + # self.dataframe = gdx2py.par2list(self.file.filename,self.name) + # self._loaded = True + # return - _ret, records = gdxcc.gdxDataReadStrStart(self.file.H,self.index) + # preprocessing + _, records = gdxcc.gdxDataReadStrStart(self.file.H,self.index) + + # Local bindings to speed up the loops + fH = self.file.H + gdxDataReadStr = gdxcc.gdxDataReadStr + gdxGetElemText = gdxcc.gdxGetElemText + value_indices = [col_ind for _, col_ind in self.value_cols] def reader(): - handle = self.file.H - for i in range(records): - yield gdxcc.gdxDataReadStr(handle) - - vc = self.value_cols # do this for speed in the next line - if load_set_text and (self.data_type == GamsDataType.Set): - data = [elements + [gdxcc.gdxGetElemText(self.file.H,int(values[col_ind]))[1] - for _col_name, col_ind in vc] - for _ret, elements, values, _afdim in reader()] - self._fixup_set_vals = False - else: - data = [elements + [values[col_ind] for col_name, col_ind in vc] for ret, elements, values, afdim in reader()] - self.dataframe = data + for _ in range(records): + yield gdxDataReadStr(fH) + + # Disable GC + import gc + gc_was_enabled = False + if disable_gc: + gc_was_enabled = gc.isenabled() + gc.disable() + + try: + # Read data row by row + if load_set_text and (self.data_type == GamsDataType.Set): + self.dataframe = [elements + [gdxGetElemText(fH, int(values[i]))[1] + for i in value_indices] + for _, elements, values, _ in reader()] + self._fixup_set_vals = False + else: + self.dataframe = [elements + [values[i] for i in value_indices] + for _, elements, values, _ in reader()] + + finally: + # restore GC if changed + if disable_gc and gc_was_enabled and not gc.isenabled(): + gc.enable() + if not self.data_type in (GamsDataType.Set, GamsDataType.Alias): self.dataframe = special.convert_gdx_to_np_svs(self.dataframe, self.num_dims) + self._loaded = True return @@ -1078,6 +1171,7 @@ def write(self,index=None): self.data_type.value, userinfo): raise GdxError(self.file.H,"Could not start writing data for symbol {}".format(repr(self.name))) + # set domain information if self.num_dims > 0: if self.index: @@ -1085,24 +1179,42 @@ def write(self,index=None): raise GdxError(self.file.H,"Could not set domain information for {}. Domains are {}".format(repr(self.name),repr(self.dims))) else: logger.info("Not writing domain information because symbol index is unknown.") + + if self.data_type not in (GamsDataType.Set, GamsDataType.Alias): + # Only reset index if actually needed for the conversion + if self.dataframe.index.duplicated().any() or not self.dataframe.index.is_monotonic_increasing: + self.dataframe = self.dataframe.reset_index(drop=True) + to_write = convert_np_to_gdx_svs(self.dataframe, self.num_dims) + else: + to_write = self.dataframe.copy() + + # Local bindings to speed up the loops values = gdxcc.doubleArray(gdxcc.GMS_VAL_MAX) - # make sure index is clean -- needed for merging in convert_np_to_gdx_svs - self.dataframe = self.dataframe.reset_index(drop=True) - # convert special numeric values if appropriate - to_write = self.dataframe.copy() if (self.data_type in (GamsDataType.Set, GamsDataType.Alias)) else special.convert_np_to_gdx_svs(self.dataframe, self.num_dims) - # write each row - for row in to_write.itertuples(index=False, name=None): - dims = [str(x) for x in row[:self.num_dims]] - vals = row[self.num_dims:] - for _col_name, col_ind in self.value_cols: - values[col_ind] = float(0.0) - try: - if isinstance(vals[col_ind],Number): - values[col_ind] = float(vals[col_ind]) - except: - raise Error("Unable to set element {} from {}.".format(col_ind,vals)) - gdxcc.gdxDataWriteStr(self.file.H,dims,values) + gdxDataWriteStr = gdxcc.gdxDataWriteStr + fh = self.file.H + value_indices = [col_ind for _, col_ind in self.value_cols] + snd = self.num_dims + + # Convert dimensions to string + try: + to_write.iloc[:, :snd] = to_write.iloc[:, :snd].astype(str) + except Exception as e: + raise Error(f"Unable to convert values in to_write df to string: {e}") + + # write each row + for row in to_write.itertuples(index=False, name=None): + dims = list(row[:snd]) + for i in value_indices: + try: + v = row[snd + i] + values[i] = float(v) if isinstance(v, Number) else 0.0 + except: + raise Error("Unable to set element {} from {}.".format(i,vals)) + gdxDataWriteStr(fh,dims,values) + + # close gdxcc.gdxDataWriteDone(self.file.H) + return diff --git a/gdxpds/read_gdx.py b/gdxpds/read_gdx.py index e6e67e8..450da20 100644 --- a/gdxpds/read_gdx.py +++ b/gdxpds/read_gdx.py @@ -11,7 +11,8 @@ class Translator(object): def __init__(self,gdx_file,gams_dir=None,lazy_load=False): self.__gdx = GdxFile(gams_dir=gams_dir,lazy_load=lazy_load) - self.__gdx.read(gdx_file) + if gdx_file is not None: + self.__gdx.read(gdx_file) self.__dataframes = None def __exit__(self, *args): @@ -56,7 +57,7 @@ def dataframes(self): @property def symbols(self): return [symbol.name for symbol in self.gdx] - + @property def data_types(self): return {symbol.name: symbol.data_type for symbol in self.gdx} @@ -74,12 +75,12 @@ def _get_dataframes(self, load_set_text=False): self.__dataframes = OrderedDict() for symbol in self.__gdx: if not symbol.loaded: - symbol.load(load_set_text=load_set_text) + symbol.load(load_set_text=load_set_text,disable_gc=disable_gc) self.__dataframes[symbol.name] = symbol.dataframe.copy() return self.__dataframes - -def to_dataframes(gdx_file,gams_dir=None,load_set_text=False): + +def to_dataframes(gdx_file,gams_dir=None,load_set_text=False,disable_gc=True): """ Primary interface for converting a GAMS GDX file to pandas DataFrames. @@ -92,6 +93,9 @@ def to_dataframes(gdx_file,gams_dir=None,load_set_text=False): load_set_text : bool If True (default is False), then for every symbol that is a Set, loads the GDX Text field into the dataframe rather than a `c_bool`. + disable_gc: bool + If True (default is True), disables Python's garbage collector when reading data to + speed up the process. Returns ------- @@ -100,7 +104,7 @@ def to_dataframes(gdx_file,gams_dir=None,load_set_text=False): file, keyed with the symbol name. """ if load_set_text: - return Translator(gdx_file,gams_dir=gams_dir,lazy_load=True)._get_dataframes(load_set_text=load_set_text) + return Translator(gdx_file,gams_dir=gams_dir,lazy_load=True)._get_dataframes(load_set_text=load_set_text,disable_gc=disable_gc) return Translator(gdx_file,gams_dir=gams_dir).dataframes @@ -143,7 +147,7 @@ def get_data_types(gdx_file,gams_dir=None): -def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_text=False): +def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_text=False,disable_gc=True): """ Interface for getting the data for a single symbol @@ -161,6 +165,9 @@ def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_ load_set_text : bool If True (default is False) and symbol_name is a Set, loads the GDX Text field into the dataframe rather than a `c_bool`. + disable_gc: bool + If True (default is True), disables Python's garbage collector when reading data to + speed up the process. Returns ------- @@ -169,8 +176,14 @@ def to_dataframe(gdx_file,symbol_name,gams_dir=None,old_interface=True,load_set_ where the key is symbol_name and the value is the corresponding pd.DataFrame. Otherwise (if not old_interface), returns just the pd.DataFrame. + """ - df = Translator(gdx_file,gams_dir=gams_dir,lazy_load=True).dataframe( - symbol_name, - load_set_text=load_set_text) - return {symbol_name: df} if old_interface else df + gdx = GdxFile(gams_dir=gams_dir, lazy_load=True) + try: + symbol = gdx.read_single_symbol(gdx_file,symbol_name) + symbol.load(load_set_text=load_set_text,disable_gc=disable_gc) + df = symbol.dataframe.copy() + return {symbol_name: df} if old_interface else df + + finally: + gdx.cleanup() diff --git a/gdxpds/special.py b/gdxpds/special.py index 85d26a6..3eed36f 100644 --- a/gdxpds/special.py +++ b/gdxpds/special.py @@ -35,10 +35,8 @@ def convert_gdx_to_np_svs(df, num_dims): # create clean copy of df tmp = df.copy() - - # apply the map to the value columns and merge with the dimensional information - tmp = (tmp.iloc[:, :num_dims]).merge(tmp.iloc[:, num_dims:].replace(GDX_TO_NP_SVS), - left_index=True, right_index=True) + # replace values in the relevant columns + tpm.iloc[:, num_dims:] = tmp.iloc[:, num_dims:].replace(GDX_TO_NP_SVS) return tmp