From f138ba21617511c13bb3163f63c78f8ab7f7508b Mon Sep 17 00:00:00 2001 From: linhnguyen-cinnamon Date: Fri, 7 Jun 2024 13:11:05 +0900 Subject: [PATCH] remove columns and rows which contains all NaN --- .../kotaemon/kotaemon/loaders/excel_loader.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/libs/kotaemon/kotaemon/loaders/excel_loader.py b/libs/kotaemon/kotaemon/loaders/excel_loader.py index d903aea84..69d5927bb 100644 --- a/libs/kotaemon/kotaemon/loaders/excel_loader.py +++ b/libs/kotaemon/kotaemon/loaders/excel_loader.py @@ -75,27 +75,18 @@ def load_data( ) dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) - sheet_names = dfs.keys() - df_sheets = [] - - for key in sheet_names: - sheet = [] - if include_sheetname: - sheet.append([key]) - sheet.extend(dfs[key].values.astype(str).tolist()) - df_sheets.append(sheet) - - text_list = list( - itertools.chain.from_iterable(df_sheets) - ) # flatten list of lists + for key in dfs.keys(): + # remove redundant row and column + dfs[key] = dfs[key].dropna(axis=0, how='all') + dfs[key] = dfs[key].dropna(axis=1, how='all') + dfs[key].fillna('', inplace=True) output = [ Document( - text=self._row_joiner.join( - self._col_joiner.join(sublist) for sublist in text_list - ), - metadata=extra_info or {}, + text="{}\n{}".format(key, dfs[key].to_string()), + metadata=extra_info.update({'name': key}), ) + for key in dfs.keys() ] return output