From 4dfa8c14a8d5e25c7c75305009851bf7d94c01ac Mon Sep 17 00:00:00 2001 From: linhnguyen-cinnamon Date: Fri, 7 Jun 2024 13:47:49 +0900 Subject: [PATCH] back to multiple joiner options --- .../kotaemon/kotaemon/loaders/excel_loader.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/libs/kotaemon/kotaemon/loaders/excel_loader.py b/libs/kotaemon/kotaemon/loaders/excel_loader.py index 69d5927bb..4b98b3aea 100644 --- a/libs/kotaemon/kotaemon/loaders/excel_loader.py +++ b/libs/kotaemon/kotaemon/loaders/excel_loader.py @@ -75,18 +75,30 @@ def load_data( ) dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) - for key in dfs.keys(): - # remove redundant row and column + sheet_names = dfs.keys() + df_sheets = [] + + for key in sheet_names: + sheet = [] + if include_sheetname: + sheet.append([key]) + dfs[key] = dfs[key].dropna(axis=0, how='all') dfs[key] = dfs[key].dropna(axis=0, how='all') - dfs[key] = dfs[key].dropna(axis=1, how='all') - dfs[key].fillna('', inplace=True) + dfs[key].fillna('', inplace=True) + sheet.extend(dfs[key].values.astype(str).tolist()) + df_sheets.append(sheet) + + text_list = list( + itertools.chain.from_iterable(df_sheets) + ) # flatten list of lists output = [ Document( - text="{}\n{}".format(key, dfs[key].to_string()), - metadata=extra_info.update({'name': key}), + text=self._row_joiner.join( + self._col_joiner.join(sublist) for sublist in text_list + ), + metadata=extra_info or {}, ) - for key in dfs.keys() ] return output