diff --git a/libs/kotaemon/kotaemon/loaders/excel_loader.py b/libs/kotaemon/kotaemon/loaders/excel_loader.py index 69d5927bb..4b98b3aea 100644 --- a/libs/kotaemon/kotaemon/loaders/excel_loader.py +++ b/libs/kotaemon/kotaemon/loaders/excel_loader.py @@ -75,18 +75,30 @@ def load_data( ) dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) - for key in dfs.keys(): - # remove redundant row and column + sheet_names = dfs.keys() + df_sheets = [] + + for key in sheet_names: + sheet = [] + if include_sheetname: + sheet.append([key]) + dfs[key] = dfs[key].dropna(axis=0, how='all') dfs[key] = dfs[key].dropna(axis=0, how='all') - dfs[key] = dfs[key].dropna(axis=1, how='all') - dfs[key].fillna('', inplace=True) + dfs[key].fillna('', inplace=True) + sheet.extend(dfs[key].values.astype(str).tolist()) + df_sheets.append(sheet) + + text_list = list( + itertools.chain.from_iterable(df_sheets) + ) # flatten list of lists output = [ Document( - text="{}\n{}".format(key, dfs[key].to_string()), - metadata=extra_info.update({'name': key}), + text=self._row_joiner.join( + self._col_joiner.join(sublist) for sublist in text_list + ), + metadata=extra_info or {}, ) - for key in dfs.keys() ] return output