Skip to content

Commit

Permalink
remove columns and rows which contains all NaN
Browse files Browse the repository at this point in the history
  • Loading branch information
linhnguyen-cinnamon authored and linhnguyen-cinnamon committed Jun 7, 2024
1 parent 56dfc8f commit f138ba2
Showing 1 changed file with 8 additions and 17 deletions.
25 changes: 8 additions & 17 deletions libs/kotaemon/kotaemon/loaders/excel_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,27 +75,18 @@ def load_data(
)

dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)
sheet_names = dfs.keys()
df_sheets = []

for key in sheet_names:
sheet = []
if include_sheetname:
sheet.append([key])
sheet.extend(dfs[key].values.astype(str).tolist())
df_sheets.append(sheet)

text_list = list(
itertools.chain.from_iterable(df_sheets)
) # flatten list of lists
for key in dfs.keys():
# remove redundant row and column
dfs[key] = dfs[key].dropna(axis=0, how='all')
dfs[key] = dfs[key].dropna(axis=1, how='all')
dfs[key].fillna('', inplace=True)

output = [
Document(
text=self._row_joiner.join(
self._col_joiner.join(sublist) for sublist in text_list
),
metadata=extra_info or {},
text="{}\n{}".format(key, dfs[key].to_string()),
metadata=extra_info.update({'name': key}),
)
for key in dfs.keys()
]

return output

0 comments on commit f138ba2

Please sign in to comment.