Skip to content

Commit

Permalink
bug fix and make describe data threaded
Browse files Browse the repository at this point in the history
  • Loading branch information
ahuang11 committed Sep 24, 2024
1 parent c3dd2e7 commit eaceeac
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 63 deletions.
2 changes: 1 addition & 1 deletion lumen/ai/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def controls(self):
table = memory.get("current_table")
self._previous_source = source
self._previous_table = table
columns = source.get_schema(table).keys()
columns = list(source.get_schema(table).keys())
index_col = pn.widgets.AutocompleteInput.from_param(
self.param.index_col, options=columns, name="Join on",
placeholder="Start typing column name", search_strategy="includes",
Expand Down
127 changes: 65 additions & 62 deletions lumen/ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def get_data_sync():
return await asyncio.to_thread(get_data_sync)


def describe_data(df: pd.DataFrame) -> str:
async def describe_data(df: pd.DataFrame) -> str:
def format_float(num):
if pd.isna(num):
return num
Expand All @@ -168,67 +168,70 @@ def format_float(num):
else:
return f"{num:.1e}" # Exponential notation with two decimals

size = df.size
shape = df.shape
if size < 250:
return df

is_summarized = False
if shape[0] > 5000:
is_summarized = True
df = df.sample(5000)

df = df.sort_index()

for col in df.columns:
if isinstance(df[col].iloc[0], pd.Timestamp):
df[col] = pd.to_datetime(df[col])

describe_df = df.describe(percentiles=[])
columns_to_drop = ["min", "max"] # present if any numeric
columns_to_drop = [col for col in columns_to_drop if col in describe_df.columns]
df_describe_dict = describe_df.drop(columns=columns_to_drop).to_dict()

for col in df.select_dtypes(include=["object"]).columns:
if col not in df_describe_dict:
df_describe_dict[col] = {}
df_describe_dict[col]["nunique"] = df[col].nunique()
try:
df_describe_dict[col]["lengths"] = {
"max": df[col].str.len().max(),
"min": df[col].str.len().min(),
"mean": float(df[col].str.len().mean()),
}
except AttributeError:
pass

for col in df.columns:
if col not in df_describe_dict:
df_describe_dict[col] = {}
df_describe_dict[col]["nulls"] = int(df[col].isnull().sum())

# select datetime64 columns
for col in df.select_dtypes(include=["datetime64"]).columns:
for key in df_describe_dict[col]:
df_describe_dict[col][key] = str(df_describe_dict[col][key])
df[col] = df[col].astype(str) # shorten output

# select all numeric columns and round
for col in df.select_dtypes(include=["int64", "float64"]).columns:
for key in df_describe_dict[col]:
df_describe_dict[col][key] = format_float(df_describe_dict[col][key])

for col in df.select_dtypes(include=["float64"]).columns:
df[col] = df[col].apply(format_float)

data = {
"summary": {
"total_table_cells": size,
"total_shape": shape,
"is_summarized": is_summarized,
},
"stats": df_describe_dict,
}
def describe_data_sync(df):
size = df.size
shape = df.shape
if size < 250:
return df

is_summarized = False
if shape[0] > 5000:
is_summarized = True
df = df.sample(5000)

df = df.sort_index()

for col in df.columns:
if isinstance(df[col].iloc[0], pd.Timestamp):
df[col] = pd.to_datetime(df[col])

describe_df = df.describe(percentiles=[])
columns_to_drop = ["min", "max"] # present if any numeric
columns_to_drop = [col for col in columns_to_drop if col in describe_df.columns]
df_describe_dict = describe_df.drop(columns=columns_to_drop).to_dict()

for col in df.select_dtypes(include=["object"]).columns:
if col not in df_describe_dict:
df_describe_dict[col] = {}
df_describe_dict[col]["nunique"] = df[col].nunique()
try:
df_describe_dict[col]["lengths"] = {
"max": df[col].str.len().max(),
"min": df[col].str.len().min(),
"mean": float(df[col].str.len().mean()),
}
except AttributeError:
pass

for col in df.columns:
if col not in df_describe_dict:
df_describe_dict[col] = {}
df_describe_dict[col]["nulls"] = int(df[col].isnull().sum())

# select datetime64 columns
for col in df.select_dtypes(include=["datetime64"]).columns:
for key in df_describe_dict[col]:
df_describe_dict[col][key] = str(df_describe_dict[col][key])
df[col] = df[col].astype(str) # shorten output

# select all numeric columns and round
for col in df.select_dtypes(include=["int64", "float64"]).columns:
for key in df_describe_dict[col]:
df_describe_dict[col][key] = format_float(df_describe_dict[col][key])

for col in df.select_dtypes(include=["float64"]).columns:
df[col] = df[col].apply(format_float)

return {
"summary": {
"total_table_cells": size,
"total_shape": shape,
"is_summarized": is_summarized,
},
"stats": df_describe_dict,
}

data = asyncio.to_thread(describe_data_sync, df)
return data


Expand Down

0 comments on commit eaceeac

Please sign in to comment.