Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mambular/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
#

# The following line *must* be the last in the module, exactly as formatted:
__version__ = "1.2.0"
__version__ = "1.2.1"
128 changes: 33 additions & 95 deletions mambular/data_utils/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,7 @@ def preprocess_data(
embeddings_val = [embeddings_val]

split_data += embeddings_train
split_result = train_test_split(
*split_data, test_size=val_size, random_state=random_state
)
split_result = train_test_split(*split_data, test_size=val_size, random_state=random_state)

self.X_train, self.X_val, self.y_train, self.y_val = split_result[:4]
self.embeddings_train = split_result[4::2]
Expand Down Expand Up @@ -161,37 +159,31 @@ def preprocess_data(
self.embeddings_val = None

# Fit the preprocessor on the combined training and validation data
combined_X = pd.concat([self.X_train, self.X_val], axis=0).reset_index(
drop=True
)
combined_X = pd.concat([self.X_train, self.X_val], axis=0).reset_index(drop=True)
combined_y = np.concatenate((self.y_train, self.y_val), axis=0)

if self.embeddings_train is not None and self.embeddings_val is not None:
combined_embeddings = [
np.concatenate((emb_train, emb_val), axis=0)
for emb_train, emb_val in zip(
self.embeddings_train, self.embeddings_val
)
for emb_train, emb_val in zip(self.embeddings_train, self.embeddings_val, strict=False)
]
else:
combined_embeddings = None

self.preprocessor.fit(combined_X, combined_y, combined_embeddings)

# Update feature info based on the actual processed data
(self.num_feature_info, self.cat_feature_info, self.embedding_feature_info) = (
self.preprocessor.get_feature_info()
)
(
self.num_feature_info,
self.cat_feature_info,
self.embedding_feature_info,
) = self.preprocessor.get_feature_info()

def setup(self, stage: str):
"""Transform the data and create DataLoaders."""
if stage == "fit":
train_preprocessed_data = self.preprocessor.transform(
self.X_train, self.embeddings_train
)
val_preprocessed_data = self.preprocessor.transform(
self.X_val, self.embeddings_val
)
train_preprocessed_data = self.preprocessor.transform(self.X_train, self.embeddings_train)
val_preprocessed_data = self.preprocessor.transform(self.X_val, self.embeddings_val)

# Initialize lists for tensors
train_cat_tensors = []
Expand All @@ -205,75 +197,40 @@ def setup(self, stage: str):
for key in self.cat_feature_info: # type: ignore
dtype = (
torch.float32
if any(
x in self.cat_feature_info[key]["preprocessing"]
for x in ["onehot", "pretrained"]
)
if any(x in self.cat_feature_info[key]["preprocessing"] for x in ["onehot", "pretrained"]) # type: ignore
else torch.long
)

cat_key = "cat_" + str(
key
) # Assuming categorical keys are prefixed with 'cat_'
cat_key = "cat_" + str(key) # Assuming categorical keys are prefixed with 'cat_'
if cat_key in train_preprocessed_data:
train_cat_tensors.append(
torch.tensor(train_preprocessed_data[cat_key], dtype=dtype)
)
train_cat_tensors.append(torch.tensor(train_preprocessed_data[cat_key], dtype=dtype))
if cat_key in val_preprocessed_data:
val_cat_tensors.append(
torch.tensor(val_preprocessed_data[cat_key], dtype=dtype)
)
val_cat_tensors.append(torch.tensor(val_preprocessed_data[cat_key], dtype=dtype))

binned_key = "num_" + str(key) # for binned features
if binned_key in train_preprocessed_data:
train_cat_tensors.append(
torch.tensor(train_preprocessed_data[binned_key], dtype=dtype)
)
train_cat_tensors.append(torch.tensor(train_preprocessed_data[binned_key], dtype=dtype))

if binned_key in val_preprocessed_data:
val_cat_tensors.append(
torch.tensor(val_preprocessed_data[binned_key], dtype=dtype)
)
val_cat_tensors.append(torch.tensor(val_preprocessed_data[binned_key], dtype=dtype))

# Populate tensors for numerical features, if present in processed data
for key in self.num_feature_info: # type: ignore
num_key = "num_" + str(
key
) # Assuming numerical keys are prefixed with 'num_'
num_key = "num_" + str(key) # Assuming numerical keys are prefixed with 'num_'
if num_key in train_preprocessed_data:
train_num_tensors.append(
torch.tensor(
train_preprocessed_data[num_key], dtype=torch.float32
)
)
train_num_tensors.append(torch.tensor(train_preprocessed_data[num_key], dtype=torch.float32))
if num_key in val_preprocessed_data:
val_num_tensors.append(
torch.tensor(
val_preprocessed_data[num_key], dtype=torch.float32
)
)
val_num_tensors.append(torch.tensor(val_preprocessed_data[num_key], dtype=torch.float32))

if self.embedding_feature_info is not None:
for key in self.embedding_feature_info:
if key in train_preprocessed_data:
train_emb_tensors.append(
torch.tensor(
train_preprocessed_data[key], dtype=torch.float32
)
)
train_emb_tensors.append(torch.tensor(train_preprocessed_data[key], dtype=torch.float32))
if key in val_preprocessed_data:
val_emb_tensors.append(
torch.tensor(
val_preprocessed_data[key], dtype=torch.float32
)
)

train_labels = torch.tensor(
self.y_train, dtype=self.labels_dtype
).unsqueeze(dim=1)
val_labels = torch.tensor(self.y_val, dtype=self.labels_dtype).unsqueeze(
dim=1
)
val_emb_tensors.append(torch.tensor(val_preprocessed_data[key], dtype=torch.float32))

train_labels = torch.tensor(self.y_train, dtype=self.labels_dtype).unsqueeze(dim=1)
val_labels = torch.tensor(self.y_val, dtype=self.labels_dtype).unsqueeze(dim=1)

self.train_dataset = MambularDataset(
train_cat_tensors,
Expand All @@ -300,42 +257,27 @@ def preprocess_new_data(self, X, embeddings):
for key in self.cat_feature_info: # type: ignore
dtype = (
torch.float32
if any(
x in self.cat_feature_info[key]["preprocessing"]
for x in ["onehot", "pretrained"]
)
if any(x in self.cat_feature_info[key]["preprocessing"] for x in ["onehot", "pretrained"]) # type: ignore
else torch.long
)
cat_key = "cat_" + str(
key
) # Assuming categorical keys are prefixed with 'cat_'
cat_key = "cat_" + str(key) # Assuming categorical keys are prefixed with 'cat_'
if cat_key in preprocessed_data:
cat_tensors.append(
torch.tensor(preprocessed_data[cat_key], dtype=dtype)
)
cat_tensors.append(torch.tensor(preprocessed_data[cat_key], dtype=dtype))

binned_key = "num_" + str(key) # for binned features
if binned_key in preprocessed_data:
cat_tensors.append(
torch.tensor(preprocessed_data[binned_key], dtype=dtype)
)
cat_tensors.append(torch.tensor(preprocessed_data[binned_key], dtype=dtype))

# Populate tensors for numerical features, if present in processed data
for key in self.num_feature_info: # type: ignore
num_key = "num_" + str(
key
) # Assuming numerical keys are prefixed with 'num_'
num_key = "num_" + str(key) # Assuming numerical keys are prefixed with 'num_'
if num_key in preprocessed_data:
num_tensors.append(
torch.tensor(preprocessed_data[num_key], dtype=torch.float32)
)
num_tensors.append(torch.tensor(preprocessed_data[num_key], dtype=torch.float32))

if self.embedding_feature_info is not None:
for key in self.embedding_feature_info:
if key in preprocessed_data:
emb_tensors.append(
torch.tensor(preprocessed_data[key], dtype=torch.float32)
)
emb_tensors.append(torch.tensor(preprocessed_data[key], dtype=torch.float32))

return MambularDataset(
cat_tensors,
Expand Down Expand Up @@ -374,9 +316,7 @@ def val_dataloader(self):
DataLoader: DataLoader instance for the validation dataset.
"""
if hasattr(self, "val_dataset"):
return DataLoader(
self.val_dataset, batch_size=self.batch_size, **self.dataloader_kwargs
)
return DataLoader(self.val_dataset, batch_size=self.batch_size, **self.dataloader_kwargs)
else:
raise ValueError("No validation dataset provided!")

Expand All @@ -387,9 +327,7 @@ def test_dataloader(self):
DataLoader: DataLoader instance for the test dataset.
"""
if hasattr(self, "test_dataset"):
return DataLoader(
self.test_dataset, batch_size=self.batch_size, **self.dataloader_kwargs
)
return DataLoader(self.test_dataset, batch_size=self.batch_size, **self.dataloader_kwargs)
else:
raise ValueError("No test dataset provided!")

Expand Down
4 changes: 4 additions & 0 deletions mambular/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,10 @@ def transform(self, X, embeddings=None):
raise NotFittedError(
"The preprocessor must be fitted before transforming new data. Use .fit or .fit_transform"
)
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
else:
X = X.copy()
transformed_X = self.column_transformer.transform(X) # type: ignore

# Now let's convert this into a dictionary of arrays, one per column
Expand Down
Loading
Loading