Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
774 changes: 585 additions & 189 deletions docs/tutorials/data_processing.ipynb

Large diffs are not rendered by default.

72 changes: 38 additions & 34 deletions docs/tutorials/data_visualization.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/safeds/data/image/containers/_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def __contains__(self, item: object) -> bool:
Returns
-------
has_item:
Weather the given item is in this image list
Whether the given item is in this image list
"""
return isinstance(item, Image) and self.has_image(item)

Expand Down Expand Up @@ -524,7 +524,7 @@ def has_image(self, image: Image) -> bool:
Returns
-------
has_image:
Weather the given image is in this image list
Whether the given image is in this image list
"""

# ------------------------------------------------------------------------------------------------------------------
Expand Down
126 changes: 99 additions & 27 deletions src/safeds/data/labeled/containers/_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class ImageDataset(Dataset[ImageList, Out_co]):
batch_size:
the batch size used for training
shuffle:
weather the data should be shuffled after each epoch of training
whether the data should be shuffled after each epoch of training
"""

def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None:
Expand Down Expand Up @@ -108,13 +108,13 @@ def __iter__(self) -> ImageDataset:
return im_ds

def __next__(self) -> tuple[Tensor, Tensor]:
if self._next_batch_index * self._batch_size >= len(self._input):
if self._next_batch_index * self._batch_size >= len(self._shuffle_tensor_indices):
raise StopIteration
self._next_batch_index += 1
return self._get_batch(self._next_batch_index - 1)

def __len__(self) -> int:
return self._input.image_count
return len(self._shuffle_tensor_indices)

def __eq__(self, other: object) -> bool:
"""
Expand All @@ -138,6 +138,7 @@ def __eq__(self, other: object) -> bool:
and isinstance(other._output, type(self._output))
and (self._input == other._input)
and (self._output == other._output)
and (self._shuffle_tensor_indices.tolist() == other._shuffle_tensor_indices.tolist())
)

def __hash__(self) -> int:
Expand All @@ -149,7 +150,13 @@ def __hash__(self) -> int:
hash:
the hash value
"""
return _structural_hash(self._input, self._output, self._shuffle_after_epoch, self._batch_size)
return _structural_hash(
self._input,
self._output,
self._shuffle_after_epoch,
self._batch_size,
self._shuffle_tensor_indices.tolist(),
)

def __sizeof__(self) -> int:
"""
Expand Down Expand Up @@ -205,7 +212,7 @@ def get_input(self) -> ImageList:
input:
the input data of this dataset
"""
return self._sort_image_list_with_shuffle_tensor_indices(self._input)
return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._input)

def get_output(self) -> Out_co:
"""
Expand All @@ -222,19 +229,25 @@ def get_output(self) -> Out_co:
elif isinstance(output, _ColumnAsTensor):
return output._to_column(self._shuffle_tensor_indices) # type: ignore[return-value]
else:
return self._sort_image_list_with_shuffle_tensor_indices(self._output) # type: ignore[return-value]
return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._output) # type: ignore[return-value]

def _sort_image_list_with_shuffle_tensor_indices(self, image_list: _SingleSizeImageList) -> _SingleSizeImageList:
def _sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(
self,
image_list: _SingleSizeImageList,
) -> _SingleSizeImageList:
shuffled_image_list = _SingleSizeImageList()
shuffled_image_list._tensor = image_list._tensor
shuffled_image_list._indices_to_tensor_positions = {
index: self._shuffle_tensor_indices[tensor_position].item()
for index, tensor_position in image_list._indices_to_tensor_positions.items()
tensor_pos = [
image_list._indices_to_tensor_positions[shuffled_index]
for shuffled_index in sorted(self._shuffle_tensor_indices.tolist())
]
temp_pos = {
shuffled_index: new_index for new_index, shuffled_index in enumerate(self._shuffle_tensor_indices.tolist())
}
shuffled_image_list._tensor = image_list._tensor[tensor_pos]
shuffled_image_list._tensor_positions_to_indices = [
index
for index, _ in sorted(shuffled_image_list._indices_to_tensor_positions.items(), key=lambda item: item[1])
new_index for _, new_index in sorted(temp_pos.items(), key=lambda item: item[0])
]
shuffled_image_list._indices_to_tensor_positions = shuffled_image_list._calc_new_indices_to_tensor_positions()
return shuffled_image_list

def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[Tensor, Tensor]:
Expand All @@ -247,18 +260,18 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[

_check_bounds("batch_size", batch_size, lower_bound=_ClosedBound(1))

if batch_number < 0 or batch_size * batch_number >= len(self._input):
if batch_number < 0 or batch_size * batch_number >= len(self._shuffle_tensor_indices):
raise IndexOutOfBoundsError(batch_size * batch_number)
max_index = (
batch_size * (batch_number + 1) if batch_size * (batch_number + 1) < len(self._input) else len(self._input)
batch_size * (batch_number + 1)
if batch_size * (batch_number + 1) < len(self._shuffle_tensor_indices)
else len(self._shuffle_tensor_indices)
)
input_tensor = (
self._input._tensor[
self._shuffle_tensor_indices[
[
self._input._indices_to_tensor_positions[index]
for index in range(batch_size * batch_number, max_index)
]
[
self._input._indices_to_tensor_positions[index]
for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
]
].to(torch.float32)
/ 255
Expand All @@ -267,11 +280,9 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[
if isinstance(self._output, _SingleSizeImageList):
output_tensor = (
self._output._tensor[
self._shuffle_tensor_indices[
[
self._output._indices_to_tensor_positions[index]
for index in range(batch_size * batch_number, max_index)
]
[
self._input._indices_to_tensor_positions[index]
for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
]
].to(torch.float32)
/ 255
Expand All @@ -284,7 +295,7 @@ def shuffle(self) -> ImageDataset[Out_co]:
"""
Return a new `ImageDataset` with shuffled data.

The original dataset list is not modified.
The original dataset is not modified.

Returns
-------
Expand All @@ -296,10 +307,71 @@ def shuffle(self) -> ImageDataset[Out_co]:
_init_default_device()

im_dataset: ImageDataset[Out_co] = copy.copy(self)
im_dataset._shuffle_tensor_indices = torch.randperm(len(self))
im_dataset._shuffle_tensor_indices = self._shuffle_tensor_indices[
torch.randperm(len(self._shuffle_tensor_indices))
]
im_dataset._next_batch_index = 0
return im_dataset

def split(
self,
percentage_in_first: float,
*,
shuffle: bool = True,
) -> tuple[ImageDataset[Out_co], ImageDataset[Out_co]]:
"""
Create two image datasets by splitting the data of the current dataset.

The first dataset contains a percentage of the data specified by `percentage_in_first`, and the second dataset
contains the remaining data.

The original dataset is not modified.
By default, the data is shuffled before splitting. You can disable this by setting `shuffle` to False.

Parameters
----------
percentage_in_first:
The percentage of data to include in the first dataset. Must be between 0 and 1.
shuffle:
Whether to shuffle the data before splitting.

Returns
-------
first_dataset:
The first dataset.
second_dataset:
The second dataset.

Raises
------
OutOfBoundsError
If `percentage_in_first` is not between 0 and 1.
"""
import torch

_check_bounds(
"percentage_in_first",
percentage_in_first,
lower_bound=_ClosedBound(0),
upper_bound=_ClosedBound(1),
)

first_dataset: ImageDataset[Out_co] = copy.copy(self)
second_dataset: ImageDataset[Out_co] = copy.copy(self)

if shuffle:
shuffled_indices = torch.randperm(len(self._shuffle_tensor_indices))
else:
shuffled_indices = torch.arange(len(self._shuffle_tensor_indices))

first_dataset._shuffle_tensor_indices, second_dataset._shuffle_tensor_indices = shuffled_indices.split(
[
round(percentage_in_first * len(self)),
len(self) - round(percentage_in_first * len(self)),
],
)
return first_dataset, second_dataset


class _TableAsTensor:
def __init__(self, table: Table) -> None:
Expand Down
92 changes: 85 additions & 7 deletions src/safeds/data/tabular/plotting/_table_plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def correlation_heatmap(self) -> Image:
# TODO: implement using matplotlib and polars
# https://stackoverflow.com/questions/33282368/plotting-a-2d-heatmap
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

only_numerical = self._table.remove_non_numeric_columns()._data_frame.fill_null(0)

Expand All @@ -115,15 +115,18 @@ def correlation_heatmap(self) -> Image:
" automatically expanding."
),
)
fig = plt.figure()
sns.heatmap(
data=only_numerical.corr().to_numpy(),

fig, ax = plt.subplots()
heatmap = plt.imshow(
only_numerical.corr().to_numpy(),
vmin=-1,
vmax=1,
xticklabels=only_numerical.columns,
yticklabels=only_numerical.columns,
cmap="vlag",
cmap="coolwarm",
)
ax.set_xticks(np.arange(len(only_numerical.columns)), labels=only_numerical.columns)
ax.set_yticks(np.arange(len(only_numerical.columns)), labels=only_numerical.columns)
fig.colorbar(heatmap)

plt.tight_layout()

return _figure_to_image(fig)
Expand Down Expand Up @@ -353,6 +356,81 @@ def scatter_plot(self, x_name: str, y_names: list[str]) -> Image:

return _figure_to_image(fig)

def moving_average_plot(self, x_name: str, y_name: str, window_size: int) -> Image:
"""
Create a moving average plot for the y column and plot it by the x column in the table.

Parameters
----------
x_name:
The name of the column to be plotted on the x-axis.
y_name:
The name of the column to be plotted on the y-axis.

Returns
-------
plot:
The plot as an image.

Raises
------
ColumnNotFoundError
If a column does not exist.
TypeError
If a column is not numeric.

Examples
--------
>>> from safeds.data.tabular.containers import Table
>>> table = Table(
... {
... "a": [1, 2, 3, 4, 5],
... "b": [2, 3, 4, 5, 6],
... }
... )
>>> image = table.plot.moving_average_plot("a", "b", window_size = 2)
"""
import matplotlib.pyplot as plt
import numpy as np
import polars as pl

_plot_validation(self._table, x_name, [y_name])
for name in [x_name, y_name]:
if self._table.get_column(name).missing_value_count() >= 1:
raise ValueError(
f"there are missing values in column '{name}', use transformation to fill missing values "
f"or drop the missing values. For a moving average no missing values are allowed.",
)

# Calculate the moving average
mean_col = pl.col(y_name).mean().alias(y_name)
grouped = self._table._lazy_frame.sort(x_name).group_by(x_name).agg(mean_col).collect()
data = grouped
moving_average = data.select([pl.col(y_name).rolling_mean(window_size).alias("moving_average")])
# set up the arrays for plotting
y_data_with_nan = moving_average["moving_average"].to_numpy()
nan_mask = ~np.isnan(y_data_with_nan)
y_data = y_data_with_nan[nan_mask]
x_data = data[x_name].to_numpy()[nan_mask]
fig, ax = plt.subplots()
ax.plot(x_data, y_data, label="moving average")
ax.set(
xlabel=x_name,
ylabel=y_name,
)
ax.legend()
if self._table.get_column(x_name).is_temporal:
ax.set_xticks(x_data) # Set x-ticks to the x data points
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment="right",
) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels
fig.tight_layout()

return _figure_to_image(fig)


def _plot_validation(table: Table, x_name: str, y_names: list[str]) -> None:
y_names.append(x_name)
Expand Down
8 changes: 1 addition & 7 deletions src/safeds/data/tabular/transformation/_range_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,7 @@ class RangeScaler(InvertibleTableTransformer):
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(
self,
min_: float = 0.0,
max_: float = 1.0,
*,
column_names: str | list[str] | None = None,
) -> None:
def __init__(self, *, column_names: str | list[str] | None = None, min_: float = 0.0, max_: float = 1.0) -> None:
super().__init__(column_names)

if min_ >= max_:
Expand Down
2 changes: 2 additions & 0 deletions src/safeds/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
ModelNotFittedError,
PlainTableError,
PredictionError,
TargetDataMismatchError,
)


Expand Down Expand Up @@ -69,6 +70,7 @@ class OutOfBoundsError(SafeDsError):
# ML exceptions
"DatasetMissesDataError",
"DatasetMissesFeaturesError",
"TargetDataMismatchError",
"FeatureDataMismatchError",
"InvalidFitDataError",
"InputSizeError",
Expand Down
Loading