Skip to content

Commit

Permalink
perf: remove unneeded copy operations in table
Browse files Browse the repository at this point in the history
  • Loading branch information
lars-reimann committed Nov 16, 2023
1 parent aea1720 commit 830860b
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 59 deletions.
66 changes: 23 additions & 43 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None:
self._data = self._data.reset_index(drop=True)
self._schema: Schema = Schema._from_pandas_dataframe(self._data)

def __eq__(self, other: Any) -> bool:
def __eq__(self, other: object) -> bool:
"""
Compare two table instances.
Expand Down Expand Up @@ -472,12 +472,12 @@ def __repr__(self) -> str:
>>> repr(table)
' a b\n0 1 2\n1 3 4'
"""
tmp = self._data.copy(deep=True)
tmp = self._data.reset_index(drop=True)
tmp.columns = self.column_names
return tmp.__repr__()

def __str__(self) -> str:
tmp = self._data.copy(deep=True)
tmp = self._data.reset_index(drop=True)
tmp.columns = self.column_names
return tmp.__str__()

Expand Down Expand Up @@ -834,13 +834,13 @@ def summarize_statistics(self) -> Table:
# Transformations
# ------------------------------------------------------------------------------------------------------------------

# This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any
# additional constraints that might have to hold in the subclass.
# Override accordingly in subclasses.
def _as_table(self: Table) -> Table:
"""
Transform the table to an instance of the Table class.
This method is meant as a way to "cast" instances of subclasses of `Table` to a proper `Table`, dropping any
additional constraints that might have to hold in the subclass. Override accordingly in subclasses.
Returns
-------
table: Table
Expand Down Expand Up @@ -882,7 +882,7 @@ def add_column(self, column: Column) -> Table:
if column.number_of_rows != self.number_of_rows and self.number_of_columns != 0:
raise ColumnSizeError(str(self.number_of_rows), str(column._data.size))

result = self._data.copy()
result = self._data.reset_index(drop=True)
result.columns = self._schema.column_names
result[column.name] = column._data
return Table._from_pandas_dataframe(result)
Expand Down Expand Up @@ -923,7 +923,7 @@ def add_columns(self, columns: list[Column] | Table) -> Table:
"""
if isinstance(columns, Table):
columns = columns.to_columns()
result = self._data.copy()
result = self._data.reset_index(drop=True)
result.columns = self._schema.column_names
for column in columns:
if column.name in result.columns:
Expand Down Expand Up @@ -972,7 +972,7 @@ def add_row(self, row: Row) -> Table:
1 3 4
"""
int_columns = []
result = self._copy()

if self.number_of_columns == 0:
return Table.from_rows([row])
if len(set(self.column_names) - set(row.column_names)) > 0:
Expand All @@ -983,12 +983,12 @@ def add_row(self, row: Row) -> Table:
),
)

if result.number_of_rows == 0:
if self.number_of_rows == 0:
int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64 | np.int32), row.column_names))

new_df = pd.concat([result._data, row._data]).infer_objects()
new_df.columns = result.column_names
schema = Schema.merge_multiple_schemas([result.schema, row.schema])
new_df = pd.concat([self._data, row._data]).infer_objects()
new_df.columns = self.column_names
schema = Schema.merge_multiple_schemas([self.schema, row.schema])
result = Table._from_pandas_dataframe(new_df, schema)

for column in int_columns:
Expand Down Expand Up @@ -1036,7 +1036,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
rows = rows.to_rows()

if len(rows) == 0:
return self._copy()
return self

different_column_names = set()
for row in rows:
Expand All @@ -1049,8 +1049,7 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
),
)

result = self._copy()

result = self
for row in rows:
result = result.add_row(row)

Expand Down Expand Up @@ -1156,8 +1155,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
if len(invalid_columns) != 0:
raise UnknownColumnNameError(invalid_columns, similar_columns)

clone = self._copy()
return clone.remove_columns(list(set(self.column_names) - set(column_names)))
return self.remove_columns(list(set(self.column_names) - set(column_names)))

def remove_columns(self, column_names: list[str]) -> Table:
"""
Expand Down Expand Up @@ -1311,8 +1309,7 @@ def remove_rows_with_missing_values(self) -> Table:
a b
0 1.0 2.0
"""
result = self._data.copy(deep=True)
result = result.dropna(axis="index")
result = self._data.dropna(axis="index")
return Table._from_pandas_dataframe(result)

def remove_rows_with_outliers(self) -> Table:
Expand Down Expand Up @@ -1352,13 +1349,11 @@ def remove_rows_with_outliers(self) -> Table:
9 0.0 0.00 0.0 -1000000
10 0.0 0.00 0.0 -1000000
"""
copy = self._data.copy(deep=True)

table_without_nonnumericals = self.remove_columns_with_non_numerical_values()
z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)

return Table._from_pandas_dataframe(copy[filter_], self._schema)
return Table._from_pandas_dataframe(self._data[filter_], self._schema)

def rename_column(self, old_name: str, new_name: str) -> Table:
"""
Expand Down Expand Up @@ -1401,7 +1396,7 @@ def rename_column(self, old_name: str, new_name: str) -> Table:
if new_name in self._schema.column_names:
raise DuplicateColumnNameError(new_name)

new_df = self._data.copy()
new_df = self._data.reset_index(drop=True)
new_df.columns = self._schema.column_names
return Table._from_pandas_dataframe(new_df.rename(columns={old_name: new_name}))

Expand Down Expand Up @@ -2130,7 +2125,7 @@ def to_csv_file(self, path: str | Path) -> None:
if path.suffix != ".csv":
raise WrongFileExtensionError(path, ".csv")
path.parent.mkdir(parents=True, exist_ok=True)
data_to_csv = self._data.copy()
data_to_csv = self._data.reset_index(drop=True)
data_to_csv.columns = self._schema.column_names
data_to_csv.to_csv(path, index=False)

Expand Down Expand Up @@ -2168,7 +2163,7 @@ def to_excel_file(self, path: str | Path) -> None:
tmp_table_file.save(path)

path.parent.mkdir(parents=True, exist_ok=True)
data_to_excel = self._data.copy()
data_to_excel = self._data.reset_index(drop=True)
data_to_excel.columns = self._schema.column_names
data_to_excel.to_excel(path)

Expand Down Expand Up @@ -2199,7 +2194,7 @@ def to_json_file(self, path: str | Path) -> None:
if path.suffix != ".json":
raise WrongFileExtensionError(path, ".json")
path.parent.mkdir(parents=True, exist_ok=True)
data_to_json = self._data.copy()
data_to_json = self._data.reset_index(drop=True)
data_to_json.columns = self._schema.column_names
data_to_json.to_json(path)

Expand Down Expand Up @@ -2335,21 +2330,6 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): #
if not allow_copy:
raise NotImplementedError("For the moment we need to copy the data, so `allow_copy` must be True.")

data_copy = self._data.copy()
data_copy = self._data.reset_index(drop=True)
data_copy.columns = self.column_names
return data_copy.__dataframe__(nan_as_null, allow_copy)

# ------------------------------------------------------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------------------------------------------------------

def _copy(self) -> Table:
"""
Return a copy of this table.
Returns
-------
table : Table
The copy of this table.
"""
return copy.deepcopy(self)
6 changes: 3 additions & 3 deletions tests/safeds/data/tabular/containers/_table/test_add_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
Schema({"col1": Anything(), "col2": Integer(is_nullable=True)}),
),
(
Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
Table({"col1": [], "col2": []}),
Row({"col1": 5, "col2": 6}),
Table({"col1": [1, 2, 1, 5], "col2": [1, 2, 4, 6]}),
Table({"col1": [5], "col2": [6]}),
Schema({"col1": Integer(), "col2": Integer()}),
),
(
Table({"col1": [], "col2": []}),
Table(),
Row({"col1": 5, "col2": 6}),
Table({"col1": [5], "col2": [6]}),
Schema({"col1": Integer(), "col2": Integer()}),
Expand Down
13 changes: 0 additions & 13 deletions tests/safeds/data/tabular/containers/_table/test_copy.py

This file was deleted.

0 comments on commit 830860b

Please sign in to comment.