From 205da70352b01896d5bd446b600e447f622cc7a2 Mon Sep 17 00:00:00 2001
From: baraline <antoine.guillaume45@gmail.com>
Date: Sat, 2 Aug 2025 14:37:44 +0200
Subject: [PATCH] Moving functions to utils

---
 aeon/base/_base_collection.py | 203 +---------------
 aeon/base/_base_series.py     | 166 +------------
 aeon/utils/preprocessing.py   | 432 ++++++++++++++++++++++++++++++++++
 3 files changed, 454 insertions(+), 347 deletions(-)
 create mode 100644 aeon/utils/preprocessing.py

diff --git a/aeon/base/_base_collection.py b/aeon/base/_base_collection.py
index 60996e7d8a..ff1b856772 100644
--- a/aeon/base/_base_collection.py
+++ b/aeon/base/_base_collection.py
@@ -17,26 +17,16 @@ class name: BaseCollectionEstimator
     fitted model/strategy   - by convention, any attributes ending in "_"
     fitted state flag       - is_fitted (property)
     fitted state inspection - check_is_fitted()
+
 """
 
 from abc import abstractmethod
 
-import numpy as np
-
 from aeon.base._base import BaseAeonEstimator
-from aeon.utils.conversion import (
-    convert_collection,
-    resolve_equal_length_inner_type,
-    resolve_unequal_length_inner_type,
-)
+from aeon.utils.preprocessing import preprocess_collection
 from aeon.utils.validation.collection import (
-    get_n_cases,
     get_n_channels,
     get_n_timepoints,
-    get_type,
-    has_missing,
-    is_equal_length,
-    is_univariate,
 )
 
 
@@ -63,6 +53,7 @@ class BaseCollectionEstimator(BaseAeonEstimator):
     @abstractmethod
     def __init__(self):
         self.metadata_ = {}  # metadata/properties of data seen in fit
+        self._n_jobs = 1
 
         super().__init__()
 
@@ -110,156 +101,17 @@ def _preprocess_collection(self, X, store_metadata=True):
         >>> X2.shape
         (10, 1, 20)
         """
-        if isinstance(X, list) and isinstance(X[0], np.ndarray):
-            X = self._reshape_np_list(X)
-
-        meta = self._check_X(X)
-        if len(self.metadata_) == 0 and store_metadata:
+        result = preprocess_collection(
+            X,
+            self.get_tags(),
+            return_metadata=store_metadata,
+        )
+        if store_metadata:
+            X, meta = result
             self.metadata_ = meta
-
-        return self._convert_X(X)
-
-    def _check_X(self, X):
-        """
-        Check classifier input X is valid.
-
-        Check if the input data is a compatible type, and that this estimator is
-        able to handle the data characteristics.
-        This is done by matching the capabilities of the estimator against the metadata
-        for X i.e., univariate/multivariate, equal length/unequal length and no missing
-        values/missing values.
-
-        Parameters
-        ----------
-        X : collection
-           See aeon.utils.COLLECTIONS_DATA_TYPES for details on aeon supported
-           data structures.
-
-        Returns
-        -------
-        metadata : dict
-            Metadata about X, with flags:
-            metadata["multivariate"] : whether X has more than one channel or not
-            metadata["missing_values"] : whether X has missing values or not
-            metadata["unequal_length"] : whether X contains unequal length series.
-            metadata["n_cases"] : number of cases in X
-            metadata["n_channels"] : number of channels in X
-            metadata["n_timepoints"] : number of timepoints in X if equal length, else
-                None
-
-        Raises
-        ------
-        ValueError
-            If X is an invalid type or has characteristics that the estimator cannot
-            handle.
-
-        See Also
-        --------
-        _convert_X :
-            Function that converts X after it has been checked.
-
-        Examples
-        --------
-        >>> from aeon.testing.mock_estimators import MockClassifierFullTags
-        >>> from aeon.testing.data_generation import make_example_3d_numpy
-        >>> clf = MockClassifierFullTags()
-        >>> X, _ = make_example_3d_numpy(n_channels=3) # X is equal length, multivariate
-        >>> meta = clf._check_X(X) # Classifier can handle this
-        """
-        # check if X is a valid type
-        get_type(X)
-
-        metadata = self._get_X_metadata(X)
-        # Check classifier capabilities for X
-        allow_multivariate = self.get_tag("capability:multivariate")
-        allow_missing = self.get_tag("capability:missing_values")
-        allow_unequal = self.get_tag("capability:unequal_length")
-
-        # Check capabilities vs input
-        problems = []
-        if metadata["missing_values"] and not allow_missing:
-            problems += ["missing values"]
-        if metadata["multivariate"] and not allow_multivariate:
-            problems += ["multivariate series"]
-        if metadata["unequal_length"] and not allow_unequal:
-            problems += ["unequal length series"]
-
-        if problems:
-            # construct error message
-            problems_and = " and ".join(problems)
-            msg = (
-                f"Data seen by instance of {type(self).__name__} has {problems_and}, "
-                f"but {type(self).__name__} cannot handle these characteristics. "
-            )
-            raise ValueError(msg)
-
-        return metadata
-
-    def _convert_X(self, X):
-        """
-        Convert X to type defined by tag X_inner_type.
-
-        If the input data is already an allowed type, it is returned unchanged.
-
-        If multiple types are allowed by self, then the best one for the type of input
-        data is selected. So, for example, if X_inner_tag is ["np-list", "numpy3D"]
-        and an df-list is passed, it will be converted to numpy3D if the series
-        are equal length, and np-list if the series are unequal length.
-
-        Parameters
-        ----------
-        X : collection
-           See aeon.utils.COLLECTIONS_DATA_TYPES for details on aeon supported
-           data structures.
-
-        Returns
-        -------
-        X : collection
-            Converted X. A data structure of type self.get_tag("X_inner_type").
-
-        See Also
-        --------
-        _check_X :
-            Function that checks X is valid and finds metadata.
-
-        Examples
-        --------
-        >>> from aeon.testing.mock_estimators import MockClassifier
-        >>> from aeon.testing.data_generation import make_example_3d_numpy_list
-        >>> from aeon.utils.validation import get_type
-        >>> clf = MockClassifier()
-        >>> X, _ = make_example_3d_numpy_list(max_n_timepoints=8)
-        >>> get_type(X)
-        'np-list'
-        >>> clf.get_tag("X_inner_type")
-        'numpy3D'
-        >>> X2 = clf._convert_X(X)
-        >>> get_type(X2)
-        'numpy3D'
-        """
-        inner_type = self.get_tag("X_inner_type")
-        if not isinstance(inner_type, list):
-            inner_type = [inner_type]
-        input_type = get_type(X)
-
-        # Check if we need to convert X, return if not
-        if input_type in inner_type:
-            return X
-
-        if len(self.metadata_) == 0:
-            metadata = self._get_X_metadata(X)
         else:
-            metadata = self.metadata_
-
-        # Convert X to X_inner_type if possible
-        # If self can handle more than one internal type, resolve correct conversion
-        # If unequal, choose data structure that can hold unequal
-        if metadata["unequal_length"]:
-            inner_type = resolve_unequal_length_inner_type(inner_type)
-        else:
-            inner_type = resolve_equal_length_inner_type(inner_type)
-
-        return convert_collection(X, inner_type)
+            X = result
+        return X
 
     def _check_shape(self, X):
         """
@@ -297,34 +149,3 @@ def _check_shape(self, X):
                         "number of channels in train set was ",
                         f"{self.metadata_['n_channels']} but in predict it is {nc}.",
                     )
-
-    @staticmethod
-    def _get_X_metadata(X):
-        # Get and store X meta data.
-        metadata = {}
-        metadata["multivariate"] = not is_univariate(X)
-        metadata["missing_values"] = has_missing(X)
-        metadata["unequal_length"] = not is_equal_length(X)
-        metadata["n_cases"] = get_n_cases(X)
-        metadata["n_channels"] = get_n_channels(X)
-        metadata["n_timepoints"] = (
-            None if metadata["unequal_length"] else get_n_timepoints(X)
-        )
-        return metadata
-
-    @staticmethod
-    def _reshape_np_list(X):
-        """Reshape 1D numpy to be 2D."""
-        reshape = False
-        for x in X:
-            if x.ndim == 1:
-                reshape = True
-                break
-        if reshape:
-            X2 = []
-            for x in X:
-                if x.ndim == 1:
-                    x = x.reshape(1, -1)
-                X2.append(x)
-            return X2
-        return X
diff --git a/aeon/base/_base_series.py b/aeon/base/_base_series.py
index c4fbb5aa30..204dc6c2f0 100644
--- a/aeon/base/_base_series.py
+++ b/aeon/base/_base_series.py
@@ -33,11 +33,8 @@
 
 from abc import abstractmethod
 
-import numpy as np
-import pandas as pd
-
 from aeon.base._base import BaseAeonEstimator
-from aeon.utils.data_types import VALID_SERIES_INNER_TYPES
+from aeon.utils.preprocessing import preprocess_series
 
 
 class BaseSeriesEstimator(BaseAeonEstimator):
@@ -112,159 +109,16 @@ def _preprocess_series(self, X, axis, store_metadata):
         X: one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
             Input time series with data structure of type self.get_tag("X_inner_type").
         """
-        meta = self._check_X(X, axis)
+        result = preprocess_series(
+            X,
+            axis=axis,
+            tags=self.get_tags(),
+            estimator_axis=self.axis,
+            return_metadata=store_metadata,
+        )
         if store_metadata:
+            X, meta = result
             self.metadata_ = meta
-        return self._convert_X(X, axis)
-
-    def _check_X(self, X, axis: int = 0):
-        """Check input X is valid.
-
-        Check if the input data is a compatible type, and that this estimator is
-        able to handle the data characteristics. This is done by matching the
-        capabilities of the estimator against the metadata for X for
-        univariate/multivariate and no missing values/missing values.
-
-        Parameters
-        ----------
-        X: one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
-            A valid aeon time series data structure. See
-            aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types.
-        axis: int
-            The time point axis of the input series if it is 2D. If ``axis==0``, it is
-            assumed each column is a time series and each row is a time point. i.e. the
-            shape of the data is ``(n_timepoints,n_channels)``. ``axis==1`` indicates
-            the time series are in rows, i.e. the shape of the data is
-            ``(n_channels,n_timepoints)``.
-
-        Returns
-        -------
-        metadata: dict
-            Metadata about X, with flags:
-            metadata["multivariate"]: whether X has more than one channel or not
-            metadata["n_channels"]: number of channels in X
-            metadata["missing_values"]: whether X has missing values or not
-        """
-        if axis > 1 or axis < 0:
-            raise ValueError(f"Input axis should be 0 or 1, saw {axis}")
-
-        # Checks: check valid dtype
-        if isinstance(X, np.ndarray):
-            if not (
-                issubclass(X.dtype.type, np.integer)
-                or issubclass(X.dtype.type, np.floating)
-            ):
-                raise ValueError("dtype for np.ndarray must be float or int")
-        elif isinstance(X, pd.Series):
-            if not pd.api.types.is_numeric_dtype(X):
-                raise ValueError("pd.Series dtype must be numeric")
-        elif isinstance(X, pd.DataFrame):
-            if not all(pd.api.types.is_numeric_dtype(X[col]) for col in X.columns):
-                raise ValueError("pd.DataFrame dtype must be numeric")
         else:
-            raise ValueError(
-                f"Input type of X should be one of {VALID_SERIES_INNER_TYPES}, "
-                f"saw {type(X)}"
-            )
-
-        metadata = {}
-
-        # check if multivariate
-        channel_idx = 0 if axis == 1 else 1
-        if X.ndim > 2:
-            raise ValueError(
-                "X must have at most 2 dimensions for multivariate data, optionally 1 "
-                f"for univarate data. Found {X.ndim} dimensions"
-            )
-        elif X.ndim > 1 and X.shape[channel_idx] > 1:
-            metadata["multivariate"] = True
-        else:
-            metadata["multivariate"] = False
-
-        metadata["n_channels"] = X.shape[channel_idx] if X.ndim > 1 else 1
-
-        # check if has missing values
-        if isinstance(X, np.ndarray):
-            metadata["missing_values"] = np.isnan(X).any()
-        elif isinstance(X, pd.Series):
-            metadata["missing_values"] = X.isna().any()
-        else:
-            metadata["missing_values"] = X.isna().any().any()
-
-        allow_multivariate = self.get_tag("capability:multivariate")
-        allow_univariate = self.get_tag("capability:univariate")
-        allow_missing = self.get_tag("capability:missing_values")
-        if metadata["missing_values"] and not allow_missing:
-            raise ValueError(
-                f"Missing values not supported by {self.__class__.__name__}"
-            )
-        if metadata["multivariate"] and not allow_multivariate:
-            raise ValueError(
-                f"Multivariate data not supported by {self.__class__.__name__}"
-            )
-        if not metadata["multivariate"] and not allow_univariate:
-            raise ValueError(
-                f"Univariate data not supported by {self.__class__.__name__}"
-            )
-
-        return metadata
-
-    def _convert_X(self, X, axis):
-        """Convert input X to internal estimator datatype.
-
-        Converts input X to the internal data type of the estimator using
-        self.get_tag("X_inner_type"). 1D numpy arrays are converted to 2D,
-        and the data will be transposed if the input axis does not match that of the
-        estimator.
-
-        Attempting to convert to a pd.Series for multivariate data or estimators will
-        raise an error.
-
-        Parameters
-        ----------
-        X: one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
-            A valid aeon time series data structure. See
-            aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types.
-        axis: int
-            The time point axis of the input series if it is 2D. If ``axis==0``, it is
-            assumed each column is a time series and each row is a time point. i.e. the
-            shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates
-            the time series are in rows, i.e. the shape of the data is
-            ``(n_channels, n_timepoints)``.
-
-        Returns
-        -------
-        X: one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
-            Input time series with data structure of type self.get_tag("X_inner_type").
-        """
-        if axis > 1 or axis < 0:
-            raise ValueError(f"Input axis should be 0 or 1, saw {axis}")
-
-        inner_type = self.get_tag("X_inner_type")
-        if not isinstance(inner_type, list):
-            inner_type = [inner_type]
-        inner_names = [i.split(".")[-1] for i in inner_type]
-
-        input = type(X).__name__
-        if input not in inner_names:
-            if inner_names[0] == "ndarray":
-                X = X.to_numpy()
-            elif inner_names[0] == "DataFrame":
-                # converting a 1d array will create a 2d array in axis 0 format
-                transpose = False
-                if X.ndim == 1 and axis == 1:
-                    transpose = True
-                X = pd.DataFrame(X)
-                if transpose:
-                    X = X.T
-            else:
-                raise ValueError(
-                    f"Unsupported inner type {inner_names[0]} derived from {inner_type}"
-                )
-
-        if X.ndim > 1 and self.axis != axis:
-            X = X.T
-        elif X.ndim == 1 and isinstance(X, np.ndarray):
-            X = X[np.newaxis, :] if self.axis == 1 else X[:, np.newaxis]
-
+            X = result
         return X
diff --git a/aeon/utils/preprocessing.py b/aeon/utils/preprocessing.py
new file mode 100644
index 0000000000..ee4200677c
--- /dev/null
+++ b/aeon/utils/preprocessing.py
@@ -0,0 +1,432 @@
+"""
+Standalone preprocessing functions for time series data.
+
+This module contains preprocessing functions that can be used independently
+of specific estimator classes. These functions handle validation, metadata
+extraction, and format conversion for both single series and collections.
+"""
+
+__maintainer__ = ["TonyBagnall", "MatthewMiddlehurst"]
+__all__ = ["preprocess_series", "preprocess_collection"]
+
+import numpy as np
+import pandas as pd
+
+from aeon.utils.conversion import (
+    convert_collection,
+    resolve_equal_length_inner_type,
+    resolve_unequal_length_inner_type,
+)
+from aeon.utils.data_types import VALID_SERIES_INNER_TYPES
+from aeon.utils.validation.collection import (
+    get_n_cases,
+    get_n_channels,
+    get_n_timepoints,
+    get_type,
+    has_missing,
+    is_equal_length,
+    is_univariate,
+)
+
+
+def preprocess_series(
+    X,
+    axis: int,
+    tags: dict,
+    estimator_axis: int,
+    return_metadata: bool = True,
+):
+    """Preprocess input X for single time series estimators.
+
+    Checks the characteristics of X, validates that the estimator can handle
+    the data, stores metadata, and converts X to the specified inner type.
+
+    Parameters
+    ----------
+    X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
+        A valid aeon time series data structure. See
+        aeon.base._base_series.VALID_SERIES_INPUT_TYPES for aeon supported types.
+    axis : int
+        The time point axis of the input series if it is 2D. If ``axis==0``, it is
+        assumed each column is a time series and each row is a time point. i.e. the
+        shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates
+        the time series are in rows, i.e. the shape of the data is
+        ``(n_channels, n_timepoints)``.
+    tags : dict
+        Dictionary containing estimator tags and capabilities with keys:
+        - "capability:univariate": bool
+        - "capability:multivariate": bool
+        - "capability:missing_values": bool (optional, defaults to False)
+    estimator_axis : int
+        The target axis that the estimator expects. If ``estimator_axis==0``,
+        output will have shape ``(n_timepoints, n_channels)``. If ``estimator_axis==1``,
+        output will have shape ``(n_channels, n_timepoints)``.
+    return_metadata : bool, default=True
+        Whether to return the metadata dict about X.
+
+    Returns
+    -------
+    X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
+        Input time series with data structure of type inner_type.
+    metadata : dict (if return_metadata=True)
+        Metadata about X, with flags:
+        - metadata["multivariate"]: whether X has more than one channel or not
+        - metadata["n_channels"]: number of channels in X
+        - metadata["missing_values"]: whether X has missing values or not
+    """
+    inner_type = tags.get("X_inner_type")
+    metadata = _check_series(X, axis, tags)
+    X_converted = _convert_series(X, axis, inner_type, estimator_axis)
+
+    if return_metadata:
+        return X_converted, metadata
+    else:
+        return X_converted
+
+
+def preprocess_collection(X, tags, return_metadata=True):
+    """Preprocess input X for collection-based estimators.
+
+    1. Checks the characteristics of X and validates estimator capabilities
+    2. Stores metadata about X if return_metadata is True
+    3. Converts X to inner_type if necessary
+
+    Parameters
+    ----------
+    X : collection
+        See aeon.utils.COLLECTIONS_DATA_TYPES for details on aeon supported
+        data structures.
+    tags : dict
+        Dictionary containing estimator tags and capabilities with keys:
+        - "capability:univariate": bool
+        - "capability:multivariate": bool
+        - "capability:unequal_length": bool
+        - "capability:missing_values": bool (optional, defaults to False)
+        - "capability:multithreading": bool (optional, defaults to False)
+    return_metadata : bool, default=True
+        Whether to return the metadata dict about X.
+
+    Returns
+    -------
+    X : collection
+        Processed X. A data structure of type inner_type.
+    metadata : dict (if return_metadata=True)
+        Metadata about X, with flags:
+        - metadata["multivariate"]: whether X has more than one channel or not
+        - metadata["missing_values"]: whether X has missing values or not
+        - metadata["unequal_length"]: whether X contains unequal length series
+        - metadata["n_cases"]: number of cases in X
+        - metadata["n_channels"]: number of channels in X
+        - metadata["n_timepoints"]: number of timepoints in X if equal length, else None
+
+    Raises
+    ------
+    ValueError
+        If X is an invalid type or has characteristics that the estimator cannot
+        handle.
+    """
+    inner_type = tags.get("X_inner_type")
+    if isinstance(X, list) and isinstance(X[0], np.ndarray):
+        X = _reshape_np_list(X)
+
+    metadata = _check_collection(X, tags)
+    X_converted = _convert_collection_type(X, inner_type, metadata)
+
+    if return_metadata:
+        return X_converted, metadata
+    else:
+        return X_converted
+
+
+def _check_series(X, axis, tags):
+    """Check input X is valid for series estimators.
+
+    Check if the input data is a compatible type, and that the estimator is
+    able to handle the data characteristics. This is done by matching the
+    capabilities of the estimator against the metadata for X for
+    univariate/multivariate and no missing values/missing values.
+
+    Parameters
+    ----------
+    X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
+        A valid aeon time series data structure.
+    axis : int
+        The time point axis of the input series if it is 2D.
+    tags : dict
+        Dictionary containing estimator capabilities.
+
+    Returns
+    -------
+    metadata : dict
+        Metadata about X, with flags:
+        - metadata["multivariate"]: whether X has more than one channel or not
+        - metadata["n_channels"]: number of channels in X
+        - metadata["missing_values"]: whether X has missing values or not
+    """
+    if axis > 1 or axis < 0:
+        raise ValueError(f"Input axis should be 0 or 1, saw {axis}")
+
+    # Checks: check valid dtype
+    if isinstance(X, np.ndarray):
+        if not (
+            issubclass(X.dtype.type, np.integer)
+            or issubclass(X.dtype.type, np.floating)
+        ):
+            raise ValueError("dtype for np.ndarray must be float or int")
+    elif isinstance(X, pd.Series):
+        if not pd.api.types.is_numeric_dtype(X):
+            raise ValueError("pd.Series dtype must be numeric")
+    elif isinstance(X, pd.DataFrame):
+        if not all(pd.api.types.is_numeric_dtype(X[col]) for col in X.columns):
+            raise ValueError("pd.DataFrame dtype must be numeric")
+    else:
+        raise ValueError(
+            f"Input type of X should be one of {VALID_SERIES_INNER_TYPES}, "
+            f"saw {type(X)}"
+        )
+
+    # Validate dimensionality
+    if X.ndim > 2:
+        raise ValueError(
+            "X must have at most 2 dimensions for multivariate data, optionally 1 "
+            f"for univarate data. Found {X.ndim} dimensions"
+        )
+
+    metadata = _get_series_metadata(X, axis)
+
+    # Check capabilities
+    allow_multivariate = tags.get("capability:multivariate", False)
+    allow_univariate = tags.get("capability:univariate", True)
+    allow_missing = tags.get("capability:missing_values", False)
+
+    if metadata["missing_values"] and not allow_missing:
+        raise ValueError("Missing values not supported by estimator")
+    if metadata["multivariate"] and not allow_multivariate:
+        raise ValueError("Multivariate data not supported by estimator")
+    if not metadata["multivariate"] and not allow_univariate:
+        raise ValueError("Univariate data not supported by estimator")
+
+    return metadata
+
+
+def _convert_series(X, axis, inner_type, estimator_axis):
+    """Convert input X to internal estimator datatype.
+
+    Converts input X to the specified internal data type. 1D numpy arrays are
+    converted to 2D, and the data will be transposed if the input axis does not
+    match the target axis.
+
+    Parameters
+    ----------
+    X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
+        A valid aeon time series data structure.
+    inner_type : str or list of str
+        The desired internal data type(s).
+    estimator_axis : int
+        The target axis that the estimator expects.
+
+    Returns
+    -------
+    X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
+        Input time series with data structure of type inner_type.
+    """
+    if axis > 1 or axis < 0:
+        raise ValueError(f"Input axis should be 0 or 1, saw {axis}")
+
+    if not isinstance(inner_type, list):
+        inner_type = [inner_type]
+    inner_names = [i.split(".")[-1] for i in inner_type]
+
+    input_type = type(X).__name__
+    if input_type not in inner_names:
+        if inner_names[0] == "ndarray":
+            X = X.to_numpy()
+        elif inner_names[0] == "DataFrame":
+            # converting a 1d array will create a 2d array in axis 0 format
+            transpose = False
+            if X.ndim == 1 and axis == 1:
+                transpose = True
+            X = pd.DataFrame(X)
+            if transpose:
+                X = X.T
+        else:
+            raise ValueError(
+                f"Unsupported inner type {inner_names[0]} derived from {inner_type}"
+            )
+
+    if X.ndim > 1 and estimator_axis != axis:
+        X = X.T
+    elif X.ndim == 1 and isinstance(X, np.ndarray):
+        X = X[np.newaxis, :] if estimator_axis == 1 else X[:, np.newaxis]
+
+    return X
+
+
+def _check_collection(X, tags):
+    """Check collection input X is valid.
+
+    Check if the input data is a compatible type, and that the estimator is
+    able to handle the data characteristics.
+
+    Parameters
+    ----------
+    X : collection
+       See aeon.utils.COLLECTIONS_DATA_TYPES for details on aeon supported
+       data structures.
+    tags : dict
+        Dictionary containing estimator capabilities.
+
+    Returns
+    -------
+    metadata : dict
+        Metadata about X.
+
+    Raises
+    ------
+    ValueError
+        If X is an invalid type or has characteristics that the estimator cannot
+        handle.
+    """
+    # check if X is a valid type
+    get_type(X)
+
+    metadata = _get_collection_metadata(X)
+
+    # Check estimator capabilities for X
+    allow_multivariate = tags.get("capability:multivariate", False)
+    allow_missing = tags.get("capability:missing_values", False)
+    allow_unequal = tags.get("capability:unequal_length", False)
+
+    # Check capabilities vs input
+    problems = []
+    if metadata["missing_values"] and not allow_missing:
+        problems += ["missing values"]
+    if metadata["multivariate"] and not allow_multivariate:
+        problems += ["multivariate series"]
+    if metadata["unequal_length"] and not allow_unequal:
+        problems += ["unequal length series"]
+
+    if problems:
+        # construct error message
+        problems_and = " and ".join(problems)
+        msg = (
+            f"Data has {problems_and}, but the estimator cannot handle"
+            f"these characteristics due to having tags : {tags}. "
+        )
+        raise ValueError(msg)
+
+    return metadata
+
+
+def _convert_collection_type(X, inner_type, metadata):
+    """Convert X to type defined by inner_type.
+
+    If the input data is already an allowed type, it is returned unchanged.
+
+    Parameters
+    ----------
+    X : collection
+       See aeon.utils.COLLECTIONS_DATA_TYPES for details on aeon supported
+       data structures.
+    inner_type : str or list of str
+        The desired internal data type(s).
+    metadata : dict
+        Metadata about X.
+
+    Returns
+    -------
+    X : collection
+        Converted X. A data structure of type inner_type.
+    """
+    if not isinstance(inner_type, list):
+        inner_type = [inner_type]
+    input_type = get_type(X)
+
+    # Check if we need to convert X, return if not
+    if input_type in inner_type:
+        return X
+
+    # Convert X to inner_type if possible
+    # If estimator can handle more than one internal type, resolve correct conversion
+    # If unequal, choose data structure that can hold unequal
+    if metadata["unequal_length"]:
+        inner_type = resolve_unequal_length_inner_type(inner_type)
+    else:
+        inner_type = resolve_equal_length_inner_type(inner_type)
+
+    return convert_collection(X, inner_type)
+
+
+def _get_collection_metadata(X):
+    """Get and store X meta data."""
+    metadata = {}
+    metadata["multivariate"] = not is_univariate(X)
+    metadata["missing_values"] = has_missing(X)
+    metadata["unequal_length"] = not is_equal_length(X)
+    metadata["n_cases"] = get_n_cases(X)
+    metadata["n_channels"] = get_n_channels(X)
+    metadata["n_timepoints"] = (
+        None if metadata["unequal_length"] else get_n_timepoints(X)
+    )
+    return metadata
+
+
+def _get_series_metadata(X, axis):
+    """Get and store series metadata.
+
+    Parameters
+    ----------
+    X : one of aeon.base._base_series.VALID_SERIES_INPUT_TYPES
+        A valid aeon time series data structure.
+    axis : int
+        The time point axis of the input series if it is 2D. If ``axis==0``, it is
+        assumed each column is a time series and each row is a time point. i.e. the
+        shape of the data is ``(n_timepoints, n_channels)``. ``axis==1`` indicates
+        the time series are in rows, i.e. the shape of the data is
+        ``(n_channels, n_timepoints)``.
+
+    Returns
+    -------
+    metadata : dict
+        Metadata about X, with flags:
+        - metadata["multivariate"]: whether X has more than one channel or not
+        - metadata["missing_values"]: whether X has missing values or not
+        - metadata["n_channels"]: number of channels in X
+    """
+    metadata = {}
+
+    # check if multivariate
+    channel_idx = 0 if axis == 1 else 1
+    if X.ndim > 1 and X.shape[channel_idx] > 1:
+        metadata["multivariate"] = True
+    else:
+        metadata["multivariate"] = False
+
+    metadata["n_channels"] = X.shape[channel_idx] if X.ndim > 1 else 1
+
+    # check if has missing values
+    if isinstance(X, np.ndarray):
+        metadata["missing_values"] = np.isnan(X).any()
+    elif isinstance(X, pd.Series):
+        metadata["missing_values"] = X.isna().any()
+    else:  # pd.DataFrame
+        metadata["missing_values"] = X.isna().any().any()
+
+    return metadata
+
+
+def _reshape_np_list(X):
+    """Reshape 1D numpy to be 2D."""
+    reshape = False
+    for x in X:
+        if x.ndim == 1:
+            reshape = True
+            break
+    if reshape:
+        X2 = []
+        for x in X:
+            if x.ndim == 1:
+                x = x.reshape(1, -1)
+            X2.append(x)
+        return X2
+    return X