Source code for tscf_eval.data_loader.tsc_data

"""Time-series classification data container.

This module provides the :class:`TSCData` dataclass, an immutable container
for time-series classification datasets. It supports both univariate and
multivariate time series with utilities for conversion, persistence, and
basic dataset operations.

Classes
-------
TSCData
    Immutable container holding feature arrays ``X``, labels ``y``, and
    metadata (name, split). Provides factory methods for construction from
    arrays or DataFrames, properties for dataset introspection, and
    serialization via NumPy's ``.npz`` format.

Type Aliases
------------
Split
    Literal type for dataset splits: ``'train'`` or ``'test'``.

Examples
--------
>>> import numpy as np
>>> from tscf_eval.data_loader import TSCData
>>>
>>> # Create from arrays
>>> X = np.random.randn(100, 50)  # 100 instances, 50 time points
>>> y = np.random.randint(0, 2, 100)
>>> data = TSCData.from_arrays("my_dataset", "train", X, y)
>>>
>>> # Inspect properties
>>> print(data.n_instances, data.series_length, data.n_classes)
100 50 2
>>>
>>> # Save and load
>>> data.save("my_dataset.npz")
>>> loaded = TSCData.load("my_dataset.npz")

See Also
--------
tscf_eval.data_loader.DataLoader : Abstract loader interface.
tscf_eval.data_loader.UCRLoader : UCR archive dataset loader.
"""

from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from pathlib import Path
from typing import Literal, cast

import numpy as np
import pandas as pd

Split = Literal["train", "test"]



[docs]
@dataclass(frozen=True)
class TSCData:
    """Immutable container for time-series classification data.

    A small, well-typed container for time-series classification datasets.
    """

    name: str
    split: Split
    X: np.ndarray
    y: np.ndarray


[docs]
    @staticmethod
    def from_arrays(
        name: str,
        split: Split,
        X: np.ndarray,
        y: Iterable,
        *,
        squeeze_univariate: bool = True,
    ) -> "TSCData":
        """Create a ``TSCData`` instance from numpy arrays / array-likes.

        Parameters
        ----------
        name : str
            Dataset name.
        split : {'train', 'test'}
            Which split this instance belongs to.
        X : array-like
            Time-series data. Accepts either 2D ``(n, L)`` for univariate
            data or 3D ``(n, D, L)`` for multivariate data.
        y : array-like
            1D labels of length ``n``.
        squeeze_univariate : bool, optional
            If ``True`` and ``X`` is shape ``(n,1,L)``, squeeze the
            channel dimension to produce shape ``(n,L)``.

        Returns
        -------
        TSCData
            Constructed immutable container.

        Raises
        ------
        ValueError
            If ``X`` or ``y`` do not have expected dimensions or if the
            number of instances disagree.
        """
        X = np.asarray(X)
        y = np.asarray(list(y))

        if squeeze_univariate and X.ndim == 3 and X.shape[1] == 1:
            X = X[:, 0, :]

        if X.ndim not in (2, 3):
            raise ValueError("X must be 2D (n,L) or 3D (n,D,L).")
        if y.ndim != 1:
            raise ValueError("y must be 1D.")
        if X.shape[0] != y.shape[0]:
            raise ValueError(f"n mismatch: X has {X.shape[0]} rows but y has {y.shape[0]}.")

        return TSCData(name=name, split=split, X=X, y=y)



[docs]
    @staticmethod
    def from_dataframe(
        name: str,
        split: Split,
        df: pd.DataFrame,
        *,
        label_col: str,
        feature_cols: Sequence[str] | None = None,
    ) -> "TSCData":
        """Create a ``TSCData`` instance from a wide-format ``DataFrame``.

        The dataframe format expected is one row per instance, numeric
        columns representing time points (or channels flattened), and
        a column containing the label.

        Parameters
        ----------
        name : str
            Dataset name used in the resulting ``TSCData``.
        split : {'train', 'test'}
            Split label to set on the resulting object.
        df : pandas.DataFrame
            Source table.
        label_col : str
            Column name in ``df`` containing labels.
        feature_cols : sequence of str, optional
            Columns to use as features in the desired order. If ``None``,
            numeric columns except label/split columns are used.
        (label maps are not used; labels are returned in original form)

        Returns
        -------
        TSCData
            Constructed dataset object.

        Raises
        ------
        ValueError
            If ``label_col`` is missing or no numeric feature columns are
            found when ``feature_cols`` is not provided.
        """
        if label_col not in df.columns:
            raise ValueError(f"label_col='{label_col}' not in dataframe.")

        if feature_cols is None:
            drop_cols = {label_col}
            drop_cols |= {c for c in ["split", "Split", "SPLIT"] if c in df.columns}
            feature_cols = (
                df.drop(columns=list(drop_cols)).select_dtypes(include=[np.number]).columns.tolist()
            )
            if not feature_cols:
                raise ValueError("No numeric feature columns found; pass feature_cols=[...]")
        else:
            for c in feature_cols:
                if c not in df.columns:
                    raise ValueError(f"feature column '{c}' missing.")

        X = df[feature_cols].to_numpy(dtype=float)  # (n, L)
        y = df[label_col].to_numpy()
        return TSCData.from_arrays(name, split, X, y, squeeze_univariate=True)


    @property
    def n_instances(self) -> int:
        """Number of instances in the dataset.

        Returns
        -------
        int
            Number of rows / time-series instances (n).
        """
        return int(self.X.shape[0])

    @property
    def series_length(self) -> int:
        """Length of each time series in time points.

        For univariate data this is the second axis length of ``X`` when
        ``X`` has shape ``(n, L)``. For multivariate data (``X`` shape
        ``(n, D, L)``) this returns ``L``.

        Returns
        -------
        int
            Series length (L).
        """
        return int(self.X.shape[-1])

    @property
    def n_dims(self) -> int:
        """Number of dimensions (channels) per time series.

        Returns
        -------
        int
            ``1`` for univariate series (``X`` is 2D) or ``D`` for
            multivariate series (``X`` is 3D with shape ``(n, D, L)``).
        """
        return 1 if self.X.ndim == 2 else int(self.X.shape[1])

    @property
    def n_classes(self) -> int:
        """Number of unique class labels present in ``y``.

        Returns
        -------
        int
            The number of distinct labels (classes) in the label array.
        """
        return int(np.unique(self.y).size)

    @property
    def is_univariate(self) -> bool:
        """Whether the dataset is univariate.

        Returns
        -------
        bool
            True if each instance has a single channel (``D == 1``),
            False otherwise.
        """
        return self.n_dims == 1


[docs]
    def describe(self) -> dict:
        """Return a small dictionary summarizing dataset properties.

        The dictionary contains basic metadata useful for logging or
        quick inspection: dataset name and split, shapes (instances,
        series length, dimensions), number of classes, class counts
        and the optional label mapping if present.

        Returns
        -------
        dict
            Summary dictionary with keys: 'name', 'split', 'n_instances',
            'series_length', 'n_dims', 'n_classes', 'class_counts'.
        """
        classes, counts = np.unique(self.y, return_counts=True)
        return {
            "name": self.name,
            "split": self.split,
            "n_instances": self.n_instances,
            "series_length": self.series_length,
            "n_dims": self.n_dims,
            "n_classes": self.n_classes,
            "class_counts": {str(c): int(n) for c, n in zip(classes, counts, strict=True)},
        }



[docs]
    def to_dataframe(self, *, label_name: str = "label", prefix: str = "t_") -> pd.DataFrame:
        """Return a wide-format ``DataFrame`` representing the dataset.

        Parameters
        ----------
        label_name : str, optional
            Column name to use for labels in the returned dataframe.
        prefix : str, optional
            Prefix for generated numeric/time columns.

        Returns
        -------
        pandas.DataFrame
            Wide-format dataframe with numeric columns for each time
            point (and channel) and a final column with labels.
        """
        if self.is_univariate:
            cols = [f"{prefix}{i}" for i in range(self.series_length)]
            dfX = pd.DataFrame(self.X, columns=cols)
        else:
            n, d, L = self.X.shape
            cols = [f"{prefix}{t}_d{dim}" for dim in range(d) for t in range(L)]
            dfX = pd.DataFrame(self.X.reshape(n, d * L), columns=cols)
        df = dfX.copy()
        df[label_name] = self.y
        return df



[docs]
    def map_labels(self, mapping: dict[int | str, int | str]) -> "TSCData":
        """Return a copy of this dataset with labels remapped.

        Parameters
        ----------
        mapping : dict
            Mapping from original labels to new labels. If a label is not
            present in ``mapping``, it is left unchanged.

        Returns
        -------
        TSCData
            New instance with remapped ``y``.
        """
        new_y = np.array([mapping.get(item, item) for item in self.y])
        return TSCData(self.name, self.split, self.X, new_y)



[docs]
    def select_classes(self, keep: Iterable[int | str]) -> "TSCData":
        """Return a view of the dataset keeping only specified classes.

        Parameters
        ----------
        keep : iterable
            Labels to keep. Items not present in the dataset are ignored.

        Returns
        -------
        TSCData
            New instance containing only instances whose label is in
            ``keep``.
        """
        keep_set = set(keep)
        mask = np.array([item in keep_set for item in self.y], dtype=bool)
        return TSCData(self.name, self.split, self.X[mask], self.y[mask])



[docs]
    def save(self, path: str | Path) -> None:
        """Save the dataset to a compressed NumPy ``.npz`` file.

        The file contains arrays for ``X``, ``y``, ``name`` and ``split``. Use
        :meth:`TSCData.load` to restore.

        Parameters
        ----------
        path : str or pathlib.Path
            Destination file path. The function will use ``numpy.savez_compressed``.
        """
        path = Path(path)
        np.savez_compressed(
            path,
            X=self.X,
            y=self.y,
            name=self.name,
            split=self.split,
        )



[docs]
    @staticmethod
    def load(path: str | Path) -> "TSCData":
        """Load a ``TSCData`` instance previously written with :meth:`save`.

        Parameters
        ----------
        path : str or pathlib.Path
            Path to ``.npz`` file produced by :meth:`save`.

        Returns
        -------
        TSCData
            Restored dataset.
        """
        z = np.load(Path(path), allow_pickle=True)
        name = str(z["name"])
        split = str(z["split"])
        split = cast("Split", split)

        X = np.asarray(z["X"])
        y = np.asarray(z["y"])

        return TSCData(name=name, split=split, X=X, y=y)