Source code for tscf_eval.data_loader.tsc_data

"""Time-series classification data container.

This module provides the :class:`TSCData` dataclass, an immutable container
for time-series classification datasets. It supports both univariate and
multivariate time series with utilities for conversion, persistence, and
basic dataset operations.

Classes
-------
TSCData
    Immutable container holding feature arrays ``X``, labels ``y``, and
    metadata (name, split). Provides factory methods for construction from
    arrays or DataFrames, properties for dataset introspection, and
    serialization via NumPy's ``.npz`` format.

Type Aliases
------------
Split
    Literal type for dataset splits: ``'train'`` or ``'test'``.

Examples
--------
>>> import numpy as np
>>> from tscf_eval.data_loader import TSCData
>>>
>>> # Create from arrays
>>> X = np.random.randn(100, 50)  # 100 instances, 50 time points
>>> y = np.random.randint(0, 2, 100)
>>> data = TSCData.from_arrays("my_dataset", "train", X, y)
>>>
>>> # Inspect properties
>>> print(data.n_instances, data.series_length, data.n_classes)
100 50 2
>>>
>>> # Save and load
>>> data.save("my_dataset.npz")
>>> loaded = TSCData.load("my_dataset.npz")

See Also
--------
tscf_eval.data_loader.DataLoader : Abstract loader interface.
tscf_eval.data_loader.UCRLoader : UCR archive dataset loader.
"""

from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from pathlib import Path
from typing import Literal, cast

import numpy as np
import pandas as pd

Split = Literal["train", "test"]


[docs] @dataclass(frozen=True) class TSCData: """Immutable container for time-series classification data. A small, well-typed container for time-series classification datasets. """ name: str split: Split X: np.ndarray y: np.ndarray
[docs] @staticmethod def from_arrays( name: str, split: Split, X: np.ndarray, y: Iterable, *, squeeze_univariate: bool = True, ) -> "TSCData": """Create a ``TSCData`` instance from numpy arrays / array-likes. Parameters ---------- name : str Dataset name. split : {'train', 'test'} Which split this instance belongs to. X : array-like Time-series data. Accepts either 2D ``(n, L)`` for univariate data or 3D ``(n, D, L)`` for multivariate data. y : array-like 1D labels of length ``n``. squeeze_univariate : bool, optional If ``True`` and ``X`` is shape ``(n,1,L)``, squeeze the channel dimension to produce shape ``(n,L)``. Returns ------- TSCData Constructed immutable container. Raises ------ ValueError If ``X`` or ``y`` do not have expected dimensions or if the number of instances disagree. """ X = np.asarray(X) y = np.asarray(list(y)) if squeeze_univariate and X.ndim == 3 and X.shape[1] == 1: X = X[:, 0, :] if X.ndim not in (2, 3): raise ValueError("X must be 2D (n,L) or 3D (n,D,L).") if y.ndim != 1: raise ValueError("y must be 1D.") if X.shape[0] != y.shape[0]: raise ValueError(f"n mismatch: X has {X.shape[0]} rows but y has {y.shape[0]}.") return TSCData(name=name, split=split, X=X, y=y)
[docs] @staticmethod def from_dataframe( name: str, split: Split, df: pd.DataFrame, *, label_col: str, feature_cols: Sequence[str] | None = None, ) -> "TSCData": """Create a ``TSCData`` instance from a wide-format ``DataFrame``. The dataframe format expected is one row per instance, numeric columns representing time points (or channels flattened), and a column containing the label. Parameters ---------- name : str Dataset name used in the resulting ``TSCData``. split : {'train', 'test'} Split label to set on the resulting object. df : pandas.DataFrame Source table. label_col : str Column name in ``df`` containing labels. feature_cols : sequence of str, optional Columns to use as features in the desired order. If ``None``, numeric columns except label/split columns are used. (label maps are not used; labels are returned in original form) Returns ------- TSCData Constructed dataset object. Raises ------ ValueError If ``label_col`` is missing or no numeric feature columns are found when ``feature_cols`` is not provided. """ if label_col not in df.columns: raise ValueError(f"label_col='{label_col}' not in dataframe.") if feature_cols is None: drop_cols = {label_col} drop_cols |= {c for c in ["split", "Split", "SPLIT"] if c in df.columns} feature_cols = ( df.drop(columns=list(drop_cols)).select_dtypes(include=[np.number]).columns.tolist() ) if not feature_cols: raise ValueError("No numeric feature columns found; pass feature_cols=[...]") else: for c in feature_cols: if c not in df.columns: raise ValueError(f"feature column '{c}' missing.") X = df[feature_cols].to_numpy(dtype=float) # (n, L) y = df[label_col].to_numpy() return TSCData.from_arrays(name, split, X, y, squeeze_univariate=True)
@property def n_instances(self) -> int: """Number of instances in the dataset. Returns ------- int Number of rows / time-series instances (n). """ return int(self.X.shape[0]) @property def series_length(self) -> int: """Length of each time series in time points. For univariate data this is the second axis length of ``X`` when ``X`` has shape ``(n, L)``. For multivariate data (``X`` shape ``(n, D, L)``) this returns ``L``. Returns ------- int Series length (L). """ return int(self.X.shape[-1]) @property def n_dims(self) -> int: """Number of dimensions (channels) per time series. Returns ------- int ``1`` for univariate series (``X`` is 2D) or ``D`` for multivariate series (``X`` is 3D with shape ``(n, D, L)``). """ return 1 if self.X.ndim == 2 else int(self.X.shape[1]) @property def n_classes(self) -> int: """Number of unique class labels present in ``y``. Returns ------- int The number of distinct labels (classes) in the label array. """ return int(np.unique(self.y).size) @property def is_univariate(self) -> bool: """Whether the dataset is univariate. Returns ------- bool True if each instance has a single channel (``D == 1``), False otherwise. """ return self.n_dims == 1
[docs] def describe(self) -> dict: """Return a small dictionary summarizing dataset properties. The dictionary contains basic metadata useful for logging or quick inspection: dataset name and split, shapes (instances, series length, dimensions), number of classes, class counts and the optional label mapping if present. Returns ------- dict Summary dictionary with keys: 'name', 'split', 'n_instances', 'series_length', 'n_dims', 'n_classes', 'class_counts'. """ classes, counts = np.unique(self.y, return_counts=True) return { "name": self.name, "split": self.split, "n_instances": self.n_instances, "series_length": self.series_length, "n_dims": self.n_dims, "n_classes": self.n_classes, "class_counts": {str(c): int(n) for c, n in zip(classes, counts, strict=True)}, }
[docs] def to_dataframe(self, *, label_name: str = "label", prefix: str = "t_") -> pd.DataFrame: """Return a wide-format ``DataFrame`` representing the dataset. Parameters ---------- label_name : str, optional Column name to use for labels in the returned dataframe. prefix : str, optional Prefix for generated numeric/time columns. Returns ------- pandas.DataFrame Wide-format dataframe with numeric columns for each time point (and channel) and a final column with labels. """ if self.is_univariate: cols = [f"{prefix}{i}" for i in range(self.series_length)] dfX = pd.DataFrame(self.X, columns=cols) else: n, d, L = self.X.shape cols = [f"{prefix}{t}_d{dim}" for dim in range(d) for t in range(L)] dfX = pd.DataFrame(self.X.reshape(n, d * L), columns=cols) df = dfX.copy() df[label_name] = self.y return df
[docs] def map_labels(self, mapping: dict[int | str, int | str]) -> "TSCData": """Return a copy of this dataset with labels remapped. Parameters ---------- mapping : dict Mapping from original labels to new labels. If a label is not present in ``mapping``, it is left unchanged. Returns ------- TSCData New instance with remapped ``y``. """ new_y = np.array([mapping.get(item, item) for item in self.y]) return TSCData(self.name, self.split, self.X, new_y)
[docs] def select_classes(self, keep: Iterable[int | str]) -> "TSCData": """Return a view of the dataset keeping only specified classes. Parameters ---------- keep : iterable Labels to keep. Items not present in the dataset are ignored. Returns ------- TSCData New instance containing only instances whose label is in ``keep``. """ keep_set = set(keep) mask = np.array([item in keep_set for item in self.y], dtype=bool) return TSCData(self.name, self.split, self.X[mask], self.y[mask])
[docs] def save(self, path: str | Path) -> None: """Save the dataset to a compressed NumPy ``.npz`` file. The file contains arrays for ``X``, ``y``, ``name`` and ``split``. Use :meth:`TSCData.load` to restore. Parameters ---------- path : str or pathlib.Path Destination file path. The function will use ``numpy.savez_compressed``. """ path = Path(path) np.savez_compressed( path, X=self.X, y=self.y, name=self.name, split=self.split, )
[docs] @staticmethod def load(path: str | Path) -> "TSCData": """Load a ``TSCData`` instance previously written with :meth:`save`. Parameters ---------- path : str or pathlib.Path Path to ``.npz`` file produced by :meth:`save`. Returns ------- TSCData Restored dataset. """ z = np.load(Path(path), allow_pickle=True) name = str(z["name"]) split = str(z["split"]) split = cast("Split", split) X = np.asarray(z["X"]) y = np.asarray(z["y"]) return TSCData(name=name, split=split, X=X, y=y)