Source code for tscf_eval.data_loader.files

"""File-based dataset loader for CSV, TXT, and Excel files.

This module provides the :class:`FileLoader` class for loading time-series
classification datasets from local files. It supports wide-format tabular
data where each row represents a time-series instance.

Supported file formats:
- CSV (``.csv``, ``.txt``)
- Excel (``.xlsx``, ``.xls``)

The loader can operate in two modes:
1. **Two-file mode**: Separate files for train and test splits.
2. **Single-file mode**: One file with a column indicating split membership.

Classes
-------
FileLoader
    Loader for file-based datasets. Reads wide-format tables and returns
    data in the :class:`TSCData` container format.

Examples
--------
>>> from tscf_eval.data_loader import FileLoader
>>>
>>> # Two-file mode
>>> loader = FileLoader(
...     train_path="data/train.csv",
...     test_path="data/test.csv",
...     label_col="target"
... )
>>> train_data = loader.load("train")
>>> test_data = loader.load("test")
>>>
>>> # Single-file mode with split column
>>> loader = FileLoader(
...     data_path="data/full_dataset.csv",
...     split_col="split",
...     label_col="target"
... )
>>> train_data = loader.load("train")

See Also
--------
tscf_eval.data_loader.UCRLoader : For loading UCR archive datasets.
tscf_eval.data_loader.TSCData : Data container returned by loaders.
"""

from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd

from .base import DataLoader
from .tsc_data import Split, TSCData

if TYPE_CHECKING:
    from collections.abc import Sequence



[docs]
class FileLoader(DataLoader):
    """Load a wide-format CSV/XLSX file (or pair of files) as ``TSCData``.

    Supports two modes:
      - Provide ``train_path`` and ``test_path`` (two-file mode).
      - Provide ``data_path`` and ``split_col`` indicating which rows
        belong to train/test (single-file mode).

    The table should be wide-format: one row per instance, numeric
    columns representing time points (or flattened channels), and a
    separate label column.
    """


[docs]
    def __init__(
        self,
        *,
        train_path: str | Path | None = None,
        test_path: str | Path | None = None,
        data_path: str | Path | None = None,
        split_col: str | None = None,
        train_value: str = "train",
        test_value: str = "test",
        label_col: str | None = None,
        feature_cols: Sequence[str] | None = None,
        sheet_name: str | int | None = None,
        name: str = "local_wide",
    ):
        """Initialize a file-based loader.

        Parameters
        ----------
        train_path, test_path : str or pathlib.Path, optional
            Paths to separate train/test files (two-file mode). Mutually
            exclusive with ``data_path`` mode.
        data_path : str or pathlib.Path, optional
            Path to a single file containing both splits; requires
            ``split_col`` to be provided.
        split_col : str, optional
            Column name in ``data_path`` indicating split membership.
        train_value, test_value : str
            Values in ``split_col`` that indicate train/test rows.
        label_col : str
            Column name containing labels (required).
        feature_cols : sequence of str, optional
            Optional explicit list of feature columns to use.
        sheet_name : str or int, optional
            When reading Excel files, the sheet to use.
        name : str
            Dataset name to assign to produced ``TSCData`` objects.
        """
        two = (train_path is not None) and (test_path is not None) and (data_path is None)
        one = (
            (data_path is not None)
            and (train_path is None)
            and (test_path is None)
            and (split_col is not None)
        )
        if not two ^ one:
            raise ValueError("Provide either (train_path & test_path) XOR (data_path & split_col).")
        if label_col is None:
            raise ValueError("label_col is required.")

        self.train_path = Path(train_path).expanduser() if train_path else None
        self.test_path = Path(test_path).expanduser() if test_path else None
        self.data_path = Path(data_path).expanduser() if data_path else None
        self.split_col = split_col
        self.train_value = str(train_value).lower()
        self.test_value = str(test_value).lower()
        self.label_col = label_col
        self.feature_cols = list(feature_cols) if feature_cols is not None else None
        self.sheet_name = sheet_name
        self.name = name



[docs]
    def load(self, split: Split, **kwargs) -> TSCData:
        """Load the requested split and return a :class:`TSCData`.

        Parameters
        ----------
        split : {'train', 'test'}
            Which split to load.
        **kwargs
            Additional options (not currently used).

        Returns
        -------
        TSCData
            Dataset container with feature arrays ``X`` and labels ``y``.
            ``X`` has shape ``(N, T)`` where ``N`` is the number of instances
            and ``T`` is the number of time points (feature columns).

        Raises
        ------
        ValueError
            If ``split_col`` is specified but not found in the data file.
        """
        if self.data_path is not None:
            df = self._read_table(self.data_path)
            if self.split_col not in df.columns:
                raise ValueError(f"split_col='{self.split_col}' not found in {self.data_path}")
            want = self.train_value if split == "train" else self.test_value
            df_split = df[df[self.split_col].astype(str).str.lower() == want]
            return TSCData.from_dataframe(
                name=self.name,
                split=split,
                df=df_split,
                label_col=self.label_col,
                feature_cols=self.feature_cols,
            )

        assert self.train_path is not None and self.test_path is not None
        path = self.train_path if split == "train" else self.test_path
        df = self._read_table(path)
        return TSCData.from_dataframe(
            name=self.name,
            split=split,
            df=df,
            label_col=self.label_col,
            feature_cols=self.feature_cols,
        )



[docs]
    def describe(self) -> dict:
        """Return a concise description for the dataset(s) represented by this loader.

        The return value includes per-split metadata (via
        :meth:`TSCData.describe`) and an overall summary (combined
        number of classes across splits).

        Returns
        -------
        dict
            Dictionary with keys:

            - ``'name'``: Dataset name.
            - ``'splits'``: Dict mapping 'train'/'test' to their descriptions.
            - ``'overall'``: Dict with ``'n_classes'`` (total unique classes).
        """
        tr = self.load("train")
        te = self.load("test")
        n_classes = int(np.unique(np.concatenate([tr.y, te.y])).size)
        return {
            "name": self.name,
            "splits": {"train": tr.describe(), "test": te.describe()},
            "overall": {"n_classes": n_classes},
        }


    def _read_table(self, path: Path) -> pd.DataFrame:
        """Read a table from CSV or Excel into a DataFrame.

        Parameters
        ----------
        path : pathlib.Path
            Path to the file. Supported suffixes: .csv, .txt, .xlsx, .xls.

        Returns
        -------
        pandas.DataFrame
            Loaded table.

        Raises
        ------
        FileNotFoundError
            If the file does not exist.
        ValueError
            If the file extension is unsupported.
        """
        if not path.exists():
            raise FileNotFoundError(str(path))
        suffix = path.suffix.lower()
        if suffix in (".csv", ".txt"):
            return pd.read_csv(path)
        if suffix in (".xlsx", ".xls"):
            res = pd.read_excel(path, sheet_name=self.sheet_name)
            if isinstance(res, dict):
                return next(iter(res.values()))
            return res
        raise ValueError(f"Unsupported file type: {suffix}")