Source code for vital_sqi.calibration.threshold_estimator

"""
Derive accept/reject thresholds from empirical SQI distributions.

The core algorithm
------------------
1.  ``accept_df``  — SQI values from clean signals (noise_floor ≈ 0)
2.  ``reject_df``  — SQI values from heavily degraded signals

For each SQI column:

    lower = percentile(accept_df[col], lower_pct)   # e.g. p5
    upper = percentile(accept_df[col], upper_pct)   # e.g. p95

The accept region is the open interval ``(lower, upper)``.

Edge cases
----------
- All-NaN column         → SQI is skipped (not calibratable)
- Constant column        → warns and widens the band by a small epsilon
- Accept and reject      → the reject distribution is stored for reference
  distributions overlap    but does not change the thresholds (clean-signal
                           distribution is authoritative)
- Very small range       → epsilon guard prevents a zero-width rule
"""

import numpy as np
import pandas as pd
import warnings
from dataclasses import dataclass, field
from typing import Optional


_EPSILON = 1e-6   # minimum half-width for a non-degenerate band



[docs]
@dataclass
class SQIThreshold:
    """
    Calibrated threshold for a single SQI column.

    Attributes
    ----------
    sqi_name : str
        Column name as it appears in the SQI DataFrame.
    lower : float
        Lower bound of the accept region (exclusive, ``>`` operator).
    upper : float
        Upper bound of the accept region (exclusive, ``<`` operator).
    accept_median : float
        Median of the accept (clean) distribution.
    accept_std : float
        Standard deviation of the accept distribution.
    reject_median : float or None
        Median of the reject distribution (NaN if not available).
    n_accept : int
        Number of valid (non-NaN) accept samples used.
    n_reject : int
        Number of valid (non-NaN) reject samples used.
    calibrated : bool
        False if the SQI could not be calibrated (all-NaN, constant, etc.).
    note : str
        Human-readable note about any special handling applied.
    """
    sqi_name: str
    lower: float = np.nan
    upper: float = np.nan
    accept_median: float = np.nan
    accept_std: float = np.nan
    reject_median: float = np.nan
    n_accept: int = 0
    n_reject: int = 0
    calibrated: bool = False
    note: str = ""




[docs]
def estimate_thresholds(
    accept_df: pd.DataFrame,
    reject_df: Optional[pd.DataFrame] = None,
    lower_pct: float = 5.0,
    upper_pct: float = 95.0,
) -> dict:
    """
    Derive accept/reject bounds for every SQI column.

    Parameters
    ----------
    accept_df : pd.DataFrame
        SQI values from clean segments.  One row per segment, one column per
        SQI.  All-NaN columns are skipped.
    reject_df : pd.DataFrame, optional
        SQI values from degraded segments.  Used only for diagnostics (the
        reject distribution does not change the threshold values).
    lower_pct : float
        Lower percentile of the accept distribution that defines the accept
        lower bound (default ``5``).
    upper_pct : float
        Upper percentile defining the accept upper bound (default ``95``).

    Returns
    -------
    dict
        Mapping of ``sqi_name → SQIThreshold``.  Only calibratable SQIs are
        included.
    """
    thresholds = {}

    for col in accept_df.columns:
        accept_vals = accept_df[col].replace([np.inf, -np.inf], np.nan).dropna().values

        t = SQIThreshold(sqi_name=col)
        t.n_accept = len(accept_vals)

        if reject_df is not None and col in reject_df.columns:
            reject_vals = reject_df[col].replace([np.inf, -np.inf], np.nan).dropna().values
            t.n_reject = len(reject_vals)
            t.reject_median = float(np.median(reject_vals)) if len(reject_vals) > 0 else np.nan
        else:
            reject_vals = np.array([])

        if t.n_accept < 5:
            t.note = f"Skipped: only {t.n_accept} valid accept samples"
            continue

        t.accept_median = float(np.median(accept_vals))
        t.accept_std = float(np.std(accept_vals))

        lower = float(np.percentile(accept_vals, lower_pct))
        upper = float(np.percentile(accept_vals, upper_pct))

        # Guard: constant or near-constant column
        if upper - lower < _EPSILON:
            half = max(abs(t.accept_median) * 0.1, _EPSILON)
            lower = t.accept_median - half
            upper = t.accept_median + half
            t.note = "Constant/near-constant: band widened by epsilon"
            warnings.warn(
                f"SQI '{col}' has near-zero variance in accept distribution. "
                f"Widened band to ({lower:.4g}, {upper:.4g})."
            )

        # Guard: reject distribution overlaps — note it but keep clean bounds
        if len(reject_vals) > 0:
            reject_lower = float(np.percentile(reject_vals, lower_pct))
            reject_upper = float(np.percentile(reject_vals, upper_pct))
            overlap = not (upper < reject_lower or lower > reject_upper)
            if overlap and not t.note:
                t.note = (
                    f"Accept/reject distributions overlap. "
                    f"Accept p5-p95: ({lower:.3g}, {upper:.3g}), "
                    f"Reject p5-p95: ({reject_lower:.3g}, {reject_upper:.3g}). "
                    f"Consider using a more discriminative SQI or tighter noise conditions."
                )

        t.lower = lower
        t.upper = upper
        t.calibrated = True
        thresholds[col] = t

    return thresholds




[docs]
def thresholds_to_dataframe(thresholds: dict) -> pd.DataFrame:
    """
    Convert a thresholds dict to a summary DataFrame for inspection.

    Parameters
    ----------
    thresholds : dict
        Output of :func:`estimate_thresholds`.

    Returns
    -------
    pd.DataFrame
        One row per SQI with columns: sqi_name, lower, upper, accept_median,
        accept_std, reject_median, n_accept, n_reject, calibrated, note.
    """
    rows = []
    for t in thresholds.values():
        rows.append({
            "sqi_name":      t.sqi_name,
            "lower":         t.lower,
            "upper":         t.upper,
            "accept_median": t.accept_median,
            "accept_std":    t.accept_std,
            "reject_median": t.reject_median,
            "n_accept":      t.n_accept,
            "n_reject":      t.n_reject,
            "calibrated":    t.calibrated,
            "note":          t.note,
        })
    return pd.DataFrame(rows).set_index("sqi_name")