"""
Derive accept/reject thresholds from empirical SQI distributions.
The core algorithm
------------------
1. ``accept_df`` — SQI values from clean signals (noise_floor ≈ 0)
2. ``reject_df`` — SQI values from heavily degraded signals
For each SQI column:
lower = percentile(accept_df[col], lower_pct) # e.g. p5
upper = percentile(accept_df[col], upper_pct) # e.g. p95
The accept region is the open interval ``(lower, upper)``.
Edge cases
----------
- All-NaN column → SQI is skipped (not calibratable)
- Constant column → warns and widens the band by a small epsilon
- Accept and reject → the reject distribution is stored for reference
distributions overlap but does not change the thresholds (clean-signal
distribution is authoritative)
- Very small range → epsilon guard prevents a zero-width rule
"""
import numpy as np
import pandas as pd
import warnings
from dataclasses import dataclass, field
from typing import Optional
_EPSILON = 1e-6 # minimum half-width for a non-degenerate band
[docs]
@dataclass
class SQIThreshold:
"""
Calibrated threshold for a single SQI column.
Attributes
----------
sqi_name : str
Column name as it appears in the SQI DataFrame.
lower : float
Lower bound of the accept region (exclusive, ``>`` operator).
upper : float
Upper bound of the accept region (exclusive, ``<`` operator).
accept_median : float
Median of the accept (clean) distribution.
accept_std : float
Standard deviation of the accept distribution.
reject_median : float or None
Median of the reject distribution (NaN if not available).
n_accept : int
Number of valid (non-NaN) accept samples used.
n_reject : int
Number of valid (non-NaN) reject samples used.
calibrated : bool
False if the SQI could not be calibrated (all-NaN, constant, etc.).
note : str
Human-readable note about any special handling applied.
"""
sqi_name: str
lower: float = np.nan
upper: float = np.nan
accept_median: float = np.nan
accept_std: float = np.nan
reject_median: float = np.nan
n_accept: int = 0
n_reject: int = 0
calibrated: bool = False
note: str = ""
[docs]
def estimate_thresholds(
accept_df: pd.DataFrame,
reject_df: Optional[pd.DataFrame] = None,
lower_pct: float = 5.0,
upper_pct: float = 95.0,
) -> dict:
"""
Derive accept/reject bounds for every SQI column.
Parameters
----------
accept_df : pd.DataFrame
SQI values from clean segments. One row per segment, one column per
SQI. All-NaN columns are skipped.
reject_df : pd.DataFrame, optional
SQI values from degraded segments. Used only for diagnostics (the
reject distribution does not change the threshold values).
lower_pct : float
Lower percentile of the accept distribution that defines the accept
lower bound (default ``5``).
upper_pct : float
Upper percentile defining the accept upper bound (default ``95``).
Returns
-------
dict
Mapping of ``sqi_name → SQIThreshold``. Only calibratable SQIs are
included.
"""
thresholds = {}
for col in accept_df.columns:
accept_vals = accept_df[col].replace([np.inf, -np.inf], np.nan).dropna().values
t = SQIThreshold(sqi_name=col)
t.n_accept = len(accept_vals)
if reject_df is not None and col in reject_df.columns:
reject_vals = reject_df[col].replace([np.inf, -np.inf], np.nan).dropna().values
t.n_reject = len(reject_vals)
t.reject_median = float(np.median(reject_vals)) if len(reject_vals) > 0 else np.nan
else:
reject_vals = np.array([])
if t.n_accept < 5:
t.note = f"Skipped: only {t.n_accept} valid accept samples"
continue
t.accept_median = float(np.median(accept_vals))
t.accept_std = float(np.std(accept_vals))
lower = float(np.percentile(accept_vals, lower_pct))
upper = float(np.percentile(accept_vals, upper_pct))
# Guard: constant or near-constant column
if upper - lower < _EPSILON:
half = max(abs(t.accept_median) * 0.1, _EPSILON)
lower = t.accept_median - half
upper = t.accept_median + half
t.note = "Constant/near-constant: band widened by epsilon"
warnings.warn(
f"SQI '{col}' has near-zero variance in accept distribution. "
f"Widened band to ({lower:.4g}, {upper:.4g})."
)
# Guard: reject distribution overlaps — note it but keep clean bounds
if len(reject_vals) > 0:
reject_lower = float(np.percentile(reject_vals, lower_pct))
reject_upper = float(np.percentile(reject_vals, upper_pct))
overlap = not (upper < reject_lower or lower > reject_upper)
if overlap and not t.note:
t.note = (
f"Accept/reject distributions overlap. "
f"Accept p5-p95: ({lower:.3g}, {upper:.3g}), "
f"Reject p5-p95: ({reject_lower:.3g}, {reject_upper:.3g}). "
f"Consider using a more discriminative SQI or tighter noise conditions."
)
t.lower = lower
t.upper = upper
t.calibrated = True
thresholds[col] = t
return thresholds
[docs]
def thresholds_to_dataframe(thresholds: dict) -> pd.DataFrame:
"""
Convert a thresholds dict to a summary DataFrame for inspection.
Parameters
----------
thresholds : dict
Output of :func:`estimate_thresholds`.
Returns
-------
pd.DataFrame
One row per SQI with columns: sqi_name, lower, upper, accept_median,
accept_std, reject_median, n_accept, n_reject, calibrated, note.
"""
rows = []
for t in thresholds.values():
rows.append({
"sqi_name": t.sqi_name,
"lower": t.lower,
"upper": t.upper,
"accept_median": t.accept_median,
"accept_std": t.accept_std,
"reject_median": t.reject_median,
"n_accept": t.n_accept,
"n_reject": t.n_reject,
"calibrated": t.calibrated,
"note": t.note,
})
return pd.DataFrame(rows).set_index("sqi_name")