Automatic Threshold Tuning¶
This document describes the automatic threshold adjustment system based on data characteristics.
Overview¶
The threshold tuning system implemented in src/truthound/profiler/auto_threshold.py automatically determines optimal validation thresholds by analyzing data distributions.
TuningStrategy¶
class TuningStrategy(str, Enum):
"""Threshold tuning strategies"""
CONSERVATIVE = "conservative" # Conservative (strict thresholds, fewer false positives)
BALANCED = "balanced" # Balanced (precision and recall balance)
PERMISSIVE = "permissive" # Permissive (loose thresholds, fewer false negatives)
ADAPTIVE = "adaptive" # Adaptive (learns from data)
STATISTICAL = "statistical" # Statistical (confidence interval based)
DOMAIN_AWARE = "domain_aware" # Domain-aware (applies domain knowledge)
ColumnThresholds¶
Per-column threshold configuration.
@dataclass
class ColumnThresholds:
"""Per-column thresholds"""
column_name: str
null_threshold: float = 0.0 # Maximum allowed null ratio
uniqueness_threshold: float | None = None # Minimum unique ratio
min_value: float | None = None # Minimum value
max_value: float | None = None # Maximum value
min_length: int | None = None # Minimum length
max_length: int | None = None # Maximum length
pattern_match_threshold: float = 0.8 # Minimum pattern match ratio
allowed_values: set[Any] | None = None # Allowed value set
outlier_threshold: float = 0.01 # Outlier ratio
confidence: float = 0.5 # Threshold confidence
reasoning: list[str] = field(default_factory=list) # Tuning reasoning
TableThresholds¶
Table-level threshold collection.
@dataclass
class TableThresholds:
"""Table thresholds"""
table_name: str
columns: dict[str, ColumnThresholds] = field(default_factory=dict)
duplicate_threshold: float = 0.0 # Allowed duplicate row ratio
row_count_min: int | None = None # Minimum row count
row_count_max: int | None = None # Maximum row count
global_null_threshold: float = 0.1 # Global null ratio
strategy_used: TuningStrategy = TuningStrategy.BALANCED
tuned_at: datetime = field(default_factory=datetime.now)
metadata: dict[str, Any] = field(default_factory=dict)
def get_column(self, name: str) -> ColumnThresholds | None:
"""Retrieve per-column thresholds"""
return self.columns.get(name)
ThresholdTuner¶
Automatic threshold tuning class. Strategy can be specified as a string or TuningStrategy enum.
from truthound.profiler.auto_threshold import ThresholdTuner, TuningStrategy
# Specify strategy as string (recommended)
tuner = ThresholdTuner(strategy="balanced")
# Or use Enum
tuner = ThresholdTuner(strategy=TuningStrategy.BALANCED)
# Tune thresholds from profile
thresholds = tuner.tune(profile)
# Check per-column thresholds
for col_name, col_thresholds in thresholds.columns.items():
print(f"{col_name}:")
print(f" null_threshold: {col_thresholds.null_threshold:.2%}")
print(f" min_value: {col_thresholds.min_value}")
print(f" max_value: {col_thresholds.max_value}")
print(f" confidence: {col_thresholds.confidence:.2%}")
Strategy-Specific Behavior¶
CONSERVATIVE - Conservative¶
tuner = ThresholdTuner(strategy=TuningStrategy.CONSERVATIVE)
thresholds = tuner.tune(profile)
# Characteristics:
# - Strict ranges (mean ± 2σ)
# - Low null tolerance
# - High pattern matching requirements
BALANCED - Balanced¶
tuner = ThresholdTuner(strategy=TuningStrategy.BALANCED)
thresholds = tuner.tune(profile)
# Characteristics:
# - Moderate ranges (mean ± 3σ)
# - Reasonable null tolerance
# - Moderate pattern matching requirements
PERMISSIVE - Permissive¶
tuner = ThresholdTuner(strategy=TuningStrategy.PERMISSIVE)
thresholds = tuner.tune(profile)
# Characteristics:
# - Wide ranges (mean ± 4σ)
# - High null tolerance
# - Low pattern matching requirements
ADAPTIVE - Adaptive¶
Automatically selects strategy by analyzing data distribution.
tuner = ThresholdTuner(strategy=TuningStrategy.ADAPTIVE)
thresholds = tuner.tune(profile)
# Internal logic:
# - Analyze data variability
# - Analyze outlier ratio
# - Analyze distribution shape
# → Automatic optimal strategy selection
STATISTICAL - Statistical¶
Threshold setting based on IQR, percentiles, and Wilson confidence intervals.
# StatisticalStrategy sets parameters in constructor
from truthound.profiler.auto_threshold import StatisticalStrategy, ThresholdTuner
# Create statistical strategy directly
stat_strategy = StatisticalStrategy(
percentile_low=0.01, # 1st percentile
percentile_high=0.99, # 99th percentile
iqr_multiplier=1.5, # IQR multiplier
)
tuner = ThresholdTuner(strategy=stat_strategy)
thresholds = tuner.tune(profile)
# Sets range/null thresholds based on IQR and Wilson CI
DOMAIN_AWARE - Domain-Aware¶
Automatically applies domain knowledge per DataType. Uses built-in DOMAIN_DEFAULTS.
from truthound.profiler.auto_threshold import ThresholdTuner
# DomainAwareStrategy uses per-DataType defaults
# - EMAIL: null_threshold=0.1, pattern_threshold=0.95, min_length=5, max_length=254
# - PHONE: null_threshold=0.2, pattern_threshold=0.9, min_length=7, max_length=20
# - UUID: null_threshold=0.0, uniqueness_threshold=1.0, min_length=36, max_length=36
# - PERCENTAGE: min_value=0.0, max_value=100.0
# Also supports DataType.CURRENCY, BOOLEAN, KOREAN_PHONE, KOREAN_RRN, etc.
tuner = ThresholdTuner(strategy="domain_aware")
thresholds = tuner.tune(profile)
IQR-Based Analysis¶
Outlier detection using Interquartile Range (IQR).
def _compute_iqr_bounds(self, profile: ColumnProfile) -> tuple[float, float]:
"""Calculate IQR-based bounds"""
q1 = profile.quantiles.get(0.25, profile.min_value)
q3 = profile.quantiles.get(0.75, profile.max_value)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
return lower, upper
Percentile-Based Analysis¶
def _compute_percentile_bounds(
self,
profile: ColumnProfile,
lower_pct: float = 0.01,
upper_pct: float = 0.99,
) -> tuple[float, float]:
"""Calculate percentile-based bounds"""
lower = profile.quantiles.get(lower_pct, profile.min_value)
upper = profile.quantiles.get(upper_pct, profile.max_value)
return lower, upper
Quick Tuning¶
from truthound.profiler.auto_threshold import tune_thresholds
from truthound.profiler.base import Strictness
# Convenience function - specify strategy and strictness
thresholds = tune_thresholds(
profile,
strategy="adaptive",
strictness=Strictness.MEDIUM, # LOOSE, MEDIUM, STRICT
)
# Table-level thresholds
print(f"Duplicate threshold: {thresholds.duplicate_threshold:.2%}")
print(f"Global null threshold: {thresholds.global_null_threshold:.2%}")
# Per-column thresholds
for col_name, col_thresh in thresholds.columns.items():
print(f"{col_name}: null <= {col_thresh.null_threshold:.1%}")
A/B Testing¶
from truthound.profiler.auto_threshold import ThresholdTester, ThresholdTuner
import polars as pl
tester = ThresholdTester()
# Compare two threshold configurations
tuner_a = ThresholdTuner(strategy="conservative")
tuner_b = ThresholdTuner(strategy="permissive")
threshold_a = tuner_a.tune(profile)
threshold_b = tuner_b.tune(profile)
# Test with DataFrame
df = pl.read_csv("test_data.csv")
result = tester.compare(
data=df,
threshold_a=threshold_a,
threshold_b=threshold_b,
)
print(f"Recommendation: {result.recommendation}")
print(f"Violations A: {result.violations_a}")
print(f"Violations B: {result.violations_b}")
Threshold Export¶
import json
# Save as JSON
with open("thresholds.json", "w") as f:
json.dump(thresholds.to_dict(), f, indent=2)
# Save as YAML
import yaml
with open("thresholds.yaml", "w") as f:
yaml.dump(thresholds.to_dict(), f)
CLI Usage¶
# Automatic threshold tuning
th tune-thresholds profile.json -o thresholds.yaml
# Specify strategy
th tune-thresholds profile.json -o thresholds.yaml --strategy statistical
# Specify confidence level
th tune-thresholds profile.json -o thresholds.yaml --strategy statistical --confidence 0.99
# Generate rules with tuned thresholds
th generate-suite profile.json -o rules.yaml --thresholds thresholds.yaml
Integration Example¶
from truthound.profiler import TableProfiler, generate_suite
from truthound.profiler.auto_threshold import ThresholdTuner, TuningStrategy
# Profiling
profiler = TableProfiler()
profile = profiler.profile_file("data.csv")
# Threshold tuning
tuner = ThresholdTuner(strategy=TuningStrategy.ADAPTIVE)
thresholds = tuner.tune(profile)
# Generate rules with tuned thresholds
suite = generate_suite(
profile,
thresholds=thresholds,
)
# Save
save_suite(suite, "rules.yaml", format="yaml")
Next Steps¶
- Quality Scoring - Evaluate tuned threshold quality
- Rule Generation - Threshold-based rule generation