Profiler Configuration¶

Truthound profiler analyzes data to generate validation rules automatically.

Quick Start¶

from truthound.profiler import profile, generate_suite

# Profile data
profile_result = profile(df)

# Generate validation suite
suite = generate_suite(df, strictness="medium")

SuiteGeneratorConfig¶

Main configuration for suite generation.

from truthound.profiler.suite_config import SuiteGeneratorConfig

config = SuiteGeneratorConfig(
    name="my_suite",
    strictness="medium",                # loose, medium, strict
    categories=CategoryConfig(),
    confidence=ConfidenceConfig(),
    output=OutputConfig(),
    generators=GeneratorConfig(),
    custom_options={},
)

CategoryConfig¶

Control which validator categories to include.

from truthound.profiler.suite_config import CategoryConfig

config = CategoryConfig(
    include=[],                         # Categories to include (empty = all)
    exclude=[],                         # Categories to exclude
    priority_order=[],                  # Preferred ordering
)

# Check if category should be included
if config.should_include("completeness"):
    # Include completeness validators
    pass

Available Categories:

schema - Schema validation
completeness - Null/missing values
uniqueness - Unique constraints
distribution - Value distributions
format - String formats
pattern - Regex patterns
range - Value ranges
consistency - Cross-column consistency

ConfidenceConfig¶

Configure confidence level filtering.

from truthound.profiler.suite_config import ConfidenceConfig

config = ConfidenceConfig(
    min_level="low",                    # low, medium, high
    include_rationale=True,             # Include rationale in output
    show_in_output=True,                # Show confidence level
)

Level	Description
`low`	All detected rules
`medium`	Moderate confidence
`high`	High confidence only

OutputConfig¶

Configure output format and style.

from truthound.profiler.suite_config import OutputConfig

config = OutputConfig(
    format="yaml",                      # yaml, json, python, toml, checkpoint
    include_metadata=True,
    include_summary=True,
    include_description=True,
    group_by_category=False,
    sort_rules=True,
    indent=2,

    # Python-specific options
    code_style="functional",            # functional, class_based, declarative
    include_docstrings=True,
    include_type_hints=True,
    max_line_length=88,
)

Output Formats:

Format	Description
`yaml`	YAML configuration file
`json`	JSON configuration file
`python`	Python code
`toml`	TOML configuration file
`checkpoint`	Checkpoint configuration

Python Code Styles:

Style	Description
`functional`	Function-based validators
`class_based`	Class-based validators
`declarative`	Declarative definitions

GeneratorConfig¶

Configure rule generators.

from truthound.profiler.suite_config import GeneratorConfig

config = GeneratorConfig(
    mode="full",                        # full, fast, custom
    enabled_generators=[],              # Generators to enable
    disabled_generators=[],             # Generators to disable
    generator_options={},               # Generator-specific options
)

# Check if generator should be used
if config.should_use_generator("schema"):
    # Use schema generator
    pass

# Get generator options
opts = config.get_generator_options("completeness")

Generator Modes:

Mode	Description
`full`	All generators
`fast`	Quick analysis only
`custom`	Manual selection

Configuration Presets¶

from truthound.profiler.suite_config import (
    SuiteGeneratorConfig,
    ConfigPreset,
)

# Use preset
config = SuiteGeneratorConfig.from_preset(ConfigPreset.DEFAULT)
config = SuiteGeneratorConfig.from_preset(ConfigPreset.STRICT)
config = SuiteGeneratorConfig.from_preset(ConfigPreset.PRODUCTION)

Available Presets:

Preset	Strictness	Key Settings
`DEFAULT`	medium	Standard, low confidence
`STRICT`	strict	Group by category, medium confidence
`LOOSE`	loose	Permissive rules
`MINIMAL`	loose	Schema only, high confidence, fast
`COMPREHENSIVE`	strict	All categories, low confidence
`SCHEMA_ONLY`	medium	Schema + completeness only
`FORMAT_ONLY`	medium	Format + pattern only
`CI_CD`	medium	Checkpoint output format
`DEVELOPMENT`	loose	Python functional style
`PRODUCTION`	strict	Grouped, high confidence

Loading Configuration¶

from truthound.profiler.suite_config import (
    load_config,
    save_config,
    SuiteGeneratorConfig,
)

# Load from file
config = load_config("suite_config.yaml")
config = load_config("suite_config.json")

# Load from environment
config = SuiteGeneratorConfig.from_env(prefix="TRUTHOUND_SUITE")

# Load from dictionary
config = SuiteGeneratorConfig.from_dict({
    "strictness": "strict",
    "categories": {"include": ["schema", "completeness"]},
})

# Clone with overrides
new_config = config.with_overrides(strictness="loose")

# Save to file
save_config(config, "suite_config.yaml")

Environment Variables:

export TRUTHOUND_SUITE_STRICTNESS=strict
export TRUTHOUND_SUITE_MIN_CONFIDENCE=medium
export TRUTHOUND_SUITE_FORMAT=yaml
export TRUTHOUND_SUITE_INCLUDE_CATEGORIES=schema,completeness
export TRUTHOUND_SUITE_EXCLUDE_CATEGORIES=pattern

Scheduling Configuration¶

Profile Triggers¶

from truthound.profiler.scheduling.triggers import (
    CronTrigger,
    IntervalTrigger,
    DataChangeTrigger,
    EventTrigger,
    CompositeTrigger,
    AlwaysTrigger,
    ManualTrigger,
)

# Cron-based scheduling
trigger = CronTrigger("0 2 * * *")              # Daily at 2 AM
trigger = CronTrigger("0 */6 * * *")            # Every 6 hours
trigger = CronTrigger("0 0 * * 0", timezone="UTC")  # Weekly

# Interval-based
trigger = IntervalTrigger(hours=6)
trigger = IntervalTrigger(minutes=30)
trigger = IntervalTrigger(days=1, hours=2)

# Data change trigger
trigger = DataChangeTrigger(
    change_threshold=0.05,              # 5% change threshold
    change_type="row_count",            # row_count, schema, hash
    min_interval_seconds=60,            # Minimum between runs
)

# Event-based
trigger = EventTrigger(event_name="profile_requested")

# Composite (combine multiple)
trigger = CompositeTrigger(
    triggers=[
        CronTrigger("0 2 * * *"),
        DataChangeTrigger(change_threshold=0.1),
    ],
    mode="any",                         # any = OR, all = AND
)

# Always/manual
trigger = AlwaysTrigger()               # Always run
trigger = ManualTrigger()               # Only manual

Profile Storage¶

from truthound.profiler.scheduling.storage import (
    InMemoryProfileStorage,
    FileProfileStorage,
)

# In-memory (development/testing)
storage = InMemoryProfileStorage(max_profiles=100)

# File-based (production)
storage = FileProfileStorage(
    base_path="./profiles",
    max_profiles=100,
    compress=False,                     # Enable gzip
)

# Storage operations
storage.save(profile, metadata={"source": "scheduled"})
last_profile = storage.get_last_profile()
last_run = storage.get_last_run_time()
profiles = storage.list_profiles(limit=10)

SchedulerConfig¶

from truthound.profiler.scheduling.scheduler import SchedulerConfig

config = SchedulerConfig(
    enable_incremental=True,            # Incremental profiling
    compute_data_hash=True,             # Track data changes
    save_history=True,                  # Save profile history
    on_profile_complete=None,           # Callback on completion
    on_profile_skip=None,               # Callback on skip
    max_history_age_days=30,            # History retention
    context_providers=[],               # Context provider functions
)

IncrementalProfileScheduler¶

from truthound.profiler.scheduling.scheduler import (
    IncrementalProfileScheduler,
    create_scheduler,
)

# Direct construction
scheduler = IncrementalProfileScheduler(
    trigger=CronTrigger("0 2 * * *"),
    storage=FileProfileStorage("./profiles"),
    config=SchedulerConfig(
        enable_incremental=True,
        compute_data_hash=True,
    ),
)

# Factory function
scheduler = create_scheduler(
    trigger_type="interval",            # interval, cron, manual, always
    storage_type="file",                # memory, file
    hours=1,
    storage_path="./profiles",
    enable_incremental=True,
)

# Run if trigger condition met
profile = scheduler.run_if_needed(data)

# Force run
profile = scheduler.run(data, incremental=True)

# Get next scheduled run
next_run = scheduler.get_next_run_time()

# Get run history
history = scheduler.get_run_history(limit=10)

# Get metrics
metrics = scheduler.get_metrics()
print(f"Total runs: {metrics.total_runs}")
print(f"Incremental: {metrics.incremental_runs}")
print(f"Skipped: {metrics.skipped_runs}")

Schema Evolution Configuration¶

SchemaEvolutionDetector¶

from truthound.profiler.evolution.detector import SchemaEvolutionDetector
from truthound.profiler.evolution.changes import (
    ChangeType,
    ChangeSeverity,
    CompatibilityLevel,
)

detector = SchemaEvolutionDetector(
    storage=storage,                    # Profile storage for baseline
    detect_renames=True,                # Detect column renames
    rename_similarity_threshold=0.8,    # 80% similarity for rename
)

# Detect changes
changes = detector.detect_changes(
    current_schema=profile.schema,
    baseline_schema=None,               # None = use stored baseline
)

# Get change summary
summary = detector.get_change_summary(changes)
print(f"Breaking changes: {summary.breaking_count}")
print(f"Total changes: {summary.total_count}")

Change Types:

Type	Description	Severity
`COLUMN_ADDED`	New column added	INFO
`COLUMN_REMOVED`	Column removed	CRITICAL
`COLUMN_RENAMED`	Column renamed	CRITICAL
`TYPE_CHANGED`	Type changed	Variable
`NULLABLE_CHANGED`	Nullability changed	Variable
`CONSTRAINT_ADDED`	New constraint	WARNING
`CONSTRAINT_REMOVED`	Constraint removed	WARNING
`DEFAULT_CHANGED`	Default value changed	INFO
`ORDER_CHANGED`	Column order changed	INFO

Change Severity:

Severity	Description
`INFO`	Non-breaking change
`WARNING`	Potentially breaking
`CRITICAL`	Breaking change

Compatibility Levels:

Level	Description
`FULL`	Forward and backward compatible
`FORWARD`	New schema can read old data
`BACKWARD`	Old schema can read new data
`NONE`	Not compatible

Type Compatibility¶

Compatible type changes (non-breaking):

Int8 → Int16, Int32, Int64
Int16 → Int32, Int64
Int32 → Int64
UInt8 → UInt16, UInt32, UInt64
Float32 → Float64

Suite Execution Configuration¶

SuiteExecutor¶

from truthound.profiler.integration.executor import (
    SuiteExecutor,
    ExecutionContext,
)

executor = SuiteExecutor(
    parallel=False,                     # Parallel execution
    fail_fast=False,                    # Stop on first failure
    max_workers=None,                   # Max parallel workers
    timeout_seconds=None,               # Execution timeout
    registry=None,                      # Validator registry
    listeners=[],                       # Execution listeners
    progress_reporter=None,             # Progress reporter
)

# Execution context
context = ExecutionContext(
    parallel=False,
    fail_fast=False,
    max_workers=4,
    timeout_seconds=300.0,
    dry_run=False,                      # Dry run mode
)

# Execute suite
result = executor.execute(suite, data, context)

# Async execution
result = await executor.execute_async(suite, data, context)

Complete Example¶

from truthound.profiler import profile, generate_suite
from truthound.profiler.suite_config import (
    SuiteGeneratorConfig,
    CategoryConfig,
    ConfidenceConfig,
    OutputConfig,
)
from truthound.profiler.scheduling import (
    IncrementalProfileScheduler,
    CronTrigger,
    FileProfileStorage,
    SchedulerConfig,
)
from truthound.profiler.evolution import SchemaEvolutionDetector

# 1. Configure suite generation
suite_config = SuiteGeneratorConfig(
    name="production_validation",
    strictness="strict",
    categories=CategoryConfig(
        include=["schema", "completeness", "distribution"],
        exclude=["pattern"],
    ),
    confidence=ConfidenceConfig(
        min_level="medium",
        include_rationale=True,
    ),
    output=OutputConfig(
        format="yaml",
        group_by_category=True,
        include_metadata=True,
    ),
)

# 2. Configure scheduling
storage = FileProfileStorage("./profiles", max_profiles=100)

scheduler = IncrementalProfileScheduler(
    trigger=CronTrigger("0 2 * * *"),   # Daily at 2 AM
    storage=storage,
    config=SchedulerConfig(
        enable_incremental=True,
        compute_data_hash=True,
        save_history=True,
        max_history_age_days=30,
    ),
)

# 3. Configure schema evolution detection
detector = SchemaEvolutionDetector(
    storage=storage,
    detect_renames=True,
    rename_similarity_threshold=0.8,
)

# 4. Run profiling
profile_result = scheduler.run_if_needed(data)

if profile_result:
    # Check for schema changes
    changes = detector.detect_changes(profile_result.schema)

    if changes:
        summary = detector.get_change_summary(changes)
        print(f"Schema changes detected: {summary.total_count}")

        for change in changes:
            if change.breaking:
                print(f"  BREAKING: {change.description}")

    # Generate validation suite
    suite = generate_suite(data, config=suite_config)
    print(f"Generated {len(suite.validators)} validators")

Environment Variables¶

# Suite generation
export TRUTHOUND_SUITE_STRICTNESS=medium
export TRUTHOUND_SUITE_MIN_CONFIDENCE=low
export TRUTHOUND_SUITE_FORMAT=yaml
export TRUTHOUND_SUITE_INCLUDE_CATEGORIES=schema,completeness

# Scheduling
export TRUTHOUND_PROFILE_INTERVAL_HOURS=6
export TRUTHOUND_PROFILE_STORAGE_PATH=./profiles
export TRUTHOUND_PROFILE_MAX_HISTORY_DAYS=30

# Schema evolution
export TRUTHOUND_DETECT_RENAMES=true
export TRUTHOUND_RENAME_THRESHOLD=0.8