Profiler Configuration¶
Truthound profiler analyzes data to generate validation rules automatically.
Quick Start¶
from truthound.profiler import profile, generate_suite
# Profile data
profile_result = profile(df)
# Generate validation suite
suite = generate_suite(df, strictness="medium")
SuiteGeneratorConfig¶
Main configuration for suite generation.
from truthound.profiler.suite_config import SuiteGeneratorConfig
config = SuiteGeneratorConfig(
name="my_suite",
strictness="medium", # loose, medium, strict
categories=CategoryConfig(),
confidence=ConfidenceConfig(),
output=OutputConfig(),
generators=GeneratorConfig(),
custom_options={},
)
CategoryConfig¶
Control which validator categories to include.
from truthound.profiler.suite_config import CategoryConfig
config = CategoryConfig(
include=[], # Categories to include (empty = all)
exclude=[], # Categories to exclude
priority_order=[], # Preferred ordering
)
# Check if category should be included
if config.should_include("completeness"):
# Include completeness validators
pass
Available Categories:
schema- Schema validationcompleteness- Null/missing valuesuniqueness- Unique constraintsdistribution- Value distributionsformat- String formatspattern- Regex patternsrange- Value rangesconsistency- Cross-column consistency
ConfidenceConfig¶
Configure confidence level filtering.
from truthound.profiler.suite_config import ConfidenceConfig
config = ConfidenceConfig(
min_level="low", # low, medium, high
include_rationale=True, # Include rationale in output
show_in_output=True, # Show confidence level
)
| Level | Description |
|---|---|
low |
All detected rules |
medium |
Moderate confidence |
high |
High confidence only |
OutputConfig¶
Configure output format and style.
from truthound.profiler.suite_config import OutputConfig
config = OutputConfig(
format="yaml", # yaml, json, python, toml, checkpoint
include_metadata=True,
include_summary=True,
include_description=True,
group_by_category=False,
sort_rules=True,
indent=2,
# Python-specific options
code_style="functional", # functional, class_based, declarative
include_docstrings=True,
include_type_hints=True,
max_line_length=88,
)
Output Formats:
| Format | Description |
|---|---|
yaml |
YAML configuration file |
json |
JSON configuration file |
python |
Python code |
toml |
TOML configuration file |
checkpoint |
Checkpoint configuration |
Python Code Styles:
| Style | Description |
|---|---|
functional |
Function-based validators |
class_based |
Class-based validators |
declarative |
Declarative definitions |
GeneratorConfig¶
Configure rule generators.
from truthound.profiler.suite_config import GeneratorConfig
config = GeneratorConfig(
mode="full", # full, fast, custom
enabled_generators=[], # Generators to enable
disabled_generators=[], # Generators to disable
generator_options={}, # Generator-specific options
)
# Check if generator should be used
if config.should_use_generator("schema"):
# Use schema generator
pass
# Get generator options
opts = config.get_generator_options("completeness")
Generator Modes:
| Mode | Description |
|---|---|
full |
All generators |
fast |
Quick analysis only |
custom |
Manual selection |
Configuration Presets¶
from truthound.profiler.suite_config import (
SuiteGeneratorConfig,
ConfigPreset,
)
# Use preset
config = SuiteGeneratorConfig.from_preset(ConfigPreset.DEFAULT)
config = SuiteGeneratorConfig.from_preset(ConfigPreset.STRICT)
config = SuiteGeneratorConfig.from_preset(ConfigPreset.PRODUCTION)
Available Presets:
| Preset | Strictness | Key Settings |
|---|---|---|
DEFAULT |
medium | Standard, low confidence |
STRICT |
strict | Group by category, medium confidence |
LOOSE |
loose | Permissive rules |
MINIMAL |
loose | Schema only, high confidence, fast |
COMPREHENSIVE |
strict | All categories, low confidence |
SCHEMA_ONLY |
medium | Schema + completeness only |
FORMAT_ONLY |
medium | Format + pattern only |
CI_CD |
medium | Checkpoint output format |
DEVELOPMENT |
loose | Python functional style |
PRODUCTION |
strict | Grouped, high confidence |
Loading Configuration¶
from truthound.profiler.suite_config import (
load_config,
save_config,
SuiteGeneratorConfig,
)
# Load from file
config = load_config("suite_config.yaml")
config = load_config("suite_config.json")
# Load from environment
config = SuiteGeneratorConfig.from_env(prefix="TRUTHOUND_SUITE")
# Load from dictionary
config = SuiteGeneratorConfig.from_dict({
"strictness": "strict",
"categories": {"include": ["schema", "completeness"]},
})
# Clone with overrides
new_config = config.with_overrides(strictness="loose")
# Save to file
save_config(config, "suite_config.yaml")
Environment Variables:
export TRUTHOUND_SUITE_STRICTNESS=strict
export TRUTHOUND_SUITE_MIN_CONFIDENCE=medium
export TRUTHOUND_SUITE_FORMAT=yaml
export TRUTHOUND_SUITE_INCLUDE_CATEGORIES=schema,completeness
export TRUTHOUND_SUITE_EXCLUDE_CATEGORIES=pattern
Scheduling Configuration¶
Profile Triggers¶
from truthound.profiler.scheduling.triggers import (
CronTrigger,
IntervalTrigger,
DataChangeTrigger,
EventTrigger,
CompositeTrigger,
AlwaysTrigger,
ManualTrigger,
)
# Cron-based scheduling
trigger = CronTrigger("0 2 * * *") # Daily at 2 AM
trigger = CronTrigger("0 */6 * * *") # Every 6 hours
trigger = CronTrigger("0 0 * * 0", timezone="UTC") # Weekly
# Interval-based
trigger = IntervalTrigger(hours=6)
trigger = IntervalTrigger(minutes=30)
trigger = IntervalTrigger(days=1, hours=2)
# Data change trigger
trigger = DataChangeTrigger(
change_threshold=0.05, # 5% change threshold
change_type="row_count", # row_count, schema, hash
min_interval_seconds=60, # Minimum between runs
)
# Event-based
trigger = EventTrigger(event_name="profile_requested")
# Composite (combine multiple)
trigger = CompositeTrigger(
triggers=[
CronTrigger("0 2 * * *"),
DataChangeTrigger(change_threshold=0.1),
],
mode="any", # any = OR, all = AND
)
# Always/manual
trigger = AlwaysTrigger() # Always run
trigger = ManualTrigger() # Only manual
Profile Storage¶
from truthound.profiler.scheduling.storage import (
InMemoryProfileStorage,
FileProfileStorage,
)
# In-memory (development/testing)
storage = InMemoryProfileStorage(max_profiles=100)
# File-based (production)
storage = FileProfileStorage(
base_path="./profiles",
max_profiles=100,
compress=False, # Enable gzip
)
# Storage operations
storage.save(profile, metadata={"source": "scheduled"})
last_profile = storage.get_last_profile()
last_run = storage.get_last_run_time()
profiles = storage.list_profiles(limit=10)
SchedulerConfig¶
from truthound.profiler.scheduling.scheduler import SchedulerConfig
config = SchedulerConfig(
enable_incremental=True, # Incremental profiling
compute_data_hash=True, # Track data changes
save_history=True, # Save profile history
on_profile_complete=None, # Callback on completion
on_profile_skip=None, # Callback on skip
max_history_age_days=30, # History retention
context_providers=[], # Context provider functions
)
IncrementalProfileScheduler¶
from truthound.profiler.scheduling.scheduler import (
IncrementalProfileScheduler,
create_scheduler,
)
# Direct construction
scheduler = IncrementalProfileScheduler(
trigger=CronTrigger("0 2 * * *"),
storage=FileProfileStorage("./profiles"),
config=SchedulerConfig(
enable_incremental=True,
compute_data_hash=True,
),
)
# Factory function
scheduler = create_scheduler(
trigger_type="interval", # interval, cron, manual, always
storage_type="file", # memory, file
hours=1,
storage_path="./profiles",
enable_incremental=True,
)
# Run if trigger condition met
profile = scheduler.run_if_needed(data)
# Force run
profile = scheduler.run(data, incremental=True)
# Get next scheduled run
next_run = scheduler.get_next_run_time()
# Get run history
history = scheduler.get_run_history(limit=10)
# Get metrics
metrics = scheduler.get_metrics()
print(f"Total runs: {metrics.total_runs}")
print(f"Incremental: {metrics.incremental_runs}")
print(f"Skipped: {metrics.skipped_runs}")
Schema Evolution Configuration¶
SchemaEvolutionDetector¶
from truthound.profiler.evolution.detector import SchemaEvolutionDetector
from truthound.profiler.evolution.changes import (
ChangeType,
ChangeSeverity,
CompatibilityLevel,
)
detector = SchemaEvolutionDetector(
storage=storage, # Profile storage for baseline
detect_renames=True, # Detect column renames
rename_similarity_threshold=0.8, # 80% similarity for rename
)
# Detect changes
changes = detector.detect_changes(
current_schema=profile.schema,
baseline_schema=None, # None = use stored baseline
)
# Get change summary
summary = detector.get_change_summary(changes)
print(f"Breaking changes: {summary.breaking_count}")
print(f"Total changes: {summary.total_count}")
Change Types:
| Type | Description | Severity |
|---|---|---|
COLUMN_ADDED |
New column added | INFO |
COLUMN_REMOVED |
Column removed | CRITICAL |
COLUMN_RENAMED |
Column renamed | CRITICAL |
TYPE_CHANGED |
Type changed | Variable |
NULLABLE_CHANGED |
Nullability changed | Variable |
CONSTRAINT_ADDED |
New constraint | WARNING |
CONSTRAINT_REMOVED |
Constraint removed | WARNING |
DEFAULT_CHANGED |
Default value changed | INFO |
ORDER_CHANGED |
Column order changed | INFO |
Change Severity:
| Severity | Description |
|---|---|
INFO |
Non-breaking change |
WARNING |
Potentially breaking |
CRITICAL |
Breaking change |
Compatibility Levels:
| Level | Description |
|---|---|
FULL |
Forward and backward compatible |
FORWARD |
New schema can read old data |
BACKWARD |
Old schema can read new data |
NONE |
Not compatible |
Type Compatibility¶
Compatible type changes (non-breaking):
Int8 → Int16, Int32, Int64
Int16 → Int32, Int64
Int32 → Int64
UInt8 → UInt16, UInt32, UInt64
Float32 → Float64
Suite Execution Configuration¶
SuiteExecutor¶
from truthound.profiler.integration.executor import (
SuiteExecutor,
ExecutionContext,
)
executor = SuiteExecutor(
parallel=False, # Parallel execution
fail_fast=False, # Stop on first failure
max_workers=None, # Max parallel workers
timeout_seconds=None, # Execution timeout
registry=None, # Validator registry
listeners=[], # Execution listeners
progress_reporter=None, # Progress reporter
)
# Execution context
context = ExecutionContext(
parallel=False,
fail_fast=False,
max_workers=4,
timeout_seconds=300.0,
dry_run=False, # Dry run mode
)
# Execute suite
result = executor.execute(suite, data, context)
# Async execution
result = await executor.execute_async(suite, data, context)
Complete Example¶
from truthound.profiler import profile, generate_suite
from truthound.profiler.suite_config import (
SuiteGeneratorConfig,
CategoryConfig,
ConfidenceConfig,
OutputConfig,
)
from truthound.profiler.scheduling import (
IncrementalProfileScheduler,
CronTrigger,
FileProfileStorage,
SchedulerConfig,
)
from truthound.profiler.evolution import SchemaEvolutionDetector
# 1. Configure suite generation
suite_config = SuiteGeneratorConfig(
name="production_validation",
strictness="strict",
categories=CategoryConfig(
include=["schema", "completeness", "distribution"],
exclude=["pattern"],
),
confidence=ConfidenceConfig(
min_level="medium",
include_rationale=True,
),
output=OutputConfig(
format="yaml",
group_by_category=True,
include_metadata=True,
),
)
# 2. Configure scheduling
storage = FileProfileStorage("./profiles", max_profiles=100)
scheduler = IncrementalProfileScheduler(
trigger=CronTrigger("0 2 * * *"), # Daily at 2 AM
storage=storage,
config=SchedulerConfig(
enable_incremental=True,
compute_data_hash=True,
save_history=True,
max_history_age_days=30,
),
)
# 3. Configure schema evolution detection
detector = SchemaEvolutionDetector(
storage=storage,
detect_renames=True,
rename_similarity_threshold=0.8,
)
# 4. Run profiling
profile_result = scheduler.run_if_needed(data)
if profile_result:
# Check for schema changes
changes = detector.detect_changes(profile_result.schema)
if changes:
summary = detector.get_change_summary(changes)
print(f"Schema changes detected: {summary.total_count}")
for change in changes:
if change.breaking:
print(f" BREAKING: {change.description}")
# Generate validation suite
suite = generate_suite(data, config=suite_config)
print(f"Generated {len(suite.validators)} validators")
Environment Variables¶
# Suite generation
export TRUTHOUND_SUITE_STRICTNESS=medium
export TRUTHOUND_SUITE_MIN_CONFIDENCE=low
export TRUTHOUND_SUITE_FORMAT=yaml
export TRUTHOUND_SUITE_INCLUDE_CATEGORIES=schema,completeness
# Scheduling
export TRUTHOUND_PROFILE_INTERVAL_HOURS=6
export TRUTHOUND_PROFILE_STORAGE_PATH=./profiles
export TRUTHOUND_PROFILE_MAX_HISTORY_DAYS=30
# Schema evolution
export TRUTHOUND_DETECT_RENAMES=true
export TRUTHOUND_RENAME_THRESHOLD=0.8