Skip to content

Caching & Incremental Profiling

This document describes the profile result caching and incremental profiling system.

Overview

The caching system implemented in src/truthound/profiler/caching.py caches profile results based on file fingerprints to save re-profiling time.

CacheKey Protocol

from typing import Protocol

class CacheKey(Protocol):
    """Cache key protocol"""

    def to_string(self) -> str:
        """Convert cache key to string"""
        ...

    def __hash__(self) -> int:
        ...

    def __eq__(self, other: object) -> bool:
        ...

FileHashCacheKey

SHA-256 based file hash cache key.

from truthound.profiler.caching import FileHashCacheKey

# Create cache key from file
cache_key = FileHashCacheKey.from_file("data.csv")

print(cache_key.file_path)      # data.csv
print(cache_key.file_hash)      # SHA-256 hash
print(cache_key.file_size)      # File size
print(cache_key.modified_time)  # Modification time
print(cache_key.to_string())    # Cache key string

Hash Computation

# Internal implementation - using SHA-256
def _compute_hash(self, path: Path) -> str:
    hasher = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            hasher.update(chunk)
    return hasher.hexdigest()

Cache Backends

MemoryCacheBackend

Memory-based LRU cache.

from truthound.profiler.caching import MemoryCacheBackend

cache = MemoryCacheBackend(max_size=100)

# Store
cache.set(cache_key, profile)

# Retrieve
profile = cache.get(cache_key)

# Check existence
if cache.exists(cache_key):
    print("Cache hit!")

# Delete
cache.delete(cache_key)

# Clear all
cache.clear()

FileCacheBackend

Disk-based JSON cache.

from truthound.profiler.caching import FileCacheBackend

cache = FileCacheBackend(
    cache_dir=".truthound/cache",
    max_age_days=30,  # Expire after 30 days
)

# Store (auto JSON serialization)
cache.set(cache_key, profile)

# Retrieve (auto JSON deserialization)
profile = cache.get(cache_key)

RedisCacheBackend

Redis-based distributed cache.

from truthound.profiler.caching import RedisCacheBackend

cache = RedisCacheBackend(
    host="localhost",
    port=6379,
    db=0,
    prefix="truthound:profile:",
    ttl_seconds=86400,  # 24-hour TTL
)

cache.set(cache_key, profile)
profile = cache.get(cache_key)

ProfileCache

Unified cache interface.

from truthound.profiler.caching import ProfileCache

# Use file system cache
cache = ProfileCache(cache_dir=".truthound/cache")

# Compute fingerprint
fingerprint = cache.compute_fingerprint("data.csv")

# Check and use cache
if cache.exists(fingerprint):
    profile = cache.get(fingerprint)
    print("Cache hit!")
else:
    profile = profiler.profile_file("data.csv")
    cache.set(fingerprint, profile)
    print("Cache miss, computed and cached")

get_or_compute Pattern

from truthound.profiler.caching import ProfileCache

cache = ProfileCache()

# Automatic computation and storage on cache miss
profile = cache.get_or_compute(
    key=cache_key,
    compute_fn=lambda: profiler.profile_file("data.csv"),
)

TTL (Time-To-Live)

from truthound.profiler.caching import FileCacheBackend
from datetime import timedelta

cache = FileCacheBackend(
    cache_dir=".truthound/cache",
    default_ttl=timedelta(days=7),
)

# Specify individual TTL
cache.set(cache_key, profile, ttl=timedelta(hours=1))

# Clean up expired cache
cache.cleanup_expired()

Incremental Profiling

Re-profiles only the changed parts by comparing with the previous profile.

from truthound.profiler import IncrementalProfiler

inc_profiler = IncrementalProfiler(cache=cache)

# Initial profile
profile_v1 = inc_profiler.profile("data_v1.csv")

# Incremental update (only re-profile changed columns)
profile_v2 = inc_profiler.update(
    "data_v2.csv",
    previous=profile_v1,
)

print(f"Columns re-profiled: {profile_v2.columns_updated}")
print(f"Columns reused: {profile_v2.columns_cached}")
print(f"Time saved: {profile_v2.time_saved_ms}ms")

Cache Statistics

from truthound.profiler.caching import CacheStatistics

# Retrieve cache statistics
stats = cache.get_statistics()

print(f"Total entries: {stats.total_entries}")
print(f"Cache hits: {stats.hit_count}")
print(f"Cache misses: {stats.miss_count}")
print(f"Hit ratio: {stats.hit_ratio:.2%}")
print(f"Total size: {stats.total_size_bytes / 1024 / 1024:.2f} MB")

Cache Invalidation

# Invalidate specific key
cache.invalidate(cache_key)

# Pattern-based invalidation
cache.invalidate_pattern("data_*.csv")

# Clear all
cache.clear()

# Clean up only expired entries
cache.cleanup_expired()

Cache Chaining

Chain multiple cache backends together.

from truthound.profiler.caching import CacheChain, MemoryCacheBackend, FileCacheBackend

# Memory -> File system chain
cache = CacheChain([
    MemoryCacheBackend(max_size=50),      # L1: Fast memory
    FileCacheBackend(".cache"),           # L2: Persistent storage
])

# Lookup L1 -> on miss lookup L2 -> copy to L1
profile = cache.get(cache_key)

CLI Usage

# Profile with caching
th profile data.csv --cache

# Specify cache directory
th profile data.csv --cache --cache-dir .my_cache

# Ignore cache (force re-profiling)
th profile data.csv --no-cache

# View cache statistics
th cache stats

# Clear cache
th cache clear

# Clean up only expired cache
th cache cleanup

Environment Variables

Variable Description Default
TRUTHOUND_CACHE_DIR Cache directory .truthound/cache
TRUTHOUND_CACHE_TTL_DAYS Cache TTL (days) 30
TRUTHOUND_CACHE_MAX_SIZE_MB Maximum cache size 1000
TRUTHOUND_REDIS_URL Redis URL None

Next Steps