Test Data Management

Strategy	Use When	Pros	Cons
Fixtures	Small, stable data	Fast, reproducible	Brittle to changes
Factories	Dynamic variation	Flexible, quick	Overhead of generation
Synthetic Data	Realistic but fake	Safe, realistic patterns	Time to generate
Production Snapshot	Complex real scenarios	Realistic	GDPR risk, slow

Fixtures (Static Data)

# users.yaml - pre-built test data
users:
  - id: 1
    name: "Alice"
    email: "alice@example.com"
    created_at: "2025-01-01T00:00:00Z"
  - id: 2
    name: "Bob"
    email: "bob@example.com"
    created_at: "2025-01-02T00:00:00Z"

Factories (Dynamic Generation)

from factory import Factory, Sequence
from faker import Faker

fake = Faker()

class UserFactory(Factory):
    class Meta:
        model = User

    id = Sequence(lambda n: n)
    name = fake.name()
    email = fake.email()
    created_at = fake.date_time()

# Generate test users
user1 = UserFactory()  # Auto-generated data
users = UserFactory.create_batch(100)  # 100 users instantly

Synthetic Data

from faker import Faker

fake = Faker()

# Realistic fake data (GDPR safe)
users = [
    {
        "id": i,
        "name": fake.name(),
        "email": fake.email(),
        "phone": fake.phone_number(),
        "address": fake.address(),
        "created_at": fake.date_time()
    }
    for i in range(100)
]

Data Masking (Production Snapshot Anonymization)

import re
import hashlib

def mask_production_data(user: dict) -> dict:
    """Mask sensitive fields for testing."""
    return {
        "id": user["id"],
        "name": fake.name(),  # Replace with fake name
        "email": f"test+{user['id']}@example.com",  # Anonymize email
        "ssn": None,  # Remove SSN entirely
        "phone": fake.phone_number(),  # Replace with fake phone
        "created_at": user["created_at"],  # Keep timestamp (non-sensitive)
    }

# Load production users, mask sensitive fields
prod_users = fetch_production_users()
masked_users = [mask_production_data(u) for u in prod_users]
# Now safe to use in tests

# Python: factory_boy

import factory
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

class UserFactory(factory.Factory):
    """Generate test users dynamically."""

    class Meta:
        model = User

    id = factory.Sequence(lambda n: n + 1)
    name = factory.LazyFunction(fake.name)
    email = factory.LazyFunction(fake.email)
    age = factory.Faker('random_int', min=18, max=80)
    created_at = factory.LazyFunction(lambda: datetime.now() - timedelta(days=factory.Faker('random_int', min=1, max=365)))
    is_active = True

class OrderFactory(factory.Factory):
    """Generate test orders."""

    class Meta:
        model = Order

    id = factory.Sequence(lambda n: n + 1000)
    user = factory.SubFactory(UserFactory)  # Nested factory
    total = factory.Faker('pydecimal', left_digits=3, right_digits=2, positive=True)
    status = factory.Faker('random_element', elements=['pending', 'completed', 'cancelled'])
    created_at = factory.LazyFunction(datetime.now)

# Usage
user = UserFactory()  # Single user with random data
users = UserFactory.create_batch(100)  # 100 users
user_with_custom = UserFactory(name="Bob", age=30)  # Override specific fields

order = OrderFactory()  # Auto-generates user + order
orders = OrderFactory.create_batch(50)  # 50 orders with 50 users

# Pytest integration
@pytest.fixture
def sample_user():
    return UserFactory()

@pytest.fixture
def sample_orders():
    return OrderFactory.create_batch(10)

def test_user_validation(sample_user):
    assert sample_user.name is not None
    assert "@" in sample_user.email

def test_order_total(sample_orders):
    assert len(sample_orders) == 10
    for order in sample_orders:
        assert order.total > 0

# Ensure test isolation: each test has clean slate

import pytest
from contextlib import contextmanager

@contextmanager
def database_transaction():
    """Wrap test in database transaction, rollback after."""
    db.begin()
    try:
        yield
    finally:
        db.rollback()  # Undo all changes, even if test fails

class TestUserCreate:
    @pytest.fixture(autouse=True)
    def setup_teardown(self):
        """Auto-run before/after each test."""
        # Setup: create fresh database
        db.create_all()
        yield
        # Teardown: clean up (even if test fails)
        db.drop_all()

    def test_create_user(self):
        """Test is isolated: own database, own data."""
        user = UserFactory()
        db.add(user)
        db.commit()

        result = db.query(User).filter(User.id == user.id).first()
        assert result is not None

    def test_list_users(self):
        """This test starts with clean database."""
        # Previous test's data is gone
        users = UserFactory.create_batch(5)
        db.add_all(users)
        db.commit()

        count = db.query(User).count()
        assert count == 5

# Parallel test execution: databases are isolated
# pytest-xdist: run multiple test workers
# Each worker has own test database, no interference

# Keep test database small and fast

# Bad: Full production schema
db_size = 500GB  # Slow to create, slow to query

# Good: Minimal schema (only what tests need)
db_size = 100MB  # Fast to create, fast to query

# Bad: Tests use live database
# All tests queue up, slow serial execution

# Good: In-memory database (SQLite) for unit tests
db = sqlite3.connect(":memory:")

# Or: Database containers (Docker) for integration tests
# Fast to spin up, easy to parallelize

# Optimize test queries
# Bad: test_user() creates 1M users, queries take minutes
def test_user():
    users = [UserFactory() for _ in range(1_000_000)]
    db.add_all(users)
    db.commit()
    # Query is slow!

# Good: test_user() creates only needed data
def test_user():
    user = UserFactory()
    db.add(user)
    db.commit()
    # Query is instant

# Database indices in test database
# Add same indices as production
# Ensures test queries match production performance

Test Data Management

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Fixtures (Static Data)

Factories (Dynamic Generation)

Synthetic Data

Data Masking (Production Snapshot Anonymization)

When to Use / When NOT to Use

Patterns & Pitfalls

Design Review Checklist

Self-Check

Next Steps

References

Test Data Management

TL;DR​

Learning Objectives​

Motivating Scenario​

Core Concepts​

Fixtures (Static Data)​

Factories (Dynamic Generation)​

Synthetic Data​

Data Masking (Production Snapshot Anonymization)​

When to Use / When NOT to Use​

Patterns & Pitfalls​

Design Review Checklist​

Self-Check​

Next Steps​

References​

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Fixtures (Static Data)

Factories (Dynamic Generation)

Synthetic Data

Data Masking (Production Snapshot Anonymization)

When to Use / When NOT to Use

Patterns & Pitfalls

Design Review Checklist

Self-Check

Next Steps

References