Tokenization & Pseudonymization

// Tokenization: replace sensitive data with random token
const crypto = require('crypto');

class TokenVault {
  constructor() {
    this.store = new Map(); // In production: secure database
  }

  async tokenize(sensitiveData) {
    // Generate random token (not reversible from original)
    const token = crypto.randomBytes(16).toString('hex');
    const key = `token_${token}`;

    // Store mapping in vault (not in application)
    this.store.set(key, {
      data: sensitiveData,
      createdAt: new Date(),
      lastAccessed: new Date()
    });

    // Return only token to application
    return token;
  }

  async detokenize(token) {
    // Only authorized service can retrieve original
    const key = `token_${token}`;
    const record = this.store.get(key);

    if (!record) {
      throw new Error('Token not found');
    }

    // Update last accessed time (for audit)
    record.lastAccessed = new Date();
    return record.data;
  }
}

// Usage: Payment Processing
async function processPayment(creditCard) {
  const vault = new TokenVault();

  // 1. Tokenize card (only happens once, at payment gateway)
  const token = await vault.tokenize(creditCard.number);
  // vault.store has: token_abc123 → 4111111111111111

  // 2. Application stores only token (safe)
  const payment = {
    amount: 99.99,
    currency: 'USD',
    cardToken: token  // Not the actual card number!
  };

  // 3. Later: payment processor detokenizes
  const actualCard = await vault.detokenize(token);
  // Only payment processor can retrieve
}

// Real-world flow:
// User inputs card → Browser sends to payment gateway (Stripe, Square)
// Payment gateway tokenizes → Application gets token
// Application stores token (safe for PCI compliance)
// Processor keeps vault (high security, limited access)

// Pseudonymization: deterministic hash for consistent identifiers
const crypto = require('crypto');

class PseudonymBuilder {
  constructor(salt) {
    // Salt must be secret and consistent
    this.salt = salt;
  }

  pseudonymize(identifier) {
    // Deterministic: same input always produces same output
    const hash = crypto.createHash('sha256')
      .update(identifier.toString() + this.salt)
      .digest('hex');

    // Return human-readable pseudonym
    return `pseudonym_${hash.slice(0, 12)}`;
  }
}

// Usage: Analytics on user data
const pb = new PseudonymBuilder(process.env.PSEUDONYM_SALT);

// Original data (what we want to protect):
const userData = [
  { user_id: 5, name: 'Alice Johnson', email: 'alice@example.com', feature_used: 'reports' },
  { user_id: 7, name: 'Bob Smith', email: 'bob@example.com', feature_used: 'analytics' },
  { user_id: 5, name: 'Alice Johnson', email: 'alice@example.com', feature_used: 'dashboard' }
];

// Pseudonymized data (safe for analytics):
const safeData = userData.map(record => ({
  user_pseudonym: pb.pseudonymize(record.user_id),  // user_a3f2e1d6b5c9
  feature_used: record.feature_used
  // name and email removed!
}));

// Result:
// [
//   { user_pseudonym: 'pseudonym_a3f2e1d6b5', feature_used: 'reports' },
//   { user_pseudonym: 'pseudonym_c7d4e2f1a9', feature_used: 'analytics' },
//   { user_pseudonym: 'pseudonym_a3f2e1d6b5', feature_used: 'dashboard' }
// ]

// Benefits:
// - Data scientist can't see actual user identities
// - Can still correlate records (same pseudonym = same user)
// - Analytics questions answerable: "which features do users combine?"
// - Complies with GDPR (treated as PII, but practical)

// Key properties:
// - Deterministic: user_id=5 always → same pseudonym
// - Same-input-same-output enables table joins
// - Can't reverse (can't get user_id from pseudonym without salt)

// Format-Preserving Encryption: encrypt while preserving format
// Credit card remains 16 digits, SSN remains 9 digits, etc.

class FPEEncryption {
  constructor(key) {
    this.key = key;
  }

  encryptCC(plaintext) {
    // Input:  4111111111111111
    // Output: 5923847362912456 (still looks like CC, but encrypted)
    // Advantage: no schema changes, looks normal

    // Note: Real FPE is complex (format type, constraints)
    // Libraries: node-ffx, libff
    // This is simplified example
    const encrypted = this.encrypt(plaintext);
    return encrypted.padStart(16, '0');
  }

  decryptCC(ciphertext) {
    // Decrypt back to original
    return this.decrypt(ciphertext);
  }

  encrypt(plaintext) {
    // Use actual FPE library in production
    const crypto = require('crypto');
    const cipher = crypto.createCipher('aes-256-cbc', this.key);
    return cipher.update(plaintext, 'utf8', 'hex');
  }

  decrypt(ciphertext) {
    const crypto = require('crypto');
    const decipher = crypto.createDecipher('aes-256-cbc', this.key);
    return decipher.update(ciphertext, 'hex', 'utf8');
  }
}

// Usage:
const fpe = new FPEEncryption(process.env.FPE_KEY);

const originalCC = '4111111111111111';
const encrypted = fpe.encryptCC(originalCC);  // 5923847362912456
const decrypted = fpe.decryptCC(encrypted);    // 4111111111111111

// FPE vs Tokenization:
// FPE: Reversible, format preserved, slower, good for analytics
// Tokenization: One-way, random format, faster, good for payment systems

Tokenization & Pseudonymization

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Tokenization vs Pseudonymization

Implementation Patterns

Practical Examples

Patterns and Pitfalls

Self-Check

Design Review Checklist

Next Steps

References

Tokenization & Pseudonymization

TL;DR​

Learning Objectives​

Motivating Scenario​

Core Concepts​

Tokenization vs Pseudonymization​

Implementation Patterns​

Practical Examples​

Patterns and Pitfalls​

Self-Check​

Design Review Checklist​

Next Steps​

References​

TL;DR

Learning Objectives

Motivating Scenario

Core Concepts

Tokenization vs Pseudonymization

Implementation Patterns

Practical Examples

Patterns and Pitfalls

Self-Check

Design Review Checklist

Next Steps

References