Initial

2025-07-20 03:56:21 -04:00
commit 59539f4daa
65 changed files with 6964 additions and 0 deletions
--- a/experiments/database_buffer_pool/sqlite_buffer_pool_experiment.py
+++ b/experiments/database_buffer_pool/sqlite_buffer_pool_experiment.py
@@ -0,0 +1,406 @@
+"""
+SQLite Buffer Pool Space-Time Tradeoff Experiment
+
+Demonstrates how SQLite's page cache size affects query performance,
+validating Williams' √n space-time tradeoff in a real production database.
+
+Key parameters:
+- cache_size: Number of pages in memory (default 2000)
+- page_size: Size of each page (default 4096 bytes)
+
+This experiment shows:
+1. Full cache (O(n) space): Fast queries
+2. √n cache: Moderate slowdown
+3. Minimal cache: Extreme slowdown
+"""
+
+import sqlite3
+import time
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Dict, List, Tuple
+import json
+import tempfile
+import shutil
+
+class SQLiteExperiment:
+    """Test SQLite performance with different cache sizes"""
+    
+    def __init__(self, num_rows: int, page_size: int = 4096):
+        self.num_rows = num_rows
+        self.page_size = page_size
+        self.temp_dir = tempfile.mkdtemp()
+        self.db_path = os.path.join(self.temp_dir, 'test.db')
+        
+    def cleanup(self):
+        """Clean up temporary files"""
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+    
+    def setup_database(self):
+        """Create and populate the test database"""
+        conn = sqlite3.connect(self.db_path)
+        conn.execute(f'PRAGMA page_size = {self.page_size}')
+        conn.commit()
+        
+        # Create tables simulating a real app
+        conn.execute('''
+            CREATE TABLE users (
+                id INTEGER PRIMARY KEY,
+                name TEXT,
+                email TEXT,
+                created_at INTEGER,
+                data BLOB
+            )
+        ''')
+        
+        conn.execute('''
+            CREATE TABLE posts (
+                id INTEGER PRIMARY KEY,
+                user_id INTEGER,
+                title TEXT,
+                content TEXT,
+                created_at INTEGER,
+                FOREIGN KEY (user_id) REFERENCES users(id)
+            )
+        ''')
+        
+        # Insert data
+        print(f"Populating database with {self.num_rows:,} users...")
+        
+        # Batch insert for efficiency
+        batch_size = 1000
+        for i in range(0, self.num_rows, batch_size):
+            batch = []
+            for j in range(min(batch_size, self.num_rows - i)):
+                user_id = i + j
+                # Add some data to make pages more realistic
+                data = os.urandom(200)  # 200 bytes of data per user
+                batch.append((
+                    user_id,
+                    f'User {user_id}',
+                    f'user{user_id}@example.com',
+                    int(time.time()) - user_id,
+                    data
+                ))
+            
+            conn.executemany(
+                'INSERT INTO users VALUES (?, ?, ?, ?, ?)',
+                batch
+            )
+            
+            # Insert 3 posts per user
+            post_batch = []
+            for user in batch:
+                user_id = user[0]
+                for k in range(3):
+                    post_batch.append((
+                        user_id * 3 + k,
+                        user_id,
+                        f'Post {k} by user {user_id}',
+                        f'Content of post {k}' * 10,  # Make content larger
+                        int(time.time()) - user_id + k
+                    ))
+            
+            conn.executemany(
+                'INSERT INTO posts VALUES (?, ?, ?, ?, ?)',
+                post_batch
+            )
+        
+        # Create indexes (common in real apps)
+        conn.execute('CREATE INDEX idx_users_email ON users(email)')
+        conn.execute('CREATE INDEX idx_posts_user ON posts(user_id)')
+        conn.execute('CREATE INDEX idx_posts_created ON posts(created_at)')
+        
+        conn.commit()
+        conn.close()
+        
+        # Get database size
+        db_size = os.path.getsize(self.db_path)
+        print(f"Database size: {db_size / 1024 / 1024:.1f} MB")
+        return db_size
+    
+    def run_queries(self, cache_size: int, num_queries: int = 100) -> Dict:
+        """Run queries with specified cache size"""
+        conn = sqlite3.connect(self.db_path)
+        
+        # Set cache size (in pages)
+        conn.execute(f'PRAGMA cache_size = {cache_size}')
+        
+        # Clear OS cache by reading another file (best effort)
+        dummy_data = os.urandom(50 * 1024 * 1024)  # 50MB
+        del dummy_data
+        
+        # Get actual cache size in bytes
+        cache_bytes = cache_size * self.page_size
+        
+        # Query patterns that simulate real usage
+        query_times = {
+            'point_lookups': [],
+            'range_scans': [],
+            'joins': [],
+            'aggregations': []
+        }
+        
+        # Warm up
+        conn.execute('SELECT COUNT(*) FROM users').fetchone()
+        
+        # 1. Point lookups (random access pattern)
+        for _ in range(num_queries):
+            user_id = np.random.randint(1, self.num_rows)
+            start = time.time()
+            conn.execute(
+                'SELECT * FROM users WHERE id = ?', 
+                (user_id,)
+            ).fetchone()
+            query_times['point_lookups'].append(time.time() - start)
+        
+        # 2. Range scans
+        for _ in range(num_queries // 10):  # Fewer range scans
+            max_start = max(1, self.num_rows - 100)
+            start_id = np.random.randint(1, max_start + 1)
+            start = time.time()
+            conn.execute(
+                'SELECT * FROM users WHERE id BETWEEN ? AND ?',
+                (start_id, min(start_id + 100, self.num_rows))
+            ).fetchall()
+            query_times['range_scans'].append(time.time() - start)
+        
+        # 3. Joins (most expensive)
+        for _ in range(num_queries // 20):  # Even fewer joins
+            user_id = np.random.randint(1, self.num_rows)
+            start = time.time()
+            conn.execute('''
+                SELECT u.*, p.*
+                FROM users u
+                JOIN posts p ON u.id = p.user_id
+                WHERE u.id = ?
+            ''', (user_id,)).fetchall()
+            query_times['joins'].append(time.time() - start)
+        
+        # 4. Aggregations
+        for _ in range(num_queries // 20):
+            start_time = int(time.time()) - np.random.randint(0, self.num_rows)
+            start = time.time()
+            conn.execute('''
+                SELECT COUNT(*), AVG(LENGTH(content))
+                FROM posts
+                WHERE created_at > ?
+            ''', (start_time,)).fetchone()
+            query_times['aggregations'].append(time.time() - start)
+        
+        # Get cache statistics
+        cache_hit = conn.execute('PRAGMA cache_stats').fetchone()
+        
+        conn.close()
+        
+        return {
+            'cache_size': cache_size,
+            'cache_bytes': cache_bytes,
+            'query_times': query_times,
+            'avg_point_lookup': np.mean(query_times['point_lookups']),
+            'avg_range_scan': np.mean(query_times['range_scans']),
+            'avg_join': np.mean(query_times['joins']),
+            'avg_aggregation': np.mean(query_times['aggregations'])
+        }
+    
+    def analyze_page_distribution(self) -> Dict:
+        """Analyze how data is distributed across pages"""
+        conn = sqlite3.connect(self.db_path)
+        
+        # Get page count
+        page_count = conn.execute('PRAGMA page_count').fetchone()[0]
+        
+        # Get various statistics
+        stats = {
+            'page_count': page_count,
+            'page_size': self.page_size,
+            'total_size': page_count * self.page_size,
+            'users_count': conn.execute('SELECT COUNT(*) FROM users').fetchone()[0],
+            'posts_count': conn.execute('SELECT COUNT(*) FROM posts').fetchone()[0]
+        }
+        
+        conn.close()
+        return stats
+
+def run_sqlite_experiment():
+    """Run the complete SQLite buffer pool experiment"""
+    
+    print("="*60)
+    print("SQLite Buffer Pool Space-Time Tradeoff Experiment")
+    print("="*60)
+    
+    # Test with different database sizes
+    sizes = [10000, 50000, 100000]  # Number of users
+    results = {}
+    
+    for num_users in sizes:
+        print(f"\n{'='*40}")
+        print(f"Testing with {num_users:,} users")
+        print(f"{'='*40}")
+        
+        exp = SQLiteExperiment(num_users)
+        db_size = exp.setup_database()
+        stats = exp.analyze_page_distribution()
+        
+        print(f"Database pages: {stats['page_count']:,}")
+        print(f"Page size: {stats['page_size']} bytes")
+        
+        # Test different cache sizes
+        # Full cache, √n cache, minimal cache
+        cache_configs = [
+            ('Full O(n)', stats['page_count']),  # All pages in memory
+            ('√n cache', int(np.sqrt(stats['page_count']))),  # √n pages
+            ('Minimal', 10)  # Almost no cache
+        ]
+        
+        user_results = []
+        
+        for label, cache_size in cache_configs:
+            print(f"\nTesting {label}: {cache_size} pages ({cache_size * 4096 / 1024:.1f} KB)")
+            
+            result = exp.run_queries(cache_size, num_queries=50)
+            result['label'] = label
+            user_results.append(result)
+            
+            print(f"  Point lookups: {result['avg_point_lookup']*1000:.2f} ms")
+            print(f"  Range scans: {result['avg_range_scan']*1000:.2f} ms")
+            print(f"  Joins: {result['avg_join']*1000:.2f} ms")
+        
+        results[num_users] = {
+            'stats': stats,
+            'experiments': user_results
+        }
+        
+        exp.cleanup()
+    
+    # Create visualizations
+    create_sqlite_plots(results)
+    
+    # Save results
+    with open('sqlite_results.json', 'w') as f:
+        # Convert numpy types for JSON serialization
+        def convert(o):
+            if isinstance(o, np.integer):
+                return int(o)
+            if isinstance(o, np.floating):
+                return float(o)
+            if isinstance(o, np.ndarray):
+                return o.tolist()
+            return o
+        
+        json.dump(results, f, indent=2, default=convert)
+    
+    print("\n" + "="*60)
+    print("EXPERIMENT COMPLETE")
+    print("Generated files:")
+    print("  - sqlite_results.json")
+    print("  - sqlite_buffer_pool_analysis.png")
+    print("="*60)
+    
+    return results
+
+def create_sqlite_plots(results: Dict):
+    """Create publication-quality plots for SQLite experiment"""
+    
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))
+    
+    # Plot 1: Point lookup performance vs cache size
+    sizes = sorted(results.keys())
+    
+    for size in sizes:
+        experiments = results[size]['experiments']
+        cache_sizes = [e['cache_size'] for e in experiments]
+        point_times = [e['avg_point_lookup'] * 1000 for e in experiments]  # Convert to ms
+        
+        ax1.plot(cache_sizes, point_times, 'o-', label=f'{size:,} users', 
+                linewidth=2, markersize=8)
+    
+    ax1.set_xlabel('Cache Size (pages)', fontsize=12)
+    ax1.set_ylabel('Avg Point Lookup Time (ms)', fontsize=12)
+    ax1.set_title('Point Lookup Performance vs Cache Size', fontsize=14)
+    ax1.set_xscale('log')
+    ax1.set_yscale('log')
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    
+    # Plot 2: Slowdown factors
+    base_size = sizes[1]  # Use 50k as reference
+    base_results = results[base_size]['experiments']
+    
+    full_cache_time = base_results[0]['avg_point_lookup']
+    sqrt_cache_time = base_results[1]['avg_point_lookup']
+    min_cache_time = base_results[2]['avg_point_lookup']
+    
+    categories = ['Full\nO(n)', '√n\nCache', 'Minimal\nO(1)']
+    slowdowns = [1, sqrt_cache_time/full_cache_time, min_cache_time/full_cache_time]
+    
+    bars = ax2.bar(categories, slowdowns, color=['green', 'orange', 'red'])
+    ax2.set_ylabel('Slowdown Factor', fontsize=12)
+    ax2.set_title(f'Query Slowdown vs Cache Size ({base_size:,} users)', fontsize=14)
+    
+    # Add value labels on bars
+    for bar, val in zip(bars, slowdowns):
+        height = bar.get_height()
+        ax2.text(bar.get_x() + bar.get_width()/2., height,
+                f'{val:.1f}×', ha='center', va='bottom', fontsize=11)
+    
+    ax2.grid(True, alpha=0.3, axis='y')
+    
+    # Plot 3: Memory usage efficiency
+    for size in sizes:
+        experiments = results[size]['experiments']
+        cache_mb = [e['cache_bytes'] / 1024 / 1024 for e in experiments]
+        query_speed = [1 / e['avg_point_lookup'] for e in experiments]  # Queries per second
+        
+        ax3.plot(cache_mb, query_speed, 's-', label=f'{size:,} users',
+                linewidth=2, markersize=8)
+    
+    ax3.set_xlabel('Cache Size (MB)', fontsize=12)
+    ax3.set_ylabel('Queries per Second', fontsize=12)
+    ax3.set_title('Memory Efficiency: Speed vs Cache Size', fontsize=14)
+    ax3.set_xscale('log')
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    
+    # Plot 4: Different query types
+    query_types = ['Point\nLookup', 'Range\nScan', 'Join\nQuery']
+    
+    for i, (label, cache_size) in enumerate(cache_configs[:3]):
+        if i >= len(base_results):
+            break
+        result = base_results[i]
+        times = [
+            result['avg_point_lookup'] * 1000,
+            result['avg_range_scan'] * 1000,
+            result['avg_join'] * 1000
+        ]
+        
+        x = np.arange(len(query_types))
+        width = 0.25
+        ax4.bar(x + i*width, times, width, label=label)
+    
+    ax4.set_xlabel('Query Type', fontsize=12)
+    ax4.set_ylabel('Average Time (ms)', fontsize=12)
+    ax4.set_title('Query Performance by Type and Cache Size', fontsize=14)
+    ax4.set_xticks(x + width)
+    ax4.set_xticklabels(query_types)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3, axis='y')
+    ax4.set_yscale('log')
+    
+    plt.suptitle('SQLite Buffer Pool: Space-Time Tradeoffs', fontsize=16)
+    plt.tight_layout()
+    plt.savefig('sqlite_buffer_pool_analysis.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+# Helper to get theoretical cache configs
+cache_configs = [
+    ('Full O(n)', None),  # Will be set based on page count
+    ('√n cache', None),
+    ('Minimal', 10)
+]
+
+if __name__ == "__main__":
+    run_sqlite_experiment()