Initial
This commit is contained in:
406
experiments/database_buffer_pool/sqlite_heavy_experiment.py
Normal file
406
experiments/database_buffer_pool/sqlite_heavy_experiment.py
Normal file
@@ -0,0 +1,406 @@
|
||||
"""
|
||||
SQLite experiment with heavier workload to demonstrate space-time tradeoffs
|
||||
Uses larger data and more complex queries to stress the buffer pool
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
import os
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import json
|
||||
import tempfile
|
||||
import shutil
|
||||
import gc
|
||||
|
||||
class SQLiteHeavyExperiment:
|
||||
"""SQLite experiment with larger data to force real I/O"""
|
||||
|
||||
def __init__(self, scale_factor: int = 100000):
|
||||
self.scale_factor = scale_factor
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
self.db_path = os.path.join(self.temp_dir, 'heavy.db')
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up temporary files"""
|
||||
if os.path.exists(self.temp_dir):
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def setup_database(self):
|
||||
"""Create a database that's too large for small caches"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
|
||||
# Use larger pages for efficiency
|
||||
conn.execute('PRAGMA page_size = 8192')
|
||||
conn.execute('PRAGMA journal_mode = WAL') # Write-ahead logging
|
||||
conn.commit()
|
||||
|
||||
# Create tables that simulate real-world complexity
|
||||
conn.execute('''
|
||||
CREATE TABLE documents (
|
||||
id INTEGER PRIMARY KEY,
|
||||
user_id INTEGER,
|
||||
title TEXT,
|
||||
content TEXT,
|
||||
tags TEXT,
|
||||
created_at INTEGER,
|
||||
updated_at INTEGER,
|
||||
view_count INTEGER,
|
||||
data BLOB
|
||||
)
|
||||
''')
|
||||
|
||||
conn.execute('''
|
||||
CREATE TABLE analytics (
|
||||
id INTEGER PRIMARY KEY,
|
||||
doc_id INTEGER,
|
||||
event_type TEXT,
|
||||
user_id INTEGER,
|
||||
timestamp INTEGER,
|
||||
metadata TEXT,
|
||||
FOREIGN KEY (doc_id) REFERENCES documents(id)
|
||||
)
|
||||
''')
|
||||
|
||||
print(f"Populating database (this will take a moment)...")
|
||||
|
||||
# Insert documents with realistic data
|
||||
batch_size = 1000
|
||||
total_docs = self.scale_factor
|
||||
|
||||
for i in range(0, total_docs, batch_size):
|
||||
batch = []
|
||||
for j in range(min(batch_size, total_docs - i)):
|
||||
doc_id = i + j
|
||||
# Create variable-length content to simulate real documents
|
||||
content_length = np.random.randint(100, 2000)
|
||||
content = 'x' * content_length # Simplified for speed
|
||||
|
||||
# Random binary data to increase row size
|
||||
data_size = np.random.randint(500, 2000)
|
||||
data = os.urandom(data_size)
|
||||
|
||||
batch.append((
|
||||
doc_id,
|
||||
np.random.randint(1, 10000), # user_id
|
||||
f'Document {doc_id}',
|
||||
content,
|
||||
f'tag{doc_id % 100},tag{doc_id % 50}',
|
||||
int(time.time()) - doc_id,
|
||||
int(time.time()) - doc_id // 2,
|
||||
np.random.randint(0, 10000),
|
||||
data
|
||||
))
|
||||
|
||||
conn.executemany(
|
||||
'INSERT INTO documents VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
|
||||
batch
|
||||
)
|
||||
|
||||
# Insert analytics events (3-5 per document)
|
||||
analytics_batch = []
|
||||
for doc in batch:
|
||||
doc_id = doc[0]
|
||||
num_events = np.random.randint(3, 6)
|
||||
for k in range(num_events):
|
||||
analytics_batch.append((
|
||||
doc_id * 5 + k,
|
||||
doc_id,
|
||||
np.random.choice(['view', 'click', 'share', 'like']),
|
||||
np.random.randint(1, 10000),
|
||||
int(time.time()) - np.random.randint(0, 86400 * 30),
|
||||
f'{{"source": "web", "version": {k}}}'
|
||||
))
|
||||
|
||||
conn.executemany(
|
||||
'INSERT INTO analytics VALUES (?, ?, ?, ?, ?, ?)',
|
||||
analytics_batch
|
||||
)
|
||||
|
||||
if (i + batch_size) % 10000 == 0:
|
||||
print(f" Inserted {i + batch_size:,} / {total_docs:,} documents...")
|
||||
conn.commit()
|
||||
|
||||
# Create indexes to make queries more realistic
|
||||
print("Creating indexes...")
|
||||
conn.execute('CREATE INDEX idx_docs_user ON documents(user_id)')
|
||||
conn.execute('CREATE INDEX idx_docs_created ON documents(created_at)')
|
||||
conn.execute('CREATE INDEX idx_analytics_doc ON analytics(doc_id)')
|
||||
conn.execute('CREATE INDEX idx_analytics_time ON analytics(timestamp)')
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Analyze to update statistics
|
||||
conn.execute('ANALYZE')
|
||||
conn.close()
|
||||
|
||||
# Get database size
|
||||
db_size = os.path.getsize(self.db_path)
|
||||
print(f"Database size: {db_size / 1024 / 1024:.1f} MB")
|
||||
|
||||
return db_size
|
||||
|
||||
def force_cache_clear(self):
|
||||
"""Try to clear OS cache"""
|
||||
# Allocate and access large memory to evict cache
|
||||
try:
|
||||
dummy = np.zeros((100, 1024, 1024), dtype=np.uint8) # 100MB
|
||||
dummy[:] = np.random.randint(0, 256, size=dummy.shape, dtype=np.uint8)
|
||||
del dummy
|
||||
gc.collect()
|
||||
except:
|
||||
pass
|
||||
|
||||
def run_heavy_queries(self, cache_pages: int) -> dict:
|
||||
"""Run queries that stress the cache"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
|
||||
# Set cache size
|
||||
conn.execute(f'PRAGMA cache_size = -{cache_pages * 8}') # Negative = KB
|
||||
|
||||
# Disable query optimizer shortcuts
|
||||
conn.execute('PRAGMA query_only = ON')
|
||||
|
||||
results = {
|
||||
'random_reads': [],
|
||||
'sequential_scan': [],
|
||||
'complex_join': [],
|
||||
'aggregation': []
|
||||
}
|
||||
|
||||
# 1. Random point queries (cache-unfriendly)
|
||||
print(" Running random reads...")
|
||||
for _ in range(50):
|
||||
doc_id = np.random.randint(1, self.scale_factor)
|
||||
start = time.time()
|
||||
conn.execute(
|
||||
'SELECT * FROM documents WHERE id = ?',
|
||||
(doc_id,)
|
||||
).fetchone()
|
||||
results['random_reads'].append(time.time() - start)
|
||||
|
||||
# 2. Sequential scan with filter
|
||||
print(" Running sequential scans...")
|
||||
for _ in range(5):
|
||||
min_views = np.random.randint(1000, 5000)
|
||||
start = time.time()
|
||||
conn.execute(
|
||||
'SELECT COUNT(*) FROM documents WHERE view_count > ?',
|
||||
(min_views,)
|
||||
).fetchone()
|
||||
results['sequential_scan'].append(time.time() - start)
|
||||
|
||||
# 3. Complex join queries
|
||||
print(" Running complex joins...")
|
||||
for _ in range(5):
|
||||
user_id = np.random.randint(1, 10000)
|
||||
start = time.time()
|
||||
conn.execute('''
|
||||
SELECT d.*, COUNT(a.id) as events
|
||||
FROM documents d
|
||||
LEFT JOIN analytics a ON d.id = a.doc_id
|
||||
WHERE d.user_id = ?
|
||||
GROUP BY d.id
|
||||
LIMIT 10
|
||||
''', (user_id,)).fetchall()
|
||||
results['complex_join'].append(time.time() - start)
|
||||
|
||||
# 4. Time-based aggregation
|
||||
print(" Running aggregations...")
|
||||
for _ in range(5):
|
||||
days_back = np.random.randint(1, 30)
|
||||
start_time = int(time.time()) - (days_back * 86400)
|
||||
start = time.time()
|
||||
conn.execute('''
|
||||
SELECT
|
||||
event_type,
|
||||
COUNT(*) as count,
|
||||
COUNT(DISTINCT user_id) as unique_users
|
||||
FROM analytics
|
||||
WHERE timestamp > ?
|
||||
GROUP BY event_type
|
||||
''', (start_time,)).fetchall()
|
||||
results['aggregation'].append(time.time() - start)
|
||||
|
||||
conn.close()
|
||||
|
||||
return {
|
||||
'cache_pages': cache_pages,
|
||||
'avg_random_read': np.mean(results['random_reads']),
|
||||
'avg_sequential': np.mean(results['sequential_scan']),
|
||||
'avg_join': np.mean(results['complex_join']),
|
||||
'avg_aggregation': np.mean(results['aggregation']),
|
||||
'p95_random_read': np.percentile(results['random_reads'], 95),
|
||||
'raw_results': results
|
||||
}
|
||||
|
||||
def run_heavy_experiment():
|
||||
"""Run the heavy SQLite experiment"""
|
||||
|
||||
print("="*60)
|
||||
print("SQLite Heavy Workload Experiment")
|
||||
print("Demonstrating space-time tradeoffs with real I/O pressure")
|
||||
print("="*60)
|
||||
|
||||
# Create large database
|
||||
scale = 50000 # 50k documents = ~200MB database
|
||||
exp = SQLiteHeavyExperiment(scale)
|
||||
|
||||
db_size = exp.setup_database()
|
||||
|
||||
# Calculate page count
|
||||
page_size = 8192
|
||||
total_pages = db_size // page_size
|
||||
|
||||
print(f"\nDatabase created:")
|
||||
print(f" Documents: {scale:,}")
|
||||
print(f" Size: {db_size / 1024 / 1024:.1f} MB")
|
||||
print(f" Pages: {total_pages:,}")
|
||||
|
||||
# Test different cache sizes
|
||||
cache_configs = [
|
||||
('O(n) Full', min(total_pages, 10000)), # Cap at 10k pages for memory
|
||||
('O(√n)', int(np.sqrt(total_pages))),
|
||||
('O(log n)', int(np.log2(total_pages))),
|
||||
('O(1)', 10)
|
||||
]
|
||||
|
||||
results = []
|
||||
|
||||
for label, cache_pages in cache_configs:
|
||||
cache_mb = cache_pages * page_size / 1024 / 1024
|
||||
print(f"\nTesting {label}: {cache_pages} pages ({cache_mb:.1f} MB)")
|
||||
|
||||
# Clear cache between runs
|
||||
exp.force_cache_clear()
|
||||
time.sleep(1) # Let system settle
|
||||
|
||||
result = exp.run_heavy_queries(cache_pages)
|
||||
result['label'] = label
|
||||
result['cache_mb'] = cache_mb
|
||||
results.append(result)
|
||||
|
||||
print(f" Random read: {result['avg_random_read']*1000:.2f} ms")
|
||||
print(f" Sequential: {result['avg_sequential']*1000:.2f} ms")
|
||||
print(f" Complex join: {result['avg_join']*1000:.2f} ms")
|
||||
|
||||
# Create visualization
|
||||
create_heavy_experiment_plot(results, db_size)
|
||||
|
||||
# Calculate slowdowns
|
||||
base = results[0]['avg_random_read']
|
||||
for r in results:
|
||||
r['slowdown'] = r['avg_random_read'] / base
|
||||
|
||||
# Save results
|
||||
with open('sqlite_heavy_results.json', 'w') as f:
|
||||
save_data = {
|
||||
'scale_factor': scale,
|
||||
'db_size_mb': db_size / 1024 / 1024,
|
||||
'results': [
|
||||
{
|
||||
'label': r['label'],
|
||||
'cache_mb': r['cache_mb'],
|
||||
'avg_random_ms': r['avg_random_read'] * 1000,
|
||||
'slowdown': r['slowdown']
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
}
|
||||
json.dump(save_data, f, indent=2)
|
||||
|
||||
exp.cleanup()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("RESULTS SUMMARY")
|
||||
print("="*60)
|
||||
for r in results:
|
||||
print(f"{r['label']:15} | Slowdown: {r['slowdown']:6.1f}x | "
|
||||
f"Random: {r['avg_random_read']*1000:6.2f} ms | "
|
||||
f"Join: {r['avg_join']*1000:6.2f} ms")
|
||||
|
||||
print("\nFiles generated:")
|
||||
print(" - sqlite_heavy_experiment.png")
|
||||
print(" - sqlite_heavy_results.json")
|
||||
print("="*60)
|
||||
|
||||
def create_heavy_experiment_plot(results, db_size):
|
||||
"""Create plot for heavy experiment"""
|
||||
|
||||
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))
|
||||
|
||||
# Extract data
|
||||
labels = [r['label'] for r in results]
|
||||
cache_mb = [r['cache_mb'] for r in results]
|
||||
random_times = [r['avg_random_read'] * 1000 for r in results]
|
||||
join_times = [r['avg_join'] * 1000 for r in results]
|
||||
|
||||
# Plot 1: Random read performance
|
||||
colors = ['green', 'orange', 'red', 'darkred']
|
||||
ax1.bar(labels, random_times, color=colors, edgecolor='black', linewidth=1.5)
|
||||
ax1.set_ylabel('Time (ms)', fontsize=12)
|
||||
ax1.set_title('Random Read Performance', fontsize=14)
|
||||
ax1.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
# Add value labels
|
||||
for i, (bar, val) in enumerate(zip(ax1.patches, random_times)):
|
||||
ax1.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
|
||||
f'{val:.1f}', ha='center', va='bottom', fontsize=10)
|
||||
|
||||
# Plot 2: Join query performance
|
||||
ax2.bar(labels, join_times, color=colors, edgecolor='black', linewidth=1.5)
|
||||
ax2.set_ylabel('Time (ms)', fontsize=12)
|
||||
ax2.set_title('Complex Join Performance', fontsize=14)
|
||||
ax2.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
# Plot 3: Cache efficiency
|
||||
db_mb = db_size / 1024 / 1024
|
||||
cache_pct = [(c / db_mb) * 100 for c in cache_mb]
|
||||
slowdowns = [r['avg_random_read'] / results[0]['avg_random_read'] for r in results]
|
||||
|
||||
ax3.scatter(cache_pct, slowdowns, s=200, c=colors, edgecolor='black', linewidth=2)
|
||||
|
||||
# Add theoretical √n curve
|
||||
x_theory = np.linspace(0.1, 100, 100)
|
||||
y_theory = 1 / np.sqrt(x_theory / 100)
|
||||
ax3.plot(x_theory, y_theory, 'b--', alpha=0.5, label='Theoretical 1/√x')
|
||||
|
||||
ax3.set_xlabel('Cache Size (% of Database)', fontsize=12)
|
||||
ax3.set_ylabel('Slowdown Factor', fontsize=12)
|
||||
ax3.set_title('Space-Time Tradeoff', fontsize=14)
|
||||
ax3.set_xscale('log')
|
||||
ax3.set_yscale('log')
|
||||
ax3.legend()
|
||||
ax3.grid(True, alpha=0.3)
|
||||
|
||||
# Plot 4: All query types comparison
|
||||
query_types = ['Random\nRead', 'Sequential\nScan', 'Complex\nJoin', 'Aggregation']
|
||||
|
||||
x = np.arange(len(query_types))
|
||||
width = 0.2
|
||||
|
||||
for i, r in enumerate(results):
|
||||
times = [
|
||||
r['avg_random_read'] * 1000,
|
||||
r['avg_sequential'] * 1000,
|
||||
r['avg_join'] * 1000,
|
||||
r['avg_aggregation'] * 1000
|
||||
]
|
||||
ax4.bar(x + i*width, times, width, label=r['label'], color=colors[i])
|
||||
|
||||
ax4.set_xlabel('Query Type', fontsize=12)
|
||||
ax4.set_ylabel('Time (ms)', fontsize=12)
|
||||
ax4.set_title('Performance by Query Type', fontsize=14)
|
||||
ax4.set_xticks(x + width * 1.5)
|
||||
ax4.set_xticklabels(query_types)
|
||||
ax4.legend(fontsize=10)
|
||||
ax4.grid(True, alpha=0.3, axis='y')
|
||||
ax4.set_yscale('log')
|
||||
|
||||
plt.suptitle('SQLite Buffer Pool: Heavy Workload Analysis', fontsize=16)
|
||||
plt.tight_layout()
|
||||
plt.savefig('sqlite_heavy_experiment.png', dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_heavy_experiment()
|
||||
Reference in New Issue
Block a user