This commit is contained in:
2025-07-20 04:11:04 -04:00
commit 69b521b549
40 changed files with 7781 additions and 0 deletions

1
tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Ubiquity SpaceTime Test Suite

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env python3
"""
Tests for external algorithms with memory pressure.
"""
import unittest
import random
import gc
import psutil
import time
from sqrtspace_spacetime import external_sort, external_groupby, SpaceTimeConfig
class TestExternalAlgorithms(unittest.TestCase):
"""Test external algorithms under memory constraints."""
def setUp(self):
"""Set up test environment."""
SpaceTimeConfig.set_defaults(
memory_limit=100 * 1024 * 1024, # 100MB limit
chunk_strategy='sqrt_n'
)
self.process = psutil.Process()
def test_external_sort_small(self):
"""Test external sort with small dataset."""
data = [random.randint(1, 1000) for _ in range(1000)]
sorted_data = external_sort(data)
# Verify sorting
self.assertEqual(len(sorted_data), len(data))
for i in range(len(sorted_data) - 1):
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
# Verify all elements present
self.assertEqual(sorted(data), sorted_data)
def test_external_sort_large_with_memory_tracking(self):
"""Test external sort with large dataset and memory tracking."""
n = 1_000_000 # 1 million items
# Generate data
print(f"\nGenerating {n:,} random integers...")
data = [random.randint(1, 10_000_000) for _ in range(n)]
# Track memory before sorting
gc.collect()
memory_before = self.process.memory_info().rss / 1024 / 1024
peak_memory = memory_before
# Sort with memory tracking
print("Sorting with external_sort...")
start_time = time.time()
# Create a custom monitoring function
memory_samples = []
def monitor_memory():
current = self.process.memory_info().rss / 1024 / 1024
memory_samples.append(current)
return current
# Sort data
sorted_data = external_sort(data)
# Measure final state
gc.collect()
memory_after = self.process.memory_info().rss / 1024 / 1024
elapsed = time.time() - start_time
# Sample memory during verification
for i in range(0, len(sorted_data) - 1, 10000):
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
if i % 100000 == 0:
peak_memory = max(peak_memory, monitor_memory())
# Calculate statistics
memory_increase = memory_after - memory_before
theoretical_sqrt_n = int(n ** 0.5)
print(f"\nExternal Sort Statistics:")
print(f" Items sorted: {n:,}")
print(f" Time taken: {elapsed:.2f} seconds")
print(f" Memory before: {memory_before:.1f} MB")
print(f" Memory after: {memory_after:.1f} MB")
print(f" Peak memory: {peak_memory:.1f} MB")
print(f" Memory increase: {memory_increase:.1f} MB")
print(f" Theoretical √n: {theoretical_sqrt_n:,} items")
print(f" Items per MB: {n / max(memory_increase, 0.1):,.0f}")
# Verify memory efficiency
# With 1M items, sqrt(n) = 1000, so memory should be much less than full dataset
self.assertLess(memory_increase, 50, f"Memory increase {memory_increase:.1f} MB is too high")
# Verify correctness on sample
sample_indices = random.sample(range(len(sorted_data) - 1), min(1000, len(sorted_data) - 1))
for i in sample_indices:
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
def test_external_groupby_memory_efficiency(self):
"""Test external groupby with memory tracking."""
n = 100_000
# Generate data with limited number of groups
print(f"\nGenerating {n:,} items for groupby...")
categories = [f"category_{i}" for i in range(100)]
data = [
{
"id": i,
"category": random.choice(categories),
"value": random.randint(1, 1000),
"data": f"data_{i}" * 10 # Make items larger
}
for i in range(n)
]
# Track memory
gc.collect()
memory_before = self.process.memory_info().rss / 1024 / 1024
# Group by category
print("Grouping by category...")
start_time = time.time()
grouped = external_groupby(data, key_func=lambda x: x["category"])
elapsed = time.time() - start_time
# Measure memory
gc.collect()
memory_after = self.process.memory_info().rss / 1024 / 1024
memory_increase = memory_after - memory_before
print(f"\nExternal GroupBy Statistics:")
print(f" Items grouped: {n:,}")
print(f" Groups created: {len(grouped)}")
print(f" Time taken: {elapsed:.2f} seconds")
print(f" Memory increase: {memory_increase:.1f} MB")
print(f" Items per MB: {n / max(memory_increase, 0.1):,.0f}")
# Verify correctness
self.assertEqual(len(grouped), len(categories))
total_items = sum(len(group) for group in grouped.values())
self.assertEqual(total_items, n)
# Verify grouping
for category, items in grouped.items():
for item in items[:10]: # Check first 10 items in each group
self.assertEqual(item["category"], category)
# Memory should be reasonable
self.assertLess(memory_increase, 100, f"Memory increase {memory_increase:.1f} MB is too high")
def test_stress_test_combined_operations(self):
"""Stress test with combined operations."""
n = 50_000
print(f"\nRunning stress test with {n:,} items...")
# Generate complex data
data = []
for i in range(n):
data.append({
"id": i,
"group": f"group_{i % 50}",
"value": random.randint(1, 1000),
"score": random.random(),
"text": f"This is item {i} with some text" * 5
})
# Track initial memory
gc.collect()
initial_memory = self.process.memory_info().rss / 1024 / 1024
# Operation 1: Group by
print(" 1. Grouping data...")
grouped = external_groupby(data, key_func=lambda x: x["group"])
# Operation 2: Sort each group
print(" 2. Sorting each group...")
for group_key, group_items in grouped.items():
# Sort by value
sorted_items = external_sort(
group_items,
key=lambda x: x["value"]
)
grouped[group_key] = sorted_items
# Operation 3: Extract top items from each group
print(" 3. Extracting top items...")
top_items = []
for group_items in grouped.values():
# Get top 10 by value
top_items.extend(group_items[-10:])
# Operation 4: Final sort
print(" 4. Final sort of top items...")
final_sorted = external_sort(
top_items,
key=lambda x: x["score"],
reverse=True
)
# Measure final memory
gc.collect()
final_memory = self.process.memory_info().rss / 1024 / 1024
total_memory_increase = final_memory - initial_memory
print(f"\nStress Test Results:")
print(f" Initial memory: {initial_memory:.1f} MB")
print(f" Final memory: {final_memory:.1f} MB")
print(f" Total increase: {total_memory_increase:.1f} MB")
print(f" Groups processed: {len(grouped)}")
print(f" Top items selected: {len(top_items)}")
# Verify results
self.assertEqual(len(grouped), 50) # 50 groups
self.assertEqual(len(top_items), 50 * 10) # Top 10 from each
self.assertEqual(len(final_sorted), len(top_items))
# Verify sorting
for i in range(len(final_sorted) - 1):
self.assertGreaterEqual(
final_sorted[i]["score"],
final_sorted[i + 1]["score"]
)
# Memory should still be reasonable after all operations
self.assertLess(
total_memory_increase,
150,
f"Memory increase {total_memory_increase:.1f} MB is too high"
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,309 @@
#!/usr/bin/env python3
"""
Memory pressure tests to verify √n behavior under constrained memory.
"""
import unittest
import gc
import os
import psutil
import resource
import tempfile
import shutil
import random
import time
from sqrtspace_spacetime import (
SpaceTimeArray, SpaceTimeDict, external_sort,
external_groupby, SpaceTimeConfig
)
class TestMemoryPressure(unittest.TestCase):
"""Test √n memory behavior under real memory constraints."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.process = psutil.Process()
# Configure strict memory limits
SpaceTimeConfig.set_defaults(
storage_path=self.temp_dir,
memory_limit=50 * 1024 * 1024, # 50MB limit
chunk_strategy='sqrt_n',
compression='gzip'
)
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_array_under_memory_pressure(self):
"""Test SpaceTimeArray behavior when memory is constrained."""
print("\n=== Testing SpaceTimeArray under memory pressure ===")
# Create large objects that will force spillover
large_object_size = 1024 # 1KB per object
n_objects = 100_000 # Total: ~100MB if all in memory
array = SpaceTimeArray(threshold='auto')
# Track metrics
spillovers = 0
max_memory = 0
start_time = time.time()
# Add objects and monitor memory
for i in range(n_objects):
# Create a large object
obj = {
'id': i,
'data': 'x' * large_object_size,
'timestamp': time.time()
}
array.append(obj)
# Monitor every 1000 items
if i % 1000 == 0:
gc.collect()
current_memory = self.process.memory_info().rss / 1024 / 1024
max_memory = max(max_memory, current_memory)
if i > 0:
hot_count = len(array._hot_data)
cold_count = len(array._cold_indices)
print(f" Items: {i:,} | Memory: {current_memory:.1f}MB | "
f"Hot: {hot_count} | Cold: {cold_count}")
# Check if spillover is happening
if cold_count > spillovers:
spillovers = cold_count
elapsed = time.time() - start_time
# Verify all data is accessible
print("\nVerifying data accessibility...")
sample_indices = random.sample(range(n_objects), min(100, n_objects))
for idx in sample_indices:
obj = array[idx]
self.assertEqual(obj['id'], idx)
self.assertEqual(len(obj['data']), large_object_size)
# Calculate statistics
theoretical_sqrt_n = int(n_objects ** 0.5)
actual_hot_items = len(array._hot_data)
print(f"\nResults:")
print(f" Total items: {n_objects:,}")
print(f" Time taken: {elapsed:.2f} seconds")
print(f" Max memory used: {max_memory:.1f} MB")
print(f" Theoretical √n: {theoretical_sqrt_n:,}")
print(f" Actual hot items: {actual_hot_items:,}")
print(f" Cold items: {len(array._cold_indices):,}")
print(f" Memory efficiency: {n_objects / max_memory:.0f} items/MB")
# Assertions
self.assertEqual(len(array), n_objects)
self.assertLess(max_memory, 150) # Should use much less than 100MB
self.assertGreater(spillovers, 0) # Should have spilled to disk
self.assertLessEqual(actual_hot_items, theoretical_sqrt_n * 2) # Within 2x of √n
def test_dict_with_memory_limit(self):
"""Test SpaceTimeDict with strict memory limit."""
print("\n=== Testing SpaceTimeDict under memory pressure ===")
# Create dictionary with explicit threshold
cache = SpaceTimeDict(threshold=1000) # Keep only 1000 items in memory
n_items = 50_000
value_size = 500 # 500 bytes per value
# Track evictions
evictions = 0
start_time = time.time()
# Add items
for i in range(n_items):
key = f"key_{i:06d}"
value = {
'id': i,
'data': 'v' * value_size,
'accessed': 0
}
cache[key] = value
# Check for evictions
if i % 1000 == 0 and i > 0:
current_hot = len(cache._hot_data)
current_cold = len(cache._cold_keys)
if current_cold > evictions:
evictions = current_cold
print(f" Items: {i:,} | Hot: {current_hot} | Cold: {current_cold}")
elapsed = time.time() - start_time
# Test access patterns (LRU behavior)
print("\nTesting LRU behavior...")
# Access some old items
for i in range(0, 100, 10):
key = f"key_{i:06d}"
value = cache[key]
value['accessed'] += 1
# Add more items to trigger eviction
for i in range(n_items, n_items + 1000):
cache[f"key_{i:06d}"] = {'id': i, 'data': 'x' * value_size}
# Recent items should still be hot
stats = cache.get_stats()
print(f"\nResults:")
print(f" Total items: {len(cache):,}")
print(f" Time taken: {elapsed:.2f} seconds")
print(f" Hot items: {len(cache._hot_data)}")
print(f" Cold items: {len(cache._cold_keys)}")
print(f" Stats: {stats}")
# Verify all items accessible
sample_keys = random.sample([f"key_{i:06d}" for i in range(n_items)], 100)
for key in sample_keys:
self.assertIn(key, cache)
value = cache[key]
self.assertIsNotNone(value)
def test_algorithm_memory_scaling(self):
"""Test that algorithms scale with √n memory usage."""
print("\n=== Testing algorithm memory scaling ===")
datasets = [10_000, 40_000, 90_000, 160_000] # n, 4n, 9n, 16n
results = []
for n in datasets:
print(f"\nTesting with n = {n:,}")
# Generate data
data = [random.randint(1, 1_000_000) for _ in range(n)]
# Measure memory for sorting
gc.collect()
mem_before = self.process.memory_info().rss / 1024 / 1024
sorted_data = external_sort(data)
gc.collect()
mem_after = self.process.memory_info().rss / 1024 / 1024
mem_used = mem_after - mem_before
# Verify correctness
self.assertEqual(len(sorted_data), n)
for i in range(min(1000, len(sorted_data) - 1)):
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
sqrt_n = int(n ** 0.5)
results.append({
'n': n,
'sqrt_n': sqrt_n,
'memory_used': mem_used,
'ratio': mem_used / max(sqrt_n * 8 / 1024 / 1024, 0.001) # 8 bytes per int
})
print(f" √n = {sqrt_n:,}")
print(f" Memory used: {mem_used:.2f} MB")
print(f" Ratio to theoretical: {results[-1]['ratio']:.2f}x")
# Verify √n scaling
print("\nScaling Analysis:")
print("n | √n | Memory (MB) | Ratio")
print("---------|---------|-------------|-------")
for r in results:
print(f"{r['n']:8,} | {r['sqrt_n']:7,} | {r['memory_used']:11.2f} | {r['ratio']:6.2f}x")
# Memory should scale roughly with √n
# As n increases 4x, memory should increase ~2x
for i in range(1, len(results)):
n_ratio = results[i]['n'] / results[i-1]['n']
mem_ratio = results[i]['memory_used'] / max(results[i-1]['memory_used'], 0.1)
expected_ratio = n_ratio ** 0.5
print(f"\nn increased {n_ratio:.1f}x, memory increased {mem_ratio:.1f}x "
f"(expected ~{expected_ratio:.1f}x)")
# Allow some variance due to overheads
self.assertLess(mem_ratio, expected_ratio * 3,
f"Memory scaling worse than √n: {mem_ratio:.1f}x vs {expected_ratio:.1f}x")
def test_concurrent_memory_pressure(self):
"""Test behavior under concurrent access with memory pressure."""
print("\n=== Testing concurrent access under memory pressure ===")
import threading
import queue
array = SpaceTimeArray(threshold=500)
errors = queue.Queue()
n_threads = 4
items_per_thread = 25_000
def worker(thread_id, start_idx):
try:
for i in range(items_per_thread):
item = {
'thread': thread_id,
'index': start_idx + i,
'data': f"thread_{thread_id}_item_{i}" * 50
}
array.append(item)
# Occasionally read random items
if i % 100 == 0 and len(array) > 10:
idx = random.randint(0, len(array) - 1)
_ = array[idx]
except Exception as e:
errors.put((thread_id, str(e)))
# Start threads
threads = []
start_time = time.time()
for i in range(n_threads):
t = threading.Thread(
target=worker,
args=(i, i * items_per_thread)
)
threads.append(t)
t.start()
# Monitor memory while threads run
max_memory = 0
while any(t.is_alive() for t in threads):
current_memory = self.process.memory_info().rss / 1024 / 1024
max_memory = max(max_memory, current_memory)
time.sleep(0.1)
# Wait for completion
for t in threads:
t.join()
elapsed = time.time() - start_time
# Check for errors
error_list = []
while not errors.empty():
error_list.append(errors.get())
print(f"\nResults:")
print(f" Threads: {n_threads}")
print(f" Total items: {n_threads * items_per_thread:,}")
print(f" Time taken: {elapsed:.2f} seconds")
print(f" Max memory: {max_memory:.1f} MB")
print(f" Errors: {len(error_list)}")
print(f" Final array size: {len(array):,}")
# Assertions
self.assertEqual(len(error_list), 0, f"Thread errors: {error_list}")
self.assertEqual(len(array), n_threads * items_per_thread)
self.assertLess(max_memory, 200) # Should handle memory pressure
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""
Tests for SpaceTimeArray with memory pressure simulation.
"""
import unittest
import tempfile
import shutil
import os
import gc
import psutil
from sqrtspace_spacetime import SpaceTimeArray, SpaceTimeConfig
class TestSpaceTimeArray(unittest.TestCase):
"""Test SpaceTimeArray functionality."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
SpaceTimeConfig.set_defaults(
storage_path=self.temp_dir,
memory_limit=50 * 1024 * 1024, # 50MB for testing
chunk_strategy='sqrt_n'
)
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_basic_operations(self):
"""Test basic array operations."""
array = SpaceTimeArray(threshold=100)
# Test append
for i in range(50):
array.append(f"item_{i}")
self.assertEqual(len(array), 50)
self.assertEqual(array[0], "item_0")
self.assertEqual(array[49], "item_49")
# Test negative indexing
self.assertEqual(array[-1], "item_49")
self.assertEqual(array[-50], "item_0")
# Test slice
slice_result = array[10:20]
self.assertEqual(len(slice_result), 10)
self.assertEqual(slice_result[0], "item_10")
def test_automatic_spillover(self):
"""Test automatic spillover to disk."""
# Create array with small threshold
array = SpaceTimeArray(threshold=10)
# Add more items than threshold
for i in range(100):
array.append(f"value_{i}")
# Check that spillover happened
self.assertEqual(len(array), 100)
self.assertGreater(len(array._cold_indices), 0)
self.assertLessEqual(len(array._hot_data), array.threshold)
# Verify all items are accessible
for i in range(100):
self.assertEqual(array[i], f"value_{i}")
def test_memory_pressure_handling(self):
"""Test behavior under memory pressure."""
# Create array with auto threshold
array = SpaceTimeArray()
# Generate large data items
large_item = "x" * 10000 # 10KB string
# Add items until memory pressure detected
for i in range(1000):
array.append(f"{large_item}_{i}")
# Check memory usage periodically
if i % 100 == 0:
process = psutil.Process()
memory_mb = process.memory_info().rss / 1024 / 1024
# Ensure we're not using excessive memory
self.assertLess(memory_mb, 200, f"Memory usage too high at iteration {i}")
# Verify all items still accessible
self.assertEqual(len(array), 1000)
self.assertTrue(array[0].endswith("_0"))
self.assertTrue(array[999].endswith("_999"))
def test_large_dataset_sqrt_n_memory(self):
"""Test √n memory usage with large dataset."""
# Configure for sqrt_n strategy
SpaceTimeConfig.set_defaults(chunk_strategy='sqrt_n')
n = 10000 # Total items
sqrt_n = int(n ** 0.5) # Expected memory items
array = SpaceTimeArray()
# Track initial memory
gc.collect()
process = psutil.Process()
initial_memory = process.memory_info().rss
# Add n items
for i in range(n):
array.append({"id": i, "data": f"item_{i}" * 10})
# Force garbage collection
gc.collect()
# Check memory usage
final_memory = process.memory_info().rss
memory_increase_mb = (final_memory - initial_memory) / 1024 / 1024
# Verify sqrt_n behavior
self.assertEqual(len(array), n)
self.assertLessEqual(len(array._hot_data), sqrt_n * 2) # Allow some buffer
self.assertGreater(len(array._cold_indices), n - sqrt_n * 2)
# Memory should be much less than storing all items
# Rough estimate: each item ~100 bytes, so n items = ~1MB
# With sqrt_n, should use ~10KB in memory
self.assertLess(memory_increase_mb, 10, f"Memory increase {memory_increase_mb}MB is too high")
# Verify random access still works
import random
for _ in range(100):
idx = random.randint(0, n - 1)
self.assertEqual(array[idx]["id"], idx)
def test_persistence_across_sessions(self):
"""Test data persistence when array is recreated."""
storage_path = os.path.join(self.temp_dir, "persist_test")
# Create and populate array
array1 = SpaceTimeArray(threshold=10, storage_path=storage_path)
for i in range(50):
array1.append(f"persistent_{i}")
# Force spillover
array1._check_and_spill()
del array1
# Create new array with same storage path
array2 = SpaceTimeArray(threshold=10, storage_path=storage_path)
# Data should be accessible
self.assertEqual(len(array2), 50)
for i in range(50):
self.assertEqual(array2[i], f"persistent_{i}")
def test_concurrent_access(self):
"""Test thread-safe access to array."""
import threading
array = SpaceTimeArray(threshold=100)
errors = []
def writer(start, count):
try:
for i in range(start, start + count):
array.append(f"thread_{i}")
except Exception as e:
errors.append(e)
def reader(count):
try:
for _ in range(count):
if len(array) > 0:
_ = array[0] # Just access, don't verify
except Exception as e:
errors.append(e)
# Create threads
threads = []
for i in range(5):
t = threading.Thread(target=writer, args=(i * 100, 100))
threads.append(t)
for i in range(3):
t = threading.Thread(target=reader, args=(50,))
threads.append(t)
# Run threads
for t in threads:
t.start()
for t in threads:
t.join()
# Check for errors
self.assertEqual(len(errors), 0, f"Thread errors: {errors}")
self.assertEqual(len(array), 500)
if __name__ == "__main__":
unittest.main()