Initial
This commit is contained in:
234
tests/test_external_algorithms.py
Normal file
234
tests/test_external_algorithms.py
Normal file
@@ -0,0 +1,234 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for external algorithms with memory pressure.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
import random
|
||||
import gc
|
||||
import psutil
|
||||
import time
|
||||
from sqrtspace_spacetime import external_sort, external_groupby, SpaceTimeConfig
|
||||
|
||||
|
||||
class TestExternalAlgorithms(unittest.TestCase):
|
||||
"""Test external algorithms under memory constraints."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
SpaceTimeConfig.set_defaults(
|
||||
memory_limit=100 * 1024 * 1024, # 100MB limit
|
||||
chunk_strategy='sqrt_n'
|
||||
)
|
||||
self.process = psutil.Process()
|
||||
|
||||
def test_external_sort_small(self):
|
||||
"""Test external sort with small dataset."""
|
||||
data = [random.randint(1, 1000) for _ in range(1000)]
|
||||
sorted_data = external_sort(data)
|
||||
|
||||
# Verify sorting
|
||||
self.assertEqual(len(sorted_data), len(data))
|
||||
for i in range(len(sorted_data) - 1):
|
||||
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
|
||||
|
||||
# Verify all elements present
|
||||
self.assertEqual(sorted(data), sorted_data)
|
||||
|
||||
def test_external_sort_large_with_memory_tracking(self):
|
||||
"""Test external sort with large dataset and memory tracking."""
|
||||
n = 1_000_000 # 1 million items
|
||||
|
||||
# Generate data
|
||||
print(f"\nGenerating {n:,} random integers...")
|
||||
data = [random.randint(1, 10_000_000) for _ in range(n)]
|
||||
|
||||
# Track memory before sorting
|
||||
gc.collect()
|
||||
memory_before = self.process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = memory_before
|
||||
|
||||
# Sort with memory tracking
|
||||
print("Sorting with external_sort...")
|
||||
start_time = time.time()
|
||||
|
||||
# Create a custom monitoring function
|
||||
memory_samples = []
|
||||
def monitor_memory():
|
||||
current = self.process.memory_info().rss / 1024 / 1024
|
||||
memory_samples.append(current)
|
||||
return current
|
||||
|
||||
# Sort data
|
||||
sorted_data = external_sort(data)
|
||||
|
||||
# Measure final state
|
||||
gc.collect()
|
||||
memory_after = self.process.memory_info().rss / 1024 / 1024
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Sample memory during verification
|
||||
for i in range(0, len(sorted_data) - 1, 10000):
|
||||
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
|
||||
if i % 100000 == 0:
|
||||
peak_memory = max(peak_memory, monitor_memory())
|
||||
|
||||
# Calculate statistics
|
||||
memory_increase = memory_after - memory_before
|
||||
theoretical_sqrt_n = int(n ** 0.5)
|
||||
|
||||
print(f"\nExternal Sort Statistics:")
|
||||
print(f" Items sorted: {n:,}")
|
||||
print(f" Time taken: {elapsed:.2f} seconds")
|
||||
print(f" Memory before: {memory_before:.1f} MB")
|
||||
print(f" Memory after: {memory_after:.1f} MB")
|
||||
print(f" Peak memory: {peak_memory:.1f} MB")
|
||||
print(f" Memory increase: {memory_increase:.1f} MB")
|
||||
print(f" Theoretical √n: {theoretical_sqrt_n:,} items")
|
||||
print(f" Items per MB: {n / max(memory_increase, 0.1):,.0f}")
|
||||
|
||||
# Verify memory efficiency
|
||||
# With 1M items, sqrt(n) = 1000, so memory should be much less than full dataset
|
||||
self.assertLess(memory_increase, 50, f"Memory increase {memory_increase:.1f} MB is too high")
|
||||
|
||||
# Verify correctness on sample
|
||||
sample_indices = random.sample(range(len(sorted_data) - 1), min(1000, len(sorted_data) - 1))
|
||||
for i in sample_indices:
|
||||
self.assertLessEqual(sorted_data[i], sorted_data[i + 1])
|
||||
|
||||
def test_external_groupby_memory_efficiency(self):
|
||||
"""Test external groupby with memory tracking."""
|
||||
n = 100_000
|
||||
|
||||
# Generate data with limited number of groups
|
||||
print(f"\nGenerating {n:,} items for groupby...")
|
||||
categories = [f"category_{i}" for i in range(100)]
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"category": random.choice(categories),
|
||||
"value": random.randint(1, 1000),
|
||||
"data": f"data_{i}" * 10 # Make items larger
|
||||
}
|
||||
for i in range(n)
|
||||
]
|
||||
|
||||
# Track memory
|
||||
gc.collect()
|
||||
memory_before = self.process.memory_info().rss / 1024 / 1024
|
||||
|
||||
# Group by category
|
||||
print("Grouping by category...")
|
||||
start_time = time.time()
|
||||
grouped = external_groupby(data, key_func=lambda x: x["category"])
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Measure memory
|
||||
gc.collect()
|
||||
memory_after = self.process.memory_info().rss / 1024 / 1024
|
||||
memory_increase = memory_after - memory_before
|
||||
|
||||
print(f"\nExternal GroupBy Statistics:")
|
||||
print(f" Items grouped: {n:,}")
|
||||
print(f" Groups created: {len(grouped)}")
|
||||
print(f" Time taken: {elapsed:.2f} seconds")
|
||||
print(f" Memory increase: {memory_increase:.1f} MB")
|
||||
print(f" Items per MB: {n / max(memory_increase, 0.1):,.0f}")
|
||||
|
||||
# Verify correctness
|
||||
self.assertEqual(len(grouped), len(categories))
|
||||
total_items = sum(len(group) for group in grouped.values())
|
||||
self.assertEqual(total_items, n)
|
||||
|
||||
# Verify grouping
|
||||
for category, items in grouped.items():
|
||||
for item in items[:10]: # Check first 10 items in each group
|
||||
self.assertEqual(item["category"], category)
|
||||
|
||||
# Memory should be reasonable
|
||||
self.assertLess(memory_increase, 100, f"Memory increase {memory_increase:.1f} MB is too high")
|
||||
|
||||
def test_stress_test_combined_operations(self):
|
||||
"""Stress test with combined operations."""
|
||||
n = 50_000
|
||||
|
||||
print(f"\nRunning stress test with {n:,} items...")
|
||||
|
||||
# Generate complex data
|
||||
data = []
|
||||
for i in range(n):
|
||||
data.append({
|
||||
"id": i,
|
||||
"group": f"group_{i % 50}",
|
||||
"value": random.randint(1, 1000),
|
||||
"score": random.random(),
|
||||
"text": f"This is item {i} with some text" * 5
|
||||
})
|
||||
|
||||
# Track initial memory
|
||||
gc.collect()
|
||||
initial_memory = self.process.memory_info().rss / 1024 / 1024
|
||||
|
||||
# Operation 1: Group by
|
||||
print(" 1. Grouping data...")
|
||||
grouped = external_groupby(data, key_func=lambda x: x["group"])
|
||||
|
||||
# Operation 2: Sort each group
|
||||
print(" 2. Sorting each group...")
|
||||
for group_key, group_items in grouped.items():
|
||||
# Sort by value
|
||||
sorted_items = external_sort(
|
||||
group_items,
|
||||
key=lambda x: x["value"]
|
||||
)
|
||||
grouped[group_key] = sorted_items
|
||||
|
||||
# Operation 3: Extract top items from each group
|
||||
print(" 3. Extracting top items...")
|
||||
top_items = []
|
||||
for group_items in grouped.values():
|
||||
# Get top 10 by value
|
||||
top_items.extend(group_items[-10:])
|
||||
|
||||
# Operation 4: Final sort
|
||||
print(" 4. Final sort of top items...")
|
||||
final_sorted = external_sort(
|
||||
top_items,
|
||||
key=lambda x: x["score"],
|
||||
reverse=True
|
||||
)
|
||||
|
||||
# Measure final memory
|
||||
gc.collect()
|
||||
final_memory = self.process.memory_info().rss / 1024 / 1024
|
||||
total_memory_increase = final_memory - initial_memory
|
||||
|
||||
print(f"\nStress Test Results:")
|
||||
print(f" Initial memory: {initial_memory:.1f} MB")
|
||||
print(f" Final memory: {final_memory:.1f} MB")
|
||||
print(f" Total increase: {total_memory_increase:.1f} MB")
|
||||
print(f" Groups processed: {len(grouped)}")
|
||||
print(f" Top items selected: {len(top_items)}")
|
||||
|
||||
# Verify results
|
||||
self.assertEqual(len(grouped), 50) # 50 groups
|
||||
self.assertEqual(len(top_items), 50 * 10) # Top 10 from each
|
||||
self.assertEqual(len(final_sorted), len(top_items))
|
||||
|
||||
# Verify sorting
|
||||
for i in range(len(final_sorted) - 1):
|
||||
self.assertGreaterEqual(
|
||||
final_sorted[i]["score"],
|
||||
final_sorted[i + 1]["score"]
|
||||
)
|
||||
|
||||
# Memory should still be reasonable after all operations
|
||||
self.assertLess(
|
||||
total_memory_increase,
|
||||
150,
|
||||
f"Memory increase {total_memory_increase:.1f} MB is too high"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user