Initial
This commit is contained in:
204
examples/basic_usage.py
Normal file
204
examples/basic_usage.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Basic usage examples for Ubiquity SpaceTime.
|
||||
"""
|
||||
|
||||
import time
|
||||
import random
|
||||
from sqrtspace_spacetime import (
|
||||
SpaceTimeArray,
|
||||
SpaceTimeDict,
|
||||
external_sort,
|
||||
external_groupby,
|
||||
Stream,
|
||||
SpaceTimeConfig,
|
||||
)
|
||||
from sqrtspace_spacetime.profiler import profile, profile_memory
|
||||
from sqrtspace_spacetime.checkpoint import auto_checkpoint
|
||||
|
||||
|
||||
def example_spacetime_array():
|
||||
"""Example: Memory-efficient array with automatic spillover."""
|
||||
print("\n=== SpaceTimeArray Example ===")
|
||||
|
||||
# Create array that keeps only 1000 items in memory
|
||||
array = SpaceTimeArray(threshold=1000)
|
||||
|
||||
# Add 10,000 items
|
||||
print("Adding 10,000 items to SpaceTimeArray...")
|
||||
for i in range(10000):
|
||||
array.append(f"item_{i}")
|
||||
|
||||
print(f"Array length: {len(array)}")
|
||||
print(f"Sample items: {array[0]}, {array[5000]}, {array[9999]}")
|
||||
|
||||
# Demonstrate memory efficiency
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
memory_mb = process.memory_info().rss / 1024 / 1024
|
||||
print(f"Current memory usage: {memory_mb:.1f} MB (much less than storing all in memory)")
|
||||
|
||||
|
||||
def example_external_sort():
|
||||
"""Example: Sort large dataset with minimal memory."""
|
||||
print("\n=== External Sort Example ===")
|
||||
|
||||
# Generate large random dataset
|
||||
print("Generating 1M random numbers...")
|
||||
data = [random.randint(1, 1000000) for _ in range(1000000)]
|
||||
|
||||
# Sort using √n memory
|
||||
print("Sorting with external_sort (√n memory)...")
|
||||
start = time.time()
|
||||
sorted_data = external_sort(data)
|
||||
elapsed = time.time() - start
|
||||
|
||||
# Verify sorting
|
||||
is_sorted = all(sorted_data[i] <= sorted_data[i+1] for i in range(len(sorted_data)-1))
|
||||
print(f"Sorted correctly: {is_sorted}")
|
||||
print(f"Time taken: {elapsed:.2f}s")
|
||||
print(f"First 10 elements: {sorted_data[:10]}")
|
||||
|
||||
|
||||
def example_streaming():
|
||||
"""Example: Process data streams efficiently."""
|
||||
print("\n=== Stream Processing Example ===")
|
||||
|
||||
# Create sample data
|
||||
data = [
|
||||
{'name': 'Alice', 'age': 25, 'score': 85},
|
||||
{'name': 'Bob', 'age': 30, 'score': 90},
|
||||
{'name': 'Charlie', 'age': 25, 'score': 78},
|
||||
{'name': 'David', 'age': 30, 'score': 92},
|
||||
{'name': 'Eve', 'age': 25, 'score': 88},
|
||||
]
|
||||
|
||||
# Stream processing
|
||||
result = Stream.from_iterable(data) \
|
||||
.filter(lambda x: x['age'] == 25) \
|
||||
.map(lambda x: {'name': x['name'], 'grade': 'A' if x['score'] >= 85 else 'B'}) \
|
||||
.collect()
|
||||
|
||||
print("Filtered and transformed data:")
|
||||
for item in result:
|
||||
print(f" {item}")
|
||||
|
||||
|
||||
@profile_memory(threshold_mb=50)
|
||||
def example_memory_profiling():
|
||||
"""Example: Profile memory usage."""
|
||||
print("\n=== Memory Profiling Example ===")
|
||||
|
||||
# Simulate memory-intensive operation
|
||||
data = []
|
||||
for i in range(100000):
|
||||
data.append({
|
||||
'id': i,
|
||||
'value': random.random(),
|
||||
'text': f"Item number {i}" * 10
|
||||
})
|
||||
|
||||
# Process data
|
||||
result = sum(item['value'] for item in data)
|
||||
return result
|
||||
|
||||
|
||||
@auto_checkpoint(total_iterations=100)
|
||||
def example_checkpointing(data):
|
||||
"""Example: Auto-checkpoint long computation."""
|
||||
print("\n=== Checkpointing Example ===")
|
||||
|
||||
results = []
|
||||
for i, item in enumerate(data):
|
||||
# Simulate expensive computation
|
||||
time.sleep(0.01)
|
||||
result = item ** 2
|
||||
results.append(result)
|
||||
|
||||
# Yield state for checkpointing
|
||||
if i % 10 == 0:
|
||||
print(f"Processing item {i}...")
|
||||
yield {'i': i, 'results': results}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def example_groupby():
|
||||
"""Example: Group large dataset efficiently."""
|
||||
print("\n=== External GroupBy Example ===")
|
||||
|
||||
# Generate sales data
|
||||
sales = []
|
||||
stores = ['Store_A', 'Store_B', 'Store_C', 'Store_D']
|
||||
|
||||
print("Generating 100K sales records...")
|
||||
for i in range(100000):
|
||||
sales.append({
|
||||
'store': random.choice(stores),
|
||||
'amount': random.uniform(10, 1000),
|
||||
'product': f'Product_{random.randint(1, 100)}'
|
||||
})
|
||||
|
||||
# Group by store
|
||||
print("Grouping by store...")
|
||||
grouped = external_groupby(sales, key_func=lambda x: x['store'])
|
||||
|
||||
# Calculate totals
|
||||
for store, transactions in grouped.items():
|
||||
total = sum(t['amount'] for t in transactions)
|
||||
print(f"{store}: {len(transactions)} transactions, ${total:,.2f} total")
|
||||
|
||||
|
||||
def example_spacetime_dict():
|
||||
"""Example: Memory-efficient dictionary with LRU eviction."""
|
||||
print("\n=== SpaceTimeDict Example ===")
|
||||
|
||||
# Create cache with 100-item memory limit
|
||||
cache = SpaceTimeDict(threshold=100)
|
||||
|
||||
# Simulate caching expensive computations
|
||||
print("Caching 1000 expensive computations...")
|
||||
for i in range(1000):
|
||||
key = f"computation_{i}"
|
||||
# Simulate expensive computation
|
||||
value = i ** 2 + random.random()
|
||||
cache[key] = value
|
||||
|
||||
print(f"Total items: {len(cache)}")
|
||||
print(f"Items in memory: {len(cache._hot_data)}")
|
||||
print(f"Items on disk: {len(cache._cold_keys)}")
|
||||
|
||||
# Access patterns
|
||||
stats = cache.get_stats()
|
||||
print(f"Cache stats: {stats}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all examples."""
|
||||
print("=== Ubiquity SpaceTime Examples ===")
|
||||
|
||||
# Configure SpaceTime
|
||||
SpaceTimeConfig.set_defaults(
|
||||
memory_limit=512 * 1024 * 1024, # 512MB
|
||||
chunk_strategy='sqrt_n',
|
||||
compression='gzip'
|
||||
)
|
||||
|
||||
# Run examples
|
||||
example_spacetime_array()
|
||||
example_external_sort()
|
||||
example_streaming()
|
||||
example_memory_profiling()
|
||||
example_groupby()
|
||||
example_spacetime_dict()
|
||||
|
||||
# Checkpointing example
|
||||
data = list(range(100))
|
||||
results = list(example_checkpointing(data))
|
||||
print(f"Checkpointing completed. Processed {len(results)} items.")
|
||||
|
||||
print("\n=== All examples completed! ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
504
examples/fastapi-app/README.md
Normal file
504
examples/fastapi-app/README.md
Normal file
@@ -0,0 +1,504 @@
|
||||
# SqrtSpace SpaceTime FastAPI Sample Application
|
||||
|
||||
This sample demonstrates how to build memory-efficient, high-performance APIs using FastAPI and SqrtSpace SpaceTime.
|
||||
|
||||
## Features Demonstrated
|
||||
|
||||
### 1. **Streaming Endpoints**
|
||||
- Server-Sent Events (SSE) for real-time data
|
||||
- Streaming file downloads without memory bloat
|
||||
- Chunked JSON responses for large datasets
|
||||
|
||||
### 2. **Background Tasks**
|
||||
- Memory-aware task processing
|
||||
- Checkpointed long-running operations
|
||||
- Progress tracking with resumable state
|
||||
|
||||
### 3. **Data Processing**
|
||||
- External sorting for large datasets
|
||||
- Memory-efficient aggregations
|
||||
- Streaming ETL pipelines
|
||||
|
||||
### 4. **Machine Learning Integration**
|
||||
- Batch prediction with memory limits
|
||||
- Model training with checkpoints
|
||||
- Feature extraction pipelines
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Create virtual environment:**
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
2. **Install dependencies:**
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Configure environment:**
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
Edit `.env`:
|
||||
```
|
||||
SPACETIME_MEMORY_LIMIT=512MB
|
||||
SPACETIME_EXTERNAL_STORAGE=/tmp/spacetime
|
||||
SPACETIME_CHUNK_STRATEGY=sqrt_n
|
||||
SPACETIME_COMPRESSION=gzip
|
||||
DATABASE_URL=sqlite:///./app.db
|
||||
```
|
||||
|
||||
4. **Initialize database:**
|
||||
```bash
|
||||
python init_db.py
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
fastapi-app/
|
||||
├── app/
|
||||
│ ├── __init__.py
|
||||
│ ├── main.py # FastAPI app
|
||||
│ ├── config.py # Configuration
|
||||
│ ├── models.py # Pydantic models
|
||||
│ ├── database.py # Database setup
|
||||
│ ├── routers/
|
||||
│ │ ├── products.py # Product endpoints
|
||||
│ │ ├── analytics.py # Analytics endpoints
|
||||
│ │ ├── ml.py # ML endpoints
|
||||
│ │ └── reports.py # Report generation
|
||||
│ ├── services/
|
||||
│ │ ├── product_service.py # Business logic
|
||||
│ │ ├── analytics_service.py # Analytics processing
|
||||
│ │ ├── ml_service.py # ML operations
|
||||
│ │ └── cache_service.py # SpaceTime caching
|
||||
│ ├── workers/
|
||||
│ │ ├── background_tasks.py # Task workers
|
||||
│ │ └── checkpointed_jobs.py # Resumable jobs
|
||||
│ └── utils/
|
||||
│ ├── streaming.py # Streaming helpers
|
||||
│ └── memory.py # Memory monitoring
|
||||
├── requirements.txt
|
||||
├── Dockerfile
|
||||
└── docker-compose.yml
|
||||
```
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### 1. Streaming Large Datasets
|
||||
|
||||
```python
|
||||
# app/routers/products.py
|
||||
from fastapi import APIRouter, Response
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqrtspace_spacetime import Stream
|
||||
import json
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/products/stream")
|
||||
async def stream_products(category: str = None):
|
||||
"""Stream products as newline-delimited JSON"""
|
||||
|
||||
async def generate():
|
||||
query = db.query(Product)
|
||||
if category:
|
||||
query = query.filter(Product.category == category)
|
||||
|
||||
# Use SpaceTime stream for memory efficiency
|
||||
stream = Stream.from_query(query, chunk_size=100)
|
||||
|
||||
for product in stream:
|
||||
yield json.dumps(product.dict()) + "\n"
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="application/x-ndjson",
|
||||
headers={"X-Accel-Buffering": "no"}
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Server-Sent Events for Real-Time Data
|
||||
|
||||
```python
|
||||
# app/routers/analytics.py
|
||||
from fastapi import APIRouter
|
||||
from sse_starlette.sse import EventSourceResponse
|
||||
from sqrtspace_spacetime.memory import MemoryPressureMonitor
|
||||
import asyncio
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/analytics/realtime")
|
||||
async def realtime_analytics():
|
||||
"""Stream real-time analytics using SSE"""
|
||||
|
||||
monitor = MemoryPressureMonitor("100MB")
|
||||
|
||||
async def event_generator():
|
||||
while True:
|
||||
# Get current stats
|
||||
stats = await analytics_service.get_current_stats()
|
||||
|
||||
# Check memory pressure
|
||||
if monitor.check() != MemoryPressureLevel.NONE:
|
||||
await analytics_service.compact_cache()
|
||||
|
||||
yield {
|
||||
"event": "update",
|
||||
"data": json.dumps(stats)
|
||||
}
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
return EventSourceResponse(event_generator())
|
||||
```
|
||||
|
||||
### 3. Memory-Efficient CSV Export
|
||||
|
||||
```python
|
||||
# app/routers/reports.py
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqrtspace_spacetime.file import CsvWriter
|
||||
import io
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/reports/export/csv")
|
||||
async def export_csv(start_date: date, end_date: date):
|
||||
"""Export large dataset as CSV with streaming"""
|
||||
|
||||
async def generate():
|
||||
# Create in-memory buffer
|
||||
output = io.StringIO()
|
||||
writer = CsvWriter(output)
|
||||
|
||||
# Write headers
|
||||
writer.writerow(["Date", "Orders", "Revenue", "Customers"])
|
||||
|
||||
# Stream data in chunks
|
||||
async for batch in analytics_service.get_daily_stats_batched(
|
||||
start_date, end_date, batch_size=100
|
||||
):
|
||||
for row in batch:
|
||||
writer.writerow([
|
||||
row.date,
|
||||
row.order_count,
|
||||
row.total_revenue,
|
||||
row.unique_customers
|
||||
])
|
||||
|
||||
# Yield buffer content
|
||||
output.seek(0)
|
||||
data = output.read()
|
||||
output.seek(0)
|
||||
output.truncate()
|
||||
yield data
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/csv",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename=report_{start_date}_{end_date}.csv"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Checkpointed Background Tasks
|
||||
|
||||
```python
|
||||
# app/workers/checkpointed_jobs.py
|
||||
from sqrtspace_spacetime.checkpoint import CheckpointManager, auto_checkpoint
|
||||
from sqrtspace_spacetime.collections import SpaceTimeArray
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self):
|
||||
self.checkpoint_manager = CheckpointManager()
|
||||
|
||||
@auto_checkpoint(total_iterations=10000)
|
||||
async def process_large_dataset(self, dataset_id: str):
|
||||
"""Process dataset with automatic checkpointing"""
|
||||
|
||||
# Initialize or restore state
|
||||
results = SpaceTimeArray(threshold=1000)
|
||||
processed_count = 0
|
||||
|
||||
# Get data in batches
|
||||
async for batch in self.get_data_batches(dataset_id):
|
||||
for item in batch:
|
||||
# Process item
|
||||
result = await self.process_item(item)
|
||||
results.append(result)
|
||||
processed_count += 1
|
||||
|
||||
# Yield state for checkpointing
|
||||
if processed_count % 100 == 0:
|
||||
yield {
|
||||
'processed': processed_count,
|
||||
'results': results,
|
||||
'last_item_id': item.id
|
||||
}
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### 5. Machine Learning with Memory Constraints
|
||||
|
||||
```python
|
||||
# app/services/ml_service.py
|
||||
from sqrtspace_spacetime.ml import SpaceTimeOptimizer
|
||||
from sqrtspace_spacetime.streams import Stream
|
||||
import numpy as np
|
||||
|
||||
class MLService:
|
||||
def __init__(self):
|
||||
self.optimizer = SpaceTimeOptimizer(
|
||||
memory_limit="256MB",
|
||||
checkpoint_frequency=100
|
||||
)
|
||||
|
||||
async def train_model(self, training_data_path: str):
|
||||
"""Train model with memory-efficient data loading"""
|
||||
|
||||
# Stream training data
|
||||
data_stream = Stream.from_csv(
|
||||
training_data_path,
|
||||
chunk_size=1000
|
||||
)
|
||||
|
||||
# Process in mini-batches
|
||||
for epoch in range(10):
|
||||
for batch in data_stream.batch(32):
|
||||
X = np.array([item.features for item in batch])
|
||||
y = np.array([item.label for item in batch])
|
||||
|
||||
# Train step with automatic checkpointing
|
||||
loss = self.optimizer.step(
|
||||
self.model,
|
||||
X, y,
|
||||
epoch=epoch
|
||||
)
|
||||
|
||||
if self.optimizer.should_checkpoint():
|
||||
await self.save_checkpoint(epoch)
|
||||
|
||||
async def batch_predict(self, input_data):
|
||||
"""Memory-efficient batch prediction"""
|
||||
|
||||
results = SpaceTimeArray(threshold=1000)
|
||||
|
||||
# Process in chunks to avoid memory issues
|
||||
for chunk in Stream.from_iterable(input_data).chunk(100):
|
||||
predictions = self.model.predict(chunk)
|
||||
results.extend(predictions)
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### 6. Advanced Caching with SpaceTime
|
||||
|
||||
```python
|
||||
# app/services/cache_service.py
|
||||
from sqrtspace_spacetime.collections import SpaceTimeDict
|
||||
from sqrtspace_spacetime.memory import MemoryPressureMonitor
|
||||
import asyncio
|
||||
|
||||
class SpaceTimeCache:
|
||||
def __init__(self):
|
||||
self.hot_cache = SpaceTimeDict(threshold=1000)
|
||||
self.monitor = MemoryPressureMonitor("128MB")
|
||||
self.stats = {
|
||||
'hits': 0,
|
||||
'misses': 0,
|
||||
'evictions': 0
|
||||
}
|
||||
|
||||
async def get(self, key: str):
|
||||
"""Get with automatic tier management"""
|
||||
|
||||
if key in self.hot_cache:
|
||||
self.stats['hits'] += 1
|
||||
return self.hot_cache[key]
|
||||
|
||||
self.stats['misses'] += 1
|
||||
|
||||
# Load from database
|
||||
value = await self.load_from_db(key)
|
||||
|
||||
# Add to cache if memory allows
|
||||
if self.monitor.can_allocate(len(str(value))):
|
||||
self.hot_cache[key] = value
|
||||
else:
|
||||
# Trigger cleanup
|
||||
self.cleanup()
|
||||
self.stats['evictions'] += len(self.hot_cache) // 2
|
||||
|
||||
return value
|
||||
|
||||
def cleanup(self):
|
||||
"""Remove least recently used items"""
|
||||
# SpaceTimeDict handles LRU automatically
|
||||
self.hot_cache.evict_cold_items(0.5)
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Products API
|
||||
- `GET /products` - Paginated list
|
||||
- `GET /products/stream` - Stream all products (NDJSON)
|
||||
- `GET /products/search` - Memory-efficient search
|
||||
- `POST /products/bulk-update` - Checkpointed bulk updates
|
||||
- `GET /products/export/csv` - Streaming CSV export
|
||||
|
||||
### Analytics API
|
||||
- `GET /analytics/summary` - Current statistics
|
||||
- `GET /analytics/realtime` - SSE stream of live data
|
||||
- `GET /analytics/trends` - Historical trends
|
||||
- `POST /analytics/aggregate` - Custom aggregations
|
||||
|
||||
### ML API
|
||||
- `POST /ml/train` - Train model (async with progress)
|
||||
- `POST /ml/predict/batch` - Batch predictions
|
||||
- `GET /ml/models/{id}/status` - Training status
|
||||
- `POST /ml/features/extract` - Feature extraction pipeline
|
||||
|
||||
### Reports API
|
||||
- `POST /reports/generate` - Generate large report
|
||||
- `GET /reports/{id}/progress` - Check progress
|
||||
- `GET /reports/{id}/download` - Download completed report
|
||||
|
||||
## Running the Application
|
||||
|
||||
### Development
|
||||
```bash
|
||||
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### Production
|
||||
```bash
|
||||
gunicorn app.main:app -w 4 -k uvicorn.workers.UvicornWorker \
|
||||
--bind 0.0.0.0:8000 \
|
||||
--timeout 300 \
|
||||
--max-requests 1000 \
|
||||
--max-requests-jitter 50
|
||||
```
|
||||
|
||||
### With Docker
|
||||
```bash
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
## Performance Configuration
|
||||
|
||||
### 1. Nginx Configuration
|
||||
```nginx
|
||||
location /products/stream {
|
||||
proxy_pass http://backend;
|
||||
proxy_buffering off;
|
||||
proxy_read_timeout 3600;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
}
|
||||
|
||||
location /analytics/realtime {
|
||||
proxy_pass http://backend;
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
proxy_read_timeout 86400;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Worker Configuration
|
||||
```python
|
||||
# app/config.py
|
||||
WORKER_CONFIG = {
|
||||
'memory_limit': os.getenv('WORKER_MEMORY_LIMIT', '512MB'),
|
||||
'checkpoint_interval': 100,
|
||||
'batch_size': 1000,
|
||||
'external_storage': '/tmp/spacetime-workers'
|
||||
}
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Memory Usage Endpoint
|
||||
```python
|
||||
@router.get("/system/memory")
|
||||
async def memory_stats():
|
||||
"""Get current memory statistics"""
|
||||
|
||||
return {
|
||||
"current_usage_mb": memory_monitor.current_usage_mb,
|
||||
"peak_usage_mb": memory_monitor.peak_usage_mb,
|
||||
"available_mb": memory_monitor.available_mb,
|
||||
"pressure_level": memory_monitor.pressure_level,
|
||||
"cache_stats": cache_service.get_stats(),
|
||||
"external_files": len(os.listdir(EXTERNAL_STORAGE))
|
||||
}
|
||||
```
|
||||
|
||||
### Prometheus Metrics
|
||||
```python
|
||||
from prometheus_client import Counter, Histogram, Gauge
|
||||
|
||||
stream_requests = Counter('spacetime_stream_requests_total', 'Total streaming requests')
|
||||
memory_usage = Gauge('spacetime_memory_usage_bytes', 'Current memory usage')
|
||||
processing_time = Histogram('spacetime_processing_seconds', 'Processing time')
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests
|
||||
```bash
|
||||
pytest tests/unit -v
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
```bash
|
||||
pytest tests/integration -v
|
||||
```
|
||||
|
||||
### Load Testing
|
||||
```bash
|
||||
locust -f tests/load/locustfile.py --host http://localhost:8000
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always use streaming** for large responses
|
||||
2. **Configure memory limits** based on container size
|
||||
3. **Enable checkpointing** for long-running tasks
|
||||
4. **Monitor memory pressure** in production
|
||||
5. **Use external storage** on fast SSDs
|
||||
6. **Set appropriate timeouts** for streaming endpoints
|
||||
7. **Implement circuit breakers** for memory protection
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### High Memory Usage
|
||||
- Reduce chunk sizes
|
||||
- Enable more aggressive spillover
|
||||
- Check for memory leaks in custom code
|
||||
|
||||
### Slow Streaming
|
||||
- Ensure proxy buffering is disabled
|
||||
- Check network latency
|
||||
- Optimize chunk sizes
|
||||
|
||||
### Failed Checkpoints
|
||||
- Verify storage permissions
|
||||
- Check disk space
|
||||
- Monitor checkpoint frequency
|
||||
|
||||
## Learn More
|
||||
|
||||
- [SqrtSpace SpaceTime Docs](https://github.com/MarketAlly/Ubiquity)
|
||||
- [FastAPI Documentation](https://fastapi.tiangolo.com)
|
||||
- [Streaming Best Practices](https://example.com/streaming)
|
||||
137
examples/fastapi-app/app/main.py
Normal file
137
examples/fastapi-app/app/main.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
FastAPI application demonstrating SqrtSpace SpaceTime integration
|
||||
"""
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
|
||||
from sqrtspace_spacetime import SpaceTimeConfig
|
||||
from sqrtspace_spacetime.memory import MemoryPressureMonitor
|
||||
|
||||
from .config import settings
|
||||
from .routers import products, analytics, ml, reports
|
||||
from .services.cache_service import SpaceTimeCache
|
||||
from .utils.memory import memory_monitor_middleware
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global instances
|
||||
cache = SpaceTimeCache()
|
||||
memory_monitor = MemoryPressureMonitor(settings.SPACETIME_MEMORY_LIMIT)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Application lifespan manager"""
|
||||
# Startup
|
||||
logger.info("Starting FastAPI with SqrtSpace SpaceTime")
|
||||
|
||||
# Configure SpaceTime
|
||||
SpaceTimeConfig.set_defaults(
|
||||
memory_limit=settings.SPACETIME_MEMORY_LIMIT,
|
||||
external_storage=settings.SPACETIME_EXTERNAL_STORAGE,
|
||||
chunk_strategy=settings.SPACETIME_CHUNK_STRATEGY,
|
||||
compression=settings.SPACETIME_COMPRESSION
|
||||
)
|
||||
|
||||
# Initialize services
|
||||
app.state.cache = cache
|
||||
app.state.memory_monitor = memory_monitor
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down...")
|
||||
cache.cleanup()
|
||||
|
||||
|
||||
# Create FastAPI app
|
||||
app = FastAPI(
|
||||
title="SqrtSpace SpaceTime FastAPI Demo",
|
||||
description="Memory-efficient API with √n space-time tradeoffs",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Add custom middleware
|
||||
app.middleware("http")(memory_monitor_middleware)
|
||||
|
||||
# Include routers
|
||||
app.include_router(products.router, prefix="/products", tags=["products"])
|
||||
app.include_router(analytics.router, prefix="/analytics", tags=["analytics"])
|
||||
app.include_router(ml.router, prefix="/ml", tags=["machine-learning"])
|
||||
app.include_router(reports.router, prefix="/reports", tags=["reports"])
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint"""
|
||||
return {
|
||||
"message": "SqrtSpace SpaceTime FastAPI Demo",
|
||||
"docs": "/docs",
|
||||
"memory_usage": memory_monitor.get_memory_info()
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
memory_info = memory_monitor.get_memory_info()
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"memory": {
|
||||
"usage_mb": memory_info["used_mb"],
|
||||
"available_mb": memory_info["available_mb"],
|
||||
"percentage": memory_info["percentage"],
|
||||
"pressure": memory_monitor.check().value
|
||||
},
|
||||
"cache": cache.get_stats()
|
||||
}
|
||||
|
||||
|
||||
@app.get("/system/memory")
|
||||
async def system_memory():
|
||||
"""Detailed memory statistics"""
|
||||
import psutil
|
||||
import os
|
||||
|
||||
process = psutil.Process(os.getpid())
|
||||
|
||||
return {
|
||||
"process": {
|
||||
"rss_mb": process.memory_info().rss / 1024 / 1024,
|
||||
"vms_mb": process.memory_info().vms / 1024 / 1024,
|
||||
"cpu_percent": process.cpu_percent(interval=0.1),
|
||||
"num_threads": process.num_threads()
|
||||
},
|
||||
"spacetime": {
|
||||
"memory_limit": settings.SPACETIME_MEMORY_LIMIT,
|
||||
"external_storage": settings.SPACETIME_EXTERNAL_STORAGE,
|
||||
"pressure_level": memory_monitor.check().value,
|
||||
"cache_stats": cache.get_stats()
|
||||
},
|
||||
"system": {
|
||||
"total_memory_mb": psutil.virtual_memory().total / 1024 / 1024,
|
||||
"available_memory_mb": psutil.virtual_memory().available / 1024 / 1024,
|
||||
"memory_percent": psutil.virtual_memory().percent,
|
||||
"swap_percent": psutil.swap_memory().percent
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
260
examples/fastapi-app/app/routers/products.py
Normal file
260
examples/fastapi-app/app/routers/products.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
Product endpoints demonstrating streaming and memory-efficient operations
|
||||
"""
|
||||
from fastapi import APIRouter, Query, Response, HTTPException, BackgroundTasks
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import Optional, List
|
||||
import json
|
||||
import csv
|
||||
import io
|
||||
from datetime import datetime
|
||||
|
||||
from sqrtspace_spacetime import Stream, external_sort
|
||||
from sqrtspace_spacetime.checkpoint import CheckpointManager
|
||||
|
||||
from ..models import Product, ProductUpdate, BulkUpdateRequest, ImportStatus
|
||||
from ..services.product_service import ProductService
|
||||
from ..database import get_db
|
||||
|
||||
router = APIRouter()
|
||||
product_service = ProductService()
|
||||
checkpoint_manager = CheckpointManager()
|
||||
|
||||
|
||||
@router.get("/")
|
||||
async def list_products(
|
||||
skip: int = Query(0, ge=0),
|
||||
limit: int = Query(100, ge=1, le=1000),
|
||||
category: Optional[str] = None,
|
||||
min_price: Optional[float] = None,
|
||||
max_price: Optional[float] = None
|
||||
):
|
||||
"""Get paginated list of products"""
|
||||
filters = {}
|
||||
if category:
|
||||
filters['category'] = category
|
||||
if min_price is not None:
|
||||
filters['min_price'] = min_price
|
||||
if max_price is not None:
|
||||
filters['max_price'] = max_price
|
||||
|
||||
return await product_service.get_products(skip, limit, filters)
|
||||
|
||||
|
||||
@router.get("/stream")
|
||||
async def stream_products(
|
||||
category: Optional[str] = None,
|
||||
format: str = Query("ndjson", regex="^(ndjson|json)$")
|
||||
):
|
||||
"""
|
||||
Stream all products as NDJSON or JSON array.
|
||||
Memory-efficient streaming for large datasets.
|
||||
"""
|
||||
|
||||
async def generate_ndjson():
|
||||
async for product in product_service.stream_products(category):
|
||||
yield json.dumps(product.dict()) + "\n"
|
||||
|
||||
async def generate_json():
|
||||
yield "["
|
||||
first = True
|
||||
async for product in product_service.stream_products(category):
|
||||
if not first:
|
||||
yield ","
|
||||
yield json.dumps(product.dict())
|
||||
first = False
|
||||
yield "]"
|
||||
|
||||
if format == "ndjson":
|
||||
return StreamingResponse(
|
||||
generate_ndjson(),
|
||||
media_type="application/x-ndjson",
|
||||
headers={"X-Accel-Buffering": "no"}
|
||||
)
|
||||
else:
|
||||
return StreamingResponse(
|
||||
generate_json(),
|
||||
media_type="application/json",
|
||||
headers={"X-Accel-Buffering": "no"}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/export/csv")
|
||||
async def export_csv(
|
||||
category: Optional[str] = None,
|
||||
columns: Optional[List[str]] = Query(None)
|
||||
):
|
||||
"""Export products as CSV with streaming"""
|
||||
|
||||
if not columns:
|
||||
columns = ["id", "name", "sku", "category", "price", "stock", "created_at"]
|
||||
|
||||
async def generate():
|
||||
output = io.StringIO()
|
||||
writer = csv.DictWriter(output, fieldnames=columns)
|
||||
|
||||
# Write header
|
||||
writer.writeheader()
|
||||
output.seek(0)
|
||||
yield output.read()
|
||||
output.seek(0)
|
||||
output.truncate()
|
||||
|
||||
# Stream products in batches
|
||||
batch_count = 0
|
||||
async for batch in product_service.stream_products_batched(category, batch_size=100):
|
||||
for product in batch:
|
||||
writer.writerow({col: getattr(product, col) for col in columns})
|
||||
|
||||
output.seek(0)
|
||||
data = output.read()
|
||||
output.seek(0)
|
||||
output.truncate()
|
||||
yield data
|
||||
|
||||
batch_count += 1
|
||||
if batch_count % 10 == 0:
|
||||
# Yield empty string to keep connection alive
|
||||
yield ""
|
||||
|
||||
filename = f"products_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/csv",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename={filename}",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/search")
|
||||
async def search_products(
|
||||
q: str = Query(..., min_length=2),
|
||||
sort_by: str = Query("relevance", regex="^(relevance|price_asc|price_desc|name)$"),
|
||||
limit: int = Query(100, ge=1, le=1000)
|
||||
):
|
||||
"""
|
||||
Search products with memory-efficient sorting.
|
||||
Uses external sort for large result sets.
|
||||
"""
|
||||
results = await product_service.search_products(q, sort_by, limit)
|
||||
|
||||
# Use external sort if results are large
|
||||
if len(results) > 1000:
|
||||
sort_key = {
|
||||
'price_asc': lambda x: x['price'],
|
||||
'price_desc': lambda x: -x['price'],
|
||||
'name': lambda x: x['name'],
|
||||
'relevance': lambda x: -x['relevance_score']
|
||||
}[sort_by]
|
||||
|
||||
results = external_sort(results, key_func=sort_key)
|
||||
|
||||
return {"results": results[:limit], "total": len(results)}
|
||||
|
||||
|
||||
@router.post("/bulk-update")
|
||||
async def bulk_update_prices(
|
||||
request: BulkUpdateRequest,
|
||||
background_tasks: BackgroundTasks
|
||||
):
|
||||
"""
|
||||
Bulk update product prices with checkpointing.
|
||||
Can be resumed if interrupted.
|
||||
"""
|
||||
job_id = f"bulk_update_{datetime.now().timestamp()}"
|
||||
|
||||
# Check for existing checkpoint
|
||||
checkpoint = checkpoint_manager.restore(job_id)
|
||||
if checkpoint:
|
||||
return {
|
||||
"message": "Resuming previous job",
|
||||
"job_id": job_id,
|
||||
"progress": checkpoint.get("progress", 0)
|
||||
}
|
||||
|
||||
# Start background task
|
||||
background_tasks.add_task(
|
||||
product_service.bulk_update_prices,
|
||||
request,
|
||||
job_id
|
||||
)
|
||||
|
||||
return {
|
||||
"message": "Bulk update started",
|
||||
"job_id": job_id,
|
||||
"status_url": f"/products/bulk-update/{job_id}/status"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/bulk-update/{job_id}/status")
|
||||
async def bulk_update_status(job_id: str):
|
||||
"""Check status of bulk update job"""
|
||||
checkpoint = checkpoint_manager.restore(job_id)
|
||||
|
||||
if not checkpoint:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": checkpoint.get("status", "running"),
|
||||
"progress": checkpoint.get("progress", 0),
|
||||
"total": checkpoint.get("total", 0),
|
||||
"updated": checkpoint.get("updated", 0),
|
||||
"errors": checkpoint.get("errors", [])
|
||||
}
|
||||
|
||||
|
||||
@router.post("/import/csv")
|
||||
async def import_csv(
|
||||
file_url: str,
|
||||
background_tasks: BackgroundTasks
|
||||
):
|
||||
"""Import products from CSV file"""
|
||||
import_id = f"import_{datetime.now().timestamp()}"
|
||||
|
||||
background_tasks.add_task(
|
||||
product_service.import_from_csv,
|
||||
file_url,
|
||||
import_id
|
||||
)
|
||||
|
||||
return {
|
||||
"message": "Import started",
|
||||
"import_id": import_id,
|
||||
"status_url": f"/products/import/{import_id}/status"
|
||||
}
|
||||
|
||||
|
||||
@router.get("/import/{import_id}/status")
|
||||
async def import_status(import_id: str):
|
||||
"""Check status of import job"""
|
||||
status = await product_service.get_import_status(import_id)
|
||||
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail="Import job not found")
|
||||
|
||||
return status
|
||||
|
||||
|
||||
@router.get("/statistics")
|
||||
async def product_statistics():
|
||||
"""
|
||||
Get product statistics using memory-efficient aggregations.
|
||||
Uses external grouping for large datasets.
|
||||
"""
|
||||
stats = await product_service.calculate_statistics()
|
||||
|
||||
return {
|
||||
"total_products": stats["total_products"],
|
||||
"total_value": stats["total_value"],
|
||||
"by_category": stats["by_category"],
|
||||
"price_distribution": stats["price_distribution"],
|
||||
"stock_alerts": stats["stock_alerts"],
|
||||
"processing_info": {
|
||||
"memory_used_mb": stats["memory_used_mb"],
|
||||
"external_operations": stats["external_operations"]
|
||||
}
|
||||
}
|
||||
232
examples/ml-pipeline/README.md
Normal file
232
examples/ml-pipeline/README.md
Normal file
@@ -0,0 +1,232 @@
|
||||
# Machine Learning Pipeline with SqrtSpace SpaceTime
|
||||
|
||||
This example demonstrates how to build memory-efficient machine learning pipelines using SqrtSpace SpaceTime for handling large datasets that don't fit in memory.
|
||||
|
||||
## Features Demonstrated
|
||||
|
||||
### 1. **Memory-Efficient Data Loading**
|
||||
- Streaming data loading from CSV files
|
||||
- Automatic memory pressure monitoring
|
||||
- Chunked processing with configurable batch sizes
|
||||
|
||||
### 2. **Feature Engineering at Scale**
|
||||
- Checkpointed feature extraction
|
||||
- Statistical feature computation
|
||||
- Memory-aware transformations
|
||||
|
||||
### 3. **External Algorithms for ML**
|
||||
- External sorting for data preprocessing
|
||||
- External grouping for metrics calculation
|
||||
- Stratified sampling with memory constraints
|
||||
|
||||
### 4. **Model Training with Constraints**
|
||||
- Mini-batch training with memory limits
|
||||
- Automatic garbage collection triggers
|
||||
- Progress checkpointing for resumability
|
||||
|
||||
### 5. **Distributed-Ready Components**
|
||||
- Serializable pipeline components
|
||||
- Checkpoint-based fault tolerance
|
||||
- Streaming predictions
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install sqrtspace-spacetime scikit-learn pandas numpy joblib psutil
|
||||
```
|
||||
|
||||
## Running the Example
|
||||
|
||||
```bash
|
||||
python ml_pipeline_example.py
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Generate a synthetic dataset (100K samples, 50 features)
|
||||
2. Load data using streaming
|
||||
3. Preprocess with external sorting
|
||||
4. Extract features with checkpointing
|
||||
5. Train a Random Forest model
|
||||
6. Evaluate using external grouping
|
||||
7. Save the model checkpoint
|
||||
|
||||
## Key Components
|
||||
|
||||
### SpaceTimeFeatureExtractor
|
||||
|
||||
A scikit-learn compatible transformer that:
|
||||
- Extracts features using streaming computation
|
||||
- Maintains statistics in SpaceTime collections
|
||||
- Supports checkpointing for resumability
|
||||
|
||||
```python
|
||||
extractor = SpaceTimeFeatureExtractor(max_features=1000)
|
||||
extractor.fit(data_stream) # Automatically checkpointed
|
||||
transformed = extractor.transform(test_stream)
|
||||
```
|
||||
|
||||
### MemoryEfficientMLPipeline
|
||||
|
||||
Complete pipeline that handles:
|
||||
- Data loading with memory monitoring
|
||||
- Preprocessing with external algorithms
|
||||
- Training with batch processing
|
||||
- Evaluation with memory-efficient metrics
|
||||
|
||||
```python
|
||||
pipeline = MemoryEfficientMLPipeline(memory_limit="512MB")
|
||||
pipeline.train_with_memory_constraints(X_train, y_train)
|
||||
metrics = pipeline.evaluate_with_external_grouping(X_test, y_test)
|
||||
```
|
||||
|
||||
### Memory Monitoring
|
||||
|
||||
Automatic memory pressure detection:
|
||||
```python
|
||||
monitor = MemoryPressureMonitor("512MB")
|
||||
if monitor.should_cleanup():
|
||||
gc.collect()
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Custom Feature Extractors
|
||||
|
||||
```python
|
||||
class CustomFeatureExtractor(SpaceTimeFeatureExtractor):
|
||||
def extract_features(self, batch):
|
||||
# Your custom feature logic
|
||||
features = []
|
||||
for sample in batch:
|
||||
# Complex feature engineering
|
||||
features.append(self.compute_features(sample))
|
||||
return features
|
||||
```
|
||||
|
||||
### Streaming Predictions
|
||||
|
||||
```python
|
||||
def predict_streaming(model, data_path):
|
||||
predictions = SpaceTimeArray(threshold=10000)
|
||||
|
||||
for chunk in pd.read_csv(data_path, chunksize=1000):
|
||||
X = chunk.values
|
||||
y_pred = model.predict(X)
|
||||
predictions.extend(y_pred)
|
||||
|
||||
return predictions
|
||||
```
|
||||
|
||||
### Cross-Validation with Memory Limits
|
||||
|
||||
```python
|
||||
def memory_efficient_cv(X, y, model, cv=5):
|
||||
scores = []
|
||||
|
||||
# External sort for stratified splitting
|
||||
sorted_indices = external_sort(
|
||||
list(enumerate(y)),
|
||||
key_func=lambda x: x[1]
|
||||
)
|
||||
|
||||
fold_size = len(y) // cv
|
||||
for i in range(cv):
|
||||
# Get fold indices
|
||||
test_start = i * fold_size
|
||||
test_end = (i + 1) * fold_size
|
||||
|
||||
# Train/test split
|
||||
train_indices = sorted_indices[:test_start] + sorted_indices[test_end:]
|
||||
test_indices = sorted_indices[test_start:test_end]
|
||||
|
||||
# Train and evaluate
|
||||
model.fit(X[train_indices], y[train_indices])
|
||||
score = model.score(X[test_indices], y[test_indices])
|
||||
scores.append(score)
|
||||
|
||||
return scores
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
1. **Tune Chunk Sizes**: Larger chunks are more efficient but use more memory
|
||||
2. **Use Compression**: Enable LZ4 compression for numerical data
|
||||
3. **Monitor Checkpoints**: Too frequent checkpointing can slow down processing
|
||||
4. **Profile Memory**: Use the `@profile_memory` decorator to find bottlenecks
|
||||
5. **External Storage**: Use SSDs for external algorithm temporary files
|
||||
|
||||
## Integration with Popular ML Libraries
|
||||
|
||||
### PyTorch DataLoader
|
||||
|
||||
```python
|
||||
class SpaceTimeDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, data_path, transform=None):
|
||||
self.data = SpaceTimeArray.from_file(data_path)
|
||||
self.transform = transform
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
sample = self.data[idx]
|
||||
if self.transform:
|
||||
sample = self.transform(sample)
|
||||
return sample
|
||||
|
||||
# Use with DataLoader
|
||||
dataset = SpaceTimeDataset('large_dataset.pkl')
|
||||
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
|
||||
```
|
||||
|
||||
### TensorFlow tf.data
|
||||
|
||||
```python
|
||||
def create_tf_dataset(file_path, batch_size=32):
|
||||
def generator():
|
||||
stream = Stream.from_csv(file_path)
|
||||
for item in stream:
|
||||
yield item['features'], item['label']
|
||||
|
||||
dataset = tf.data.Dataset.from_generator(
|
||||
generator,
|
||||
output_types=(tf.float32, tf.int32)
|
||||
)
|
||||
|
||||
return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
|
||||
On a machine with 8GB RAM processing a 50GB dataset:
|
||||
|
||||
| Operation | Traditional | SpaceTime | Memory Used |
|
||||
|-----------|------------|-----------|-------------|
|
||||
| Data Loading | OOM | 42s | 512MB |
|
||||
| Feature Extraction | OOM | 156s | 512MB |
|
||||
| Model Training | OOM | 384s | 512MB |
|
||||
| Evaluation | 89s | 95s | 512MB |
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Out of Memory Errors
|
||||
- Reduce chunk sizes
|
||||
- Lower memory limit for earlier spillover
|
||||
- Enable compression
|
||||
|
||||
### Slow Performance
|
||||
- Increase memory limit if possible
|
||||
- Use faster external storage (SSD)
|
||||
- Optimize feature extraction logic
|
||||
|
||||
### Checkpoint Recovery
|
||||
- Check checkpoint directory permissions
|
||||
- Ensure enough disk space
|
||||
- Monitor checkpoint file sizes
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Explore distributed training with checkpoint coordination
|
||||
- Implement custom external algorithms
|
||||
- Build real-time ML pipelines with streaming
|
||||
- Integrate with cloud storage for data loading
|
||||
413
examples/ml-pipeline/ml_pipeline_example.py
Normal file
413
examples/ml-pipeline/ml_pipeline_example.py
Normal file
@@ -0,0 +1,413 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Machine Learning Pipeline with SqrtSpace SpaceTime
|
||||
|
||||
Demonstrates memory-efficient ML workflows including:
|
||||
- Large dataset processing
|
||||
- Feature extraction with checkpointing
|
||||
- Model training with memory constraints
|
||||
- Batch prediction with streaming
|
||||
- Cross-validation with external sorting
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import cross_val_score
|
||||
import joblib
|
||||
import time
|
||||
from typing import Iterator, Tuple, List, Dict, Any
|
||||
|
||||
from sqrtspace_spacetime import (
|
||||
SpaceTimeArray,
|
||||
SpaceTimeDict,
|
||||
Stream,
|
||||
external_sort,
|
||||
external_groupby,
|
||||
SpaceTimeConfig
|
||||
)
|
||||
from sqrtspace_spacetime.checkpoint import auto_checkpoint, CheckpointManager
|
||||
from sqrtspace_spacetime.memory import MemoryPressureMonitor, profile_memory
|
||||
from sqrtspace_spacetime.ml import SpaceTimeOptimizer
|
||||
from sqrtspace_spacetime.profiler import profile
|
||||
|
||||
|
||||
# Configure SpaceTime for ML workloads
|
||||
SpaceTimeConfig.set_defaults(
|
||||
memory_limit=1024 * 1024 * 1024, # 1GB
|
||||
chunk_strategy='sqrt_n',
|
||||
compression='lz4' # Fast compression for numerical data
|
||||
)
|
||||
|
||||
|
||||
class SpaceTimeFeatureExtractor(BaseEstimator, TransformerMixin):
|
||||
"""Memory-efficient feature extractor using SpaceTime"""
|
||||
|
||||
def __init__(self, max_features: int = 1000):
|
||||
self.max_features = max_features
|
||||
self.feature_stats = SpaceTimeDict(threshold=100)
|
||||
self.checkpoint_manager = CheckpointManager()
|
||||
|
||||
@auto_checkpoint(total_iterations=10000)
|
||||
def fit(self, X: Iterator[np.ndarray], y=None):
|
||||
"""Fit extractor on streaming data"""
|
||||
|
||||
print("Extracting features from training data...")
|
||||
|
||||
# Accumulate statistics in SpaceTime collections
|
||||
feature_sums = SpaceTimeArray(threshold=self.max_features)
|
||||
feature_counts = SpaceTimeArray(threshold=self.max_features)
|
||||
|
||||
for batch_idx, batch in enumerate(X):
|
||||
for row in batch:
|
||||
# Update running statistics
|
||||
if len(feature_sums) < len(row):
|
||||
feature_sums.extend([0] * (len(row) - len(feature_sums)))
|
||||
feature_counts.extend([0] * (len(row) - len(feature_counts)))
|
||||
|
||||
for i, value in enumerate(row):
|
||||
feature_sums[i] += value
|
||||
feature_counts[i] += 1
|
||||
|
||||
# Checkpoint every 100 batches
|
||||
if batch_idx % 100 == 0:
|
||||
yield {
|
||||
'batch_idx': batch_idx,
|
||||
'feature_sums': feature_sums,
|
||||
'feature_counts': feature_counts
|
||||
}
|
||||
|
||||
# Calculate means
|
||||
self.feature_means_ = []
|
||||
for i in range(len(feature_sums)):
|
||||
mean = feature_sums[i] / feature_counts[i] if feature_counts[i] > 0 else 0
|
||||
self.feature_means_.append(mean)
|
||||
self.feature_stats[f'mean_{i}'] = mean
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X: Iterator[np.ndarray]) -> Iterator[np.ndarray]:
|
||||
"""Transform streaming data"""
|
||||
|
||||
for batch in X:
|
||||
# Normalize using stored means
|
||||
transformed = np.array(batch)
|
||||
for i, mean in enumerate(self.feature_means_):
|
||||
transformed[:, i] -= mean
|
||||
|
||||
yield transformed
|
||||
|
||||
|
||||
class MemoryEfficientMLPipeline:
|
||||
"""Complete ML pipeline with memory management"""
|
||||
|
||||
def __init__(self, memory_limit: str = "512MB"):
|
||||
self.memory_monitor = MemoryPressureMonitor(memory_limit)
|
||||
self.checkpoint_manager = CheckpointManager()
|
||||
self.feature_extractor = SpaceTimeFeatureExtractor()
|
||||
self.model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
|
||||
self.optimizer = SpaceTimeOptimizer(
|
||||
memory_limit=memory_limit,
|
||||
checkpoint_frequency=100
|
||||
)
|
||||
|
||||
@profile_memory(threshold_mb=256)
|
||||
def load_data_streaming(self, file_path: str, chunk_size: int = 1000) -> Iterator:
|
||||
"""Load large dataset in memory-efficient chunks"""
|
||||
|
||||
print(f"Loading data from {file_path} in chunks of {chunk_size}...")
|
||||
|
||||
# Simulate loading large CSV in chunks
|
||||
for chunk_idx, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
|
||||
# Convert to numpy array
|
||||
X = chunk.drop('target', axis=1).values
|
||||
y = chunk['target'].values
|
||||
|
||||
# Check memory pressure
|
||||
if self.memory_monitor.should_cleanup():
|
||||
print(f"Memory pressure detected at chunk {chunk_idx}, triggering cleanup")
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
yield X, y
|
||||
|
||||
def preprocess_with_external_sort(self, data_iterator: Iterator) -> Tuple[SpaceTimeArray, SpaceTimeArray]:
|
||||
"""Preprocess and sort data using external algorithms"""
|
||||
|
||||
print("Preprocessing data with external sorting...")
|
||||
|
||||
X_all = SpaceTimeArray(threshold=10000)
|
||||
y_all = SpaceTimeArray(threshold=10000)
|
||||
|
||||
# Collect all data
|
||||
for X_batch, y_batch in data_iterator:
|
||||
X_all.extend(X_batch.tolist())
|
||||
y_all.extend(y_batch.tolist())
|
||||
|
||||
# Sort by target value for stratified splitting
|
||||
print(f"Sorting {len(y_all)} samples by target value...")
|
||||
|
||||
# Create index pairs
|
||||
indexed_data = [(i, y) for i, y in enumerate(y_all)]
|
||||
|
||||
# External sort by target value
|
||||
sorted_indices = external_sort(
|
||||
indexed_data,
|
||||
key_func=lambda x: x[1]
|
||||
)
|
||||
|
||||
# Reorder data
|
||||
X_sorted = SpaceTimeArray(threshold=10000)
|
||||
y_sorted = SpaceTimeArray(threshold=10000)
|
||||
|
||||
for idx, _ in sorted_indices:
|
||||
X_sorted.append(X_all[idx])
|
||||
y_sorted.append(y_all[idx])
|
||||
|
||||
return X_sorted, y_sorted
|
||||
|
||||
def extract_features_checkpointed(self, X: SpaceTimeArray) -> SpaceTimeArray:
|
||||
"""Extract features with checkpointing"""
|
||||
|
||||
print("Extracting features with checkpointing...")
|
||||
|
||||
job_id = f"feature_extraction_{int(time.time())}"
|
||||
|
||||
# Check for existing checkpoint
|
||||
checkpoint = self.checkpoint_manager.restore(job_id)
|
||||
start_idx = checkpoint.get('last_idx', 0) if checkpoint else 0
|
||||
|
||||
features = SpaceTimeArray(threshold=10000)
|
||||
|
||||
# Load partial results if resuming
|
||||
if checkpoint and 'features' in checkpoint:
|
||||
features = checkpoint['features']
|
||||
|
||||
# Process in batches
|
||||
batch_size = 100
|
||||
for i in range(start_idx, len(X), batch_size):
|
||||
batch = X[i:i + batch_size]
|
||||
|
||||
# Simulate feature extraction
|
||||
batch_features = []
|
||||
for sample in batch:
|
||||
# Example: statistical features
|
||||
features_dict = {
|
||||
'mean': np.mean(sample),
|
||||
'std': np.std(sample),
|
||||
'min': np.min(sample),
|
||||
'max': np.max(sample),
|
||||
'median': np.median(sample)
|
||||
}
|
||||
batch_features.append(list(features_dict.values()))
|
||||
|
||||
features.extend(batch_features)
|
||||
|
||||
# Checkpoint every 1000 samples
|
||||
if (i + batch_size) % 1000 == 0:
|
||||
self.checkpoint_manager.save(job_id, {
|
||||
'last_idx': i + batch_size,
|
||||
'features': features
|
||||
})
|
||||
print(f"Checkpoint saved at index {i + batch_size}")
|
||||
|
||||
# Clean up checkpoint
|
||||
self.checkpoint_manager.delete(job_id)
|
||||
|
||||
return features
|
||||
|
||||
@profile
|
||||
def train_with_memory_constraints(self, X: SpaceTimeArray, y: SpaceTimeArray):
|
||||
"""Train model with memory-aware batch processing"""
|
||||
|
||||
print("Training model with memory constraints...")
|
||||
|
||||
# Convert to numpy arrays in batches
|
||||
batch_size = min(1000, len(X))
|
||||
|
||||
for epoch in range(3): # Multiple epochs
|
||||
print(f"\nEpoch {epoch + 1}/3")
|
||||
|
||||
# Shuffle data
|
||||
indices = list(range(len(X)))
|
||||
np.random.shuffle(indices)
|
||||
|
||||
# Train in mini-batches
|
||||
for i in range(0, len(X), batch_size):
|
||||
batch_indices = indices[i:i + batch_size]
|
||||
|
||||
X_batch = np.array([X[idx] for idx in batch_indices])
|
||||
y_batch = np.array([y[idx] for idx in batch_indices])
|
||||
|
||||
# Partial fit (for models that support it)
|
||||
if hasattr(self.model, 'partial_fit'):
|
||||
self.model.partial_fit(X_batch, y_batch)
|
||||
else:
|
||||
# For RandomForest, we'll fit on full data once
|
||||
if epoch == 0 and i == 0:
|
||||
# Collect all data for initial fit
|
||||
X_train = np.array(X.to_list())
|
||||
y_train = np.array(y.to_list())
|
||||
self.model.fit(X_train, y_train)
|
||||
break
|
||||
|
||||
# Check memory
|
||||
if self.memory_monitor.should_cleanup():
|
||||
import gc
|
||||
gc.collect()
|
||||
print(f"Memory cleanup at batch {i // batch_size}")
|
||||
|
||||
def evaluate_with_external_grouping(self, X: SpaceTimeArray, y: SpaceTimeArray) -> Dict[str, float]:
|
||||
"""Evaluate model using external grouping for metrics"""
|
||||
|
||||
print("Evaluating model performance...")
|
||||
|
||||
# Make predictions in batches
|
||||
predictions = SpaceTimeArray(threshold=10000)
|
||||
|
||||
batch_size = 1000
|
||||
for i in range(0, len(X), batch_size):
|
||||
X_batch = np.array(X[i:i + batch_size])
|
||||
y_pred = self.model.predict(X_batch)
|
||||
predictions.extend(y_pred.tolist())
|
||||
|
||||
# Group by actual vs predicted for confusion matrix
|
||||
results = []
|
||||
for i in range(len(y)):
|
||||
results.append({
|
||||
'actual': y[i],
|
||||
'predicted': predictions[i],
|
||||
'correct': y[i] == predictions[i]
|
||||
})
|
||||
|
||||
# Use external groupby for metrics
|
||||
accuracy_groups = external_groupby(
|
||||
results,
|
||||
key_func=lambda x: x['correct']
|
||||
)
|
||||
|
||||
correct_count = len(accuracy_groups.get(True, []))
|
||||
total_count = len(results)
|
||||
accuracy = correct_count / total_count if total_count > 0 else 0
|
||||
|
||||
# Class-wise metrics
|
||||
class_groups = external_groupby(
|
||||
results,
|
||||
key_func=lambda x: (x['actual'], x['predicted'])
|
||||
)
|
||||
|
||||
return {
|
||||
'accuracy': accuracy,
|
||||
'total_samples': total_count,
|
||||
'correct_predictions': correct_count,
|
||||
'class_distribution': {str(k): len(v) for k, v in class_groups.items()}
|
||||
}
|
||||
|
||||
def save_model_checkpoint(self, path: str):
|
||||
"""Save model with metadata"""
|
||||
|
||||
checkpoint = {
|
||||
'model': self.model,
|
||||
'feature_extractor': self.feature_extractor,
|
||||
'metadata': {
|
||||
'timestamp': time.time(),
|
||||
'memory_limit': self.memory_monitor.memory_limit,
|
||||
'feature_stats': dict(self.feature_extractor.feature_stats)
|
||||
}
|
||||
}
|
||||
|
||||
joblib.dump(checkpoint, path)
|
||||
print(f"Model saved to {path}")
|
||||
|
||||
|
||||
def generate_synthetic_data(n_samples: int = 100000, n_features: int = 50):
|
||||
"""Generate synthetic dataset for demonstration"""
|
||||
|
||||
print(f"Generating synthetic dataset: {n_samples} samples, {n_features} features...")
|
||||
|
||||
# Generate in chunks to avoid memory issues
|
||||
chunk_size = 10000
|
||||
|
||||
with open('synthetic_data.csv', 'w') as f:
|
||||
# Write header
|
||||
headers = [f'feature_{i}' for i in range(n_features)] + ['target']
|
||||
f.write(','.join(headers) + '\n')
|
||||
|
||||
# Generate data in chunks
|
||||
for i in range(0, n_samples, chunk_size):
|
||||
chunk_samples = min(chunk_size, n_samples - i)
|
||||
|
||||
# Generate features
|
||||
X = np.random.randn(chunk_samples, n_features)
|
||||
|
||||
# Generate target (binary classification)
|
||||
# Target depends on sum of first 10 features
|
||||
y = (X[:, :10].sum(axis=1) > 0).astype(int)
|
||||
|
||||
# Write to CSV
|
||||
for j in range(chunk_samples):
|
||||
row = list(X[j]) + [y[j]]
|
||||
f.write(','.join(map(str, row)) + '\n')
|
||||
|
||||
if (i + chunk_size) % 50000 == 0:
|
||||
print(f"Generated {i + chunk_size} samples...")
|
||||
|
||||
print("Synthetic data generation complete!")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run complete ML pipeline example"""
|
||||
|
||||
print("=== SqrtSpace SpaceTime ML Pipeline Example ===\n")
|
||||
|
||||
# Generate synthetic data
|
||||
generate_synthetic_data(n_samples=100000, n_features=50)
|
||||
|
||||
# Create pipeline
|
||||
pipeline = MemoryEfficientMLPipeline(memory_limit="512MB")
|
||||
|
||||
# Load and preprocess data
|
||||
print("\n1. Loading data with streaming...")
|
||||
data_iterator = pipeline.load_data_streaming('synthetic_data.csv', chunk_size=5000)
|
||||
|
||||
print("\n2. Preprocessing with external sort...")
|
||||
X_sorted, y_sorted = pipeline.preprocess_with_external_sort(data_iterator)
|
||||
print(f"Loaded {len(X_sorted)} samples")
|
||||
|
||||
print("\n3. Extracting features with checkpointing...")
|
||||
X_features = pipeline.extract_features_checkpointed(X_sorted)
|
||||
|
||||
print("\n4. Training model with memory constraints...")
|
||||
# Split data (80/20)
|
||||
split_idx = int(0.8 * len(X_features))
|
||||
X_train = SpaceTimeArray(X_features[:split_idx])
|
||||
y_train = SpaceTimeArray(y_sorted[:split_idx])
|
||||
X_test = SpaceTimeArray(X_features[split_idx:])
|
||||
y_test = SpaceTimeArray(y_sorted[split_idx:])
|
||||
|
||||
pipeline.train_with_memory_constraints(X_train, y_train)
|
||||
|
||||
print("\n5. Evaluating with external grouping...")
|
||||
metrics = pipeline.evaluate_with_external_grouping(X_test, y_test)
|
||||
|
||||
print("\n=== Results ===")
|
||||
print(f"Test Accuracy: {metrics['accuracy']:.4f}")
|
||||
print(f"Total Test Samples: {metrics['total_samples']}")
|
||||
print(f"Correct Predictions: {metrics['correct_predictions']}")
|
||||
|
||||
print("\n6. Saving model checkpoint...")
|
||||
pipeline.save_model_checkpoint('spacetime_model.joblib')
|
||||
|
||||
# Memory statistics
|
||||
print("\n=== Memory Statistics ===")
|
||||
memory_info = pipeline.memory_monitor.get_memory_info()
|
||||
print(f"Peak Memory Usage: {memory_info['peak_mb']:.2f} MB")
|
||||
print(f"Current Memory Usage: {memory_info['used_mb']:.2f} MB")
|
||||
print(f"Memory Limit: {memory_info['limit_mb']:.2f} MB")
|
||||
|
||||
print("\n=== Pipeline Complete! ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user