This commit is contained in:
2025-07-20 04:11:04 -04:00
commit 69b521b549
40 changed files with 7781 additions and 0 deletions

204
examples/basic_usage.py Normal file
View File

@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Basic usage examples for Ubiquity SpaceTime.
"""
import time
import random
from sqrtspace_spacetime import (
SpaceTimeArray,
SpaceTimeDict,
external_sort,
external_groupby,
Stream,
SpaceTimeConfig,
)
from sqrtspace_spacetime.profiler import profile, profile_memory
from sqrtspace_spacetime.checkpoint import auto_checkpoint
def example_spacetime_array():
"""Example: Memory-efficient array with automatic spillover."""
print("\n=== SpaceTimeArray Example ===")
# Create array that keeps only 1000 items in memory
array = SpaceTimeArray(threshold=1000)
# Add 10,000 items
print("Adding 10,000 items to SpaceTimeArray...")
for i in range(10000):
array.append(f"item_{i}")
print(f"Array length: {len(array)}")
print(f"Sample items: {array[0]}, {array[5000]}, {array[9999]}")
# Demonstrate memory efficiency
import psutil
process = psutil.Process()
memory_mb = process.memory_info().rss / 1024 / 1024
print(f"Current memory usage: {memory_mb:.1f} MB (much less than storing all in memory)")
def example_external_sort():
"""Example: Sort large dataset with minimal memory."""
print("\n=== External Sort Example ===")
# Generate large random dataset
print("Generating 1M random numbers...")
data = [random.randint(1, 1000000) for _ in range(1000000)]
# Sort using √n memory
print("Sorting with external_sort (√n memory)...")
start = time.time()
sorted_data = external_sort(data)
elapsed = time.time() - start
# Verify sorting
is_sorted = all(sorted_data[i] <= sorted_data[i+1] for i in range(len(sorted_data)-1))
print(f"Sorted correctly: {is_sorted}")
print(f"Time taken: {elapsed:.2f}s")
print(f"First 10 elements: {sorted_data[:10]}")
def example_streaming():
"""Example: Process data streams efficiently."""
print("\n=== Stream Processing Example ===")
# Create sample data
data = [
{'name': 'Alice', 'age': 25, 'score': 85},
{'name': 'Bob', 'age': 30, 'score': 90},
{'name': 'Charlie', 'age': 25, 'score': 78},
{'name': 'David', 'age': 30, 'score': 92},
{'name': 'Eve', 'age': 25, 'score': 88},
]
# Stream processing
result = Stream.from_iterable(data) \
.filter(lambda x: x['age'] == 25) \
.map(lambda x: {'name': x['name'], 'grade': 'A' if x['score'] >= 85 else 'B'}) \
.collect()
print("Filtered and transformed data:")
for item in result:
print(f" {item}")
@profile_memory(threshold_mb=50)
def example_memory_profiling():
"""Example: Profile memory usage."""
print("\n=== Memory Profiling Example ===")
# Simulate memory-intensive operation
data = []
for i in range(100000):
data.append({
'id': i,
'value': random.random(),
'text': f"Item number {i}" * 10
})
# Process data
result = sum(item['value'] for item in data)
return result
@auto_checkpoint(total_iterations=100)
def example_checkpointing(data):
"""Example: Auto-checkpoint long computation."""
print("\n=== Checkpointing Example ===")
results = []
for i, item in enumerate(data):
# Simulate expensive computation
time.sleep(0.01)
result = item ** 2
results.append(result)
# Yield state for checkpointing
if i % 10 == 0:
print(f"Processing item {i}...")
yield {'i': i, 'results': results}
return results
def example_groupby():
"""Example: Group large dataset efficiently."""
print("\n=== External GroupBy Example ===")
# Generate sales data
sales = []
stores = ['Store_A', 'Store_B', 'Store_C', 'Store_D']
print("Generating 100K sales records...")
for i in range(100000):
sales.append({
'store': random.choice(stores),
'amount': random.uniform(10, 1000),
'product': f'Product_{random.randint(1, 100)}'
})
# Group by store
print("Grouping by store...")
grouped = external_groupby(sales, key_func=lambda x: x['store'])
# Calculate totals
for store, transactions in grouped.items():
total = sum(t['amount'] for t in transactions)
print(f"{store}: {len(transactions)} transactions, ${total:,.2f} total")
def example_spacetime_dict():
"""Example: Memory-efficient dictionary with LRU eviction."""
print("\n=== SpaceTimeDict Example ===")
# Create cache with 100-item memory limit
cache = SpaceTimeDict(threshold=100)
# Simulate caching expensive computations
print("Caching 1000 expensive computations...")
for i in range(1000):
key = f"computation_{i}"
# Simulate expensive computation
value = i ** 2 + random.random()
cache[key] = value
print(f"Total items: {len(cache)}")
print(f"Items in memory: {len(cache._hot_data)}")
print(f"Items on disk: {len(cache._cold_keys)}")
# Access patterns
stats = cache.get_stats()
print(f"Cache stats: {stats}")
def main():
"""Run all examples."""
print("=== Ubiquity SpaceTime Examples ===")
# Configure SpaceTime
SpaceTimeConfig.set_defaults(
memory_limit=512 * 1024 * 1024, # 512MB
chunk_strategy='sqrt_n',
compression='gzip'
)
# Run examples
example_spacetime_array()
example_external_sort()
example_streaming()
example_memory_profiling()
example_groupby()
example_spacetime_dict()
# Checkpointing example
data = list(range(100))
results = list(example_checkpointing(data))
print(f"Checkpointing completed. Processed {len(results)} items.")
print("\n=== All examples completed! ===")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,504 @@
# SqrtSpace SpaceTime FastAPI Sample Application
This sample demonstrates how to build memory-efficient, high-performance APIs using FastAPI and SqrtSpace SpaceTime.
## Features Demonstrated
### 1. **Streaming Endpoints**
- Server-Sent Events (SSE) for real-time data
- Streaming file downloads without memory bloat
- Chunked JSON responses for large datasets
### 2. **Background Tasks**
- Memory-aware task processing
- Checkpointed long-running operations
- Progress tracking with resumable state
### 3. **Data Processing**
- External sorting for large datasets
- Memory-efficient aggregations
- Streaming ETL pipelines
### 4. **Machine Learning Integration**
- Batch prediction with memory limits
- Model training with checkpoints
- Feature extraction pipelines
## Installation
1. **Create virtual environment:**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. **Install dependencies:**
```bash
pip install -r requirements.txt
```
3. **Configure environment:**
```bash
cp .env.example .env
```
Edit `.env`:
```
SPACETIME_MEMORY_LIMIT=512MB
SPACETIME_EXTERNAL_STORAGE=/tmp/spacetime
SPACETIME_CHUNK_STRATEGY=sqrt_n
SPACETIME_COMPRESSION=gzip
DATABASE_URL=sqlite:///./app.db
```
4. **Initialize database:**
```bash
python init_db.py
```
## Project Structure
```
fastapi-app/
├── app/
│ ├── __init__.py
│ ├── main.py # FastAPI app
│ ├── config.py # Configuration
│ ├── models.py # Pydantic models
│ ├── database.py # Database setup
│ ├── routers/
│ │ ├── products.py # Product endpoints
│ │ ├── analytics.py # Analytics endpoints
│ │ ├── ml.py # ML endpoints
│ │ └── reports.py # Report generation
│ ├── services/
│ │ ├── product_service.py # Business logic
│ │ ├── analytics_service.py # Analytics processing
│ │ ├── ml_service.py # ML operations
│ │ └── cache_service.py # SpaceTime caching
│ ├── workers/
│ │ ├── background_tasks.py # Task workers
│ │ └── checkpointed_jobs.py # Resumable jobs
│ └── utils/
│ ├── streaming.py # Streaming helpers
│ └── memory.py # Memory monitoring
├── requirements.txt
├── Dockerfile
└── docker-compose.yml
```
## Usage Examples
### 1. Streaming Large Datasets
```python
# app/routers/products.py
from fastapi import APIRouter, Response
from fastapi.responses import StreamingResponse
from sqrtspace_spacetime import Stream
import json
router = APIRouter()
@router.get("/products/stream")
async def stream_products(category: str = None):
"""Stream products as newline-delimited JSON"""
async def generate():
query = db.query(Product)
if category:
query = query.filter(Product.category == category)
# Use SpaceTime stream for memory efficiency
stream = Stream.from_query(query, chunk_size=100)
for product in stream:
yield json.dumps(product.dict()) + "\n"
return StreamingResponse(
generate(),
media_type="application/x-ndjson",
headers={"X-Accel-Buffering": "no"}
)
```
### 2. Server-Sent Events for Real-Time Data
```python
# app/routers/analytics.py
from fastapi import APIRouter
from sse_starlette.sse import EventSourceResponse
from sqrtspace_spacetime.memory import MemoryPressureMonitor
import asyncio
router = APIRouter()
@router.get("/analytics/realtime")
async def realtime_analytics():
"""Stream real-time analytics using SSE"""
monitor = MemoryPressureMonitor("100MB")
async def event_generator():
while True:
# Get current stats
stats = await analytics_service.get_current_stats()
# Check memory pressure
if monitor.check() != MemoryPressureLevel.NONE:
await analytics_service.compact_cache()
yield {
"event": "update",
"data": json.dumps(stats)
}
await asyncio.sleep(1)
return EventSourceResponse(event_generator())
```
### 3. Memory-Efficient CSV Export
```python
# app/routers/reports.py
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from sqrtspace_spacetime.file import CsvWriter
import io
router = APIRouter()
@router.get("/reports/export/csv")
async def export_csv(start_date: date, end_date: date):
"""Export large dataset as CSV with streaming"""
async def generate():
# Create in-memory buffer
output = io.StringIO()
writer = CsvWriter(output)
# Write headers
writer.writerow(["Date", "Orders", "Revenue", "Customers"])
# Stream data in chunks
async for batch in analytics_service.get_daily_stats_batched(
start_date, end_date, batch_size=100
):
for row in batch:
writer.writerow([
row.date,
row.order_count,
row.total_revenue,
row.unique_customers
])
# Yield buffer content
output.seek(0)
data = output.read()
output.seek(0)
output.truncate()
yield data
return StreamingResponse(
generate(),
media_type="text/csv",
headers={
"Content-Disposition": f"attachment; filename=report_{start_date}_{end_date}.csv"
}
)
```
### 4. Checkpointed Background Tasks
```python
# app/workers/checkpointed_jobs.py
from sqrtspace_spacetime.checkpoint import CheckpointManager, auto_checkpoint
from sqrtspace_spacetime.collections import SpaceTimeArray
class DataProcessor:
def __init__(self):
self.checkpoint_manager = CheckpointManager()
@auto_checkpoint(total_iterations=10000)
async def process_large_dataset(self, dataset_id: str):
"""Process dataset with automatic checkpointing"""
# Initialize or restore state
results = SpaceTimeArray(threshold=1000)
processed_count = 0
# Get data in batches
async for batch in self.get_data_batches(dataset_id):
for item in batch:
# Process item
result = await self.process_item(item)
results.append(result)
processed_count += 1
# Yield state for checkpointing
if processed_count % 100 == 0:
yield {
'processed': processed_count,
'results': results,
'last_item_id': item.id
}
return results
```
### 5. Machine Learning with Memory Constraints
```python
# app/services/ml_service.py
from sqrtspace_spacetime.ml import SpaceTimeOptimizer
from sqrtspace_spacetime.streams import Stream
import numpy as np
class MLService:
def __init__(self):
self.optimizer = SpaceTimeOptimizer(
memory_limit="256MB",
checkpoint_frequency=100
)
async def train_model(self, training_data_path: str):
"""Train model with memory-efficient data loading"""
# Stream training data
data_stream = Stream.from_csv(
training_data_path,
chunk_size=1000
)
# Process in mini-batches
for epoch in range(10):
for batch in data_stream.batch(32):
X = np.array([item.features for item in batch])
y = np.array([item.label for item in batch])
# Train step with automatic checkpointing
loss = self.optimizer.step(
self.model,
X, y,
epoch=epoch
)
if self.optimizer.should_checkpoint():
await self.save_checkpoint(epoch)
async def batch_predict(self, input_data):
"""Memory-efficient batch prediction"""
results = SpaceTimeArray(threshold=1000)
# Process in chunks to avoid memory issues
for chunk in Stream.from_iterable(input_data).chunk(100):
predictions = self.model.predict(chunk)
results.extend(predictions)
return results
```
### 6. Advanced Caching with SpaceTime
```python
# app/services/cache_service.py
from sqrtspace_spacetime.collections import SpaceTimeDict
from sqrtspace_spacetime.memory import MemoryPressureMonitor
import asyncio
class SpaceTimeCache:
def __init__(self):
self.hot_cache = SpaceTimeDict(threshold=1000)
self.monitor = MemoryPressureMonitor("128MB")
self.stats = {
'hits': 0,
'misses': 0,
'evictions': 0
}
async def get(self, key: str):
"""Get with automatic tier management"""
if key in self.hot_cache:
self.stats['hits'] += 1
return self.hot_cache[key]
self.stats['misses'] += 1
# Load from database
value = await self.load_from_db(key)
# Add to cache if memory allows
if self.monitor.can_allocate(len(str(value))):
self.hot_cache[key] = value
else:
# Trigger cleanup
self.cleanup()
self.stats['evictions'] += len(self.hot_cache) // 2
return value
def cleanup(self):
"""Remove least recently used items"""
# SpaceTimeDict handles LRU automatically
self.hot_cache.evict_cold_items(0.5)
```
## API Endpoints
### Products API
- `GET /products` - Paginated list
- `GET /products/stream` - Stream all products (NDJSON)
- `GET /products/search` - Memory-efficient search
- `POST /products/bulk-update` - Checkpointed bulk updates
- `GET /products/export/csv` - Streaming CSV export
### Analytics API
- `GET /analytics/summary` - Current statistics
- `GET /analytics/realtime` - SSE stream of live data
- `GET /analytics/trends` - Historical trends
- `POST /analytics/aggregate` - Custom aggregations
### ML API
- `POST /ml/train` - Train model (async with progress)
- `POST /ml/predict/batch` - Batch predictions
- `GET /ml/models/{id}/status` - Training status
- `POST /ml/features/extract` - Feature extraction pipeline
### Reports API
- `POST /reports/generate` - Generate large report
- `GET /reports/{id}/progress` - Check progress
- `GET /reports/{id}/download` - Download completed report
## Running the Application
### Development
```bash
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
```
### Production
```bash
gunicorn app.main:app -w 4 -k uvicorn.workers.UvicornWorker \
--bind 0.0.0.0:8000 \
--timeout 300 \
--max-requests 1000 \
--max-requests-jitter 50
```
### With Docker
```bash
docker-compose up
```
## Performance Configuration
### 1. Nginx Configuration
```nginx
location /products/stream {
proxy_pass http://backend;
proxy_buffering off;
proxy_read_timeout 3600;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
location /analytics/realtime {
proxy_pass http://backend;
proxy_buffering off;
proxy_cache off;
proxy_read_timeout 86400;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
```
### 2. Worker Configuration
```python
# app/config.py
WORKER_CONFIG = {
'memory_limit': os.getenv('WORKER_MEMORY_LIMIT', '512MB'),
'checkpoint_interval': 100,
'batch_size': 1000,
'external_storage': '/tmp/spacetime-workers'
}
```
## Monitoring
### Memory Usage Endpoint
```python
@router.get("/system/memory")
async def memory_stats():
"""Get current memory statistics"""
return {
"current_usage_mb": memory_monitor.current_usage_mb,
"peak_usage_mb": memory_monitor.peak_usage_mb,
"available_mb": memory_monitor.available_mb,
"pressure_level": memory_monitor.pressure_level,
"cache_stats": cache_service.get_stats(),
"external_files": len(os.listdir(EXTERNAL_STORAGE))
}
```
### Prometheus Metrics
```python
from prometheus_client import Counter, Histogram, Gauge
stream_requests = Counter('spacetime_stream_requests_total', 'Total streaming requests')
memory_usage = Gauge('spacetime_memory_usage_bytes', 'Current memory usage')
processing_time = Histogram('spacetime_processing_seconds', 'Processing time')
```
## Testing
### Unit Tests
```bash
pytest tests/unit -v
```
### Integration Tests
```bash
pytest tests/integration -v
```
### Load Testing
```bash
locust -f tests/load/locustfile.py --host http://localhost:8000
```
## Best Practices
1. **Always use streaming** for large responses
2. **Configure memory limits** based on container size
3. **Enable checkpointing** for long-running tasks
4. **Monitor memory pressure** in production
5. **Use external storage** on fast SSDs
6. **Set appropriate timeouts** for streaming endpoints
7. **Implement circuit breakers** for memory protection
## Troubleshooting
### High Memory Usage
- Reduce chunk sizes
- Enable more aggressive spillover
- Check for memory leaks in custom code
### Slow Streaming
- Ensure proxy buffering is disabled
- Check network latency
- Optimize chunk sizes
### Failed Checkpoints
- Verify storage permissions
- Check disk space
- Monitor checkpoint frequency
## Learn More
- [SqrtSpace SpaceTime Docs](https://github.com/MarketAlly/Ubiquity)
- [FastAPI Documentation](https://fastapi.tiangolo.com)
- [Streaming Best Practices](https://example.com/streaming)

View File

@@ -0,0 +1,137 @@
"""
FastAPI application demonstrating SqrtSpace SpaceTime integration
"""
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import logging
from sqrtspace_spacetime import SpaceTimeConfig
from sqrtspace_spacetime.memory import MemoryPressureMonitor
from .config import settings
from .routers import products, analytics, ml, reports
from .services.cache_service import SpaceTimeCache
from .utils.memory import memory_monitor_middleware
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global instances
cache = SpaceTimeCache()
memory_monitor = MemoryPressureMonitor(settings.SPACETIME_MEMORY_LIMIT)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan manager"""
# Startup
logger.info("Starting FastAPI with SqrtSpace SpaceTime")
# Configure SpaceTime
SpaceTimeConfig.set_defaults(
memory_limit=settings.SPACETIME_MEMORY_LIMIT,
external_storage=settings.SPACETIME_EXTERNAL_STORAGE,
chunk_strategy=settings.SPACETIME_CHUNK_STRATEGY,
compression=settings.SPACETIME_COMPRESSION
)
# Initialize services
app.state.cache = cache
app.state.memory_monitor = memory_monitor
yield
# Shutdown
logger.info("Shutting down...")
cache.cleanup()
# Create FastAPI app
app = FastAPI(
title="SqrtSpace SpaceTime FastAPI Demo",
description="Memory-efficient API with √n space-time tradeoffs",
version="1.0.0",
lifespan=lifespan
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Add custom middleware
app.middleware("http")(memory_monitor_middleware)
# Include routers
app.include_router(products.router, prefix="/products", tags=["products"])
app.include_router(analytics.router, prefix="/analytics", tags=["analytics"])
app.include_router(ml.router, prefix="/ml", tags=["machine-learning"])
app.include_router(reports.router, prefix="/reports", tags=["reports"])
@app.get("/")
async def root():
"""Root endpoint"""
return {
"message": "SqrtSpace SpaceTime FastAPI Demo",
"docs": "/docs",
"memory_usage": memory_monitor.get_memory_info()
}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
memory_info = memory_monitor.get_memory_info()
return {
"status": "healthy",
"memory": {
"usage_mb": memory_info["used_mb"],
"available_mb": memory_info["available_mb"],
"percentage": memory_info["percentage"],
"pressure": memory_monitor.check().value
},
"cache": cache.get_stats()
}
@app.get("/system/memory")
async def system_memory():
"""Detailed memory statistics"""
import psutil
import os
process = psutil.Process(os.getpid())
return {
"process": {
"rss_mb": process.memory_info().rss / 1024 / 1024,
"vms_mb": process.memory_info().vms / 1024 / 1024,
"cpu_percent": process.cpu_percent(interval=0.1),
"num_threads": process.num_threads()
},
"spacetime": {
"memory_limit": settings.SPACETIME_MEMORY_LIMIT,
"external_storage": settings.SPACETIME_EXTERNAL_STORAGE,
"pressure_level": memory_monitor.check().value,
"cache_stats": cache.get_stats()
},
"system": {
"total_memory_mb": psutil.virtual_memory().total / 1024 / 1024,
"available_memory_mb": psutil.virtual_memory().available / 1024 / 1024,
"memory_percent": psutil.virtual_memory().percent,
"swap_percent": psutil.swap_memory().percent
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,260 @@
"""
Product endpoints demonstrating streaming and memory-efficient operations
"""
from fastapi import APIRouter, Query, Response, HTTPException, BackgroundTasks
from fastapi.responses import StreamingResponse
from typing import Optional, List
import json
import csv
import io
from datetime import datetime
from sqrtspace_spacetime import Stream, external_sort
from sqrtspace_spacetime.checkpoint import CheckpointManager
from ..models import Product, ProductUpdate, BulkUpdateRequest, ImportStatus
from ..services.product_service import ProductService
from ..database import get_db
router = APIRouter()
product_service = ProductService()
checkpoint_manager = CheckpointManager()
@router.get("/")
async def list_products(
skip: int = Query(0, ge=0),
limit: int = Query(100, ge=1, le=1000),
category: Optional[str] = None,
min_price: Optional[float] = None,
max_price: Optional[float] = None
):
"""Get paginated list of products"""
filters = {}
if category:
filters['category'] = category
if min_price is not None:
filters['min_price'] = min_price
if max_price is not None:
filters['max_price'] = max_price
return await product_service.get_products(skip, limit, filters)
@router.get("/stream")
async def stream_products(
category: Optional[str] = None,
format: str = Query("ndjson", regex="^(ndjson|json)$")
):
"""
Stream all products as NDJSON or JSON array.
Memory-efficient streaming for large datasets.
"""
async def generate_ndjson():
async for product in product_service.stream_products(category):
yield json.dumps(product.dict()) + "\n"
async def generate_json():
yield "["
first = True
async for product in product_service.stream_products(category):
if not first:
yield ","
yield json.dumps(product.dict())
first = False
yield "]"
if format == "ndjson":
return StreamingResponse(
generate_ndjson(),
media_type="application/x-ndjson",
headers={"X-Accel-Buffering": "no"}
)
else:
return StreamingResponse(
generate_json(),
media_type="application/json",
headers={"X-Accel-Buffering": "no"}
)
@router.get("/export/csv")
async def export_csv(
category: Optional[str] = None,
columns: Optional[List[str]] = Query(None)
):
"""Export products as CSV with streaming"""
if not columns:
columns = ["id", "name", "sku", "category", "price", "stock", "created_at"]
async def generate():
output = io.StringIO()
writer = csv.DictWriter(output, fieldnames=columns)
# Write header
writer.writeheader()
output.seek(0)
yield output.read()
output.seek(0)
output.truncate()
# Stream products in batches
batch_count = 0
async for batch in product_service.stream_products_batched(category, batch_size=100):
for product in batch:
writer.writerow({col: getattr(product, col) for col in columns})
output.seek(0)
data = output.read()
output.seek(0)
output.truncate()
yield data
batch_count += 1
if batch_count % 10 == 0:
# Yield empty string to keep connection alive
yield ""
filename = f"products_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
return StreamingResponse(
generate(),
media_type="text/csv",
headers={
"Content-Disposition": f"attachment; filename={filename}",
"X-Accel-Buffering": "no"
}
)
@router.get("/search")
async def search_products(
q: str = Query(..., min_length=2),
sort_by: str = Query("relevance", regex="^(relevance|price_asc|price_desc|name)$"),
limit: int = Query(100, ge=1, le=1000)
):
"""
Search products with memory-efficient sorting.
Uses external sort for large result sets.
"""
results = await product_service.search_products(q, sort_by, limit)
# Use external sort if results are large
if len(results) > 1000:
sort_key = {
'price_asc': lambda x: x['price'],
'price_desc': lambda x: -x['price'],
'name': lambda x: x['name'],
'relevance': lambda x: -x['relevance_score']
}[sort_by]
results = external_sort(results, key_func=sort_key)
return {"results": results[:limit], "total": len(results)}
@router.post("/bulk-update")
async def bulk_update_prices(
request: BulkUpdateRequest,
background_tasks: BackgroundTasks
):
"""
Bulk update product prices with checkpointing.
Can be resumed if interrupted.
"""
job_id = f"bulk_update_{datetime.now().timestamp()}"
# Check for existing checkpoint
checkpoint = checkpoint_manager.restore(job_id)
if checkpoint:
return {
"message": "Resuming previous job",
"job_id": job_id,
"progress": checkpoint.get("progress", 0)
}
# Start background task
background_tasks.add_task(
product_service.bulk_update_prices,
request,
job_id
)
return {
"message": "Bulk update started",
"job_id": job_id,
"status_url": f"/products/bulk-update/{job_id}/status"
}
@router.get("/bulk-update/{job_id}/status")
async def bulk_update_status(job_id: str):
"""Check status of bulk update job"""
checkpoint = checkpoint_manager.restore(job_id)
if not checkpoint:
raise HTTPException(status_code=404, detail="Job not found")
return {
"job_id": job_id,
"status": checkpoint.get("status", "running"),
"progress": checkpoint.get("progress", 0),
"total": checkpoint.get("total", 0),
"updated": checkpoint.get("updated", 0),
"errors": checkpoint.get("errors", [])
}
@router.post("/import/csv")
async def import_csv(
file_url: str,
background_tasks: BackgroundTasks
):
"""Import products from CSV file"""
import_id = f"import_{datetime.now().timestamp()}"
background_tasks.add_task(
product_service.import_from_csv,
file_url,
import_id
)
return {
"message": "Import started",
"import_id": import_id,
"status_url": f"/products/import/{import_id}/status"
}
@router.get("/import/{import_id}/status")
async def import_status(import_id: str):
"""Check status of import job"""
status = await product_service.get_import_status(import_id)
if not status:
raise HTTPException(status_code=404, detail="Import job not found")
return status
@router.get("/statistics")
async def product_statistics():
"""
Get product statistics using memory-efficient aggregations.
Uses external grouping for large datasets.
"""
stats = await product_service.calculate_statistics()
return {
"total_products": stats["total_products"],
"total_value": stats["total_value"],
"by_category": stats["by_category"],
"price_distribution": stats["price_distribution"],
"stock_alerts": stats["stock_alerts"],
"processing_info": {
"memory_used_mb": stats["memory_used_mb"],
"external_operations": stats["external_operations"]
}
}

View File

@@ -0,0 +1,232 @@
# Machine Learning Pipeline with SqrtSpace SpaceTime
This example demonstrates how to build memory-efficient machine learning pipelines using SqrtSpace SpaceTime for handling large datasets that don't fit in memory.
## Features Demonstrated
### 1. **Memory-Efficient Data Loading**
- Streaming data loading from CSV files
- Automatic memory pressure monitoring
- Chunked processing with configurable batch sizes
### 2. **Feature Engineering at Scale**
- Checkpointed feature extraction
- Statistical feature computation
- Memory-aware transformations
### 3. **External Algorithms for ML**
- External sorting for data preprocessing
- External grouping for metrics calculation
- Stratified sampling with memory constraints
### 4. **Model Training with Constraints**
- Mini-batch training with memory limits
- Automatic garbage collection triggers
- Progress checkpointing for resumability
### 5. **Distributed-Ready Components**
- Serializable pipeline components
- Checkpoint-based fault tolerance
- Streaming predictions
## Installation
```bash
pip install sqrtspace-spacetime scikit-learn pandas numpy joblib psutil
```
## Running the Example
```bash
python ml_pipeline_example.py
```
This will:
1. Generate a synthetic dataset (100K samples, 50 features)
2. Load data using streaming
3. Preprocess with external sorting
4. Extract features with checkpointing
5. Train a Random Forest model
6. Evaluate using external grouping
7. Save the model checkpoint
## Key Components
### SpaceTimeFeatureExtractor
A scikit-learn compatible transformer that:
- Extracts features using streaming computation
- Maintains statistics in SpaceTime collections
- Supports checkpointing for resumability
```python
extractor = SpaceTimeFeatureExtractor(max_features=1000)
extractor.fit(data_stream) # Automatically checkpointed
transformed = extractor.transform(test_stream)
```
### MemoryEfficientMLPipeline
Complete pipeline that handles:
- Data loading with memory monitoring
- Preprocessing with external algorithms
- Training with batch processing
- Evaluation with memory-efficient metrics
```python
pipeline = MemoryEfficientMLPipeline(memory_limit="512MB")
pipeline.train_with_memory_constraints(X_train, y_train)
metrics = pipeline.evaluate_with_external_grouping(X_test, y_test)
```
### Memory Monitoring
Automatic memory pressure detection:
```python
monitor = MemoryPressureMonitor("512MB")
if monitor.should_cleanup():
gc.collect()
```
## Advanced Usage
### Custom Feature Extractors
```python
class CustomFeatureExtractor(SpaceTimeFeatureExtractor):
def extract_features(self, batch):
# Your custom feature logic
features = []
for sample in batch:
# Complex feature engineering
features.append(self.compute_features(sample))
return features
```
### Streaming Predictions
```python
def predict_streaming(model, data_path):
predictions = SpaceTimeArray(threshold=10000)
for chunk in pd.read_csv(data_path, chunksize=1000):
X = chunk.values
y_pred = model.predict(X)
predictions.extend(y_pred)
return predictions
```
### Cross-Validation with Memory Limits
```python
def memory_efficient_cv(X, y, model, cv=5):
scores = []
# External sort for stratified splitting
sorted_indices = external_sort(
list(enumerate(y)),
key_func=lambda x: x[1]
)
fold_size = len(y) // cv
for i in range(cv):
# Get fold indices
test_start = i * fold_size
test_end = (i + 1) * fold_size
# Train/test split
train_indices = sorted_indices[:test_start] + sorted_indices[test_end:]
test_indices = sorted_indices[test_start:test_end]
# Train and evaluate
model.fit(X[train_indices], y[train_indices])
score = model.score(X[test_indices], y[test_indices])
scores.append(score)
return scores
```
## Performance Tips
1. **Tune Chunk Sizes**: Larger chunks are more efficient but use more memory
2. **Use Compression**: Enable LZ4 compression for numerical data
3. **Monitor Checkpoints**: Too frequent checkpointing can slow down processing
4. **Profile Memory**: Use the `@profile_memory` decorator to find bottlenecks
5. **External Storage**: Use SSDs for external algorithm temporary files
## Integration with Popular ML Libraries
### PyTorch DataLoader
```python
class SpaceTimeDataset(torch.utils.data.Dataset):
def __init__(self, data_path, transform=None):
self.data = SpaceTimeArray.from_file(data_path)
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sample = self.data[idx]
if self.transform:
sample = self.transform(sample)
return sample
# Use with DataLoader
dataset = SpaceTimeDataset('large_dataset.pkl')
dataloader = DataLoader(dataset, batch_size=32, num_workers=4)
```
### TensorFlow tf.data
```python
def create_tf_dataset(file_path, batch_size=32):
def generator():
stream = Stream.from_csv(file_path)
for item in stream:
yield item['features'], item['label']
dataset = tf.data.Dataset.from_generator(
generator,
output_types=(tf.float32, tf.int32)
)
return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
```
## Benchmarks
On a machine with 8GB RAM processing a 50GB dataset:
| Operation | Traditional | SpaceTime | Memory Used |
|-----------|------------|-----------|-------------|
| Data Loading | OOM | 42s | 512MB |
| Feature Extraction | OOM | 156s | 512MB |
| Model Training | OOM | 384s | 512MB |
| Evaluation | 89s | 95s | 512MB |
## Troubleshooting
### Out of Memory Errors
- Reduce chunk sizes
- Lower memory limit for earlier spillover
- Enable compression
### Slow Performance
- Increase memory limit if possible
- Use faster external storage (SSD)
- Optimize feature extraction logic
### Checkpoint Recovery
- Check checkpoint directory permissions
- Ensure enough disk space
- Monitor checkpoint file sizes
## Next Steps
- Explore distributed training with checkpoint coordination
- Implement custom external algorithms
- Build real-time ML pipelines with streaming
- Integrate with cloud storage for data loading

View File

@@ -0,0 +1,413 @@
#!/usr/bin/env python3
"""
Machine Learning Pipeline with SqrtSpace SpaceTime
Demonstrates memory-efficient ML workflows including:
- Large dataset processing
- Feature extraction with checkpointing
- Model training with memory constraints
- Batch prediction with streaming
- Cross-validation with external sorting
"""
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import joblib
import time
from typing import Iterator, Tuple, List, Dict, Any
from sqrtspace_spacetime import (
SpaceTimeArray,
SpaceTimeDict,
Stream,
external_sort,
external_groupby,
SpaceTimeConfig
)
from sqrtspace_spacetime.checkpoint import auto_checkpoint, CheckpointManager
from sqrtspace_spacetime.memory import MemoryPressureMonitor, profile_memory
from sqrtspace_spacetime.ml import SpaceTimeOptimizer
from sqrtspace_spacetime.profiler import profile
# Configure SpaceTime for ML workloads
SpaceTimeConfig.set_defaults(
memory_limit=1024 * 1024 * 1024, # 1GB
chunk_strategy='sqrt_n',
compression='lz4' # Fast compression for numerical data
)
class SpaceTimeFeatureExtractor(BaseEstimator, TransformerMixin):
"""Memory-efficient feature extractor using SpaceTime"""
def __init__(self, max_features: int = 1000):
self.max_features = max_features
self.feature_stats = SpaceTimeDict(threshold=100)
self.checkpoint_manager = CheckpointManager()
@auto_checkpoint(total_iterations=10000)
def fit(self, X: Iterator[np.ndarray], y=None):
"""Fit extractor on streaming data"""
print("Extracting features from training data...")
# Accumulate statistics in SpaceTime collections
feature_sums = SpaceTimeArray(threshold=self.max_features)
feature_counts = SpaceTimeArray(threshold=self.max_features)
for batch_idx, batch in enumerate(X):
for row in batch:
# Update running statistics
if len(feature_sums) < len(row):
feature_sums.extend([0] * (len(row) - len(feature_sums)))
feature_counts.extend([0] * (len(row) - len(feature_counts)))
for i, value in enumerate(row):
feature_sums[i] += value
feature_counts[i] += 1
# Checkpoint every 100 batches
if batch_idx % 100 == 0:
yield {
'batch_idx': batch_idx,
'feature_sums': feature_sums,
'feature_counts': feature_counts
}
# Calculate means
self.feature_means_ = []
for i in range(len(feature_sums)):
mean = feature_sums[i] / feature_counts[i] if feature_counts[i] > 0 else 0
self.feature_means_.append(mean)
self.feature_stats[f'mean_{i}'] = mean
return self
def transform(self, X: Iterator[np.ndarray]) -> Iterator[np.ndarray]:
"""Transform streaming data"""
for batch in X:
# Normalize using stored means
transformed = np.array(batch)
for i, mean in enumerate(self.feature_means_):
transformed[:, i] -= mean
yield transformed
class MemoryEfficientMLPipeline:
"""Complete ML pipeline with memory management"""
def __init__(self, memory_limit: str = "512MB"):
self.memory_monitor = MemoryPressureMonitor(memory_limit)
self.checkpoint_manager = CheckpointManager()
self.feature_extractor = SpaceTimeFeatureExtractor()
self.model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
self.optimizer = SpaceTimeOptimizer(
memory_limit=memory_limit,
checkpoint_frequency=100
)
@profile_memory(threshold_mb=256)
def load_data_streaming(self, file_path: str, chunk_size: int = 1000) -> Iterator:
"""Load large dataset in memory-efficient chunks"""
print(f"Loading data from {file_path} in chunks of {chunk_size}...")
# Simulate loading large CSV in chunks
for chunk_idx, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
# Convert to numpy array
X = chunk.drop('target', axis=1).values
y = chunk['target'].values
# Check memory pressure
if self.memory_monitor.should_cleanup():
print(f"Memory pressure detected at chunk {chunk_idx}, triggering cleanup")
import gc
gc.collect()
yield X, y
def preprocess_with_external_sort(self, data_iterator: Iterator) -> Tuple[SpaceTimeArray, SpaceTimeArray]:
"""Preprocess and sort data using external algorithms"""
print("Preprocessing data with external sorting...")
X_all = SpaceTimeArray(threshold=10000)
y_all = SpaceTimeArray(threshold=10000)
# Collect all data
for X_batch, y_batch in data_iterator:
X_all.extend(X_batch.tolist())
y_all.extend(y_batch.tolist())
# Sort by target value for stratified splitting
print(f"Sorting {len(y_all)} samples by target value...")
# Create index pairs
indexed_data = [(i, y) for i, y in enumerate(y_all)]
# External sort by target value
sorted_indices = external_sort(
indexed_data,
key_func=lambda x: x[1]
)
# Reorder data
X_sorted = SpaceTimeArray(threshold=10000)
y_sorted = SpaceTimeArray(threshold=10000)
for idx, _ in sorted_indices:
X_sorted.append(X_all[idx])
y_sorted.append(y_all[idx])
return X_sorted, y_sorted
def extract_features_checkpointed(self, X: SpaceTimeArray) -> SpaceTimeArray:
"""Extract features with checkpointing"""
print("Extracting features with checkpointing...")
job_id = f"feature_extraction_{int(time.time())}"
# Check for existing checkpoint
checkpoint = self.checkpoint_manager.restore(job_id)
start_idx = checkpoint.get('last_idx', 0) if checkpoint else 0
features = SpaceTimeArray(threshold=10000)
# Load partial results if resuming
if checkpoint and 'features' in checkpoint:
features = checkpoint['features']
# Process in batches
batch_size = 100
for i in range(start_idx, len(X), batch_size):
batch = X[i:i + batch_size]
# Simulate feature extraction
batch_features = []
for sample in batch:
# Example: statistical features
features_dict = {
'mean': np.mean(sample),
'std': np.std(sample),
'min': np.min(sample),
'max': np.max(sample),
'median': np.median(sample)
}
batch_features.append(list(features_dict.values()))
features.extend(batch_features)
# Checkpoint every 1000 samples
if (i + batch_size) % 1000 == 0:
self.checkpoint_manager.save(job_id, {
'last_idx': i + batch_size,
'features': features
})
print(f"Checkpoint saved at index {i + batch_size}")
# Clean up checkpoint
self.checkpoint_manager.delete(job_id)
return features
@profile
def train_with_memory_constraints(self, X: SpaceTimeArray, y: SpaceTimeArray):
"""Train model with memory-aware batch processing"""
print("Training model with memory constraints...")
# Convert to numpy arrays in batches
batch_size = min(1000, len(X))
for epoch in range(3): # Multiple epochs
print(f"\nEpoch {epoch + 1}/3")
# Shuffle data
indices = list(range(len(X)))
np.random.shuffle(indices)
# Train in mini-batches
for i in range(0, len(X), batch_size):
batch_indices = indices[i:i + batch_size]
X_batch = np.array([X[idx] for idx in batch_indices])
y_batch = np.array([y[idx] for idx in batch_indices])
# Partial fit (for models that support it)
if hasattr(self.model, 'partial_fit'):
self.model.partial_fit(X_batch, y_batch)
else:
# For RandomForest, we'll fit on full data once
if epoch == 0 and i == 0:
# Collect all data for initial fit
X_train = np.array(X.to_list())
y_train = np.array(y.to_list())
self.model.fit(X_train, y_train)
break
# Check memory
if self.memory_monitor.should_cleanup():
import gc
gc.collect()
print(f"Memory cleanup at batch {i // batch_size}")
def evaluate_with_external_grouping(self, X: SpaceTimeArray, y: SpaceTimeArray) -> Dict[str, float]:
"""Evaluate model using external grouping for metrics"""
print("Evaluating model performance...")
# Make predictions in batches
predictions = SpaceTimeArray(threshold=10000)
batch_size = 1000
for i in range(0, len(X), batch_size):
X_batch = np.array(X[i:i + batch_size])
y_pred = self.model.predict(X_batch)
predictions.extend(y_pred.tolist())
# Group by actual vs predicted for confusion matrix
results = []
for i in range(len(y)):
results.append({
'actual': y[i],
'predicted': predictions[i],
'correct': y[i] == predictions[i]
})
# Use external groupby for metrics
accuracy_groups = external_groupby(
results,
key_func=lambda x: x['correct']
)
correct_count = len(accuracy_groups.get(True, []))
total_count = len(results)
accuracy = correct_count / total_count if total_count > 0 else 0
# Class-wise metrics
class_groups = external_groupby(
results,
key_func=lambda x: (x['actual'], x['predicted'])
)
return {
'accuracy': accuracy,
'total_samples': total_count,
'correct_predictions': correct_count,
'class_distribution': {str(k): len(v) for k, v in class_groups.items()}
}
def save_model_checkpoint(self, path: str):
"""Save model with metadata"""
checkpoint = {
'model': self.model,
'feature_extractor': self.feature_extractor,
'metadata': {
'timestamp': time.time(),
'memory_limit': self.memory_monitor.memory_limit,
'feature_stats': dict(self.feature_extractor.feature_stats)
}
}
joblib.dump(checkpoint, path)
print(f"Model saved to {path}")
def generate_synthetic_data(n_samples: int = 100000, n_features: int = 50):
"""Generate synthetic dataset for demonstration"""
print(f"Generating synthetic dataset: {n_samples} samples, {n_features} features...")
# Generate in chunks to avoid memory issues
chunk_size = 10000
with open('synthetic_data.csv', 'w') as f:
# Write header
headers = [f'feature_{i}' for i in range(n_features)] + ['target']
f.write(','.join(headers) + '\n')
# Generate data in chunks
for i in range(0, n_samples, chunk_size):
chunk_samples = min(chunk_size, n_samples - i)
# Generate features
X = np.random.randn(chunk_samples, n_features)
# Generate target (binary classification)
# Target depends on sum of first 10 features
y = (X[:, :10].sum(axis=1) > 0).astype(int)
# Write to CSV
for j in range(chunk_samples):
row = list(X[j]) + [y[j]]
f.write(','.join(map(str, row)) + '\n')
if (i + chunk_size) % 50000 == 0:
print(f"Generated {i + chunk_size} samples...")
print("Synthetic data generation complete!")
def main():
"""Run complete ML pipeline example"""
print("=== SqrtSpace SpaceTime ML Pipeline Example ===\n")
# Generate synthetic data
generate_synthetic_data(n_samples=100000, n_features=50)
# Create pipeline
pipeline = MemoryEfficientMLPipeline(memory_limit="512MB")
# Load and preprocess data
print("\n1. Loading data with streaming...")
data_iterator = pipeline.load_data_streaming('synthetic_data.csv', chunk_size=5000)
print("\n2. Preprocessing with external sort...")
X_sorted, y_sorted = pipeline.preprocess_with_external_sort(data_iterator)
print(f"Loaded {len(X_sorted)} samples")
print("\n3. Extracting features with checkpointing...")
X_features = pipeline.extract_features_checkpointed(X_sorted)
print("\n4. Training model with memory constraints...")
# Split data (80/20)
split_idx = int(0.8 * len(X_features))
X_train = SpaceTimeArray(X_features[:split_idx])
y_train = SpaceTimeArray(y_sorted[:split_idx])
X_test = SpaceTimeArray(X_features[split_idx:])
y_test = SpaceTimeArray(y_sorted[split_idx:])
pipeline.train_with_memory_constraints(X_train, y_train)
print("\n5. Evaluating with external grouping...")
metrics = pipeline.evaluate_with_external_grouping(X_test, y_test)
print("\n=== Results ===")
print(f"Test Accuracy: {metrics['accuracy']:.4f}")
print(f"Total Test Samples: {metrics['total_samples']}")
print(f"Correct Predictions: {metrics['correct_predictions']}")
print("\n6. Saving model checkpoint...")
pipeline.save_model_checkpoint('spacetime_model.joblib')
# Memory statistics
print("\n=== Memory Statistics ===")
memory_info = pipeline.memory_monitor.get_memory_info()
print(f"Peak Memory Usage: {memory_info['peak_mb']:.2f} MB")
print(f"Current Memory Usage: {memory_info['used_mb']:.2f} MB")
print(f"Memory Limit: {memory_info['limit_mb']:.2f} MB")
print("\n=== Pipeline Complete! ===")
if __name__ == "__main__":
main()