This commit is contained in:
2025-07-20 04:11:04 -04:00
commit 69b521b549
40 changed files with 7781 additions and 0 deletions

View File

@@ -0,0 +1,504 @@
# SqrtSpace SpaceTime FastAPI Sample Application
This sample demonstrates how to build memory-efficient, high-performance APIs using FastAPI and SqrtSpace SpaceTime.
## Features Demonstrated
### 1. **Streaming Endpoints**
- Server-Sent Events (SSE) for real-time data
- Streaming file downloads without memory bloat
- Chunked JSON responses for large datasets
### 2. **Background Tasks**
- Memory-aware task processing
- Checkpointed long-running operations
- Progress tracking with resumable state
### 3. **Data Processing**
- External sorting for large datasets
- Memory-efficient aggregations
- Streaming ETL pipelines
### 4. **Machine Learning Integration**
- Batch prediction with memory limits
- Model training with checkpoints
- Feature extraction pipelines
## Installation
1. **Create virtual environment:**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
2. **Install dependencies:**
```bash
pip install -r requirements.txt
```
3. **Configure environment:**
```bash
cp .env.example .env
```
Edit `.env`:
```
SPACETIME_MEMORY_LIMIT=512MB
SPACETIME_EXTERNAL_STORAGE=/tmp/spacetime
SPACETIME_CHUNK_STRATEGY=sqrt_n
SPACETIME_COMPRESSION=gzip
DATABASE_URL=sqlite:///./app.db
```
4. **Initialize database:**
```bash
python init_db.py
```
## Project Structure
```
fastapi-app/
├── app/
│ ├── __init__.py
│ ├── main.py # FastAPI app
│ ├── config.py # Configuration
│ ├── models.py # Pydantic models
│ ├── database.py # Database setup
│ ├── routers/
│ │ ├── products.py # Product endpoints
│ │ ├── analytics.py # Analytics endpoints
│ │ ├── ml.py # ML endpoints
│ │ └── reports.py # Report generation
│ ├── services/
│ │ ├── product_service.py # Business logic
│ │ ├── analytics_service.py # Analytics processing
│ │ ├── ml_service.py # ML operations
│ │ └── cache_service.py # SpaceTime caching
│ ├── workers/
│ │ ├── background_tasks.py # Task workers
│ │ └── checkpointed_jobs.py # Resumable jobs
│ └── utils/
│ ├── streaming.py # Streaming helpers
│ └── memory.py # Memory monitoring
├── requirements.txt
├── Dockerfile
└── docker-compose.yml
```
## Usage Examples
### 1. Streaming Large Datasets
```python
# app/routers/products.py
from fastapi import APIRouter, Response
from fastapi.responses import StreamingResponse
from sqrtspace_spacetime import Stream
import json
router = APIRouter()
@router.get("/products/stream")
async def stream_products(category: str = None):
"""Stream products as newline-delimited JSON"""
async def generate():
query = db.query(Product)
if category:
query = query.filter(Product.category == category)
# Use SpaceTime stream for memory efficiency
stream = Stream.from_query(query, chunk_size=100)
for product in stream:
yield json.dumps(product.dict()) + "\n"
return StreamingResponse(
generate(),
media_type="application/x-ndjson",
headers={"X-Accel-Buffering": "no"}
)
```
### 2. Server-Sent Events for Real-Time Data
```python
# app/routers/analytics.py
from fastapi import APIRouter
from sse_starlette.sse import EventSourceResponse
from sqrtspace_spacetime.memory import MemoryPressureMonitor
import asyncio
router = APIRouter()
@router.get("/analytics/realtime")
async def realtime_analytics():
"""Stream real-time analytics using SSE"""
monitor = MemoryPressureMonitor("100MB")
async def event_generator():
while True:
# Get current stats
stats = await analytics_service.get_current_stats()
# Check memory pressure
if monitor.check() != MemoryPressureLevel.NONE:
await analytics_service.compact_cache()
yield {
"event": "update",
"data": json.dumps(stats)
}
await asyncio.sleep(1)
return EventSourceResponse(event_generator())
```
### 3. Memory-Efficient CSV Export
```python
# app/routers/reports.py
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from sqrtspace_spacetime.file import CsvWriter
import io
router = APIRouter()
@router.get("/reports/export/csv")
async def export_csv(start_date: date, end_date: date):
"""Export large dataset as CSV with streaming"""
async def generate():
# Create in-memory buffer
output = io.StringIO()
writer = CsvWriter(output)
# Write headers
writer.writerow(["Date", "Orders", "Revenue", "Customers"])
# Stream data in chunks
async for batch in analytics_service.get_daily_stats_batched(
start_date, end_date, batch_size=100
):
for row in batch:
writer.writerow([
row.date,
row.order_count,
row.total_revenue,
row.unique_customers
])
# Yield buffer content
output.seek(0)
data = output.read()
output.seek(0)
output.truncate()
yield data
return StreamingResponse(
generate(),
media_type="text/csv",
headers={
"Content-Disposition": f"attachment; filename=report_{start_date}_{end_date}.csv"
}
)
```
### 4. Checkpointed Background Tasks
```python
# app/workers/checkpointed_jobs.py
from sqrtspace_spacetime.checkpoint import CheckpointManager, auto_checkpoint
from sqrtspace_spacetime.collections import SpaceTimeArray
class DataProcessor:
def __init__(self):
self.checkpoint_manager = CheckpointManager()
@auto_checkpoint(total_iterations=10000)
async def process_large_dataset(self, dataset_id: str):
"""Process dataset with automatic checkpointing"""
# Initialize or restore state
results = SpaceTimeArray(threshold=1000)
processed_count = 0
# Get data in batches
async for batch in self.get_data_batches(dataset_id):
for item in batch:
# Process item
result = await self.process_item(item)
results.append(result)
processed_count += 1
# Yield state for checkpointing
if processed_count % 100 == 0:
yield {
'processed': processed_count,
'results': results,
'last_item_id': item.id
}
return results
```
### 5. Machine Learning with Memory Constraints
```python
# app/services/ml_service.py
from sqrtspace_spacetime.ml import SpaceTimeOptimizer
from sqrtspace_spacetime.streams import Stream
import numpy as np
class MLService:
def __init__(self):
self.optimizer = SpaceTimeOptimizer(
memory_limit="256MB",
checkpoint_frequency=100
)
async def train_model(self, training_data_path: str):
"""Train model with memory-efficient data loading"""
# Stream training data
data_stream = Stream.from_csv(
training_data_path,
chunk_size=1000
)
# Process in mini-batches
for epoch in range(10):
for batch in data_stream.batch(32):
X = np.array([item.features for item in batch])
y = np.array([item.label for item in batch])
# Train step with automatic checkpointing
loss = self.optimizer.step(
self.model,
X, y,
epoch=epoch
)
if self.optimizer.should_checkpoint():
await self.save_checkpoint(epoch)
async def batch_predict(self, input_data):
"""Memory-efficient batch prediction"""
results = SpaceTimeArray(threshold=1000)
# Process in chunks to avoid memory issues
for chunk in Stream.from_iterable(input_data).chunk(100):
predictions = self.model.predict(chunk)
results.extend(predictions)
return results
```
### 6. Advanced Caching with SpaceTime
```python
# app/services/cache_service.py
from sqrtspace_spacetime.collections import SpaceTimeDict
from sqrtspace_spacetime.memory import MemoryPressureMonitor
import asyncio
class SpaceTimeCache:
def __init__(self):
self.hot_cache = SpaceTimeDict(threshold=1000)
self.monitor = MemoryPressureMonitor("128MB")
self.stats = {
'hits': 0,
'misses': 0,
'evictions': 0
}
async def get(self, key: str):
"""Get with automatic tier management"""
if key in self.hot_cache:
self.stats['hits'] += 1
return self.hot_cache[key]
self.stats['misses'] += 1
# Load from database
value = await self.load_from_db(key)
# Add to cache if memory allows
if self.monitor.can_allocate(len(str(value))):
self.hot_cache[key] = value
else:
# Trigger cleanup
self.cleanup()
self.stats['evictions'] += len(self.hot_cache) // 2
return value
def cleanup(self):
"""Remove least recently used items"""
# SpaceTimeDict handles LRU automatically
self.hot_cache.evict_cold_items(0.5)
```
## API Endpoints
### Products API
- `GET /products` - Paginated list
- `GET /products/stream` - Stream all products (NDJSON)
- `GET /products/search` - Memory-efficient search
- `POST /products/bulk-update` - Checkpointed bulk updates
- `GET /products/export/csv` - Streaming CSV export
### Analytics API
- `GET /analytics/summary` - Current statistics
- `GET /analytics/realtime` - SSE stream of live data
- `GET /analytics/trends` - Historical trends
- `POST /analytics/aggregate` - Custom aggregations
### ML API
- `POST /ml/train` - Train model (async with progress)
- `POST /ml/predict/batch` - Batch predictions
- `GET /ml/models/{id}/status` - Training status
- `POST /ml/features/extract` - Feature extraction pipeline
### Reports API
- `POST /reports/generate` - Generate large report
- `GET /reports/{id}/progress` - Check progress
- `GET /reports/{id}/download` - Download completed report
## Running the Application
### Development
```bash
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
```
### Production
```bash
gunicorn app.main:app -w 4 -k uvicorn.workers.UvicornWorker \
--bind 0.0.0.0:8000 \
--timeout 300 \
--max-requests 1000 \
--max-requests-jitter 50
```
### With Docker
```bash
docker-compose up
```
## Performance Configuration
### 1. Nginx Configuration
```nginx
location /products/stream {
proxy_pass http://backend;
proxy_buffering off;
proxy_read_timeout 3600;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
location /analytics/realtime {
proxy_pass http://backend;
proxy_buffering off;
proxy_cache off;
proxy_read_timeout 86400;
proxy_http_version 1.1;
proxy_set_header Connection "";
}
```
### 2. Worker Configuration
```python
# app/config.py
WORKER_CONFIG = {
'memory_limit': os.getenv('WORKER_MEMORY_LIMIT', '512MB'),
'checkpoint_interval': 100,
'batch_size': 1000,
'external_storage': '/tmp/spacetime-workers'
}
```
## Monitoring
### Memory Usage Endpoint
```python
@router.get("/system/memory")
async def memory_stats():
"""Get current memory statistics"""
return {
"current_usage_mb": memory_monitor.current_usage_mb,
"peak_usage_mb": memory_monitor.peak_usage_mb,
"available_mb": memory_monitor.available_mb,
"pressure_level": memory_monitor.pressure_level,
"cache_stats": cache_service.get_stats(),
"external_files": len(os.listdir(EXTERNAL_STORAGE))
}
```
### Prometheus Metrics
```python
from prometheus_client import Counter, Histogram, Gauge
stream_requests = Counter('spacetime_stream_requests_total', 'Total streaming requests')
memory_usage = Gauge('spacetime_memory_usage_bytes', 'Current memory usage')
processing_time = Histogram('spacetime_processing_seconds', 'Processing time')
```
## Testing
### Unit Tests
```bash
pytest tests/unit -v
```
### Integration Tests
```bash
pytest tests/integration -v
```
### Load Testing
```bash
locust -f tests/load/locustfile.py --host http://localhost:8000
```
## Best Practices
1. **Always use streaming** for large responses
2. **Configure memory limits** based on container size
3. **Enable checkpointing** for long-running tasks
4. **Monitor memory pressure** in production
5. **Use external storage** on fast SSDs
6. **Set appropriate timeouts** for streaming endpoints
7. **Implement circuit breakers** for memory protection
## Troubleshooting
### High Memory Usage
- Reduce chunk sizes
- Enable more aggressive spillover
- Check for memory leaks in custom code
### Slow Streaming
- Ensure proxy buffering is disabled
- Check network latency
- Optimize chunk sizes
### Failed Checkpoints
- Verify storage permissions
- Check disk space
- Monitor checkpoint frequency
## Learn More
- [SqrtSpace SpaceTime Docs](https://github.com/MarketAlly/Ubiquity)
- [FastAPI Documentation](https://fastapi.tiangolo.com)
- [Streaming Best Practices](https://example.com/streaming)

View File

@@ -0,0 +1,137 @@
"""
FastAPI application demonstrating SqrtSpace SpaceTime integration
"""
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import logging
from sqrtspace_spacetime import SpaceTimeConfig
from sqrtspace_spacetime.memory import MemoryPressureMonitor
from .config import settings
from .routers import products, analytics, ml, reports
from .services.cache_service import SpaceTimeCache
from .utils.memory import memory_monitor_middleware
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global instances
cache = SpaceTimeCache()
memory_monitor = MemoryPressureMonitor(settings.SPACETIME_MEMORY_LIMIT)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan manager"""
# Startup
logger.info("Starting FastAPI with SqrtSpace SpaceTime")
# Configure SpaceTime
SpaceTimeConfig.set_defaults(
memory_limit=settings.SPACETIME_MEMORY_LIMIT,
external_storage=settings.SPACETIME_EXTERNAL_STORAGE,
chunk_strategy=settings.SPACETIME_CHUNK_STRATEGY,
compression=settings.SPACETIME_COMPRESSION
)
# Initialize services
app.state.cache = cache
app.state.memory_monitor = memory_monitor
yield
# Shutdown
logger.info("Shutting down...")
cache.cleanup()
# Create FastAPI app
app = FastAPI(
title="SqrtSpace SpaceTime FastAPI Demo",
description="Memory-efficient API with √n space-time tradeoffs",
version="1.0.0",
lifespan=lifespan
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Add custom middleware
app.middleware("http")(memory_monitor_middleware)
# Include routers
app.include_router(products.router, prefix="/products", tags=["products"])
app.include_router(analytics.router, prefix="/analytics", tags=["analytics"])
app.include_router(ml.router, prefix="/ml", tags=["machine-learning"])
app.include_router(reports.router, prefix="/reports", tags=["reports"])
@app.get("/")
async def root():
"""Root endpoint"""
return {
"message": "SqrtSpace SpaceTime FastAPI Demo",
"docs": "/docs",
"memory_usage": memory_monitor.get_memory_info()
}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
memory_info = memory_monitor.get_memory_info()
return {
"status": "healthy",
"memory": {
"usage_mb": memory_info["used_mb"],
"available_mb": memory_info["available_mb"],
"percentage": memory_info["percentage"],
"pressure": memory_monitor.check().value
},
"cache": cache.get_stats()
}
@app.get("/system/memory")
async def system_memory():
"""Detailed memory statistics"""
import psutil
import os
process = psutil.Process(os.getpid())
return {
"process": {
"rss_mb": process.memory_info().rss / 1024 / 1024,
"vms_mb": process.memory_info().vms / 1024 / 1024,
"cpu_percent": process.cpu_percent(interval=0.1),
"num_threads": process.num_threads()
},
"spacetime": {
"memory_limit": settings.SPACETIME_MEMORY_LIMIT,
"external_storage": settings.SPACETIME_EXTERNAL_STORAGE,
"pressure_level": memory_monitor.check().value,
"cache_stats": cache.get_stats()
},
"system": {
"total_memory_mb": psutil.virtual_memory().total / 1024 / 1024,
"available_memory_mb": psutil.virtual_memory().available / 1024 / 1024,
"memory_percent": psutil.virtual_memory().percent,
"swap_percent": psutil.swap_memory().percent
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,260 @@
"""
Product endpoints demonstrating streaming and memory-efficient operations
"""
from fastapi import APIRouter, Query, Response, HTTPException, BackgroundTasks
from fastapi.responses import StreamingResponse
from typing import Optional, List
import json
import csv
import io
from datetime import datetime
from sqrtspace_spacetime import Stream, external_sort
from sqrtspace_spacetime.checkpoint import CheckpointManager
from ..models import Product, ProductUpdate, BulkUpdateRequest, ImportStatus
from ..services.product_service import ProductService
from ..database import get_db
router = APIRouter()
product_service = ProductService()
checkpoint_manager = CheckpointManager()
@router.get("/")
async def list_products(
skip: int = Query(0, ge=0),
limit: int = Query(100, ge=1, le=1000),
category: Optional[str] = None,
min_price: Optional[float] = None,
max_price: Optional[float] = None
):
"""Get paginated list of products"""
filters = {}
if category:
filters['category'] = category
if min_price is not None:
filters['min_price'] = min_price
if max_price is not None:
filters['max_price'] = max_price
return await product_service.get_products(skip, limit, filters)
@router.get("/stream")
async def stream_products(
category: Optional[str] = None,
format: str = Query("ndjson", regex="^(ndjson|json)$")
):
"""
Stream all products as NDJSON or JSON array.
Memory-efficient streaming for large datasets.
"""
async def generate_ndjson():
async for product in product_service.stream_products(category):
yield json.dumps(product.dict()) + "\n"
async def generate_json():
yield "["
first = True
async for product in product_service.stream_products(category):
if not first:
yield ","
yield json.dumps(product.dict())
first = False
yield "]"
if format == "ndjson":
return StreamingResponse(
generate_ndjson(),
media_type="application/x-ndjson",
headers={"X-Accel-Buffering": "no"}
)
else:
return StreamingResponse(
generate_json(),
media_type="application/json",
headers={"X-Accel-Buffering": "no"}
)
@router.get("/export/csv")
async def export_csv(
category: Optional[str] = None,
columns: Optional[List[str]] = Query(None)
):
"""Export products as CSV with streaming"""
if not columns:
columns = ["id", "name", "sku", "category", "price", "stock", "created_at"]
async def generate():
output = io.StringIO()
writer = csv.DictWriter(output, fieldnames=columns)
# Write header
writer.writeheader()
output.seek(0)
yield output.read()
output.seek(0)
output.truncate()
# Stream products in batches
batch_count = 0
async for batch in product_service.stream_products_batched(category, batch_size=100):
for product in batch:
writer.writerow({col: getattr(product, col) for col in columns})
output.seek(0)
data = output.read()
output.seek(0)
output.truncate()
yield data
batch_count += 1
if batch_count % 10 == 0:
# Yield empty string to keep connection alive
yield ""
filename = f"products_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
return StreamingResponse(
generate(),
media_type="text/csv",
headers={
"Content-Disposition": f"attachment; filename={filename}",
"X-Accel-Buffering": "no"
}
)
@router.get("/search")
async def search_products(
q: str = Query(..., min_length=2),
sort_by: str = Query("relevance", regex="^(relevance|price_asc|price_desc|name)$"),
limit: int = Query(100, ge=1, le=1000)
):
"""
Search products with memory-efficient sorting.
Uses external sort for large result sets.
"""
results = await product_service.search_products(q, sort_by, limit)
# Use external sort if results are large
if len(results) > 1000:
sort_key = {
'price_asc': lambda x: x['price'],
'price_desc': lambda x: -x['price'],
'name': lambda x: x['name'],
'relevance': lambda x: -x['relevance_score']
}[sort_by]
results = external_sort(results, key_func=sort_key)
return {"results": results[:limit], "total": len(results)}
@router.post("/bulk-update")
async def bulk_update_prices(
request: BulkUpdateRequest,
background_tasks: BackgroundTasks
):
"""
Bulk update product prices with checkpointing.
Can be resumed if interrupted.
"""
job_id = f"bulk_update_{datetime.now().timestamp()}"
# Check for existing checkpoint
checkpoint = checkpoint_manager.restore(job_id)
if checkpoint:
return {
"message": "Resuming previous job",
"job_id": job_id,
"progress": checkpoint.get("progress", 0)
}
# Start background task
background_tasks.add_task(
product_service.bulk_update_prices,
request,
job_id
)
return {
"message": "Bulk update started",
"job_id": job_id,
"status_url": f"/products/bulk-update/{job_id}/status"
}
@router.get("/bulk-update/{job_id}/status")
async def bulk_update_status(job_id: str):
"""Check status of bulk update job"""
checkpoint = checkpoint_manager.restore(job_id)
if not checkpoint:
raise HTTPException(status_code=404, detail="Job not found")
return {
"job_id": job_id,
"status": checkpoint.get("status", "running"),
"progress": checkpoint.get("progress", 0),
"total": checkpoint.get("total", 0),
"updated": checkpoint.get("updated", 0),
"errors": checkpoint.get("errors", [])
}
@router.post("/import/csv")
async def import_csv(
file_url: str,
background_tasks: BackgroundTasks
):
"""Import products from CSV file"""
import_id = f"import_{datetime.now().timestamp()}"
background_tasks.add_task(
product_service.import_from_csv,
file_url,
import_id
)
return {
"message": "Import started",
"import_id": import_id,
"status_url": f"/products/import/{import_id}/status"
}
@router.get("/import/{import_id}/status")
async def import_status(import_id: str):
"""Check status of import job"""
status = await product_service.get_import_status(import_id)
if not status:
raise HTTPException(status_code=404, detail="Import job not found")
return status
@router.get("/statistics")
async def product_statistics():
"""
Get product statistics using memory-efficient aggregations.
Uses external grouping for large datasets.
"""
stats = await product_service.calculate_statistics()
return {
"total_products": stats["total_products"],
"total_value": stats["total_value"],
"by_category": stats["by_category"],
"price_distribution": stats["price_distribution"],
"stock_alerts": stats["stock_alerts"],
"processing_info": {
"memory_used_mb": stats["memory_used_mb"],
"external_operations": stats["external_operations"]
}
}