Initial

2025-07-20 04:04:41 -04:00
commit 89909d5b20
27 changed files with 11534 additions and 0 deletions
--- a/distsys/shuffle_optimizer.py
+++ b/distsys/shuffle_optimizer.py
@@ -0,0 +1,636 @@
+#!/usr/bin/env python3
+"""
+Distributed Shuffle Optimizer: Optimize shuffle operations in distributed computing
+
+Features:
+- Buffer Sizing: Calculate optimal buffer sizes per node
+- Spill Strategy: Decide when to spill based on memory pressure
+- Aggregation Trees: Build √n-height aggregation trees
+- Network Awareness: Consider network topology in optimization
+- AI Explanations: Clear reasoning for optimization decisions
+"""
+
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import numpy as np
+import json
+import time
+import psutil
+import socket
+from dataclasses import dataclass, asdict
+from typing import Dict, List, Tuple, Optional, Any, Union
+from enum import Enum
+import heapq
+import zlib
+
+# Import core components
+from core.spacetime_core import (
+    MemoryHierarchy,
+    SqrtNCalculator,
+    OptimizationStrategy,
+    MemoryProfiler
+)
+
+
+class ShuffleStrategy(Enum):
+    """Shuffle strategies for distributed systems"""
+    ALL_TO_ALL = "all_to_all"              # Every node to every node
+    TREE_AGGREGATE = "tree_aggregate"       # Hierarchical aggregation
+    HASH_PARTITION = "hash_partition"       # Hash-based partitioning
+    RANGE_PARTITION = "range_partition"     # Range-based partitioning
+    COMBINER_BASED = "combiner_based"       # Local combining first
+
+
+class CompressionType(Enum):
+    """Compression algorithms for shuffle data"""
+    NONE = "none"
+    SNAPPY = "snappy"    # Fast, moderate compression
+    ZLIB = "zlib"        # Slower, better compression
+    LZ4 = "lz4"          # Very fast, light compression
+
+
+@dataclass
+class NodeInfo:
+    """Information about a compute node"""
+    node_id: str
+    hostname: str
+    cpu_cores: int
+    memory_gb: float
+    network_bandwidth_gbps: float
+    storage_type: str  # 'ssd' or 'hdd'
+    rack_id: Optional[str] = None
+
+
+@dataclass
+class ShuffleTask:
+    """A shuffle task specification"""
+    task_id: str
+    input_partitions: int
+    output_partitions: int
+    data_size_gb: float
+    key_distribution: str  # 'uniform', 'skewed', 'heavy_hitters'
+    value_size_avg: int    # Average value size in bytes
+    combiner_function: Optional[str] = None  # 'sum', 'max', 'collect', etc.
+
+
+@dataclass
+class ShufflePlan:
+    """Optimized shuffle execution plan"""
+    strategy: ShuffleStrategy
+    buffer_sizes: Dict[str, int]  # node_id -> buffer_size
+    spill_thresholds: Dict[str, float]  # node_id -> threshold
+    aggregation_tree: Optional[Dict[str, List[str]]]  # parent -> children
+    compression: CompressionType
+    partition_assignment: Dict[int, str]  # partition -> node_id
+    estimated_time: float
+    estimated_network_usage: float
+    memory_usage: Dict[str, float]
+    explanation: str
+
+
+@dataclass
+class ShuffleMetrics:
+    """Metrics from shuffle execution"""
+    total_time: float
+    network_bytes: int
+    disk_spills: int
+    memory_peak: int
+    compression_ratio: float
+    skew_factor: float  # Max/avg partition size
+
+
+class NetworkTopology:
+    """Model network topology for optimization"""
+    
+    def __init__(self, nodes: List[NodeInfo]):
+        self.nodes = {n.node_id: n for n in nodes}
+        self.racks = self._group_by_rack(nodes)
+        self.bandwidth_matrix = self._build_bandwidth_matrix()
+    
+    def _group_by_rack(self, nodes: List[NodeInfo]) -> Dict[str, List[str]]:
+        """Group nodes by rack"""
+        racks = {}
+        for node in nodes:
+            rack = node.rack_id or 'default'
+            if rack not in racks:
+                racks[rack] = []
+            racks[rack].append(node.node_id)
+        return racks
+    
+    def _build_bandwidth_matrix(self) -> Dict[Tuple[str, str], float]:
+        """Build bandwidth matrix between nodes"""
+        matrix = {}
+        for n1 in self.nodes:
+            for n2 in self.nodes:
+                if n1 == n2:
+                    matrix[(n1, n2)] = float('inf')  # Local
+                elif self._same_rack(n1, n2):
+                    # Same rack: use min node bandwidth
+                    matrix[(n1, n2)] = min(
+                        self.nodes[n1].network_bandwidth_gbps,
+                        self.nodes[n2].network_bandwidth_gbps
+                    )
+                else:
+                    # Cross-rack: assume 50% of node bandwidth
+                    matrix[(n1, n2)] = min(
+                        self.nodes[n1].network_bandwidth_gbps,
+                        self.nodes[n2].network_bandwidth_gbps
+                    ) * 0.5
+        return matrix
+    
+    def _same_rack(self, node1: str, node2: str) -> bool:
+        """Check if two nodes are in the same rack"""
+        r1 = self.nodes[node1].rack_id or 'default'
+        r2 = self.nodes[node2].rack_id or 'default'
+        return r1 == r2
+    
+    def get_bandwidth(self, src: str, dst: str) -> float:
+        """Get bandwidth between two nodes in Gbps"""
+        return self.bandwidth_matrix.get((src, dst), 1.0)
+
+
+class CostModel:
+    """Cost model for shuffle operations"""
+    
+    def __init__(self, topology: NetworkTopology):
+        self.topology = topology
+        self.hierarchy = MemoryHierarchy.detect_system()
+    
+    def estimate_shuffle_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
+        """Estimate shuffle execution time"""
+        # Network transfer time
+        network_time = self._estimate_network_time(task, plan)
+        
+        # Disk I/O time (if spilling)
+        io_time = self._estimate_io_time(task, plan)
+        
+        # CPU time (serialization, compression)
+        cpu_time = self._estimate_cpu_time(task, plan)
+        
+        # Take max as they can overlap
+        return max(network_time, io_time) + cpu_time * 0.1
+    
+    def _estimate_network_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
+        """Estimate network transfer time"""
+        bytes_per_partition = task.data_size_gb * 1e9 / task.input_partitions
+        
+        if plan.strategy == ShuffleStrategy.ALL_TO_ALL:
+            # Every partition to every node
+            total_bytes = task.data_size_gb * 1e9
+            avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
+            return total_bytes / (avg_bandwidth * 1e9)
+        
+        elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE:
+            # Log(n) levels in tree
+            num_nodes = len(self.topology.nodes)
+            tree_height = np.log2(num_nodes)
+            bytes_per_level = task.data_size_gb * 1e9 / tree_height
+            avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
+            return tree_height * bytes_per_level / (avg_bandwidth * 1e9)
+        
+        else:
+            # Hash/range partition: each partition to one node
+            avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
+            return bytes_per_partition * task.output_partitions / (avg_bandwidth * 1e9)
+    
+    def _estimate_io_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
+        """Estimate disk I/O time if spilling"""
+        total_spill = 0
+        
+        for node_id, threshold in plan.spill_thresholds.items():
+            node = self.topology.nodes[node_id]
+            buffer_size = plan.buffer_sizes[node_id]
+            
+            # Estimate spill amount
+            node_data = task.data_size_gb * 1e9 / len(self.topology.nodes)
+            if node_data > buffer_size:
+                spill_amount = node_data - buffer_size
+                total_spill += spill_amount
+        
+        if total_spill > 0:
+            # Assume 200MB/s for HDD, 500MB/s for SSD
+            io_speed = 500e6 if 'ssd' in str(plan).lower() else 200e6
+            return total_spill / io_speed
+        
+        return 0.0
+    
+    def _estimate_cpu_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
+        """Estimate CPU time for serialization and compression"""
+        total_cores = sum(n.cpu_cores for n in self.topology.nodes.values())
+        
+        # Serialization cost
+        serialize_rate = 1e9  # 1GB/s per core
+        serialize_time = task.data_size_gb * 1e9 / (serialize_rate * total_cores)
+        
+        # Compression cost
+        if plan.compression != CompressionType.NONE:
+            if plan.compression == CompressionType.ZLIB:
+                compress_rate = 100e6  # 100MB/s per core
+            elif plan.compression == CompressionType.SNAPPY:
+                compress_rate = 500e6  # 500MB/s per core
+            else:  # LZ4
+                compress_rate = 1e9    # 1GB/s per core
+            
+            compress_time = task.data_size_gb * 1e9 / (compress_rate * total_cores)
+        else:
+            compress_time = 0
+        
+        return serialize_time + compress_time
+
+
+class ShuffleOptimizer:
+    """Main distributed shuffle optimizer"""
+    
+    def __init__(self, nodes: List[NodeInfo], memory_limit_fraction: float = 0.5):
+        self.topology = NetworkTopology(nodes)
+        self.cost_model = CostModel(self.topology)
+        self.memory_limit_fraction = memory_limit_fraction
+        self.sqrt_calc = SqrtNCalculator()
+    
+    def optimize_shuffle(self, task: ShuffleTask) -> ShufflePlan:
+        """Generate optimized shuffle plan"""
+        # Choose strategy based on task characteristics
+        strategy = self._choose_strategy(task)
+        
+        # Calculate buffer sizes using √n principle
+        buffer_sizes = self._calculate_buffer_sizes(task)
+        
+        # Determine spill thresholds
+        spill_thresholds = self._calculate_spill_thresholds(task, buffer_sizes)
+        
+        # Build aggregation tree if needed
+        aggregation_tree = None
+        if strategy == ShuffleStrategy.TREE_AGGREGATE:
+            aggregation_tree = self._build_aggregation_tree()
+        
+        # Choose compression
+        compression = self._choose_compression(task)
+        
+        # Assign partitions to nodes
+        partition_assignment = self._assign_partitions(task, strategy)
+        
+        # Estimate performance
+        plan = ShufflePlan(
+            strategy=strategy,
+            buffer_sizes=buffer_sizes,
+            spill_thresholds=spill_thresholds,
+            aggregation_tree=aggregation_tree,
+            compression=compression,
+            partition_assignment=partition_assignment,
+            estimated_time=0.0,
+            estimated_network_usage=0.0,
+            memory_usage={},
+            explanation=""
+        )
+        
+        # Calculate estimates
+        plan.estimated_time = self.cost_model.estimate_shuffle_time(task, plan)
+        plan.estimated_network_usage = self._estimate_network_usage(task, plan)
+        plan.memory_usage = self._estimate_memory_usage(task, plan)
+        
+        # Generate explanation
+        plan.explanation = self._generate_explanation(task, plan)
+        
+        return plan
+    
+    def _choose_strategy(self, task: ShuffleTask) -> ShuffleStrategy:
+        """Choose shuffle strategy based on task characteristics"""
+        # Small data: all-to-all is fine
+        if task.data_size_gb < 1:
+            return ShuffleStrategy.ALL_TO_ALL
+        
+        # Has combiner: use combining strategy
+        if task.combiner_function:
+            return ShuffleStrategy.COMBINER_BASED
+        
+        # Many nodes: use tree aggregation
+        if len(self.topology.nodes) > 10:
+            return ShuffleStrategy.TREE_AGGREGATE
+        
+        # Skewed data: use range partitioning
+        if task.key_distribution == 'skewed':
+            return ShuffleStrategy.RANGE_PARTITION
+        
+        # Default: hash partitioning
+        return ShuffleStrategy.HASH_PARTITION
+    
+    def _calculate_buffer_sizes(self, task: ShuffleTask) -> Dict[str, int]:
+        """Calculate optimal buffer sizes using √n principle"""
+        buffer_sizes = {}
+        
+        for node_id, node in self.topology.nodes.items():
+            # Available memory for shuffle
+            available_memory = node.memory_gb * 1e9 * self.memory_limit_fraction
+            
+            # Data size per node
+            data_per_node = task.data_size_gb * 1e9 / len(self.topology.nodes)
+            
+            if data_per_node <= available_memory:
+                # Can fit all data
+                buffer_size = int(data_per_node)
+            else:
+                # Use √n buffer
+                sqrt_buffer = self.sqrt_calc.calculate_interval(
+                    int(data_per_node / task.value_size_avg)
+                ) * task.value_size_avg
+                buffer_size = min(int(sqrt_buffer), int(available_memory))
+            
+            buffer_sizes[node_id] = buffer_size
+        
+        return buffer_sizes
+    
+    def _calculate_spill_thresholds(self, task: ShuffleTask, 
+                                  buffer_sizes: Dict[str, int]) -> Dict[str, float]:
+        """Calculate memory thresholds for spilling"""
+        thresholds = {}
+        
+        for node_id, buffer_size in buffer_sizes.items():
+            # Spill at 80% of buffer to leave headroom
+            thresholds[node_id] = buffer_size * 0.8
+        
+        return thresholds
+    
+    def _build_aggregation_tree(self) -> Dict[str, List[str]]:
+        """Build √n-height aggregation tree"""
+        nodes = list(self.topology.nodes.keys())
+        n = len(nodes)
+        
+        # Calculate branching factor for √n height
+        height = int(np.sqrt(n))
+        branching_factor = int(np.ceil(n ** (1 / height)))
+        
+        tree = {}
+        
+        # Build tree level by level
+        current_level = nodes[:]
+        
+        while len(current_level) > 1:
+            next_level = []
+            
+            for i in range(0, len(current_level), branching_factor):
+                # Group nodes
+                group = current_level[i:i + branching_factor]
+                if len(group) > 1:
+                    parent = group[0]  # First node as parent
+                    tree[parent] = group[1:]  # Rest as children
+                    next_level.append(parent)
+                elif group:
+                    next_level.append(group[0])
+            
+            current_level = next_level
+        
+        return tree
+    
+    def _choose_compression(self, task: ShuffleTask) -> CompressionType:
+        """Choose compression based on data characteristics and network"""
+        # Average network bandwidth
+        avg_bandwidth = np.mean([
+            n.network_bandwidth_gbps for n in self.topology.nodes.values()
+        ])
+        
+        # High bandwidth: no compression
+        if avg_bandwidth > 10:  # 10+ Gbps
+            return CompressionType.NONE
+        
+        # Large values: use better compression
+        if task.value_size_avg > 1000:
+            return CompressionType.ZLIB
+        
+        # Medium bandwidth: balanced compression
+        if avg_bandwidth > 1:  # 1-10 Gbps
+            return CompressionType.SNAPPY
+        
+        # Low bandwidth: fast compression
+        return CompressionType.LZ4
+    
+    def _assign_partitions(self, task: ShuffleTask, 
+                         strategy: ShuffleStrategy) -> Dict[int, str]:
+        """Assign partitions to nodes"""
+        nodes = list(self.topology.nodes.keys())
+        assignment = {}
+        
+        if strategy == ShuffleStrategy.HASH_PARTITION:
+            # Round-robin assignment
+            for i in range(task.output_partitions):
+                assignment[i] = nodes[i % len(nodes)]
+        
+        elif strategy == ShuffleStrategy.RANGE_PARTITION:
+            # Assign ranges to nodes
+            partitions_per_node = task.output_partitions // len(nodes)
+            for i, node in enumerate(nodes):
+                start = i * partitions_per_node
+                end = start + partitions_per_node
+                if i == len(nodes) - 1:
+                    end = task.output_partitions
+                for p in range(start, end):
+                    assignment[p] = node
+        
+        else:
+            # Default: even distribution
+            for i in range(task.output_partitions):
+                assignment[i] = nodes[i % len(nodes)]
+        
+        return assignment
+    
+    def _estimate_network_usage(self, task: ShuffleTask, plan: ShufflePlan) -> float:
+        """Estimate total network bytes"""
+        base_bytes = task.data_size_gb * 1e9
+        
+        # Apply compression ratio
+        if plan.compression == CompressionType.ZLIB:
+            base_bytes *= 0.3  # ~70% compression
+        elif plan.compression == CompressionType.SNAPPY:
+            base_bytes *= 0.5  # ~50% compression
+        elif plan.compression == CompressionType.LZ4:
+            base_bytes *= 0.7  # ~30% compression
+        
+        # Apply strategy multiplier
+        if plan.strategy == ShuffleStrategy.ALL_TO_ALL:
+            n = len(self.topology.nodes)
+            base_bytes *= (n - 1) / n  # Each node sends to n-1 others
+        elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE:
+            # Log(n) levels
+            base_bytes *= np.log2(len(self.topology.nodes))
+        
+        return base_bytes
+    
+    def _estimate_memory_usage(self, task: ShuffleTask, plan: ShufflePlan) -> Dict[str, float]:
+        """Estimate memory usage per node"""
+        memory_usage = {}
+        
+        for node_id in self.topology.nodes:
+            # Buffer memory
+            buffer_mem = plan.buffer_sizes[node_id]
+            
+            # Overhead (metadata, indices)
+            overhead = buffer_mem * 0.1
+            
+            # Compression buffers if used
+            compress_mem = 0
+            if plan.compression != CompressionType.NONE:
+                compress_mem = min(buffer_mem * 0.1, 100 * 1024 * 1024)  # Max 100MB
+            
+            memory_usage[node_id] = buffer_mem + overhead + compress_mem
+        
+        return memory_usage
+    
+    def _generate_explanation(self, task: ShuffleTask, plan: ShufflePlan) -> str:
+        """Generate human-readable explanation"""
+        explanations = []
+        
+        # Strategy explanation
+        strategy_reasons = {
+            ShuffleStrategy.ALL_TO_ALL: "small data size allows full exchange",
+            ShuffleStrategy.TREE_AGGREGATE: f"√n-height tree reduces network hops to {int(np.sqrt(len(self.topology.nodes)))}",
+            ShuffleStrategy.HASH_PARTITION: "uniform data distribution suits hash partitioning",
+            ShuffleStrategy.RANGE_PARTITION: "skewed data benefits from range partitioning",
+            ShuffleStrategy.COMBINER_BASED: "combiner function enables local aggregation"
+        }
+        
+        explanations.append(
+            f"Using {plan.strategy.value} strategy because {strategy_reasons[plan.strategy]}."
+        )
+        
+        # Buffer sizing
+        avg_buffer_mb = np.mean(list(plan.buffer_sizes.values())) / 1e6
+        explanations.append(
+            f"Allocated {avg_buffer_mb:.0f}MB buffers per node using √n principle "
+            f"to balance memory usage and I/O."
+        )
+        
+        # Compression
+        if plan.compression != CompressionType.NONE:
+            explanations.append(
+                f"Applied {plan.compression.value} compression to reduce network "
+                f"traffic by ~{(1 - plan.estimated_network_usage / (task.data_size_gb * 1e9)) * 100:.0f}%."
+            )
+        
+        # Performance estimate
+        explanations.append(
+            f"Estimated completion time: {plan.estimated_time:.1f}s with "
+            f"{plan.estimated_network_usage / 1e9:.1f}GB network transfer."
+        )
+        
+        return " ".join(explanations)
+    
+    def execute_shuffle(self, task: ShuffleTask, plan: ShufflePlan) -> ShuffleMetrics:
+        """Simulate shuffle execution (for testing)"""
+        start_time = time.time()
+        
+        # Simulate execution
+        time.sleep(0.1)  # Simulate some work
+        
+        # Calculate metrics
+        metrics = ShuffleMetrics(
+            total_time=time.time() - start_time,
+            network_bytes=int(plan.estimated_network_usage),
+            disk_spills=sum(1 for b in plan.buffer_sizes.values() 
+                          if b < task.data_size_gb * 1e9 / len(self.topology.nodes)),
+            memory_peak=max(plan.memory_usage.values()),
+            compression_ratio=1.0,
+            skew_factor=1.0
+        )
+        
+        if plan.compression == CompressionType.ZLIB:
+            metrics.compression_ratio = 3.3
+        elif plan.compression == CompressionType.SNAPPY:
+            metrics.compression_ratio = 2.0
+        elif plan.compression == CompressionType.LZ4:
+            metrics.compression_ratio = 1.4
+        
+        return metrics
+
+
+def create_test_cluster(num_nodes: int = 4) -> List[NodeInfo]:
+    """Create a test cluster configuration"""
+    nodes = []
+    
+    for i in range(num_nodes):
+        node = NodeInfo(
+            node_id=f"node{i}",
+            hostname=f"worker{i}.cluster.local",
+            cpu_cores=16,
+            memory_gb=64,
+            network_bandwidth_gbps=10.0,
+            storage_type='ssd',
+            rack_id=f"rack{i // 2}"  # 2 nodes per rack
+        )
+        nodes.append(node)
+    
+    return nodes
+
+
+# Example usage
+if __name__ == "__main__":
+    print("Distributed Shuffle Optimizer Example")
+    print("="*60)
+    
+    # Create test cluster
+    nodes = create_test_cluster(4)
+    optimizer = ShuffleOptimizer(nodes)
+    
+    # Example 1: Small uniform shuffle
+    print("\nExample 1: Small uniform shuffle")
+    task1 = ShuffleTask(
+        task_id="shuffle_1",
+        input_partitions=100,
+        output_partitions=100,
+        data_size_gb=0.5,
+        key_distribution='uniform',
+        value_size_avg=100
+    )
+    
+    plan1 = optimizer.optimize_shuffle(task1)
+    print(f"Strategy: {plan1.strategy.value}")
+    print(f"Compression: {plan1.compression.value}")
+    print(f"Estimated time: {plan1.estimated_time:.2f}s")
+    print(f"Explanation: {plan1.explanation}")
+    
+    # Example 2: Large skewed shuffle
+    print("\n\nExample 2: Large skewed shuffle")
+    task2 = ShuffleTask(
+        task_id="shuffle_2",
+        input_partitions=1000,
+        output_partitions=500,
+        data_size_gb=100,
+        key_distribution='skewed',
+        value_size_avg=1000,
+        combiner_function='sum'
+    )
+    
+    plan2 = optimizer.optimize_shuffle(task2)
+    print(f"Strategy: {plan2.strategy.value}")
+    print(f"Buffer sizes: {list(plan2.buffer_sizes.values())[0] / 1e9:.1f}GB per node")
+    print(f"Network usage: {plan2.estimated_network_usage / 1e9:.1f}GB")
+    print(f"Explanation: {plan2.explanation}")
+    
+    # Example 3: Many nodes with aggregation
+    print("\n\nExample 3: Many nodes with tree aggregation")
+    large_cluster = create_test_cluster(16)
+    large_optimizer = ShuffleOptimizer(large_cluster)
+    
+    task3 = ShuffleTask(
+        task_id="shuffle_3",
+        input_partitions=10000,
+        output_partitions=16,
+        data_size_gb=50,
+        key_distribution='uniform',
+        value_size_avg=200,
+        combiner_function='collect'
+    )
+    
+    plan3 = large_optimizer.optimize_shuffle(task3)
+    print(f"Strategy: {plan3.strategy.value}")
+    if plan3.aggregation_tree:
+        print(f"Tree height: {int(np.sqrt(len(large_cluster)))}")
+        print(f"Tree structure sample: {list(plan3.aggregation_tree.items())[:3]}")
+    print(f"Explanation: {plan3.explanation}")
+    
+    # Simulate execution
+    print("\n\nSimulating shuffle execution...")
+    metrics = optimizer.execute_shuffle(task1, plan1)
+    print(f"Execution time: {metrics.total_time:.3f}s")
+    print(f"Network bytes: {metrics.network_bytes / 1e6:.1f}MB")
+    print(f"Compression ratio: {metrics.compression_ratio:.1f}x")