Initial
This commit is contained in:
636
distsys/shuffle_optimizer.py
Normal file
636
distsys/shuffle_optimizer.py
Normal file
@@ -0,0 +1,636 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Distributed Shuffle Optimizer: Optimize shuffle operations in distributed computing
|
||||
|
||||
Features:
|
||||
- Buffer Sizing: Calculate optimal buffer sizes per node
|
||||
- Spill Strategy: Decide when to spill based on memory pressure
|
||||
- Aggregation Trees: Build √n-height aggregation trees
|
||||
- Network Awareness: Consider network topology in optimization
|
||||
- AI Explanations: Clear reasoning for optimization decisions
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import numpy as np
|
||||
import json
|
||||
import time
|
||||
import psutil
|
||||
import socket
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Dict, List, Tuple, Optional, Any, Union
|
||||
from enum import Enum
|
||||
import heapq
|
||||
import zlib
|
||||
|
||||
# Import core components
|
||||
from core.spacetime_core import (
|
||||
MemoryHierarchy,
|
||||
SqrtNCalculator,
|
||||
OptimizationStrategy,
|
||||
MemoryProfiler
|
||||
)
|
||||
|
||||
|
||||
class ShuffleStrategy(Enum):
|
||||
"""Shuffle strategies for distributed systems"""
|
||||
ALL_TO_ALL = "all_to_all" # Every node to every node
|
||||
TREE_AGGREGATE = "tree_aggregate" # Hierarchical aggregation
|
||||
HASH_PARTITION = "hash_partition" # Hash-based partitioning
|
||||
RANGE_PARTITION = "range_partition" # Range-based partitioning
|
||||
COMBINER_BASED = "combiner_based" # Local combining first
|
||||
|
||||
|
||||
class CompressionType(Enum):
|
||||
"""Compression algorithms for shuffle data"""
|
||||
NONE = "none"
|
||||
SNAPPY = "snappy" # Fast, moderate compression
|
||||
ZLIB = "zlib" # Slower, better compression
|
||||
LZ4 = "lz4" # Very fast, light compression
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeInfo:
|
||||
"""Information about a compute node"""
|
||||
node_id: str
|
||||
hostname: str
|
||||
cpu_cores: int
|
||||
memory_gb: float
|
||||
network_bandwidth_gbps: float
|
||||
storage_type: str # 'ssd' or 'hdd'
|
||||
rack_id: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ShuffleTask:
|
||||
"""A shuffle task specification"""
|
||||
task_id: str
|
||||
input_partitions: int
|
||||
output_partitions: int
|
||||
data_size_gb: float
|
||||
key_distribution: str # 'uniform', 'skewed', 'heavy_hitters'
|
||||
value_size_avg: int # Average value size in bytes
|
||||
combiner_function: Optional[str] = None # 'sum', 'max', 'collect', etc.
|
||||
|
||||
|
||||
@dataclass
|
||||
class ShufflePlan:
|
||||
"""Optimized shuffle execution plan"""
|
||||
strategy: ShuffleStrategy
|
||||
buffer_sizes: Dict[str, int] # node_id -> buffer_size
|
||||
spill_thresholds: Dict[str, float] # node_id -> threshold
|
||||
aggregation_tree: Optional[Dict[str, List[str]]] # parent -> children
|
||||
compression: CompressionType
|
||||
partition_assignment: Dict[int, str] # partition -> node_id
|
||||
estimated_time: float
|
||||
estimated_network_usage: float
|
||||
memory_usage: Dict[str, float]
|
||||
explanation: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ShuffleMetrics:
|
||||
"""Metrics from shuffle execution"""
|
||||
total_time: float
|
||||
network_bytes: int
|
||||
disk_spills: int
|
||||
memory_peak: int
|
||||
compression_ratio: float
|
||||
skew_factor: float # Max/avg partition size
|
||||
|
||||
|
||||
class NetworkTopology:
|
||||
"""Model network topology for optimization"""
|
||||
|
||||
def __init__(self, nodes: List[NodeInfo]):
|
||||
self.nodes = {n.node_id: n for n in nodes}
|
||||
self.racks = self._group_by_rack(nodes)
|
||||
self.bandwidth_matrix = self._build_bandwidth_matrix()
|
||||
|
||||
def _group_by_rack(self, nodes: List[NodeInfo]) -> Dict[str, List[str]]:
|
||||
"""Group nodes by rack"""
|
||||
racks = {}
|
||||
for node in nodes:
|
||||
rack = node.rack_id or 'default'
|
||||
if rack not in racks:
|
||||
racks[rack] = []
|
||||
racks[rack].append(node.node_id)
|
||||
return racks
|
||||
|
||||
def _build_bandwidth_matrix(self) -> Dict[Tuple[str, str], float]:
|
||||
"""Build bandwidth matrix between nodes"""
|
||||
matrix = {}
|
||||
for n1 in self.nodes:
|
||||
for n2 in self.nodes:
|
||||
if n1 == n2:
|
||||
matrix[(n1, n2)] = float('inf') # Local
|
||||
elif self._same_rack(n1, n2):
|
||||
# Same rack: use min node bandwidth
|
||||
matrix[(n1, n2)] = min(
|
||||
self.nodes[n1].network_bandwidth_gbps,
|
||||
self.nodes[n2].network_bandwidth_gbps
|
||||
)
|
||||
else:
|
||||
# Cross-rack: assume 50% of node bandwidth
|
||||
matrix[(n1, n2)] = min(
|
||||
self.nodes[n1].network_bandwidth_gbps,
|
||||
self.nodes[n2].network_bandwidth_gbps
|
||||
) * 0.5
|
||||
return matrix
|
||||
|
||||
def _same_rack(self, node1: str, node2: str) -> bool:
|
||||
"""Check if two nodes are in the same rack"""
|
||||
r1 = self.nodes[node1].rack_id or 'default'
|
||||
r2 = self.nodes[node2].rack_id or 'default'
|
||||
return r1 == r2
|
||||
|
||||
def get_bandwidth(self, src: str, dst: str) -> float:
|
||||
"""Get bandwidth between two nodes in Gbps"""
|
||||
return self.bandwidth_matrix.get((src, dst), 1.0)
|
||||
|
||||
|
||||
class CostModel:
|
||||
"""Cost model for shuffle operations"""
|
||||
|
||||
def __init__(self, topology: NetworkTopology):
|
||||
self.topology = topology
|
||||
self.hierarchy = MemoryHierarchy.detect_system()
|
||||
|
||||
def estimate_shuffle_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
|
||||
"""Estimate shuffle execution time"""
|
||||
# Network transfer time
|
||||
network_time = self._estimate_network_time(task, plan)
|
||||
|
||||
# Disk I/O time (if spilling)
|
||||
io_time = self._estimate_io_time(task, plan)
|
||||
|
||||
# CPU time (serialization, compression)
|
||||
cpu_time = self._estimate_cpu_time(task, plan)
|
||||
|
||||
# Take max as they can overlap
|
||||
return max(network_time, io_time) + cpu_time * 0.1
|
||||
|
||||
def _estimate_network_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
|
||||
"""Estimate network transfer time"""
|
||||
bytes_per_partition = task.data_size_gb * 1e9 / task.input_partitions
|
||||
|
||||
if plan.strategy == ShuffleStrategy.ALL_TO_ALL:
|
||||
# Every partition to every node
|
||||
total_bytes = task.data_size_gb * 1e9
|
||||
avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
|
||||
return total_bytes / (avg_bandwidth * 1e9)
|
||||
|
||||
elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE:
|
||||
# Log(n) levels in tree
|
||||
num_nodes = len(self.topology.nodes)
|
||||
tree_height = np.log2(num_nodes)
|
||||
bytes_per_level = task.data_size_gb * 1e9 / tree_height
|
||||
avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
|
||||
return tree_height * bytes_per_level / (avg_bandwidth * 1e9)
|
||||
|
||||
else:
|
||||
# Hash/range partition: each partition to one node
|
||||
avg_bandwidth = np.mean(list(self.topology.bandwidth_matrix.values()))
|
||||
return bytes_per_partition * task.output_partitions / (avg_bandwidth * 1e9)
|
||||
|
||||
def _estimate_io_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
|
||||
"""Estimate disk I/O time if spilling"""
|
||||
total_spill = 0
|
||||
|
||||
for node_id, threshold in plan.spill_thresholds.items():
|
||||
node = self.topology.nodes[node_id]
|
||||
buffer_size = plan.buffer_sizes[node_id]
|
||||
|
||||
# Estimate spill amount
|
||||
node_data = task.data_size_gb * 1e9 / len(self.topology.nodes)
|
||||
if node_data > buffer_size:
|
||||
spill_amount = node_data - buffer_size
|
||||
total_spill += spill_amount
|
||||
|
||||
if total_spill > 0:
|
||||
# Assume 200MB/s for HDD, 500MB/s for SSD
|
||||
io_speed = 500e6 if 'ssd' in str(plan).lower() else 200e6
|
||||
return total_spill / io_speed
|
||||
|
||||
return 0.0
|
||||
|
||||
def _estimate_cpu_time(self, task: ShuffleTask, plan: ShufflePlan) -> float:
|
||||
"""Estimate CPU time for serialization and compression"""
|
||||
total_cores = sum(n.cpu_cores for n in self.topology.nodes.values())
|
||||
|
||||
# Serialization cost
|
||||
serialize_rate = 1e9 # 1GB/s per core
|
||||
serialize_time = task.data_size_gb * 1e9 / (serialize_rate * total_cores)
|
||||
|
||||
# Compression cost
|
||||
if plan.compression != CompressionType.NONE:
|
||||
if plan.compression == CompressionType.ZLIB:
|
||||
compress_rate = 100e6 # 100MB/s per core
|
||||
elif plan.compression == CompressionType.SNAPPY:
|
||||
compress_rate = 500e6 # 500MB/s per core
|
||||
else: # LZ4
|
||||
compress_rate = 1e9 # 1GB/s per core
|
||||
|
||||
compress_time = task.data_size_gb * 1e9 / (compress_rate * total_cores)
|
||||
else:
|
||||
compress_time = 0
|
||||
|
||||
return serialize_time + compress_time
|
||||
|
||||
|
||||
class ShuffleOptimizer:
|
||||
"""Main distributed shuffle optimizer"""
|
||||
|
||||
def __init__(self, nodes: List[NodeInfo], memory_limit_fraction: float = 0.5):
|
||||
self.topology = NetworkTopology(nodes)
|
||||
self.cost_model = CostModel(self.topology)
|
||||
self.memory_limit_fraction = memory_limit_fraction
|
||||
self.sqrt_calc = SqrtNCalculator()
|
||||
|
||||
def optimize_shuffle(self, task: ShuffleTask) -> ShufflePlan:
|
||||
"""Generate optimized shuffle plan"""
|
||||
# Choose strategy based on task characteristics
|
||||
strategy = self._choose_strategy(task)
|
||||
|
||||
# Calculate buffer sizes using √n principle
|
||||
buffer_sizes = self._calculate_buffer_sizes(task)
|
||||
|
||||
# Determine spill thresholds
|
||||
spill_thresholds = self._calculate_spill_thresholds(task, buffer_sizes)
|
||||
|
||||
# Build aggregation tree if needed
|
||||
aggregation_tree = None
|
||||
if strategy == ShuffleStrategy.TREE_AGGREGATE:
|
||||
aggregation_tree = self._build_aggregation_tree()
|
||||
|
||||
# Choose compression
|
||||
compression = self._choose_compression(task)
|
||||
|
||||
# Assign partitions to nodes
|
||||
partition_assignment = self._assign_partitions(task, strategy)
|
||||
|
||||
# Estimate performance
|
||||
plan = ShufflePlan(
|
||||
strategy=strategy,
|
||||
buffer_sizes=buffer_sizes,
|
||||
spill_thresholds=spill_thresholds,
|
||||
aggregation_tree=aggregation_tree,
|
||||
compression=compression,
|
||||
partition_assignment=partition_assignment,
|
||||
estimated_time=0.0,
|
||||
estimated_network_usage=0.0,
|
||||
memory_usage={},
|
||||
explanation=""
|
||||
)
|
||||
|
||||
# Calculate estimates
|
||||
plan.estimated_time = self.cost_model.estimate_shuffle_time(task, plan)
|
||||
plan.estimated_network_usage = self._estimate_network_usage(task, plan)
|
||||
plan.memory_usage = self._estimate_memory_usage(task, plan)
|
||||
|
||||
# Generate explanation
|
||||
plan.explanation = self._generate_explanation(task, plan)
|
||||
|
||||
return plan
|
||||
|
||||
def _choose_strategy(self, task: ShuffleTask) -> ShuffleStrategy:
|
||||
"""Choose shuffle strategy based on task characteristics"""
|
||||
# Small data: all-to-all is fine
|
||||
if task.data_size_gb < 1:
|
||||
return ShuffleStrategy.ALL_TO_ALL
|
||||
|
||||
# Has combiner: use combining strategy
|
||||
if task.combiner_function:
|
||||
return ShuffleStrategy.COMBINER_BASED
|
||||
|
||||
# Many nodes: use tree aggregation
|
||||
if len(self.topology.nodes) > 10:
|
||||
return ShuffleStrategy.TREE_AGGREGATE
|
||||
|
||||
# Skewed data: use range partitioning
|
||||
if task.key_distribution == 'skewed':
|
||||
return ShuffleStrategy.RANGE_PARTITION
|
||||
|
||||
# Default: hash partitioning
|
||||
return ShuffleStrategy.HASH_PARTITION
|
||||
|
||||
def _calculate_buffer_sizes(self, task: ShuffleTask) -> Dict[str, int]:
|
||||
"""Calculate optimal buffer sizes using √n principle"""
|
||||
buffer_sizes = {}
|
||||
|
||||
for node_id, node in self.topology.nodes.items():
|
||||
# Available memory for shuffle
|
||||
available_memory = node.memory_gb * 1e9 * self.memory_limit_fraction
|
||||
|
||||
# Data size per node
|
||||
data_per_node = task.data_size_gb * 1e9 / len(self.topology.nodes)
|
||||
|
||||
if data_per_node <= available_memory:
|
||||
# Can fit all data
|
||||
buffer_size = int(data_per_node)
|
||||
else:
|
||||
# Use √n buffer
|
||||
sqrt_buffer = self.sqrt_calc.calculate_interval(
|
||||
int(data_per_node / task.value_size_avg)
|
||||
) * task.value_size_avg
|
||||
buffer_size = min(int(sqrt_buffer), int(available_memory))
|
||||
|
||||
buffer_sizes[node_id] = buffer_size
|
||||
|
||||
return buffer_sizes
|
||||
|
||||
def _calculate_spill_thresholds(self, task: ShuffleTask,
|
||||
buffer_sizes: Dict[str, int]) -> Dict[str, float]:
|
||||
"""Calculate memory thresholds for spilling"""
|
||||
thresholds = {}
|
||||
|
||||
for node_id, buffer_size in buffer_sizes.items():
|
||||
# Spill at 80% of buffer to leave headroom
|
||||
thresholds[node_id] = buffer_size * 0.8
|
||||
|
||||
return thresholds
|
||||
|
||||
def _build_aggregation_tree(self) -> Dict[str, List[str]]:
|
||||
"""Build √n-height aggregation tree"""
|
||||
nodes = list(self.topology.nodes.keys())
|
||||
n = len(nodes)
|
||||
|
||||
# Calculate branching factor for √n height
|
||||
height = int(np.sqrt(n))
|
||||
branching_factor = int(np.ceil(n ** (1 / height)))
|
||||
|
||||
tree = {}
|
||||
|
||||
# Build tree level by level
|
||||
current_level = nodes[:]
|
||||
|
||||
while len(current_level) > 1:
|
||||
next_level = []
|
||||
|
||||
for i in range(0, len(current_level), branching_factor):
|
||||
# Group nodes
|
||||
group = current_level[i:i + branching_factor]
|
||||
if len(group) > 1:
|
||||
parent = group[0] # First node as parent
|
||||
tree[parent] = group[1:] # Rest as children
|
||||
next_level.append(parent)
|
||||
elif group:
|
||||
next_level.append(group[0])
|
||||
|
||||
current_level = next_level
|
||||
|
||||
return tree
|
||||
|
||||
def _choose_compression(self, task: ShuffleTask) -> CompressionType:
|
||||
"""Choose compression based on data characteristics and network"""
|
||||
# Average network bandwidth
|
||||
avg_bandwidth = np.mean([
|
||||
n.network_bandwidth_gbps for n in self.topology.nodes.values()
|
||||
])
|
||||
|
||||
# High bandwidth: no compression
|
||||
if avg_bandwidth > 10: # 10+ Gbps
|
||||
return CompressionType.NONE
|
||||
|
||||
# Large values: use better compression
|
||||
if task.value_size_avg > 1000:
|
||||
return CompressionType.ZLIB
|
||||
|
||||
# Medium bandwidth: balanced compression
|
||||
if avg_bandwidth > 1: # 1-10 Gbps
|
||||
return CompressionType.SNAPPY
|
||||
|
||||
# Low bandwidth: fast compression
|
||||
return CompressionType.LZ4
|
||||
|
||||
def _assign_partitions(self, task: ShuffleTask,
|
||||
strategy: ShuffleStrategy) -> Dict[int, str]:
|
||||
"""Assign partitions to nodes"""
|
||||
nodes = list(self.topology.nodes.keys())
|
||||
assignment = {}
|
||||
|
||||
if strategy == ShuffleStrategy.HASH_PARTITION:
|
||||
# Round-robin assignment
|
||||
for i in range(task.output_partitions):
|
||||
assignment[i] = nodes[i % len(nodes)]
|
||||
|
||||
elif strategy == ShuffleStrategy.RANGE_PARTITION:
|
||||
# Assign ranges to nodes
|
||||
partitions_per_node = task.output_partitions // len(nodes)
|
||||
for i, node in enumerate(nodes):
|
||||
start = i * partitions_per_node
|
||||
end = start + partitions_per_node
|
||||
if i == len(nodes) - 1:
|
||||
end = task.output_partitions
|
||||
for p in range(start, end):
|
||||
assignment[p] = node
|
||||
|
||||
else:
|
||||
# Default: even distribution
|
||||
for i in range(task.output_partitions):
|
||||
assignment[i] = nodes[i % len(nodes)]
|
||||
|
||||
return assignment
|
||||
|
||||
def _estimate_network_usage(self, task: ShuffleTask, plan: ShufflePlan) -> float:
|
||||
"""Estimate total network bytes"""
|
||||
base_bytes = task.data_size_gb * 1e9
|
||||
|
||||
# Apply compression ratio
|
||||
if plan.compression == CompressionType.ZLIB:
|
||||
base_bytes *= 0.3 # ~70% compression
|
||||
elif plan.compression == CompressionType.SNAPPY:
|
||||
base_bytes *= 0.5 # ~50% compression
|
||||
elif plan.compression == CompressionType.LZ4:
|
||||
base_bytes *= 0.7 # ~30% compression
|
||||
|
||||
# Apply strategy multiplier
|
||||
if plan.strategy == ShuffleStrategy.ALL_TO_ALL:
|
||||
n = len(self.topology.nodes)
|
||||
base_bytes *= (n - 1) / n # Each node sends to n-1 others
|
||||
elif plan.strategy == ShuffleStrategy.TREE_AGGREGATE:
|
||||
# Log(n) levels
|
||||
base_bytes *= np.log2(len(self.topology.nodes))
|
||||
|
||||
return base_bytes
|
||||
|
||||
def _estimate_memory_usage(self, task: ShuffleTask, plan: ShufflePlan) -> Dict[str, float]:
|
||||
"""Estimate memory usage per node"""
|
||||
memory_usage = {}
|
||||
|
||||
for node_id in self.topology.nodes:
|
||||
# Buffer memory
|
||||
buffer_mem = plan.buffer_sizes[node_id]
|
||||
|
||||
# Overhead (metadata, indices)
|
||||
overhead = buffer_mem * 0.1
|
||||
|
||||
# Compression buffers if used
|
||||
compress_mem = 0
|
||||
if plan.compression != CompressionType.NONE:
|
||||
compress_mem = min(buffer_mem * 0.1, 100 * 1024 * 1024) # Max 100MB
|
||||
|
||||
memory_usage[node_id] = buffer_mem + overhead + compress_mem
|
||||
|
||||
return memory_usage
|
||||
|
||||
def _generate_explanation(self, task: ShuffleTask, plan: ShufflePlan) -> str:
|
||||
"""Generate human-readable explanation"""
|
||||
explanations = []
|
||||
|
||||
# Strategy explanation
|
||||
strategy_reasons = {
|
||||
ShuffleStrategy.ALL_TO_ALL: "small data size allows full exchange",
|
||||
ShuffleStrategy.TREE_AGGREGATE: f"√n-height tree reduces network hops to {int(np.sqrt(len(self.topology.nodes)))}",
|
||||
ShuffleStrategy.HASH_PARTITION: "uniform data distribution suits hash partitioning",
|
||||
ShuffleStrategy.RANGE_PARTITION: "skewed data benefits from range partitioning",
|
||||
ShuffleStrategy.COMBINER_BASED: "combiner function enables local aggregation"
|
||||
}
|
||||
|
||||
explanations.append(
|
||||
f"Using {plan.strategy.value} strategy because {strategy_reasons[plan.strategy]}."
|
||||
)
|
||||
|
||||
# Buffer sizing
|
||||
avg_buffer_mb = np.mean(list(plan.buffer_sizes.values())) / 1e6
|
||||
explanations.append(
|
||||
f"Allocated {avg_buffer_mb:.0f}MB buffers per node using √n principle "
|
||||
f"to balance memory usage and I/O."
|
||||
)
|
||||
|
||||
# Compression
|
||||
if plan.compression != CompressionType.NONE:
|
||||
explanations.append(
|
||||
f"Applied {plan.compression.value} compression to reduce network "
|
||||
f"traffic by ~{(1 - plan.estimated_network_usage / (task.data_size_gb * 1e9)) * 100:.0f}%."
|
||||
)
|
||||
|
||||
# Performance estimate
|
||||
explanations.append(
|
||||
f"Estimated completion time: {plan.estimated_time:.1f}s with "
|
||||
f"{plan.estimated_network_usage / 1e9:.1f}GB network transfer."
|
||||
)
|
||||
|
||||
return " ".join(explanations)
|
||||
|
||||
def execute_shuffle(self, task: ShuffleTask, plan: ShufflePlan) -> ShuffleMetrics:
|
||||
"""Simulate shuffle execution (for testing)"""
|
||||
start_time = time.time()
|
||||
|
||||
# Simulate execution
|
||||
time.sleep(0.1) # Simulate some work
|
||||
|
||||
# Calculate metrics
|
||||
metrics = ShuffleMetrics(
|
||||
total_time=time.time() - start_time,
|
||||
network_bytes=int(plan.estimated_network_usage),
|
||||
disk_spills=sum(1 for b in plan.buffer_sizes.values()
|
||||
if b < task.data_size_gb * 1e9 / len(self.topology.nodes)),
|
||||
memory_peak=max(plan.memory_usage.values()),
|
||||
compression_ratio=1.0,
|
||||
skew_factor=1.0
|
||||
)
|
||||
|
||||
if plan.compression == CompressionType.ZLIB:
|
||||
metrics.compression_ratio = 3.3
|
||||
elif plan.compression == CompressionType.SNAPPY:
|
||||
metrics.compression_ratio = 2.0
|
||||
elif plan.compression == CompressionType.LZ4:
|
||||
metrics.compression_ratio = 1.4
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def create_test_cluster(num_nodes: int = 4) -> List[NodeInfo]:
|
||||
"""Create a test cluster configuration"""
|
||||
nodes = []
|
||||
|
||||
for i in range(num_nodes):
|
||||
node = NodeInfo(
|
||||
node_id=f"node{i}",
|
||||
hostname=f"worker{i}.cluster.local",
|
||||
cpu_cores=16,
|
||||
memory_gb=64,
|
||||
network_bandwidth_gbps=10.0,
|
||||
storage_type='ssd',
|
||||
rack_id=f"rack{i // 2}" # 2 nodes per rack
|
||||
)
|
||||
nodes.append(node)
|
||||
|
||||
return nodes
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
print("Distributed Shuffle Optimizer Example")
|
||||
print("="*60)
|
||||
|
||||
# Create test cluster
|
||||
nodes = create_test_cluster(4)
|
||||
optimizer = ShuffleOptimizer(nodes)
|
||||
|
||||
# Example 1: Small uniform shuffle
|
||||
print("\nExample 1: Small uniform shuffle")
|
||||
task1 = ShuffleTask(
|
||||
task_id="shuffle_1",
|
||||
input_partitions=100,
|
||||
output_partitions=100,
|
||||
data_size_gb=0.5,
|
||||
key_distribution='uniform',
|
||||
value_size_avg=100
|
||||
)
|
||||
|
||||
plan1 = optimizer.optimize_shuffle(task1)
|
||||
print(f"Strategy: {plan1.strategy.value}")
|
||||
print(f"Compression: {plan1.compression.value}")
|
||||
print(f"Estimated time: {plan1.estimated_time:.2f}s")
|
||||
print(f"Explanation: {plan1.explanation}")
|
||||
|
||||
# Example 2: Large skewed shuffle
|
||||
print("\n\nExample 2: Large skewed shuffle")
|
||||
task2 = ShuffleTask(
|
||||
task_id="shuffle_2",
|
||||
input_partitions=1000,
|
||||
output_partitions=500,
|
||||
data_size_gb=100,
|
||||
key_distribution='skewed',
|
||||
value_size_avg=1000,
|
||||
combiner_function='sum'
|
||||
)
|
||||
|
||||
plan2 = optimizer.optimize_shuffle(task2)
|
||||
print(f"Strategy: {plan2.strategy.value}")
|
||||
print(f"Buffer sizes: {list(plan2.buffer_sizes.values())[0] / 1e9:.1f}GB per node")
|
||||
print(f"Network usage: {plan2.estimated_network_usage / 1e9:.1f}GB")
|
||||
print(f"Explanation: {plan2.explanation}")
|
||||
|
||||
# Example 3: Many nodes with aggregation
|
||||
print("\n\nExample 3: Many nodes with tree aggregation")
|
||||
large_cluster = create_test_cluster(16)
|
||||
large_optimizer = ShuffleOptimizer(large_cluster)
|
||||
|
||||
task3 = ShuffleTask(
|
||||
task_id="shuffle_3",
|
||||
input_partitions=10000,
|
||||
output_partitions=16,
|
||||
data_size_gb=50,
|
||||
key_distribution='uniform',
|
||||
value_size_avg=200,
|
||||
combiner_function='collect'
|
||||
)
|
||||
|
||||
plan3 = large_optimizer.optimize_shuffle(task3)
|
||||
print(f"Strategy: {plan3.strategy.value}")
|
||||
if plan3.aggregation_tree:
|
||||
print(f"Tree height: {int(np.sqrt(len(large_cluster)))}")
|
||||
print(f"Tree structure sample: {list(plan3.aggregation_tree.items())[:3]}")
|
||||
print(f"Explanation: {plan3.explanation}")
|
||||
|
||||
# Simulate execution
|
||||
print("\n\nSimulating shuffle execution...")
|
||||
metrics = optimizer.execute_shuffle(task1, plan1)
|
||||
print(f"Execution time: {metrics.total_time:.3f}s")
|
||||
print(f"Network bytes: {metrics.network_bytes / 1e6:.1f}MB")
|
||||
print(f"Compression ratio: {metrics.compression_ratio:.1f}x")
|
||||
Reference in New Issue
Block a user