Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 82 additions & 30 deletions src/madengine/mad_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,44 +459,85 @@ def _process_batch_manifest_entries(
)


def display_results_table(summary: Dict, title: str) -> None:
"""Display results in a formatted table."""
def display_results_table(summary: Dict, title: str, show_gpu_arch: bool = False) -> None:
"""Display results in a formatted table with each model as a separate row."""
table = Table(title=title, show_header=True, header_style="bold magenta")
table.add_column("Index", justify="right", style="dim")
table.add_column("Status", style="bold")
table.add_column("Count", justify="right")
table.add_column("Items", style="dim")
table.add_column("Model", style="cyan")

# Add GPU Architecture column if multi-arch build was used
if show_gpu_arch:
table.add_column("GPU Architecture", style="yellow")

successful = summary.get("successful_builds", summary.get("successful_runs", []))
failed = summary.get("failed_builds", summary.get("failed_runs", []))

# Helper function to extract display names from items
def get_display_names(items, limit=5):
if not items:
return ""

display_items = []
for item in items[:limit]:
if isinstance(item, dict):
# For dictionary items (run results), use model name or name field
name = item.get("model", item.get("name", str(item)[:20]))
display_items.append(name)
# Helper function to extract model name from build result
def extract_model_name(item):
if isinstance(item, dict):
# For build results, prioritize docker_image extraction for model name
if "docker_image" in item:
# Extract model name from docker image name
# e.g., "ci-dummy_dummy.ubuntu.amd" -> "dummy"
# e.g., "ci-dummy_dummy.ubuntu.amd_gfx908" -> "dummy"
docker_image = item["docker_image"]
if docker_image.startswith("ci-"):
# Remove ci- prefix and extract model name
parts = docker_image[3:].split("_")
if len(parts) >= 2:
model_name = parts[0] # First part is the model name
else:
model_name = parts[0] if parts else docker_image
else:
model_name = docker_image
return model_name
# For run results, use model name or name field
elif "model" in item:
return item["model"]
elif "name" in item:
return item["name"]
return str(item)[:20] # Fallback

# Helper function to extract GPU architecture
def extract_gpu_arch(item):
if isinstance(item, dict) and "gpu_architecture" in item:
return item["gpu_architecture"]
return "N/A"

# Add successful builds/runs
row_index = 1
for item in successful:
model_name = extract_model_name(item)
if show_gpu_arch:
gpu_arch = extract_gpu_arch(item)
table.add_row(str(row_index), "✅ Success", model_name, gpu_arch)
else:
table.add_row(str(row_index), "✅ Success", model_name)
row_index += 1

# Add failed builds/runs
for item in failed:
if isinstance(item, dict):
model_name = item.get("model", "Unknown")
if show_gpu_arch:
gpu_arch = item.get("architecture", "N/A")
table.add_row(str(row_index), "❌ Failed", model_name, gpu_arch)
else:
# For string items (build results), use as-is
display_items.append(str(item))

result = ", ".join(display_items)
if len(items) > limit:
result += "..."
return result

if successful:
table.add_row("✅ Success", str(len(successful)), get_display_names(successful))

if failed:
table.add_row("❌ Failed", str(len(failed)), get_display_names(failed))
table.add_row(str(row_index), "❌ Failed", model_name)
else:
if show_gpu_arch:
table.add_row(str(row_index), "❌ Failed", str(item), "N/A")
else:
table.add_row(str(row_index), "❌ Failed", str(item))
row_index += 1

# Show empty state if no results
if not successful and not failed:
table.add_row("ℹ️ No items", "0", "")
if show_gpu_arch:
table.add_row("1", "ℹ️ No items", "", "")
else:
table.add_row("1", "ℹ️ No items", "")

console.print(table)

Expand All @@ -507,6 +548,14 @@ def build(
List[str],
typer.Option("--tags", "-t", help="Model tags to build (can specify multiple)"),
] = [],
target_archs: Annotated[
List[str],
typer.Option(
"--target-archs",
"-a",
help="Target GPU architectures to build for (e.g., gfx908,gfx90a,gfx942). If not specified, builds single image with MAD_SYSTEM_GPU_ARCHITECTURE from additional_context or detected GPU architecture."
),
] = [],
registry: Annotated[
Optional[str],
typer.Option("--registry", "-r", help="Docker registry to push images to"),
Expand Down Expand Up @@ -658,6 +707,7 @@ def build(
# Create arguments object
args = create_args_namespace(
tags=effective_tags,
target_archs=target_archs,
registry=registry,
additional_context=additional_context,
additional_context_file=additional_context_file,
Expand Down Expand Up @@ -716,7 +766,9 @@ def build(
)

# Display results
display_results_table(build_summary, "Build Results")
# Check if target_archs was used to show GPU architecture column
show_gpu_arch = bool(target_archs)
display_results_table(build_summary, "Build Results", show_gpu_arch)

# Save summary
save_summary_with_feedback(build_summary, summary_output, "Build")
Expand Down
100 changes: 100 additions & 0 deletions src/madengine/tools/distributed_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,17 @@ def build_phase(
else ""
)

# Get target architectures from args if provided
target_archs = getattr(self.args, "target_archs", [])

# Handle comma-separated architectures in a single string
if target_archs:
processed_archs = []
for arch_arg in target_archs:
# Split comma-separated values and add to list
processed_archs.extend([arch.strip() for arch in arch_arg.split(',') if arch.strip()])
target_archs = processed_archs

# If batch_build_metadata is provided, use it to set per-model registry/registry_image
build_summary = builder.build_all_models(
models,
Expand All @@ -189,6 +200,7 @@ def build_phase(
registry,
phase_suffix,
batch_build_metadata=batch_build_metadata,
target_archs=target_archs,
)

# Export build manifest with registry information
Expand Down Expand Up @@ -389,6 +401,52 @@ def run_phase(

print(f"Loaded manifest with {len(manifest['built_images'])} images")

# Filter images by GPU architecture compatibility
try:
runtime_gpu_arch = self.context.get_system_gpu_architecture()
print(f"Runtime GPU architecture detected: {runtime_gpu_arch}")

# Filter manifest images by GPU architecture compatibility
compatible_images = self._filter_images_by_gpu_architecture(
manifest["built_images"], runtime_gpu_arch
)

if not compatible_images:
available_archs = list(set(
img.get('gpu_architecture', 'unknown')
for img in manifest['built_images'].values()
))
available_archs = [arch for arch in available_archs if arch != 'unknown']

if available_archs:
error_msg = (
f"No compatible Docker images found for runtime GPU architecture '{runtime_gpu_arch}'. "
f"Available image architectures: {available_archs}. "
f"Please build images for the target architecture using: "
f"--target-archs {runtime_gpu_arch}"
)
else:
error_msg = (
f"No compatible Docker images found for runtime GPU architecture '{runtime_gpu_arch}'. "
f"The manifest contains legacy images without architecture information. "
f"These will be treated as compatible for backward compatibility."
)

raise RuntimeError(error_msg)

# Update manifest to only include compatible images
manifest["built_images"] = compatible_images
print(f"Filtered to {len(compatible_images)} compatible images for GPU architecture '{runtime_gpu_arch}'")

except Exception as e:
# If GPU architecture detection fails, proceed with all images for backward compatibility
self.rich_console.print(
f"[yellow]Warning: GPU architecture filtering failed: {e}[/yellow]"
)
self.rich_console.print(
"[yellow]Proceeding with all available images (backward compatibility mode)[/yellow]"
)

# Registry is now per-image; CLI registry is fallback
if registry:
print(f"Using registry from CLI: {registry}")
Expand Down Expand Up @@ -789,6 +847,48 @@ def _copy_scripts(self) -> None:
self.console.sh(f"cp -vLR --preserve=all {scripts_path} .")
print(f"Scripts copied to {os.getcwd()}/scripts")

def _filter_images_by_gpu_architecture(self, built_images: typing.Dict, runtime_arch: str) -> typing.Dict:
"""Filter built images by GPU architecture compatibility.

Args:
built_images: Dictionary of built images from manifest
runtime_arch: Runtime GPU architecture (e.g., 'gfx908')

Returns:
dict: Filtered dictionary containing only compatible images
"""
compatible = {}

self.rich_console.print(f"[cyan]Filtering images for runtime GPU architecture: {runtime_arch}[/cyan]")

for image_name, image_info in built_images.items():
image_arch = image_info.get("gpu_architecture")

if not image_arch:
# Legacy images without architecture info - assume compatible for backward compatibility
self.rich_console.print(
f"[yellow] Warning: Image {image_name} has no architecture info, assuming compatible (legacy mode)[/yellow]"
)
compatible[image_name] = image_info
elif image_arch == runtime_arch:
# Exact architecture match
self.rich_console.print(
f"[green] ✓ Compatible: {image_name} (architecture: {image_arch})[/green]"
)
compatible[image_name] = image_info
else:
# Architecture mismatch
self.rich_console.print(
f"[red] ✗ Incompatible: {image_name} (architecture: {image_arch}, runtime: {runtime_arch})[/red]"
)

if not compatible:
self.rich_console.print(f"[red]No compatible images found for runtime architecture: {runtime_arch}[/red]")
else:
self.rich_console.print(f"[green]Found {len(compatible)} compatible image(s)[/green]")

return compatible

def cleanup(self) -> None:
"""Cleanup the scripts/common directory."""
# check the directory exists
Expand Down
Loading