From 364bef4c4ba45036c785b4dca907a0334245ea44 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 10:42:38 -0400 Subject: [PATCH 1/7] Implemented the core multi-GPU architectures support for docker image building --- src/madengine/mad_cli.py | 9 + .../tools/distributed_orchestrator.py | 12 + src/madengine/tools/docker_builder.py | 824 ++++++++++++++---- 3 files changed, 674 insertions(+), 171 deletions(-) diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index 0e707c59..705db264 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -507,6 +507,14 @@ def build( List[str], typer.Option("--tags", "-t", help="Model tags to build (can specify multiple)"), ] = [], + target_archs: Annotated[ + List[str], + typer.Option( + "--target-archs", + "-a", + help="Target GPU architectures to build for (e.g., gfx908,gfx90a,gfx942). If not specified, builds single image with MAD_SYSTEM_GPU_ARCHITECTURE from additional_context or detected GPU architecture." + ), + ] = [], registry: Annotated[ Optional[str], typer.Option("--registry", "-r", help="Docker registry to push images to"), @@ -658,6 +666,7 @@ def build( # Create arguments object args = create_args_namespace( tags=effective_tags, + target_archs=target_archs, registry=registry, additional_context=additional_context, additional_context_file=additional_context_file, diff --git a/src/madengine/tools/distributed_orchestrator.py b/src/madengine/tools/distributed_orchestrator.py index 2af532cb..ad13655a 100644 --- a/src/madengine/tools/distributed_orchestrator.py +++ b/src/madengine/tools/distributed_orchestrator.py @@ -181,6 +181,17 @@ def build_phase( else "" ) + # Get target architectures from args if provided + target_archs = getattr(self.args, "target_archs", []) + + # Handle comma-separated architectures in a single string + if target_archs: + processed_archs = [] + for arch_arg in target_archs: + # Split comma-separated values and add to list + processed_archs.extend([arch.strip() for arch in arch_arg.split(',') if arch.strip()]) + target_archs = processed_archs + # If batch_build_metadata is provided, use it to set per-model registry/registry_image build_summary = builder.build_all_models( models, @@ -189,6 +200,7 @@ def build_phase( registry, phase_suffix, batch_build_metadata=batch_build_metadata, + target_archs=target_archs, ) # Export build manifest with registry information diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 021f8e5e..12833482 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -10,6 +10,7 @@ import os import time import json +import re import typing from contextlib import redirect_stdout, redirect_stderr from rich.console import Console as RichConsole @@ -21,6 +22,15 @@ class DockerBuilder: """Class responsible for building Docker images for models.""" + # GPU architecture variables used in MAD/DLM Dockerfiles + GPU_ARCH_VARIABLES = [ + "MAD_SYSTEM_GPU_ARCHITECTURE", + "PYTORCH_ROCM_ARCH", + "GPU_TARGETS", + "GFX_COMPILATION_ARCH", + "GPU_ARCHS" + ] + def __init__( self, context: Context, console: Console = None, live_output: bool = False ): @@ -87,6 +97,8 @@ def build_image( credentials: typing.Dict = None, clean_cache: bool = False, phase_suffix: str = "", + additional_build_args: typing.Dict[str, str] = None, + override_image_name: str = None, ) -> typing.Dict: """Build a Docker image for the given model. @@ -96,18 +108,22 @@ def build_image( credentials: Optional credentials dictionary clean_cache: Whether to use --no-cache phase_suffix: Suffix for log file name (e.g., ".build" or "") + additional_build_args: Additional build arguments to pass to Docker + override_image_name: Override the generated image name Returns: dict: Build information including image name, build duration, etc. """ # Generate image name first - image_docker_name = ( - model_info["name"].replace("/", "_").lower() - + "_" - + os.path.basename(dockerfile).replace(".Dockerfile", "") - ) - - docker_image = "ci-" + image_docker_name + if override_image_name: + docker_image = override_image_name + else: + image_docker_name = ( + model_info["name"].replace("/", "_").lower() + + "_" + + os.path.basename(dockerfile).replace(".Dockerfile", "") + ) + docker_image = "ci-" + image_docker_name # Create log file for this build cur_docker_file_basename = os.path.basename(dockerfile).replace( @@ -143,6 +159,10 @@ def build_image( for key_cred, value_cred in credentials[model_info["cred"]].items(): run_build_arg[model_info["cred"] + "_" + key_cred.upper()] = value_cred + # Add additional build args if provided (for multi-architecture builds) + if additional_build_args: + run_build_arg.update(additional_build_args) + build_args = self.get_build_arg(run_build_arg) use_cache_str = "--no-cache" if clean_cache else "" @@ -444,8 +464,9 @@ def build_all_models( registry: str = None, phase_suffix: str = "", batch_build_metadata: typing.Optional[dict] = None, + target_archs: typing.List[str] = None, # New parameter ) -> typing.Dict: - """Build images for all models. + """Build images for all models, with optional multi-architecture support. Args: models: List of model information dictionaries @@ -453,11 +474,18 @@ def build_all_models( clean_cache: Whether to use --no-cache registry: Optional registry to push images to phase_suffix: Suffix for log file name (e.g., ".build" or "") + batch_build_metadata: Optional batch build metadata + target_archs: Optional list of target GPU architectures for multi-arch builds Returns: dict: Summary of all built images """ self.rich_console.print(f"[bold blue]Building Docker images for {len(models)} models...[/bold blue]") + + if target_archs: + self.rich_console.print(f"[bold cyan]Multi-architecture build mode enabled for: {', '.join(target_archs)}[/bold cyan]") + else: + self.rich_console.print(f"[bold cyan]Single architecture build mode[/bold cyan]") build_summary = { "successful_builds": [], @@ -466,180 +494,479 @@ def build_all_models( "successful_pushes": [], "failed_pushes": [], } - + for model_info in models: - try: - # If batch_build_metadata is provided, override registry and registry_image for this model - model_registry = registry - model_registry_image = None - if batch_build_metadata and model_info["name"] in batch_build_metadata: - meta = batch_build_metadata[model_info["name"]] - if meta.get("registry"): - model_registry = meta["registry"] - if meta.get("registry_image"): - model_registry_image = meta["registry_image"] - - # Find dockerfiles for this model - all_dockerfiles = self.console.sh( - f"ls {model_info['dockerfile']}.*" - ).split("\n") - - dockerfiles = {} - for cur_docker_file in all_dockerfiles: - # Get context of dockerfile - dockerfiles[cur_docker_file] = self.console.sh( - f"head -n5 {cur_docker_file} | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" + # Check if MAD_SYSTEM_GPU_ARCHITECTURE is provided in additional_context + # This overrides --target-archs and uses default flow + if ("docker_build_arg" in self.context.ctx and + "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + self.rich_console.print(f"[yellow]Info: MAD_SYSTEM_GPU_ARCHITECTURE provided in additional_context, " + f"disabling --target-archs and using default flow for model {model_info['name']}[/yellow]") + # Use single architecture build mode regardless of target_archs + try: + single_build_info = self._build_model_single_arch( + model_info, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata ) - - # Filter dockerfiles based on context - dockerfiles = self.context.filter(dockerfiles) - - if not dockerfiles: - self.rich_console.print( - f"[yellow]No matching dockerfiles found for model {model_info['name']}[/yellow]" + build_summary["successful_builds"].extend(single_build_info) + build_summary["total_build_time"] += sum( + info.get("build_duration", 0) for info in single_build_info ) - continue - - # Build each dockerfile - - for dockerfile in dockerfiles.keys(): + except Exception as e: + build_summary["failed_builds"].append({ + "model": model_info["name"], + "error": str(e) + }) + elif target_archs: + # Multi-architecture build mode with Dockerfile validation + for arch in target_archs: try: - build_info = self.build_image( - model_info, - dockerfile, - credentials, - clean_cache, - phase_suffix, - ) - - # Determine registry image name for push/tag - registry_image = None - if model_registry_image: - registry_image = model_registry_image - elif model_registry: - registry_image = self._determine_registry_image_name( - build_info["docker_image"], model_registry, credentials - ) - # Always use registry_image from batch_build_metadata if present - if ( - batch_build_metadata - and model_info["name"] in batch_build_metadata - ): - meta = batch_build_metadata[model_info["name"]] - if meta.get("registry_image"): - registry_image = meta["registry_image"] - if registry_image: - build_info["registry_image"] = registry_image - if build_info["docker_image"] in self.built_images: - self.built_images[build_info["docker_image"]][ - "registry_image" - ] = registry_image - - # Now attempt to push to registry if registry is set - if model_registry and registry_image: - explicit_registry_image = registry_image - try: - # Use registry_image from batch_build_metadata for push/tag if present - actual_registry_image = self.push_image( - build_info["docker_image"], - model_registry, - credentials, - explicit_registry_image, + # Check if model's Dockerfile has GPU variables + has_gpu_vars, dockerfile_path = self._check_dockerfile_has_gpu_variables(model_info) + + if has_gpu_vars: + # Validate target architecture against model's Dockerfile + if not self._validate_target_arch_against_dockerfile(model_info, arch): + raise ValueError( + f"Target GPU architecture '{arch}' does not match model '{model_info['name']}' " + f"Dockerfile GPU architecture requirements. Cannot build image." ) - if actual_registry_image != registry_image: - self.rich_console.print( - f"[yellow]Warning: Pushed image name {actual_registry_image} differs from intended {registry_image}[/yellow]" - ) - - # Track successful push - build_summary["successful_pushes"].append({ - "model": model_info["name"], - "dockerfile": dockerfile, - "local_image": build_info["docker_image"], - "registry_image": actual_registry_image, - "registry": model_registry - }) - - except Exception as push_error: - self.rich_console.print( - f"[red]Failed to push {build_info['docker_image']} to registry: {push_error}[/red]" - ) - build_info["push_failed"] = True - build_info["push_error"] = str(push_error) - if build_info["docker_image"] in self.built_images: - self.built_images[build_info["docker_image"]][ - "push_failed" - ] = True - self.built_images[build_info["docker_image"]][ - "push_error" - ] = str(push_error) - - # Track failed push - build_summary["failed_pushes"].append({ - "model": model_info["name"], - "dockerfile": dockerfile, - "local_image": build_info["docker_image"], - "intended_registry_image": registry_image, - "registry": model_registry, - "error": str(push_error) - }) - - build_summary["successful_builds"].append( - { - "model": model_info["name"], - "dockerfile": dockerfile, - "build_info": build_info, - } + # Build with architecture suffix + arch_build_info = self._build_model_for_arch( + model_info, arch, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) + else: + # No GPU variables - run normal build using existing flow + self.rich_console.print(f"[yellow]Info: No GPU architecture variables found in {dockerfile_path}, " + f"using normal build flow without architecture suffix for model {model_info['name']}[/yellow]") + arch_build_info = self._build_model_single_arch( + model_info, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) + + build_summary["successful_builds"].extend(arch_build_info) + build_summary["total_build_time"] += sum( + info.get("build_duration", 0) for info in arch_build_info ) - - build_summary["total_build_time"] += build_info[ - "build_duration" - ] - except Exception as e: - self.rich_console.print( - f"[red]Failed to build {dockerfile} for model {model_info['name']}: {e}[/red]" - ) - build_summary["failed_builds"].append( - { - "model": model_info["name"], - "dockerfile": dockerfile, - "error": str(e), - } - ) + build_summary["failed_builds"].append({ + "model": model_info["name"], + "architecture": arch, + "error": str(e) + }) + else: + # Single architecture build mode (existing behavior - no validation needed) + try: + single_build_info = self._build_model_single_arch( + model_info, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) + build_summary["successful_builds"].extend(single_build_info) + build_summary["total_build_time"] += sum( + info.get("build_duration", 0) for info in single_build_info + ) + except Exception as e: + build_summary["failed_builds"].append({ + "model": model_info["name"], + "error": str(e) + }) + + return build_summary - except Exception as e: - self.rich_console.print(f"[red]Error processing model {model_info['name']}: {e}[/red]") - build_summary["failed_builds"].append( - {"model": model_info["name"], "error": str(e)} + def _check_dockerfile_has_gpu_variables(self, model_info: typing.Dict) -> typing.Tuple[bool, str]: + """ + Check if model's Dockerfile contains GPU architecture variables. + Returns (has_gpu_vars, dockerfile_path) + """ + try: + # Find dockerfiles for this model + dockerfiles = self._get_dockerfiles_for_model(model_info) + + for dockerfile_path in dockerfiles: + with open(dockerfile_path, 'r') as f: + dockerfile_content = f.read() + + # Parse GPU architecture variables from Dockerfile + dockerfile_gpu_vars = self._parse_dockerfile_gpu_variables(dockerfile_content) + + if dockerfile_gpu_vars: + return True, dockerfile_path + else: + return False, dockerfile_path + + # No dockerfiles found + return False, "No Dockerfile found" + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error checking GPU variables for model {model_info['name']}: {e}[/yellow]") + return False, "Error reading Dockerfile" + + def _get_dockerfiles_for_model(self, model_info: typing.Dict) -> typing.List[str]: + """Get dockerfiles for a model.""" + try: + all_dockerfiles = self.console.sh( + f"ls {model_info['dockerfile']}.*" + ).split("\n") + + dockerfiles = {} + for cur_docker_file in all_dockerfiles: + # Get context of dockerfile + dockerfiles[cur_docker_file] = self.console.sh( + f"head -n5 {cur_docker_file} | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" ) - self.rich_console.print(f"\n[bold]Build Summary:[/bold]") - self.rich_console.print(f" [green]Successful builds: {len(build_summary['successful_builds'])}[/green]") - self.rich_console.print(f" [red]Failed builds: {len(build_summary['failed_builds'])}[/red]") - self.rich_console.print(f" [blue]Total build time: {build_summary['total_build_time']:.2f} seconds[/blue]") - - # Display push statistics if any pushes were attempted - total_pushes = len(build_summary['successful_pushes']) + len(build_summary['failed_pushes']) - if total_pushes > 0: - self.rich_console.print(f"\n[bold]Registry Push Summary:[/bold]") - self.rich_console.print(f" [green]Successful pushes: {len(build_summary['successful_pushes'])}[/green]") - self.rich_console.print(f" [red]Failed pushes: {len(build_summary['failed_pushes'])}[/red]") - - # Show successful pushes - if build_summary['successful_pushes']: - self.rich_console.print(f"\n[bold green]Successfully pushed images:[/bold green]") - for push in build_summary['successful_pushes']: - self.rich_console.print(f" [green]✅ {push['model']} -> {push['registry_image']}[/green]") - - # Show failed pushes with errors - if build_summary['failed_pushes']: - self.rich_console.print(f"\n[bold red]Failed to push images:[/bold red]") - for push in build_summary['failed_pushes']: - self.rich_console.print(f" [red]❌ {push['model']} -> {push['intended_registry_image']}[/red]") - self.rich_console.print(f" [dim red]Error: {push['error']}[/dim red]") + # Filter dockerfiles based on context + dockerfiles = self.context.filter(dockerfiles) + + return list(dockerfiles.keys()) + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error finding dockerfiles for model {model_info['name']}: {e}[/yellow]") + return [] - return build_summary + def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, target_arch: str) -> bool: + """ + Validate that target architecture is compatible with model's Dockerfile GPU variables. + Called during build phase when --target-archs is provided. + """ + try: + # Find dockerfiles for this model + dockerfiles = self._get_dockerfiles_for_model(model_info) + + for dockerfile_path in dockerfiles: + with open(dockerfile_path, 'r') as f: + dockerfile_content = f.read() + + # Parse GPU architecture variables from Dockerfile + dockerfile_gpu_vars = self._parse_dockerfile_gpu_variables(dockerfile_content) + + if not dockerfile_gpu_vars: + # No GPU variables found - target arch is acceptable + self.rich_console.print(f"[cyan]Info: No GPU architecture variables found in {dockerfile_path}, " + f"target architecture '{target_arch}' is acceptable[/cyan]") + continue + + # Validate target architecture against each GPU variable + for var_name, var_values in dockerfile_gpu_vars.items(): + if not self._is_target_arch_compatible_with_variable( + var_name, var_values, target_arch + ): + self.rich_console.print(f"[red]Error: Target architecture '{target_arch}' is not compatible " + f"with {var_name}={var_values} in {dockerfile_path}[/red]") + return False + + self.rich_console.print(f"[cyan]Info: Target architecture '{target_arch}' validated successfully " + f"against {dockerfile_path}[/cyan]") + + return True + + except FileNotFoundError as e: + self.rich_console.print(f"[yellow]Warning: Dockerfile not found for model {model_info['name']}: {e}[/yellow]") + return True # Assume compatible if Dockerfile not found + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error validating target architecture for model {model_info['name']}: {e}[/yellow]") + return True # Assume compatible on parsing errors + + def _parse_dockerfile_gpu_variables(self, dockerfile_content: str) -> typing.Dict[str, typing.List[str]]: + """Parse GPU architecture variables from Dockerfile content.""" + gpu_variables = {} + + for var_name in self.GPU_ARCH_VARIABLES: + # Look for ARG declarations + arg_pattern = rf"ARG\s+{var_name}=([^\s\n]+)" + arg_matches = re.findall(arg_pattern, dockerfile_content, re.IGNORECASE) + + # Look for ENV declarations + env_pattern = rf"ENV\s+{var_name}[=\s]+([^\s\n]+)" + env_matches = re.findall(env_pattern, dockerfile_content, re.IGNORECASE) + + # Process found values + all_matches = arg_matches + env_matches + if all_matches: + # Take the last defined value (in case of multiple definitions) + raw_value = all_matches[-1].strip('"\'') + parsed_values = self._parse_gpu_variable_value(var_name, raw_value) + if parsed_values: + gpu_variables[var_name] = parsed_values + + return gpu_variables + + def _parse_gpu_variable_value(self, var_name: str, raw_value: str) -> typing.List[str]: + """Parse GPU variable value based on variable type and format.""" + architectures = [] + + # Handle different variable formats + if var_name in ["GPU_TARGETS", "GPU_ARCHS", "PYTORCH_ROCM_ARCH"]: + # These often contain multiple architectures separated by semicolons or commas + if ";" in raw_value: + architectures = [arch.strip() for arch in raw_value.split(";") if arch.strip()] + elif "," in raw_value: + architectures = [arch.strip() for arch in raw_value.split(",") if arch.strip()] + else: + architectures = [raw_value.strip()] + else: + # Single architecture value (MAD_SYSTEM_GPU_ARCHITECTURE, GFX_COMPILATION_ARCH) + architectures = [raw_value.strip()] + + # Normalize architecture names + normalized_archs = [] + for arch in architectures: + normalized = self._normalize_architecture_name(arch) + if normalized: + normalized_archs.append(normalized) + + return normalized_archs + + def _normalize_architecture_name(self, arch: str) -> str: + """Normalize architecture name to standard format.""" + arch = arch.lower().strip() + + # Handle common variations and aliases + if arch.startswith("gfx"): + return arch + elif arch in ["mi100", "mi-100"]: + return "gfx908" + elif arch in ["mi200", "mi-200", "mi210", "mi250"]: + return "gfx90a" + elif arch in ["mi300", "mi-300", "mi300a"]: + return "gfx940" + elif arch in ["mi300x", "mi-300x"]: + return "gfx942" + elif arch.startswith("mi"): + # Unknown MI series - return as is for potential future support + return arch + + return arch if arch else None + + def _is_target_arch_compatible_with_variable( + self, + var_name: str, + var_values: typing.List[str], + target_arch: str + ) -> bool: + """ + Validate that target architecture is compatible with a specific GPU variable. + Used during build phase validation. + """ + if var_name == "MAD_SYSTEM_GPU_ARCHITECTURE": + # MAD_SYSTEM_GPU_ARCHITECTURE will be overridden by target_arch, so always compatible + return True + + elif var_name in ["PYTORCH_ROCM_ARCH", "GPU_TARGETS", "GPU_ARCHS"]: + # Multi-architecture variables - target arch must be in the list + return target_arch in var_values + + elif var_name == "GFX_COMPILATION_ARCH": + # Compilation architecture should be compatible with target arch + return len(var_values) == 1 and ( + var_values[0] == target_arch or + self._is_compilation_arch_compatible(var_values[0], target_arch) + ) + + # Unknown variable - assume compatible + return True + + def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) -> bool: + """Check if compilation architecture is compatible with target architecture.""" + # Define compatibility rules for compilation + compatibility_matrix = { + "gfx908": ["gfx908"], # MI100 - exact match only + "gfx90a": ["gfx90a"], # MI200 - exact match only + "gfx940": ["gfx940"], # MI300A - exact match only + "gfx941": ["gfx941"], # MI300X - exact match only + "gfx942": ["gfx942"], # MI300X - exact match only + } + + compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) + return target_arch in compatible_archs + + def _build_model_for_arch( + self, + model_info: typing.Dict, + gpu_arch: str, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model for specific GPU architecture with smart image naming.""" + + # Find dockerfiles + dockerfiles = self._get_dockerfiles_for_model(model_info) + + arch_results = [] + for dockerfile in dockerfiles: + # Smart image naming: add architecture suffix only if Dockerfile has GPU variables + has_gpu_vars, _ = self._check_dockerfile_has_gpu_variables(model_info) + + if has_gpu_vars: + # Create architecture-specific image name + base_image_name = self._create_base_image_name(model_info, dockerfile) + arch_image_name = f"{base_image_name}_{gpu_arch}" + else: + # Use existing docker image name (no suffix) + arch_image_name = self._create_base_image_name(model_info, dockerfile) + + # Set MAD_SYSTEM_GPU_ARCHITECTURE for this build + arch_build_args = {"MAD_SYSTEM_GPU_ARCHITECTURE": gpu_arch} + + # Build the image + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix, + additional_build_args=arch_build_args, + override_image_name=arch_image_name + ) + + # Add architecture metadata + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push with architecture-specific tagging + if registry: + if has_gpu_vars: + registry_image = self._create_arch_registry_image_name( + arch_image_name, gpu_arch, registry, batch_build_metadata, model_info + ) + else: + registry_image = self._create_registry_image_name( + arch_image_name, registry, batch_build_metadata, model_info + ) + try: + self.push_image(arch_image_name, registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + arch_results.append(build_info) + + return arch_results + + def _build_model_single_arch( + self, + model_info: typing.Dict, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model using existing single architecture flow.""" + + # Use existing build logic - MAD_SYSTEM_GPU_ARCHITECTURE comes from additional_context + # or Dockerfile defaults + dockerfiles = self._get_dockerfiles_for_model(model_info) + + results = [] + for dockerfile in dockerfiles: + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix + ) + + # Extract GPU architecture from build args or context for manifest + gpu_arch = self._get_effective_gpu_architecture(model_info, dockerfile) + if gpu_arch: + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push (existing logic) + if registry: + try: + registry_image = self._create_registry_image_name( + build_info["docker_image"], registry, batch_build_metadata, model_info + ) + self.push_image(build_info["docker_image"], registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + results.append(build_info) + + return results + + def _get_effective_gpu_architecture(self, model_info: typing.Dict, dockerfile_path: str) -> str: + """Get effective GPU architecture for single arch builds.""" + # Check if MAD_SYSTEM_GPU_ARCHITECTURE is in build args from additional_context + if ("docker_build_arg" in self.context.ctx and + "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + return self.context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + + # Try to extract from Dockerfile defaults + try: + with open(dockerfile_path, 'r') as f: + content = f.read() + + # Look for ARG or ENV declarations + patterns = [ + r"ARG\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)", + r"ENV\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)" + ] + + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return match.group(1).strip('"\'') + except Exception: + pass + + return None + + def _create_base_image_name(self, model_info: typing.Dict, dockerfile: str) -> str: + """Create base image name from model info and dockerfile.""" + # Extract dockerfile context suffix (e.g., "ubuntu.amd" from "dummy.ubuntu.amd.Dockerfile") + dockerfile_name = os.path.basename(dockerfile) + if '.' in dockerfile_name: + # Remove the .Dockerfile extension and get context + context_parts = dockerfile_name.replace('.Dockerfile', '').split('.')[1:] # Skip model name + context_suffix = '.'.join(context_parts) if context_parts else 'default' + else: + context_suffix = 'default' + + # Create base image name: ci-{model}_{model}.{context} + return f"ci-{model_info['name']}_{model_info['name']}.{context_suffix}" + + def _create_registry_image_name( + self, + image_name: str, + registry: str, + batch_build_metadata: typing.Optional[dict], + model_info: typing.Dict + ) -> str: + """Create registry image name.""" + if batch_build_metadata and model_info["name"] in batch_build_metadata: + meta = batch_build_metadata[model_info["name"]] + if meta.get("registry_image"): + return meta["registry_image"] + + # Default registry naming + return self._determine_registry_image_name(image_name, registry) + + def _create_arch_registry_image_name( + self, + image_name: str, + gpu_arch: str, + registry: str, + batch_build_metadata: typing.Optional[dict], + model_info: typing.Dict + ) -> str: + """Create architecture-specific registry image name.""" + # For multi-arch builds, add architecture to the tag + if batch_build_metadata and model_info["name"] in batch_build_metadata: + meta = batch_build_metadata[model_info["name"]] + if meta.get("registry_image"): + # Append architecture to existing registry image + return f"{meta['registry_image']}_{gpu_arch}" + + # Default arch-specific registry naming + base_registry_name = self._determine_registry_image_name(image_name, registry) + return f"{base_registry_name}" # Architecture already in image_name def _determine_registry_image_name( self, docker_image: str, registry: str, credentials: typing.Dict = None @@ -685,3 +1012,158 @@ def _determine_registry_image_name( registry_image = f"{registry}/{docker_image}" return registry_image + + def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) -> bool: + """Check if compilation architecture is compatible with target architecture.""" + # Define compatibility rules for compilation + compatibility_matrix = { + "gfx908": ["gfx908"], # MI100 - exact match only + "gfx90a": ["gfx90a"], # MI200 - exact match only + "gfx940": ["gfx940"], # MI300A - exact match only + "gfx941": ["gfx941"], # MI300X - exact match only + "gfx942": ["gfx942"], # MI300X - exact match only + } + + compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) + return target_arch in compatible_archs + + def _build_model_for_arch( + self, + model_info: typing.Dict, + gpu_arch: str, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model for specific GPU architecture with smart image naming.""" + + # Find dockerfiles + dockerfiles = self._get_dockerfiles_for_model(model_info) + + arch_results = [] + for dockerfile in dockerfiles: + # Smart image naming: add architecture suffix only if Dockerfile has GPU variables + has_gpu_vars, _ = self._check_dockerfile_has_gpu_variables(model_info) + + if has_gpu_vars: + # Create architecture-specific image name + base_image_name = self._create_base_image_name(model_info, dockerfile) + arch_image_name = f"{base_image_name}_{gpu_arch}" + else: + # Use existing docker image name (no suffix) + arch_image_name = self._create_base_image_name(model_info, dockerfile) + + # Set MAD_SYSTEM_GPU_ARCHITECTURE for this build + arch_build_args = {"MAD_SYSTEM_GPU_ARCHITECTURE": gpu_arch} + + # Build the image + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix, + additional_build_args=arch_build_args, + override_image_name=arch_image_name + ) + + # Add architecture metadata + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push with architecture-specific tagging + if registry: + registry_image = self._determine_registry_image_name( + arch_image_name, registry, credentials + ) + try: + self.push_image(arch_image_name, registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + arch_results.append(build_info) + + return arch_results + + def _build_model_single_arch( + self, + model_info: typing.Dict, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model using existing single architecture flow.""" + + # Find dockerfiles for this model + dockerfiles = self._get_dockerfiles_for_model(model_info) + + results = [] + for dockerfile in dockerfiles: + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix + ) + + # Extract GPU architecture from build args or context for manifest + gpu_arch = self._get_effective_gpu_architecture(model_info, dockerfile) + if gpu_arch: + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push (existing logic) + if registry: + registry_image = self._determine_registry_image_name( + build_info["docker_image"], registry, credentials + ) + try: + self.push_image(build_info["docker_image"], registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + results.append(build_info) + + return results + + def _get_effective_gpu_architecture(self, model_info: typing.Dict, dockerfile_path: str) -> str: + """Get effective GPU architecture for single arch builds.""" + # Check if MAD_SYSTEM_GPU_ARCHITECTURE is in build args from additional_context + if ("docker_build_arg" in self.context.ctx and + "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + return self.context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + + # Try to extract from Dockerfile defaults + try: + with open(dockerfile_path, 'r') as f: + content = f.read() + + # Look for ARG or ENV declarations + patterns = [ + r"ARG\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)", + r"ENV\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)" + ] + + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return match.group(1).strip('"\'') + except Exception: + pass + + return None + + def _create_base_image_name(self, model_info: typing.Dict, dockerfile_path: str) -> str: + """Create base image name for a model.""" + # Use existing image naming logic from build_image method + # This is a simplified version - we may need to extract more from build_image + model_name = model_info["name"] + dockerfile_context = self.console.sh( + f"head -n5 {dockerfile_path} | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" + ) + return f"ci-{model_name}_{dockerfile_context}" From 156bcfe7eb5a89b25c54efa2206c3eb1fdeb1a0a Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 15:29:10 -0400 Subject: [PATCH 2/7] Implemented unit tests for the feature of multi-gpu arch --- tests/test_multi_gpu_arch.py | 148 +++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 tests/test_multi_gpu_arch.py diff --git a/tests/test_multi_gpu_arch.py b/tests/test_multi_gpu_arch.py new file mode 100644 index 00000000..0f3f9673 --- /dev/null +++ b/tests/test_multi_gpu_arch.py @@ -0,0 +1,148 @@ +"""Comprehensive unit tests for multi-GPU architecture support in MADEngine. + +Covers: +- Multi-arch DockerBuilder logic (image naming, manifest, legacy/override) +- Dockerfile GPU variable parsing/validation +- Target architecture normalization and compatibility +- Run-phase manifest filtering by gpu_architecture + +All tests are logic/unit tests and do not require GPU hardware. +""" +import pytest +from unittest.mock import MagicMock, patch +from madengine.tools.docker_builder import DockerBuilder +from madengine.tools.distributed_orchestrator import DistributedOrchestrator + +class TestMultiGPUArch: + def setup_method(self): + self.context = MagicMock() + self.console = MagicMock() + self.builder = DockerBuilder(self.context, self.console) + self.orchestrator = DistributedOrchestrator(MagicMock()) + + # --- DockerBuilder Multi-Arch Logic --- + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") + @patch.object(DockerBuilder, "build_image") + def test_multi_arch_build_image_naming(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + # GPU variable present + mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert result[0]["docker_image"].endswith("_gfx908") + # GPU variable absent + mock_check_gpu_vars.return_value = (False, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert not result[0]["docker_image"].endswith("_gfx908") + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") + @patch.object(DockerBuilder, "build_image") + def test_multi_arch_manifest_fields(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert result[0]["gpu_architecture"] == "gfx908" + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "build_image") + def test_legacy_single_arch_build(self, mock_build_image, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + result = self.builder._build_model_single_arch(model_info, None, False, None, "", None) + assert result[0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + + @patch.object(DockerBuilder, "_build_model_single_arch") + def test_additional_context_overrides_target_archs(self, mock_single_arch): + self.context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx908"}} + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_single_arch.return_value = [{"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0}] + result = self.builder.build_all_models([model_info], target_archs=["gfx908", "gfx90a"]) + assert result["successful_builds"][0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + + # --- Dockerfile GPU Variable Parsing/Validation --- + def test_parse_dockerfile_gpu_variables(self): + dockerfile_content = """ + ARG MAD_SYSTEM_GPU_ARCHITECTURE=gfx908 + ENV PYTORCH_ROCM_ARCH=gfx908;gfx90a + ARG GPU_TARGETS=gfx908,gfx942 + ENV GFX_COMPILATION_ARCH=gfx908 + ARG GPU_ARCHS=gfx908;gfx90a;gfx942 + """ + result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) + assert result["MAD_SYSTEM_GPU_ARCHITECTURE"] == ["gfx908"] + assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] + assert result["GPU_TARGETS"] == ["gfx908", "gfx942"] + assert result["GFX_COMPILATION_ARCH"] == ["gfx908"] + assert result["GPU_ARCHS"] == ["gfx908", "gfx90a", "gfx942"] + + def test_parse_dockerfile_gpu_variables_env_delimiter(self): + dockerfile_content = "ENV PYTORCH_ROCM_ARCH = gfx908,gfx90a" + result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) + assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] + + def test_parse_malformed_dockerfile(self): + dockerfile_content = "ENV BAD_LINE\nARG MAD_SYSTEM_GPU_ARCHITECTURE=\nENV PYTORCH_ROCM_ARCH=\n" + result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) + assert isinstance(result, dict) + + # --- Target Architecture Normalization/Compatibility --- + def test_normalize_architecture_name(self): + cases = { + "gfx908": "gfx908", + "GFX908": "gfx908", + "mi100": "gfx908", + "mi-100": "gfx908", + "mi200": "gfx90a", + "mi-200": "gfx90a", + "mi210": "gfx90a", + "mi250": "gfx90a", + "mi300": "gfx940", + "mi-300": "gfx940", + "mi300a": "gfx940", + "mi300x": "gfx942", + "mi-300x": "gfx942", + "unknown": "unknown", + "": None, + } + for inp, expected in cases.items(): + assert self.builder._normalize_architecture_name(inp) == expected + + def test_is_target_arch_compatible_with_variable(self): + assert self.builder._is_target_arch_compatible_with_variable("MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx908"], "gfx942") + assert self.builder._is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908", "gfx942"], "gfx942") + assert not self.builder._is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908"], "gfx942") + assert self.builder._is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx908") + assert not self.builder._is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx942") + assert self.builder._is_target_arch_compatible_with_variable("UNKNOWN_VAR", ["foo"], "bar") + + def test_is_compilation_arch_compatible(self): + assert self.builder._is_compilation_arch_compatible("gfx908", "gfx908") + assert not self.builder._is_compilation_arch_compatible("gfx908", "gfx942") + assert self.builder._is_compilation_arch_compatible("foo", "foo") + + # --- Run-Phase Manifest Filtering --- + def test_filter_images_by_gpu_architecture(self): + orch = self.orchestrator + orch.context = MagicMock() + orch.context.get_system_gpu_architecture.return_value = "gfx908" + # Exact match + built_images = {"img1": {"gpu_architecture": "gfx908"}, "img2": {"gpu_architecture": "gfx90a"}} + filtered = orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert "img1" in filtered and "img2" not in filtered + # Legacy image (no arch field) + built_images = {"img1": {}, "img2": {"gpu_architecture": "gfx90a"}} + filtered = orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert "img1" in filtered + # No match, error message includes available archs (simulate run_phase error) + built_images = {"img1": {"gpu_architecture": "gfx90a"}, "img2": {"gpu_architecture": "gfx942"}} + try: + orch._filter_images_by_gpu_architecture(built_images, "gfx908") + except Exception: + pass From 8457257435f346177334fb4c0ca3de8eb054ceda Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 16:58:20 -0400 Subject: [PATCH 3/7] Debug and fix the unit test of multi gpu arch --- .../tools/distributed_orchestrator.py | 88 +++++++++++++++++++ tests/test_multi_gpu_arch.py | 36 +++++--- 2 files changed, 113 insertions(+), 11 deletions(-) diff --git a/src/madengine/tools/distributed_orchestrator.py b/src/madengine/tools/distributed_orchestrator.py index ad13655a..f3353273 100644 --- a/src/madengine/tools/distributed_orchestrator.py +++ b/src/madengine/tools/distributed_orchestrator.py @@ -401,6 +401,52 @@ def run_phase( print(f"Loaded manifest with {len(manifest['built_images'])} images") + # Filter images by GPU architecture compatibility + try: + runtime_gpu_arch = self.context.get_system_gpu_architecture() + print(f"Runtime GPU architecture detected: {runtime_gpu_arch}") + + # Filter manifest images by GPU architecture compatibility + compatible_images = self._filter_images_by_gpu_architecture( + manifest["built_images"], runtime_gpu_arch + ) + + if not compatible_images: + available_archs = list(set( + img.get('gpu_architecture', 'unknown') + for img in manifest['built_images'].values() + )) + available_archs = [arch for arch in available_archs if arch != 'unknown'] + + if available_archs: + error_msg = ( + f"No compatible Docker images found for runtime GPU architecture '{runtime_gpu_arch}'. " + f"Available image architectures: {available_archs}. " + f"Please build images for the target architecture using: " + f"--target-archs {runtime_gpu_arch}" + ) + else: + error_msg = ( + f"No compatible Docker images found for runtime GPU architecture '{runtime_gpu_arch}'. " + f"The manifest contains legacy images without architecture information. " + f"These will be treated as compatible for backward compatibility." + ) + + raise RuntimeError(error_msg) + + # Update manifest to only include compatible images + manifest["built_images"] = compatible_images + print(f"Filtered to {len(compatible_images)} compatible images for GPU architecture '{runtime_gpu_arch}'") + + except Exception as e: + # If GPU architecture detection fails, proceed with all images for backward compatibility + self.rich_console.print( + f"[yellow]Warning: GPU architecture filtering failed: {e}[/yellow]" + ) + self.rich_console.print( + "[yellow]Proceeding with all available images (backward compatibility mode)[/yellow]" + ) + # Registry is now per-image; CLI registry is fallback if registry: print(f"Using registry from CLI: {registry}") @@ -801,6 +847,48 @@ def _copy_scripts(self) -> None: self.console.sh(f"cp -vLR --preserve=all {scripts_path} .") print(f"Scripts copied to {os.getcwd()}/scripts") + def _filter_images_by_gpu_architecture(self, built_images: typing.Dict, runtime_arch: str) -> typing.Dict: + """Filter built images by GPU architecture compatibility. + + Args: + built_images: Dictionary of built images from manifest + runtime_arch: Runtime GPU architecture (e.g., 'gfx908') + + Returns: + dict: Filtered dictionary containing only compatible images + """ + compatible = {} + + self.rich_console.print(f"[cyan]Filtering images for runtime GPU architecture: {runtime_arch}[/cyan]") + + for image_name, image_info in built_images.items(): + image_arch = image_info.get("gpu_architecture") + + if not image_arch: + # Legacy images without architecture info - assume compatible for backward compatibility + self.rich_console.print( + f"[yellow] Warning: Image {image_name} has no architecture info, assuming compatible (legacy mode)[/yellow]" + ) + compatible[image_name] = image_info + elif image_arch == runtime_arch: + # Exact architecture match + self.rich_console.print( + f"[green] ✓ Compatible: {image_name} (architecture: {image_arch})[/green]" + ) + compatible[image_name] = image_info + else: + # Architecture mismatch + self.rich_console.print( + f"[red] ✗ Incompatible: {image_name} (architecture: {image_arch}, runtime: {runtime_arch})[/red]" + ) + + if not compatible: + self.rich_console.print(f"[red]No compatible images found for runtime architecture: {runtime_arch}[/red]") + else: + self.rich_console.print(f"[green]Found {len(compatible)} compatible image(s)[/green]") + + return compatible + def cleanup(self) -> None: """Cleanup the scripts/common directory.""" # check the directory exists diff --git a/tests/test_multi_gpu_arch.py b/tests/test_multi_gpu_arch.py index 0f3f9673..e46d8e10 100644 --- a/tests/test_multi_gpu_arch.py +++ b/tests/test_multi_gpu_arch.py @@ -18,7 +18,16 @@ def setup_method(self): self.context = MagicMock() self.console = MagicMock() self.builder = DockerBuilder(self.context, self.console) - self.orchestrator = DistributedOrchestrator(MagicMock()) + + # Mock args for DistributedOrchestrator to avoid file reading issues + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.additional_context_file = None + mock_args.live_output = True + mock_args.data_config_file_name = "data.json" + + # Create orchestrator with mocked args and build_only_mode to avoid GPU detection + self.orchestrator = DistributedOrchestrator(mock_args, build_only_mode=True) # --- DockerBuilder Multi-Arch Logic --- @patch.object(DockerBuilder, "_get_dockerfiles_for_model") @@ -130,19 +139,24 @@ def test_is_compilation_arch_compatible(self): # --- Run-Phase Manifest Filtering --- def test_filter_images_by_gpu_architecture(self): orch = self.orchestrator - orch.context = MagicMock() - orch.context.get_system_gpu_architecture.return_value = "gfx908" - # Exact match + + # Test exact match built_images = {"img1": {"gpu_architecture": "gfx908"}, "img2": {"gpu_architecture": "gfx90a"}} filtered = orch._filter_images_by_gpu_architecture(built_images, "gfx908") assert "img1" in filtered and "img2" not in filtered - # Legacy image (no arch field) + + # Test legacy image (no arch field) built_images = {"img1": {}, "img2": {"gpu_architecture": "gfx90a"}} filtered = orch._filter_images_by_gpu_architecture(built_images, "gfx908") - assert "img1" in filtered - # No match, error message includes available archs (simulate run_phase error) + assert "img1" in filtered # Legacy images should be included for backward compatibility + assert "img2" not in filtered + + # Test no match case built_images = {"img1": {"gpu_architecture": "gfx90a"}, "img2": {"gpu_architecture": "gfx942"}} - try: - orch._filter_images_by_gpu_architecture(built_images, "gfx908") - except Exception: - pass + filtered = orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert len(filtered) == 0 + + # Test all matching case + built_images = {"img1": {"gpu_architecture": "gfx908"}, "img2": {"gpu_architecture": "gfx908"}} + filtered = orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert len(filtered) == 2 From 3a0b4c75acab09ab514df0f81b533b418504f014 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 17:26:26 -0400 Subject: [PATCH 4/7] Debug the issue of display results table --- src/madengine/mad_cli.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index 705db264..6278c505 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -477,11 +477,32 @@ def get_display_names(items, limit=5): display_items = [] for item in items[:limit]: if isinstance(item, dict): - # For dictionary items (run results), use model name or name field - name = item.get("model", item.get("name", str(item)[:20])) - display_items.append(name) + # For build results, prioritize docker_image extraction for model name + if "docker_image" in item: + # Extract model name from docker image name + # e.g., "ci-dummy_dummy.ubuntu.amd" -> "dummy" + # e.g., "ci-dummy_dummy.ubuntu.amd_gfx908" -> "dummy" + docker_image = item["docker_image"] + if docker_image.startswith("ci-"): + # Remove ci- prefix and extract model name + parts = docker_image[3:].split("_") + if len(parts) >= 2: + model_name = parts[0] # First part is the model name + else: + model_name = parts[0] if parts else docker_image + else: + model_name = docker_image + display_items.append(model_name) + # For run results, use model name or name field + elif "model" in item: + display_items.append(item["model"]) + elif "name" in item: + display_items.append(item["name"]) + else: + # Fallback to truncated string representation + display_items.append(str(item)[:20]) else: - # For string items (build results), use as-is + # For string items, use as-is display_items.append(str(item)) result = ", ".join(display_items) From 682bec2ee7f57f5fe4ba0815b256562cc2ceee5b Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 19:48:50 -0400 Subject: [PATCH 5/7] Enhanced the results table, and improved the flow of handle gpu arch surfix at docker image name --- src/madengine/mad_cli.py | 42 ++++++++++++-- src/madengine/tools/docker_builder.py | 79 ++------------------------- 2 files changed, 41 insertions(+), 80 deletions(-) diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index 6278c505..42a446d8 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -459,12 +459,16 @@ def _process_batch_manifest_entries( ) -def display_results_table(summary: Dict, title: str) -> None: +def display_results_table(summary: Dict, title: str, show_gpu_arch: bool = False) -> None: """Display results in a formatted table.""" table = Table(title=title, show_header=True, header_style="bold magenta") table.add_column("Status", style="bold") table.add_column("Count", justify="right") table.add_column("Items", style="dim") + + # Add GPU Architecture column if multi-arch build was used + if show_gpu_arch: + table.add_column("GPU Architecture", style="cyan") successful = summary.get("successful_builds", summary.get("successful_runs", [])) failed = summary.get("failed_builds", summary.get("failed_runs", [])) @@ -510,14 +514,40 @@ def get_display_names(items, limit=5): result += "..." return result + # Helper function to extract GPU architectures from items + def get_gpu_architectures(items, limit=5): + if not items: + return "" + + gpu_archs = [] + for item in items[:limit]: + if isinstance(item, dict) and "gpu_architecture" in item: + gpu_archs.append(item["gpu_architecture"]) + else: + gpu_archs.append("N/A") + + result = ", ".join(gpu_archs) + if len(items) > limit: + result += "..." + return result + if successful: - table.add_row("✅ Success", str(len(successful)), get_display_names(successful)) + if show_gpu_arch: + table.add_row("✅ Success", str(len(successful)), get_display_names(successful), get_gpu_architectures(successful)) + else: + table.add_row("✅ Success", str(len(successful)), get_display_names(successful)) if failed: - table.add_row("❌ Failed", str(len(failed)), get_display_names(failed)) + if show_gpu_arch: + table.add_row("❌ Failed", str(len(failed)), get_display_names(failed), get_gpu_architectures(failed)) + else: + table.add_row("❌ Failed", str(len(failed)), get_display_names(failed)) if not successful and not failed: - table.add_row("ℹ️ No items", "0", "") + if show_gpu_arch: + table.add_row("ℹ️ No items", "0", "", "") + else: + table.add_row("ℹ️ No items", "0", "") console.print(table) @@ -746,7 +776,9 @@ def build( ) # Display results - display_results_table(build_summary, "Build Results") + # Check if target_archs was used to show GPU architecture column + show_gpu_arch = bool(target_archs) + display_results_table(build_summary, "Build Results", show_gpu_arch) # Save summary save_summary_with_feedback(build_summary, summary_output, "Build") diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 12833482..198d2fda 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -781,71 +781,6 @@ def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) - compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) return target_arch in compatible_archs - def _build_model_for_arch( - self, - model_info: typing.Dict, - gpu_arch: str, - credentials: typing.Dict, - clean_cache: bool, - registry: str, - phase_suffix: str, - batch_build_metadata: typing.Optional[dict] - ) -> typing.List[typing.Dict]: - """Build model for specific GPU architecture with smart image naming.""" - - # Find dockerfiles - dockerfiles = self._get_dockerfiles_for_model(model_info) - - arch_results = [] - for dockerfile in dockerfiles: - # Smart image naming: add architecture suffix only if Dockerfile has GPU variables - has_gpu_vars, _ = self._check_dockerfile_has_gpu_variables(model_info) - - if has_gpu_vars: - # Create architecture-specific image name - base_image_name = self._create_base_image_name(model_info, dockerfile) - arch_image_name = f"{base_image_name}_{gpu_arch}" - else: - # Use existing docker image name (no suffix) - arch_image_name = self._create_base_image_name(model_info, dockerfile) - - # Set MAD_SYSTEM_GPU_ARCHITECTURE for this build - arch_build_args = {"MAD_SYSTEM_GPU_ARCHITECTURE": gpu_arch} - - # Build the image - build_info = self.build_image( - model_info, - dockerfile, - credentials, - clean_cache, - phase_suffix, - additional_build_args=arch_build_args, - override_image_name=arch_image_name - ) - - # Add architecture metadata - build_info["gpu_architecture"] = gpu_arch - - # Handle registry push with architecture-specific tagging - if registry: - if has_gpu_vars: - registry_image = self._create_arch_registry_image_name( - arch_image_name, gpu_arch, registry, batch_build_metadata, model_info - ) - else: - registry_image = self._create_registry_image_name( - arch_image_name, registry, batch_build_metadata, model_info - ) - try: - self.push_image(arch_image_name, registry, credentials, registry_image) - build_info["registry_image"] = registry_image - except Exception as e: - build_info["push_error"] = str(e) - - arch_results.append(build_info) - - return arch_results - def _build_model_single_arch( self, model_info: typing.Dict, @@ -1044,16 +979,10 @@ def _build_model_for_arch( arch_results = [] for dockerfile in dockerfiles: - # Smart image naming: add architecture suffix only if Dockerfile has GPU variables - has_gpu_vars, _ = self._check_dockerfile_has_gpu_variables(model_info) - - if has_gpu_vars: - # Create architecture-specific image name - base_image_name = self._create_base_image_name(model_info, dockerfile) - arch_image_name = f"{base_image_name}_{gpu_arch}" - else: - # Use existing docker image name (no suffix) - arch_image_name = self._create_base_image_name(model_info, dockerfile) + # When using --target-archs, always add architecture suffix regardless of GPU variables + # This ensures consistent naming for multi-architecture builds + base_image_name = self._create_base_image_name(model_info, dockerfile) + arch_image_name = f"{base_image_name}_{gpu_arch}" # Set MAD_SYSTEM_GPU_ARCHITECTURE for this build arch_build_args = {"MAD_SYSTEM_GPU_ARCHITECTURE": gpu_arch} From 89784ca11eb1e639cb13afcb39b669a3cf6bca4a Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 20:58:54 -0400 Subject: [PATCH 6/7] Creates architecture-specific images with proper naming and metadata, regardless of the underlying Dockerfile configuration. --- src/madengine/mad_cli.py | 130 ++++++++++++-------------- src/madengine/tools/docker_builder.py | 30 ++---- 2 files changed, 66 insertions(+), 94 deletions(-) diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index 42a446d8..93756380 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -460,94 +460,84 @@ def _process_batch_manifest_entries( def display_results_table(summary: Dict, title: str, show_gpu_arch: bool = False) -> None: - """Display results in a formatted table.""" + """Display results in a formatted table with each model as a separate row.""" table = Table(title=title, show_header=True, header_style="bold magenta") + table.add_column("Index", justify="right", style="dim") table.add_column("Status", style="bold") - table.add_column("Count", justify="right") - table.add_column("Items", style="dim") + table.add_column("Model", style="cyan") # Add GPU Architecture column if multi-arch build was used if show_gpu_arch: - table.add_column("GPU Architecture", style="cyan") + table.add_column("GPU Architecture", style="yellow") successful = summary.get("successful_builds", summary.get("successful_runs", [])) failed = summary.get("failed_builds", summary.get("failed_runs", [])) - # Helper function to extract display names from items - def get_display_names(items, limit=5): - if not items: - return "" - - display_items = [] - for item in items[:limit]: - if isinstance(item, dict): - # For build results, prioritize docker_image extraction for model name - if "docker_image" in item: - # Extract model name from docker image name - # e.g., "ci-dummy_dummy.ubuntu.amd" -> "dummy" - # e.g., "ci-dummy_dummy.ubuntu.amd_gfx908" -> "dummy" - docker_image = item["docker_image"] - if docker_image.startswith("ci-"): - # Remove ci- prefix and extract model name - parts = docker_image[3:].split("_") - if len(parts) >= 2: - model_name = parts[0] # First part is the model name - else: - model_name = parts[0] if parts else docker_image + # Helper function to extract model name from build result + def extract_model_name(item): + if isinstance(item, dict): + # For build results, prioritize docker_image extraction for model name + if "docker_image" in item: + # Extract model name from docker image name + # e.g., "ci-dummy_dummy.ubuntu.amd" -> "dummy" + # e.g., "ci-dummy_dummy.ubuntu.amd_gfx908" -> "dummy" + docker_image = item["docker_image"] + if docker_image.startswith("ci-"): + # Remove ci- prefix and extract model name + parts = docker_image[3:].split("_") + if len(parts) >= 2: + model_name = parts[0] # First part is the model name else: - model_name = docker_image - display_items.append(model_name) - # For run results, use model name or name field - elif "model" in item: - display_items.append(item["model"]) - elif "name" in item: - display_items.append(item["name"]) + model_name = parts[0] if parts else docker_image else: - # Fallback to truncated string representation - display_items.append(str(item)[:20]) - else: - # For string items, use as-is - display_items.append(str(item)) - - result = ", ".join(display_items) - if len(items) > limit: - result += "..." - return result - - # Helper function to extract GPU architectures from items - def get_gpu_architectures(items, limit=5): - if not items: - return "" - - gpu_archs = [] - for item in items[:limit]: - if isinstance(item, dict) and "gpu_architecture" in item: - gpu_archs.append(item["gpu_architecture"]) - else: - gpu_archs.append("N/A") - - result = ", ".join(gpu_archs) - if len(items) > limit: - result += "..." - return result - - if successful: + model_name = docker_image + return model_name + # For run results, use model name or name field + elif "model" in item: + return item["model"] + elif "name" in item: + return item["name"] + return str(item)[:20] # Fallback + + # Helper function to extract GPU architecture + def extract_gpu_arch(item): + if isinstance(item, dict) and "gpu_architecture" in item: + return item["gpu_architecture"] + return "N/A" + + # Add successful builds/runs + row_index = 1 + for item in successful: + model_name = extract_model_name(item) if show_gpu_arch: - table.add_row("✅ Success", str(len(successful)), get_display_names(successful), get_gpu_architectures(successful)) + gpu_arch = extract_gpu_arch(item) + table.add_row(str(row_index), "✅ Success", model_name, gpu_arch) else: - table.add_row("✅ Success", str(len(successful)), get_display_names(successful)) - - if failed: - if show_gpu_arch: - table.add_row("❌ Failed", str(len(failed)), get_display_names(failed), get_gpu_architectures(failed)) + table.add_row(str(row_index), "✅ Success", model_name) + row_index += 1 + + # Add failed builds/runs + for item in failed: + if isinstance(item, dict): + model_name = item.get("model", "Unknown") + if show_gpu_arch: + gpu_arch = item.get("architecture", "N/A") + table.add_row(str(row_index), "❌ Failed", model_name, gpu_arch) + else: + table.add_row(str(row_index), "❌ Failed", model_name) else: - table.add_row("❌ Failed", str(len(failed)), get_display_names(failed)) + if show_gpu_arch: + table.add_row(str(row_index), "❌ Failed", str(item), "N/A") + else: + table.add_row(str(row_index), "❌ Failed", str(item)) + row_index += 1 + # Show empty state if no results if not successful and not failed: if show_gpu_arch: - table.add_row("ℹ️ No items", "0", "", "") + table.add_row("1", "ℹ️ No items", "", "") else: - table.add_row("ℹ️ No items", "0", "") + table.add_row("1", "ℹ️ No items", "") console.print(table) diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 198d2fda..7eaee5a0 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -518,32 +518,14 @@ def build_all_models( "error": str(e) }) elif target_archs: - # Multi-architecture build mode with Dockerfile validation + # Multi-architecture build mode - always use architecture suffix for arch in target_archs: try: - # Check if model's Dockerfile has GPU variables - has_gpu_vars, dockerfile_path = self._check_dockerfile_has_gpu_variables(model_info) - - if has_gpu_vars: - # Validate target architecture against model's Dockerfile - if not self._validate_target_arch_against_dockerfile(model_info, arch): - raise ValueError( - f"Target GPU architecture '{arch}' does not match model '{model_info['name']}' " - f"Dockerfile GPU architecture requirements. Cannot build image." - ) - # Build with architecture suffix - arch_build_info = self._build_model_for_arch( - model_info, arch, credentials, clean_cache, - registry, phase_suffix, batch_build_metadata - ) - else: - # No GPU variables - run normal build using existing flow - self.rich_console.print(f"[yellow]Info: No GPU architecture variables found in {dockerfile_path}, " - f"using normal build flow without architecture suffix for model {model_info['name']}[/yellow]") - arch_build_info = self._build_model_single_arch( - model_info, credentials, clean_cache, - registry, phase_suffix, batch_build_metadata - ) + # Always build with architecture suffix when --target-archs is used + arch_build_info = self._build_model_for_arch( + model_info, arch, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) build_summary["successful_builds"].extend(arch_build_info) build_summary["total_build_time"] += sum( From 23bbf573e7d7e2d4dd096a04743e54341e07db00 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 8 Aug 2025 21:05:01 -0400 Subject: [PATCH 7/7] Fixed the syntax error --- src/madengine/tools/docker_builder.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 7eaee5a0..fd6b0c29 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -1068,13 +1068,3 @@ def _get_effective_gpu_architecture(self, model_info: typing.Dict, dockerfile_pa pass return None - - def _create_base_image_name(self, model_info: typing.Dict, dockerfile_path: str) -> str: - """Create base image name for a model.""" - # Use existing image naming logic from build_image method - # This is a simplified version - we may need to extract more from build_image - model_name = model_info["name"] - dockerfile_context = self.console.sh( - f"head -n5 {dockerfile_path} | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" - ) - return f"ci-{model_name}_{dockerfile_context}"