Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/madengine/mad_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,9 +736,14 @@ def build(
except typer.Exit:
raise
except Exception as e:
from madengine.core.errors import handle_error
from madengine.core.errors import handle_error, create_error_context

handle_error(e, context={"operation": "build", "phase": "build"})
context = create_error_context(
operation="build",
phase="build",
component="build_command"
)
handle_error(e, context=context)
raise typer.Exit(ExitCode.FAILURE)


Expand Down
60 changes: 30 additions & 30 deletions src/madengine/tools/container_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import typing
import warnings
import re
from rich.console import Console as RichConsole
from contextlib import redirect_stdout, redirect_stderr
from madengine.core.console import Console
from madengine.core.context import Context
Expand Down Expand Up @@ -45,6 +46,7 @@ def __init__(
self.data = data
self.console = console or Console(live_output=live_output)
self.live_output = live_output
self.rich_console = RichConsole()
self.credentials = None
self.perf_csv_path = "perf.csv" # Default output path

Expand Down Expand Up @@ -150,7 +152,7 @@ def login_to_registry(self, registry: str, credentials: typing.Dict = None) -> N
credentials: Optional credentials dictionary containing username/password
"""
if not credentials:
print("No credentials provided for registry login")
self.rich_console.print("[yellow]No credentials provided for registry login[/yellow]")
return

# Check if registry credentials are available
Expand Down Expand Up @@ -207,9 +209,9 @@ def login_to_registry(self, registry: str, credentials: typing.Dict = None) -> N

try:
self.console.sh(login_command, secret=True)
print(f"Successfully logged in to registry: {registry or 'DockerHub'}")
self.rich_console.print(f"[green]✅ Successfully logged in to registry: {registry or 'DockerHub'}[/green]")
except Exception as e:
print(f"Failed to login to registry {registry}: {e}")
self.rich_console.print(f"[red]❌ Failed to login to registry {registry}: {e}[/red]")
# Don't raise exception here, as public images might still be pullable

def pull_image(
Expand All @@ -234,7 +236,7 @@ def pull_image(
if registry and credentials:
self.login_to_registry(registry, credentials)

print(f"\n📥 Starting docker pull from registry...")
self.rich_console.print(f"\n[bold blue]📥 Starting docker pull from registry...[/bold blue]")
print(f"📍 Registry: {registry or 'Default'}")
print(f"🏷️ Image: {registry_image}")
try:
Expand All @@ -243,16 +245,16 @@ def pull_image(
if local_name:
self.console.sh(f"docker tag {registry_image} {local_name}")
print(f"🏷️ Tagged as: {local_name}")
print(f"✅ Successfully pulled and tagged image")
print(f"{'='*80}")
self.rich_console.print(f"[bold green]✅ Successfully pulled and tagged image[/bold green]")
self.rich_console.print(f"[dim]{'='*80}[/dim]")
return local_name

print(f"✅ Successfully pulled image: {registry_image}")
print(f"{'='*80}")
self.rich_console.print(f"[bold green]✅ Successfully pulled image:[/bold green] [cyan]{registry_image}[/cyan]")
self.rich_console.print(f"[dim]{'='*80}[/dim]")
return registry_image

except Exception as e:
print(f"Failed to pull image {registry_image}: {e}")
self.rich_console.print(f"[red]❌ Failed to pull image {registry_image}: {e}[/red]")
raise

def get_gpu_arg(self, requested_gpus: str) -> str:
Expand Down Expand Up @@ -503,7 +505,7 @@ def run_container(
Returns:
dict: Execution results including performance metrics
"""
print(f"Running model {model_info['name']} in container {docker_image}")
self.rich_console.print(f"[bold green]🏃 Running model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan] [dim]in container[/dim] [yellow]{docker_image}[/yellow]")

# Create log file for this run
# Extract dockerfile part from docker image name (remove "ci-" prefix and model name prefix)
Expand Down Expand Up @@ -639,12 +641,12 @@ def run_container(
# set timeout
print(f"⏰ Setting timeout to {str(timeout)} seconds.")

print(f"\n🏃 Starting Docker container execution...")
self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]")
print(f"🏷️ Image: {docker_image}")
print(f"📦 Container: {container_name}")
print(f"📝 Log file: {log_file_path}")
print(f"🎮 GPU Vendor: {gpu_vendor}")
print(f"{'='*80}")
self.rich_console.print(f"[dim]{'='*80}[/dim]")

# Run the container with logging
try:
Expand All @@ -668,12 +670,10 @@ def run_container(
# Show GPU info
if gpu_vendor.find("AMD") != -1:
print(f"🎮 Checking AMD GPU status...")
smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true")
print(smi)
model_docker.sh("/opt/rocm/bin/rocm-smi || true")
elif gpu_vendor.find("NVIDIA") != -1:
print(f"🎮 Checking NVIDIA GPU status...")
smi = model_docker.sh("/usr/bin/nvidia-smi || true")
print(smi)
model_docker.sh("/usr/bin/nvidia-smi || true")

# Prepare model directory
model_dir = "run_directory"
Expand Down Expand Up @@ -785,7 +785,7 @@ def run_container(

# Run the model
test_start_time = time.time()
print("Running model...")
self.rich_console.print("[bold blue]Running model...[/bold blue]")

model_args = self.context.ctx.get(
"model_args", model_info["args"]
Expand Down Expand Up @@ -828,8 +828,8 @@ def run_container(
)
break
except Exception as e:
print(
f"Warning: Could not validate multiple results file: {e}"
self.rich_console.print(
f"[yellow]Warning: Could not validate multiple results file: {e}[/yellow]"
)
run_results["performance"] = None
else:
Expand Down Expand Up @@ -909,20 +909,20 @@ def run_container(

if has_errors:
run_results["status"] = "FAILURE"
print(
f"Status: FAILURE (error patterns detected in logs)"
self.rich_console.print(
f"[red]Status: FAILURE (error patterns detected in logs)[/red]"
)
elif has_performance:
run_results["status"] = "SUCCESS"
print(
f"Status: SUCCESS (performance metrics found, no errors)"
self.rich_console.print(
f"[green]Status: SUCCESS (performance metrics found, no errors)[/green]"
)
else:
run_results["status"] = "FAILURE"
print(f"Status: FAILURE (no performance metrics)")
self.rich_console.print(f"[red]Status: FAILURE (no performance metrics)[/red]")

except Exception as e:
print(f"Warning: Error in status determination: {e}")
self.rich_console.print(f"[yellow]Warning: Error in status determination: {e}[/yellow]")
# Fallback to simple performance check
run_results["status"] = (
"SUCCESS"
Expand Down Expand Up @@ -988,7 +988,7 @@ def run_container(
)

except Exception as e:
print(f"Warning: Could not update perf.csv: {e}")
self.rich_console.print(f"[yellow]Warning: Could not update perf.csv: {e}[/yellow]")

# Cleanup if not keeping alive
if not keep_alive:
Expand All @@ -1003,12 +1003,12 @@ def run_container(
del model_docker

except Exception as e:
print("===== EXCEPTION =====")
print("Exception: ", e)
self.rich_console.print("[bold red]===== EXCEPTION =====[/bold red]")
self.rich_console.print(f"[red]Exception: {e}[/red]")
import traceback

traceback.print_exc()
print("=============== =====")
self.rich_console.print("[bold red]=============== =====[/bold red]")
run_results["status"] = "FAILURE"

# Also update perf.csv for failures
Expand All @@ -1033,7 +1033,7 @@ def run_container(
)

except Exception as csv_e:
print(f"Warning: Could not update perf.csv with exception: {csv_e}")
self.rich_console.print(f"[yellow]Warning: Could not update perf.csv with exception: {csv_e}[/yellow]")

return run_results

Expand Down
24 changes: 15 additions & 9 deletions src/madengine/tools/discover_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import importlib.util
import typing
from dataclasses import dataclass, field, asdict
from rich.console import Console as RichConsole


@dataclass
Expand Down Expand Up @@ -53,6 +54,7 @@ def __init__(self, args: argparse.Namespace):
args (argparse.Namespace): Arguments passed to the script.
"""
self.args = args
self.rich_console = RichConsole()
# list of models from models.json and scripts/model_dir/models.json
self.models: typing.List[dict] = []
# list of custom models from scripts/model_dir/get_models_json.py
Expand All @@ -77,13 +79,13 @@ def _setup_model_dir_if_needed(self) -> None:
import subprocess

cwd_path = os.getcwd()
print(f"MODEL_DIR environment variable detected: {model_dir_env}")
self.rich_console.print(f"[bold cyan]📁 MODEL_DIR environment variable detected:[/bold cyan] [yellow]{model_dir_env}[/yellow]")
print(f"Copying contents to current working directory: {cwd_path}")

try:
# Check if source directory exists
if not os.path.exists(model_dir_env):
print(f"Warning: MODEL_DIR path does not exist: {model_dir_env}")
self.rich_console.print(f"[yellow]⚠️ Warning: MODEL_DIR path does not exist: {model_dir_env}[/yellow]")
return

# Use cp command similar to the original implementation
Expand All @@ -92,20 +94,20 @@ def _setup_model_dir_if_needed(self) -> None:
result = subprocess.run(
cmd, shell=True, capture_output=True, text=True, check=True
)
print(f"Successfully copied MODEL_DIR contents")
self.rich_console.print(f"[green]✅ Successfully copied MODEL_DIR contents[/green]")
# Only show verbose output if there are not too many files
if result.stdout and len(result.stdout.splitlines()) < 20:
print(result.stdout)
elif result.stdout:
print(f"Copied {len(result.stdout.splitlines())} files/directories")
print(f"Model dir: {model_dir_env} → current dir: {cwd_path}")
except subprocess.CalledProcessError as e:
print(f"Warning: Failed to copy MODEL_DIR contents: {e}")
self.rich_console.print(f"[yellow]⚠️ Warning: Failed to copy MODEL_DIR contents: {e}[/yellow]")
if e.stderr:
print(f"Error details: {e.stderr}")
# Continue execution even if copy fails
except Exception as e:
print(f"Warning: Unexpected error copying MODEL_DIR: {e}")
self.rich_console.print(f"[yellow]⚠️ Warning: Unexpected error copying MODEL_DIR: {e}[/yellow]")
# Continue execution even if copy fails

def discover_models(self) -> None:
Expand All @@ -125,6 +127,7 @@ def discover_models(self) -> None:
self.models = model_dict_list
self.model_list = [model_dict["name"] for model_dict in model_dict_list]
else:
self.rich_console.print("[red]❌ models.json file not found.[/red]")
raise FileNotFoundError("models.json file not found.")

# walk through the subdirs in model_dir/scripts directory to find the models.json file
Expand All @@ -134,6 +137,7 @@ def discover_models(self) -> None:
files = os.listdir(root)

if "models.json" in files and "get_models_json.py" in files:
self.rich_console.print(f"[red]❌ Both models.json and get_models_json.py found in {root}.[/red]")
raise ValueError(
f"Both models.json and get_models_json.py found in {root}."
)
Expand Down Expand Up @@ -179,8 +183,8 @@ def discover_models(self) -> None:
self.custom_models.append(custom_model)
self.model_list.append(custom_model.name)
except AssertionError:
print(
"See madengine/tests/fixtures/dummy/scripts/dummy3/get_models_json.py for an example."
self.rich_console.print(
"[yellow]💡 See madengine/tests/fixtures/dummy/scripts/dummy3/get_models_json.py for an example.[/yellow]"
)
raise

Expand Down Expand Up @@ -240,6 +244,7 @@ def select_models(self) -> None:
tag_models.append(model_dict)

if not tag_models:
self.rich_console.print(f"[red]❌ No models found corresponding to the given tag: {tag}[/red]")
raise ValueError(
f"No models found corresponding to the given tag: {tag}"
)
Expand All @@ -249,12 +254,13 @@ def select_models(self) -> None:
def print_models(self) -> None:
if self.selected_models:
# print selected models using parsed tags and adding backslash-separated extra args
self.rich_console.print(f"[bold green]📋 Selected Models ({len(self.selected_models)} models):[/bold green]")
print(json.dumps(self.selected_models, indent=4))
else:
# print list of all model names
print(f"Number of models in total: {len(self.model_list)}")
self.rich_console.print(f"[bold cyan]📊 Available Models ({len(self.model_list)} total):[/bold cyan]")
for model_name in self.model_list:
print(f"{model_name}")
print(f" {model_name}")

def run(self, live_output: bool = True):

Expand Down
Loading