Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/madengine/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,12 @@ def extract_model_name(item):

# Helper function to format numbers
def format_number(value):
if value is None or value == "-":
if value is None or value == "-" or value == "":
return "-"
try:
return f"{float(value):,.0f}"
except (ValueError, TypeError):
return str(value)
return str(value) if value else "-"

# Add successful builds/runs
row_index = 1
Expand Down Expand Up @@ -424,7 +424,12 @@ def format_performance(perf):
is_current_run = (session_start_row is not None and idx >= session_start_row)
run_marker = "[bold green]➤[/]" if is_current_run else "" # Arrow marker for current runs

model = str(row.get("model", "Unknown"))
model_val = row.get("model", "Unknown")
model = (
"Unknown"
if (pd.isna(model_val) or model_val == "" or str(model_val).strip() == "nan")
else str(model_val)
)
dataname = str(row.get("dataname", "")) if not pd.isna(row.get("dataname")) and row.get("dataname") != "" else "N/A"
data_provider_type = str(row.get("data_provider_type", "")) if not pd.isna(row.get("data_provider_type")) and row.get("data_provider_type") != "" else "N/A"

Expand Down
62 changes: 39 additions & 23 deletions src/madengine/deployment/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2963,10 +2963,12 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
)
if record:
self._write_to_perf_csv(record)
# Use nodes=[] so display shows perf_data.performance/metric per row
# (not pod-level nodes whose perf came from log and is empty for multiple_results)
results["successful_runs"].append({
"model": item["model"],
"perf_data": record,
"nodes": results["nodes"],
"nodes": [],
"per_node_metrics": [item],
})
self.console.print(
Expand Down Expand Up @@ -3779,26 +3781,26 @@ def _aggregate_node_metrics(
def _write_to_perf_csv(self, perf_data: Dict):
"""
Write performance data to local perf.csv file.

Uses the same format as local execution for consistency.
Matches the schema from container_runner.py's create_run_details_dict().

When appending to an existing file, uses the file's existing header so that
column order matches (e.g. MAD-private uses a different schema with model/performance
in different positions). Values are mapped by column name so model, performance,
metric, status etc. always land in the correct columns.
"""
import csv
from pathlib import Path

perf_csv_path = Path("perf.csv")

# Check if file exists to determine if we need headers
file_exists = perf_csv_path.exists()

# CSV headers matching local execution format EXACTLY
# This is the same order as in container_runner.py line 69
# Enhanced with topology fields for multi-node tracking
headers = [

# CSV headers for NEW files (madengine standard order)
default_headers = [
"model",
"n_gpus",
"nnodes", # NEW: Number of nodes
"gpus_per_node", # NEW: GPUs per node
"nnodes",
"gpus_per_node",
"training_precision",
"pipeline",
"args",
Expand All @@ -3810,7 +3812,7 @@ def _write_to_perf_csv(self, perf_data: Dict):
"git_commit",
"machine_name",
"deployment_type",
"launcher", # Execution launcher (native, docker, torchrun, etc.)
"launcher",
"gpu_architecture",
"performance",
"metric",
Expand All @@ -3825,17 +3827,31 @@ def _write_to_perf_csv(self, perf_data: Dict):
"build_number",
"additional_docker_run_options",
]

# Write to CSV
with open(perf_csv_path, 'a', newline='') as f:
writer = csv.DictWriter(f, fieldnames=headers, extrasaction='ignore')

# Write headers if new file

file_exists = perf_csv_path.exists()
existing_header = None
if file_exists:
# Read existing header so we write in the same column order (fixes MAD-private
# and other repos that use a different perf.csv schema with more/different columns)
with open(perf_csv_path, "r", newline="", encoding="utf-8", errors="replace") as rf:
reader = csv.reader(rf)
existing_header = next(reader, None)
headers = existing_header if existing_header else default_headers
if file_exists and existing_header:
# Build row by name so model/performance/metric/status go to correct columns
Comment on lines +3831 to +3841
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If perf.csv exists but is empty (or the first row is blank), perf_csv_path.exists() will be True while existing_header is None/empty, which skips writeheader() and appends a data row without any header line. Consider treating an empty/invalid header as a new file (e.g., check stat().st_size == 0 or validate existing_header) and write default_headers before appending rows.

Copilot uses AI. Check for mistakes.
row_by_name = {k: perf_data.get(k, "") for k in headers}
row_to_write = [str(row_by_name.get(h, "")) for h in headers]
else:
row_to_write = perf_data

with open(perf_csv_path, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=headers, extrasaction="ignore")
if not file_exists:
writer.writeheader()

# Write data row (only fields in headers will be written)
writer.writerow(perf_data)
if file_exists and existing_header:
csv.writer(f).writerow(row_to_write)
else:
writer.writerow(row_to_write)
Comment on lines +3847 to +3854
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This branch builds writer = csv.DictWriter(...) but then writes rows using csv.writer(f).writerow(...). Mixing DictWriter and writer can lead to subtle differences in quoting/dialect and makes the code harder to follow. Consider consistently using DictWriter.writerow(...) (it already respects fieldnames order and fills missing keys with restval) to write both new and existing-header rows.

Copilot uses AI. Check for mistakes.

def cleanup(self, deployment_id: str) -> bool:
"""Delete Job, ConfigMap, Service and associated pods."""
Expand Down