Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions charts/model-engine/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ endpoint_name: ${ENDPOINT_NAME}
{{- define "modelEngine.jobTemplateLabels" -}}
{{- include "modelEngine.baseTemplateLabels" . | printf "%s\n" -}}
launch_job_id: ${JOB_ID}
tags.datadoghq.com/request_id: ${REQUEST_ID}
tags.datadoghq.com/service: ${JOB_ID}
tags.datadoghq.com/user_id: ${OWNER}
tags.datadoghq.com/team: ${TEAM}
{{- end }}

{{- define "modelEngine.serviceTemplateAsyncAnnotations" -}}
Expand Down
1 change: 1 addition & 0 deletions model-engine/model_engine_server/common/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def get_boolean_env_var(name: str) -> bool:
)
"""The path to the config map containing the Launch service template.
"""
logger.info(f"{LAUNCH_SERVICE_TEMPLATE_CONFIG_MAP_PATH=}")

LAUNCH_SERVICE_TEMPLATE_FOLDER: Optional[str] = os.environ.get("LAUNCH_SERVICE_TEMPLATE_FOLDER")
"""The path to the folder containing the Launch service template. If set, this overrides
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
from kubernetes_asyncio.client.rest import ApiException
from model_engine_server.common.config import hmi_config
from model_engine_server.common.env_vars import GIT_TAG
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.core.loggers import (
LoggerTagKey,
LoggerTagManager,
filename_wo_ext,
make_logger,
)
from model_engine_server.domain.entities import BatchJobSerializationFormat
from model_engine_server.domain.exceptions import EndpointResourceInfraException
from model_engine_server.infra.gateways import BatchJobOrchestrationGateway
Expand Down Expand Up @@ -55,6 +60,7 @@ async def create_batch_job_orchestrator(
BATCH_JOB_MAX_RUNTIME=int(timeout_seconds + SHUTDOWN_GRACE_PERIOD),
BATCH_JOB_TTL_SECONDS_AFTER_FINISHED=BATCH_JOB_TTL_SECONDS_AFTER_FINISHED,
GIT_TAG=GIT_TAG,
REQUEST_ID=LoggerTagManager.get(LoggerTagKey.REQUEST_ID) or "",
)
resource_key = "batch-job-orchestration-job.yaml"
deployment_spec = load_k8s_yaml(resource_key, substitution_kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
from model_engine_server.common.dtos.batch_jobs import CreateDockerImageBatchJobResourceRequests
from model_engine_server.common.serialization_utils import python_json_to_b64
from model_engine_server.core.config import infra_config
from model_engine_server.core.loggers import filename_wo_ext, make_logger
from model_engine_server.core.loggers import (
LoggerTagKey,
LoggerTagManager,
filename_wo_ext,
make_logger,
)
from model_engine_server.domain.entities.batch_job_entity import BatchJobStatus, DockerImageBatchJob
from model_engine_server.domain.exceptions import EndpointResourceInfraException
from model_engine_server.domain.gateways.docker_image_batch_job_gateway import (
Expand Down Expand Up @@ -231,6 +236,7 @@ def _generate_job_spec(
# GPU Arguments
GPU_TYPE=resource_requests.gpu_type.value,
GPUS=resource_requests.gpus or 1,
REQUEST_ID=LoggerTagManager.get(LoggerTagKey.REQUEST_ID) or "",
)
else:
resource_key = "docker-image-batch-job-cpu.yaml"
Expand Down Expand Up @@ -259,6 +265,7 @@ def _generate_job_spec(
LOCAL_FILE_NAME=mount_location,
FILE_CONTENTS_B64ENCODED=job_config_b64encoded,
AWS_ROLE=infra_config().profile_ml_inference_worker,
REQUEST_ID=LoggerTagManager.get(LoggerTagKey.REQUEST_ID) or "",
)

resource_spec = load_k8s_yaml(resource_key, substitution_kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ class _JobArguments(_BaseResourceArguments):
JOB_ID: str
BATCH_JOB_MAX_RUNTIME: int
BATCH_JOB_TTL_SECONDS_AFTER_FINISHED: int
REQUEST_ID: str


class _DockerImageBatchJobArguments(_JobArguments):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any, Dict, List, Optional

from model_engine_server.common.dtos.batch_jobs import CreateDockerImageBatchJobResourceRequests
from model_engine_server.core.loggers import logger_name, make_logger
from model_engine_server.domain.entities import FineTuneHparamValueType
from model_engine_server.domain.entities.batch_job_entity import DockerImageBatchJob
from model_engine_server.domain.exceptions import (
Expand All @@ -17,6 +18,8 @@
from model_engine_server.domain.services import LLMFineTuningService
from model_engine_server.infra.repositories.llm_fine_tune_repository import LLMFineTuneRepository

logger = make_logger(logger_name())


class DockerImageBatchJobLLMFineTuningService(LLMFineTuningService):
def __init__(
Expand Down Expand Up @@ -76,6 +79,9 @@ async def create_fine_tune(
# TODO: Pass user-defined labels
labels = dict(team="egp", product="llm-fine-tune")

logger.info(
f"Using bundle {di_batch_job_bundle.id} for fine-tune job: {di_batch_job_bundle.image_repository=}, {di_batch_job_bundle.image_tag=}"
)
batch_job_id = await self.docker_image_batch_job_gateway.create_docker_image_batch_job(
created_by=created_by,
owner=owner,
Expand Down