Skip to content

Commit 5e4d662

Browse files
authored
Fine-tuning e2e integration test (#372)
* make test work * add status checking * fix * test * wget fix * final fixes * move namespace
1 parent 4e2ea6c commit 5e4d662

4 files changed

Lines changed: 64 additions & 23 deletions

File tree

.circleci/config.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,15 +121,16 @@ jobs:
121121
command: |
122122
sudo apt-get update && sudo apt-get install -y expect
123123
pushd $HOME/project/.circleci/resources
124+
kubectl create namespace model-engine
124125
kubectl apply -f redis-k8s.yaml
125126
kubectl apply -f postgres-k8s.yaml
126127
kubectl create secret generic model-engine-postgres-credentials --from-literal=database_url=postgresql://postgres:circle_test@postgres.default:5432/circle_test
128+
kubectl create secret generic model-engine-postgres-credentials --from-literal=database_url=postgresql://postgres:circle_test@postgres.default:5432/circle_test -n model-engine
127129
export ISTIO_VERSION=1.15.0
128130
popd
129131
curl -L https://istio.io/downloadIstio | TARGET_ARCH=x86_64 sh -
130132
install istio-${ISTIO_VERSION}/bin/istioctl $HOME/bin
131133
$HOME/bin/istioctl install --set profile=demo -y
132-
kubectl create namespace model-engine
133134
kubectl create configmap default-config --from-literal=config="$(cat $HOME/project/.circleci/resources/.minikube-config-map | envsubst)"
134135
kubectl create configmap default-config --namespace model-engine --from-literal=config="$(cat $HOME/project/.circleci/resources/.minikube-config-map | envsubst)"
135136
cat $HOME/project/.circleci/resources/.minikube-registry-creds | envsubst | expect
@@ -142,7 +143,7 @@ jobs:
142143
name: Pre-load integration test images to minikube
143144
command: |
144145
docker build -f model-engine/model_engine_server/inference/pytorch_or_tf.base.Dockerfile \
145-
--build-arg BASE_IMAGE=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime \
146+
--build-arg BASE_IMAGE=python:3.8-slim \
146147
--build-arg REQUIREMENTS_FILE="$CIRCLE_SHA1-base-requirements.txt" \
147148
-t temp:1.7.1-cuda11.0-cudnn8-runtime-$CIRCLE_SHA1 .
148149
@@ -179,7 +180,10 @@ jobs:
179180
command: |
180181
pushd $HOME/project
181182
kubectl port-forward svc/model-engine 5001:80 &
182-
GIT_TAG=$CIRCLE_SHA1 pytest integration_tests
183+
export AWS_ACCESS_KEY_ID=$CIRCLECI_AWS_ACCESS_KEY
184+
export AWS_SECRET_ACCESS_KEY=$CIRCLECI_AWS_SECRET_KEY
185+
export GIT_TAG=$CIRCLE_SHA1
186+
pytest integration_tests
183187
184188
executors:
185189
ubuntu-large:

charts/model-engine/values_circleci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ config:
140140
141141
billing_queue_arn: none
142142
cache_redis_url: redis://redis-message-broker-master.default/15
143-
s3_file_llm_fine_tune_repository: "s3://$CIRCLECI_AWS_S3_BUCKET"
143+
s3_file_llm_fine_tune_repository: "s3://$CIRCLECI_AWS_S3_BUCKET/fine_tune_repository"
144144
dd_trace_enabled: false
145145
istio_enabled: true
146146
tgi_repository: "text-generation-inference"
Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,68 @@
1-
import pytest
1+
import json
2+
import os
3+
import time
24

3-
from .rest_api_utils import ( # CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST, CREATE_FINE_TUNE_REQUEST, USER_ID_0, cancel_fine_tune_by_id, create_docker_image_batch_job_bundle, create_fine_tune, get_fine_tune_by_id,
5+
import boto3
6+
import smart_open
7+
8+
from .rest_api_utils import (
9+
CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST,
10+
CREATE_FINE_TUNE_REQUEST,
411
USER_ID_0,
12+
cancel_fine_tune_by_id,
13+
create_docker_image_batch_job_bundle,
14+
create_fine_tune,
15+
get_fine_tune_by_id,
516
list_fine_tunes,
617
)
718

19+
MAX_RETRIES = 10
20+
821

9-
@pytest.mark.skip(reason="test doesn't currently work, needs to be implemented correctly")
1022
def test_fine_tunes() -> None:
11-
# TODO: get this test to work (move LLM fine tune repository to database rather than in S3)
23+
di_batch_job_id = create_docker_image_batch_job_bundle(
24+
CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST, USER_ID_0
25+
)["docker_image_batch_job_bundle_id"]
26+
data = {
27+
"test_base_model-lora": {
28+
"docker_image_batch_job_bundle_id": di_batch_job_id,
29+
"launch_bundle_config": {},
30+
"launch_endpoint_config": {},
31+
"default_hparams": {},
32+
"required_params": [],
33+
}
34+
}
1235

13-
# di_batch_job_id = create_docker_image_batch_job_bundle(
14-
# CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST, USER_ID_0
15-
# )["docker_image_batch_job_bundle_id"]
36+
if os.getenv("CIRCLECI") == "true":
37+
session = boto3.Session()
38+
aws_s3_bucket = os.getenv("CIRCLECI_AWS_S3_BUCKET")
39+
client = session.client("s3")
40+
with smart_open.open(
41+
f"s3://{aws_s3_bucket}/fine_tune_repository",
42+
"w",
43+
transport_params={"client": client},
44+
) as f:
45+
json.dump(data, f)
1646

17-
# create_response = create_fine_tune(CREATE_FINE_TUNE_REQUEST, USER_ID_0)
18-
# fine_tune_id = create_response["id"]
47+
create_response = create_fine_tune(CREATE_FINE_TUNE_REQUEST, USER_ID_0)
48+
fine_tune_id = create_response["id"]
1949

20-
# get_response = get_fine_tune_by_id(fine_tune_id, USER_ID_0)
21-
# assert get_response["id"] == fine_tune_id
50+
get_response = get_fine_tune_by_id(fine_tune_id, USER_ID_0)
51+
num_retries = 0
52+
while get_response["status"] not in ["SUCCESS", "FAILURE"]:
53+
if num_retries >= MAX_RETRIES:
54+
raise Exception("Fine tune job did not complete in time.")
55+
num_retries += 1
56+
get_response = get_fine_tune_by_id(fine_tune_id, USER_ID_0)
57+
time.sleep(10)
58+
assert get_response["id"] == fine_tune_id
59+
assert get_response["status"] == "SUCCESS"
2260

23-
# list_response_0_before = list_fine_tunes(USER_ID_0)
24-
# num_jobs = len(list_response_0_before["jobs"])
25-
# assert num_jobs >= 1
61+
list_response_0_before = list_fine_tunes(USER_ID_0)
62+
num_jobs = len(list_response_0_before["jobs"])
63+
assert num_jobs >= 1
2664

27-
list_response_1 = list_fine_tunes(USER_ID_0)
28-
assert len(list_response_1["jobs"]) == 0
65+
cancel_fine_tune_by_id(fine_tune_id, USER_ID_0)
2966

30-
# list_response_0_after = list_fine_tunes(USER_ID_0)
31-
# assert len(list_response_0_after["jobs"]) == num_jobs - 1
67+
list_response_0_after = list_fine_tunes(USER_ID_0)
68+
assert len(list_response_0_after["jobs"]) == num_jobs - 1

model-engine/model_engine_server/inference/pytorch_or_tf.base.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ RUN apt-get update && apt-get install -y \
2727
&& rm -rf /var/lib/apt/lists/*
2828

2929
# Apparently wget has a vulnerability so we remove it here
30-
RUN apt-get remove wget -y
30+
RUN dpkg -l | grep wget && apt-get remove wget -y || echo "wget not installed, skipping removal"
3131

3232
# Create a virtualenv for python so we install our packages in the right place
3333
# Not sure how useful the existing contents of the pytorch image are anymore :/ Maybe it's used for cuda/cudnn installs

0 commit comments

Comments
 (0)