Fine-tuning e2e integration test (#372)

tiffzhao5 · web-flow · commit 5e4d6626a3b1 · 2023-11-15T10:41:38.000-08:00
* make test work

* add status checking

* fix

* test

* wget fix

* final fixes

* move namespace
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -121,15 +121,16 @@ jobs:
           command: |
             sudo apt-get update && sudo apt-get install -y expect
             pushd $HOME/project/.circleci/resources
+            kubectl create namespace model-engine
             kubectl apply -f redis-k8s.yaml
             kubectl apply -f postgres-k8s.yaml
             kubectl create secret generic model-engine-postgres-credentials --from-literal=database_url=postgresql://postgres:circle_test@postgres.default:5432/circle_test
+            kubectl create secret generic model-engine-postgres-credentials --from-literal=database_url=postgresql://postgres:circle_test@postgres.default:5432/circle_test -n model-engine
             export ISTIO_VERSION=1.15.0
             popd
             curl -L https://istio.io/downloadIstio | TARGET_ARCH=x86_64 sh -
             install istio-${ISTIO_VERSION}/bin/istioctl $HOME/bin
             $HOME/bin/istioctl install --set profile=demo -y
-            kubectl create namespace model-engine
             kubectl create configmap default-config --from-literal=config="$(cat $HOME/project/.circleci/resources/.minikube-config-map | envsubst)"
             kubectl create configmap default-config --namespace model-engine --from-literal=config="$(cat $HOME/project/.circleci/resources/.minikube-config-map | envsubst)"
             cat $HOME/project/.circleci/resources/.minikube-registry-creds | envsubst | expect
@@ -142,7 +143,7 @@ jobs:
           name: Pre-load integration test images to minikube
           command: |
             docker build -f model-engine/model_engine_server/inference/pytorch_or_tf.base.Dockerfile \
-            --build-arg BASE_IMAGE=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime \
+            --build-arg BASE_IMAGE=python:3.8-slim \
             --build-arg REQUIREMENTS_FILE="$CIRCLE_SHA1-base-requirements.txt" \
             -t temp:1.7.1-cuda11.0-cudnn8-runtime-$CIRCLE_SHA1 .
 
@@ -179,7 +180,10 @@ jobs:
           command: |
             pushd $HOME/project
             kubectl port-forward svc/model-engine 5001:80 &
-            GIT_TAG=$CIRCLE_SHA1 pytest integration_tests
+            export AWS_ACCESS_KEY_ID=$CIRCLECI_AWS_ACCESS_KEY
+            export AWS_SECRET_ACCESS_KEY=$CIRCLECI_AWS_SECRET_KEY
+            export GIT_TAG=$CIRCLE_SHA1
+            pytest integration_tests
 
 executors:
   ubuntu-large:
diff --git a/charts/model-engine/values_circleci.yaml b/charts/model-engine/values_circleci.yaml
@@ -140,7 +140,7 @@ config:
 
       billing_queue_arn: none
       cache_redis_url: redis://redis-message-broker-master.default/15
-      s3_file_llm_fine_tune_repository: "s3://$CIRCLECI_AWS_S3_BUCKET"
+      s3_file_llm_fine_tune_repository: "s3://$CIRCLECI_AWS_S3_BUCKET/fine_tune_repository"
       dd_trace_enabled: false
       istio_enabled: true
       tgi_repository: "text-generation-inference"
diff --git a/integration_tests/test_fine_tunes.py b/integration_tests/test_fine_tunes.py
@@ -1,31 +1,68 @@
-import pytest
+import json
+import os
+import time
 
-from .rest_api_utils import (  # CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST, CREATE_FINE_TUNE_REQUEST, USER_ID_0, cancel_fine_tune_by_id, create_docker_image_batch_job_bundle, create_fine_tune, get_fine_tune_by_id,
+import boto3
+import smart_open
+
+from .rest_api_utils import (
+    CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST,
+    CREATE_FINE_TUNE_REQUEST,
     USER_ID_0,
+    cancel_fine_tune_by_id,
+    create_docker_image_batch_job_bundle,
+    create_fine_tune,
+    get_fine_tune_by_id,
     list_fine_tunes,
 )
 
+MAX_RETRIES = 10
+
 
-@pytest.mark.skip(reason="test doesn't currently work, needs to be implemented correctly")
 def test_fine_tunes() -> None:
-    # TODO: get this test to work (move LLM fine tune repository to database rather than in S3)
+    di_batch_job_id = create_docker_image_batch_job_bundle(
+        CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST, USER_ID_0
+    )["docker_image_batch_job_bundle_id"]
+    data = {
+        "test_base_model-lora": {
+            "docker_image_batch_job_bundle_id": di_batch_job_id,
+            "launch_bundle_config": {},
+            "launch_endpoint_config": {},
+            "default_hparams": {},
+            "required_params": [],
+        }
+    }
 
-    # di_batch_job_id = create_docker_image_batch_job_bundle(
-    #     CREATE_FINE_TUNE_DI_BATCH_JOB_BUNDLE_REQUEST, USER_ID_0
-    # )["docker_image_batch_job_bundle_id"]
+    if os.getenv("CIRCLECI") == "true":
+        session = boto3.Session()
+        aws_s3_bucket = os.getenv("CIRCLECI_AWS_S3_BUCKET")
+        client = session.client("s3")
+        with smart_open.open(
+            f"s3://{aws_s3_bucket}/fine_tune_repository",
+            "w",
+            transport_params={"client": client},
+        ) as f:
+            json.dump(data, f)
 
-    # create_response = create_fine_tune(CREATE_FINE_TUNE_REQUEST, USER_ID_0)
-    # fine_tune_id = create_response["id"]
+    create_response = create_fine_tune(CREATE_FINE_TUNE_REQUEST, USER_ID_0)
+    fine_tune_id = create_response["id"]
 
-    # get_response = get_fine_tune_by_id(fine_tune_id, USER_ID_0)
-    # assert get_response["id"] == fine_tune_id
+    get_response = get_fine_tune_by_id(fine_tune_id, USER_ID_0)
+    num_retries = 0
+    while get_response["status"] not in ["SUCCESS", "FAILURE"]:
+        if num_retries >= MAX_RETRIES:
+            raise Exception("Fine tune job did not complete in time.")
+        num_retries += 1
+        get_response = get_fine_tune_by_id(fine_tune_id, USER_ID_0)
+        time.sleep(10)
+    assert get_response["id"] == fine_tune_id
+    assert get_response["status"] == "SUCCESS"
 
-    # list_response_0_before = list_fine_tunes(USER_ID_0)
-    # num_jobs = len(list_response_0_before["jobs"])
-    # assert num_jobs >= 1
+    list_response_0_before = list_fine_tunes(USER_ID_0)
+    num_jobs = len(list_response_0_before["jobs"])
+    assert num_jobs >= 1
 
-    list_response_1 = list_fine_tunes(USER_ID_0)
-    assert len(list_response_1["jobs"]) == 0
+    cancel_fine_tune_by_id(fine_tune_id, USER_ID_0)
 
-    # list_response_0_after = list_fine_tunes(USER_ID_0)
-    # assert len(list_response_0_after["jobs"]) == num_jobs - 1
+    list_response_0_after = list_fine_tunes(USER_ID_0)
+    assert len(list_response_0_after["jobs"]) == num_jobs - 1
diff --git a/model-engine/model_engine_server/inference/pytorch_or_tf.base.Dockerfile b/model-engine/model_engine_server/inference/pytorch_or_tf.base.Dockerfile
@@ -27,7 +27,7 @@ RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
 
 # Apparently wget has a vulnerability so we remove it here
-RUN apt-get remove wget -y
+RUN dpkg -l | grep wget && apt-get remove wget -y || echo "wget not installed, skipping removal"
 
 # Create a virtualenv for python so we install our packages in the right place
 # Not sure how useful the existing contents of the pytorch image are anymore :/ Maybe it's used for cuda/cudnn installs