diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..db580678d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +models/ +.ipynb_checkpoints +*.gz +*.csv diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..0ff9cf764 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/home/yasha/anaconda3/bin/python" +} \ No newline at end of file diff --git a/README.md b/README.md index d4aab6e1a..7915ecc59 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,14 @@ # Function Templates and Examples -This repo stores ML and data processing related functions and examples +This repo stores ML and data processing related functions and examples: + +## open archive +Open an archive and extract its contents. + +## archive to parquet +Retrieve an archive, extract its contents and store as a parquet file. + +## xgboost model server +Load a serialized xgboost model and deploy as a model server. + diff --git a/datagen/binary_classes/binary.py b/datagen/binary_classes/binary.py new file mode 100644 index 000000000..715ab3d4c --- /dev/null +++ b/datagen/binary_classes/binary.py @@ -0,0 +1,78 @@ +n_samp# Copyright 2019 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from typing import Optional, List, Any +from sklearn.datasets import make_classification + +from mlrun.execution import MLClientCtx + + +def create_binary_classification( + context: MLClientCtx = None, + n_samples: int = 100_000, + m_features: int = 20, + features_hdr: Optional[List[str]] = None, + weight: float = 0.50, + random_state=1, + filename: Optional[str] = None, + target_path: str = "", + key: str = "", + **sk_params, +): + """Create a binary classification sample dataset and save. + If no filename is given it will default to: + 'simdata-{n_samples}X{m_features}.parquet'. + All of the scikit-learn parameters can be set using **sk_params + :param context: function context + :param n_samples: number of rows/samples + :param m_features: number of cols/features + :param features_hdr: header for features array + :param weight: fraction of sample (neg) + :param random_state: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state) + :param filename: optional name for stored data file + :param target_path: destimation for file + :param key: key of data in artifact store + :param sk_params: keyword arguments for scikit-learn's 'make_classification' + Returns filename of created data (includes path). + """ + # check directories exist and create filename if None: + os.makedirs(target_path, exist_ok=True) + if not filename: + name = f"simdata-{n_samples:0.0e}X{m_features}.parquet".replace("+", "") + filename = os.path.join(target_path, name) + + features, labels = make_classification( + n_samples=n_samples, + n_features=m_features, + weights=[weight], # False + n_classes=2, + random_state=random_state, + **sk_params, + ) + + # make dataframes, add column names, concatenate (X, y) + X = pd.DataFrame(features) + if not features_hdr: + X.columns = ["feat_" + str(x) for x in range(m_features)] + else: + X.columns = features_hdr + + y = pd.DataFrame(labels, columns=["labels"]) + data = pd.concat([X, y], axis=1) + + pq.write_table(pa.Table.from_pandas(data), filename) + context.log_artifact(key, target_path=filename) diff --git a/fileutils/arc_to_parquet/README.md b/fileutils/arc_to_parquet/README.md new file mode 100644 index 000000000..ceff6c842 --- /dev/null +++ b/fileutils/arc_to_parquet/README.md @@ -0,0 +1,43 @@ +## arc_to_parquet + +Retrieve a remote archive and save locally as a parquet file, [source](arc_to_parquet.py) + +Usage example: + +```python +# load function from Github +xfn = mlrun.import_function('https://raw.githubusercontent.com/mlrun/functions/master/fileutils/arc_to_parquet/arc_to_parquet.yaml') + +# configure function: mount on Iguazio data fabric, set as interactive (return stdout) +xfn.apply(mlrun.mount_v3io()) +xfn.interactive = True + +# create and run the task +images_path = '/User/mlrun/functions/images' +archive = 'https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz' + +arc_to_parq_task = mlrun.NewTask('arc2parq', + handler='arc_to_parquet', + params={ + 'target_path': target_path, + 'name' : 'x_test_50.csv', + 'key' : 'raw_data', + 'archive_url': archive}) +# run +run = xfn.run(open_archive_task) +``` + +Output: + +``` +[mlrun] 2020-01-09 21:28:47,515 starting run arc2parq uid=ed20cbdcddb3473882507594f69e6180 -> http://mlrun-api:8080 +[mlrun] 2020-01-09 21:29:03,735 destination file does not exist, downloading +[mlrun] 2020-01-09 21:29:03,873 saved table to /User/mlrun/functions/parquet/x_test_50.parquet +[mlrun] 2020-01-09 21:29:03,873 logging /User/mlrun/functions/parquet/x_test_50.parquet to context + +[mlrun] 2020-01-09 21:29:03,898 run executed, status=completed +... +to track results use .show() or .logs() or in CLI: +!mlrun get run ed20cbdcddb3473882507594f69e6180 , !mlrun logs ed20cbdcddb3473882507594f69e6180 +[mlrun] 2020-01-09 21:29:06,867 run executed, status=completed +``` \ No newline at end of file diff --git a/fileutils/arc_to_parquet/arc_to_parquet.py b/fileutils/arc_to_parquet/arc_to_parquet.py new file mode 100644 index 000000000..80e2a3b5a --- /dev/null +++ b/fileutils/arc_to_parquet/arc_to_parquet.py @@ -0,0 +1,69 @@ +import os +from pathlib import Path +import pandas as pd +import pyarrow.parquet as pq +import pyarrow as pa + +from mlrun.execution import MLClientCtx +from typing import IO, AnyStr, Union, List, Optional + + +def arc_to_parquet( + context: MLClientCtx, + archive_url: Union[str, Path, IO[AnyStr]], + header: Optional[List[str]] = None, + target_path: str = "", + name: str = "", + chunksize: int = 10_000, + log_data: bool = True, + add_uid: bool = False, + key: str = "raw_data", +) -> None: + """Open a file/object archive and save as a parquet file. + + :param context: function context + :param archive_url: any valid string path consistent with the path variable + of pandas.read_csv, including strings as file paths, as urls, + pathlib.Path objects, etc... + :param header: column names + :param target_path: destination folder of table + :param name: name file to be saved locally, also + :param chunksize: (0) row size retrieved per iteration + :param log_data: (True) if True, log the data so that it is available + at the next step + :param add_uid: (False) add the metadata uid to the target_path so that + runs can be identified + :param key: key in artifact store (when log_data=True) + """ + if not name.endswith(".parquet"): + name += ".parquet" + + if not add_uid: + uid = "" + else: + uid = context.uid + + dest_path = os.path.join(target_path, uid, name) + os.makedirs(os.path.join(target_path, uid), exist_ok=True) + + if not os.path.isfile(dest_path): + context.logger.info("destination file does not exist, downloading") + pqwriter = None + for i, df in enumerate( + pd.read_csv(archive_url, chunksize=chunksize, names=header) + ): + table = pa.Table.from_pandas(df) + if i == 0: + pqwriter = pq.ParquetWriter(dest_path, table.schema) + pqwriter.write_table(table) + + if pqwriter: + pqwriter.close() + + context.logger.info(f"saved table to {dest_path}") + else: + context.logger.info("destination file already exists") + + if log_data: + context.logger.info(f"assign data to {key} in artifact store") + context.log_artifact(key, target_path=dest_path) diff --git a/fileutils/arc_to_parquet/arc_to_parquet.yaml b/fileutils/arc_to_parquet/arc_to_parquet.yaml new file mode 100644 index 000000000..28e73eca8 --- /dev/null +++ b/fileutils/arc_to_parquet/arc_to_parquet.yaml @@ -0,0 +1,13 @@ +kind: job +metadata: + name: arc_to_parquet +spec: + description: 'archive to parquet and log' + build: + functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDIwLTAxLTA5IDE3OjA3CgppbXBvcnQgb3MKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIHR5cGluZyBpbXBvcnQgSU8sIEFueVN0ciwgVW5pb24sIExpc3QKZnJvbSBwYXRobGliIGltcG9ydCBQYXRoCgppbXBvcnQgcGFuZGFzIGFzIHBkCmltcG9ydCBweWFycm93LnBhcnF1ZXQgYXMgcHEKaW1wb3J0IHB5YXJyb3cgYXMgcGEKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgYXJjaGl2ZV91cmw6IFVuaW9uW3N0ciwgUGF0aCwgSU9bQW55U3RyXV0sCiAgICBoZWFkZXI6IFVuaW9uW05vbmUsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgdGFyZ2V0X3BhdGg6IHN0ciA9ICIiLAogICAgbmFtZTogc3RyID0gIiIsCiAgICBjaHVua3NpemU6IGludCA9IDEwXzAwMCwKICAgIGxvZ19kYXRhOiBib29sID0gVHJ1ZSwKICAgIGtleTogc3RyID0gJ3Jhd19kYXRhJwopIC0+IE5vbmU6CiAgICAiIiJPcGVuIGEgZmlsZS9vYmplY3QgYXJjaGl2ZSBhbmQgc2F2ZSBhcyBhIHBhcnF1ZXQgZmlsZS4KICAgIAogICAgQXJnczoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGFyY2hpdmVfdXJsOiBhbnkgdmFsaWQgc3RyaW5nIHBhdGggY29uc2lzdGVudCB3aXRoIHRoZSBwYXRoIHZhcmlhYmxlCiAgICAgICAgICAgICAgICAgICAgICAgIG9mIHBhbmRhcy5yZWFkX2Nzdi4gbmNsdWRpbmcgc3RyaW5ncyBhcyBmaWxlIHBhdGhzLCBhcyB1cmxzLCAKICAgICAgICAgICAgICAgICAgICAgICAgcGF0aGxpYi5QYXRoIG9iamVjdHMsIGV0Yy4uLgogICAgOnBhcmFtIGhlYWRlcjogICAgICBjb2x1bW4gbmFtZXMKICAgIDpwYXJhbSB0YXJnZXRfcGF0aDogZGVzdGluYXRpb24gZm9sZGVyIG9mIHRhYmxlCiAgICA6cGFyYW0gbmFtZTogICAgICAgIG5hbWUgZmlsZSB0byBiZSBzYXZlZCBsb2NhbGx5LCBhbHNvCiAgICA6cGFyYW0gY2h1bmtzaXplOiAgICgwKSByb3cgc2l6ZSByZXRyaWV2ZWQgcGVyIGl0ZXJhdGlvbgogICAgOnBhcmFtIGxvZ19kYXRhOiAgICAoVHJ1ZSkgaWYgVHJ1ZSwgbG9nIHRoZSBkYXRhIHNvIHRoYXQgaXQgaXMgYXZhaWxhYmxlCiAgICAgICAgICAgICAgICAgICAgICAgIGF0IHRoZSBuZXh0IHN0ZXAKICAgICIiIgogICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGgsIGV4aXN0X29rPVRydWUpCgogICAgaWYgbm90IG5hbWUuZW5kc3dpdGgoIi5wYXJxdWV0Iik6CiAgICAgICAgbmFtZSArPSAiLnBhcnF1ZXQiCgogICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBuYW1lKQoKICAgIGlmIG5vdCBvcy5wYXRoLmlzZmlsZShkZXN0X3BhdGgpOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oImRlc3RpbmF0aW9uIGZpbGUgZG9lcyBub3QgZXhpc3QsIGRvd25sb2FkaW5nIikKICAgICAgICBwcXdyaXRlciA9IE5vbmUKICAgICAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKAogICAgICAgICAgICBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCwgY2h1bmtzaXplPWNodW5rc2l6ZSwgbmFtZXM9aGVhZGVyKQogICAgICAgICk6CiAgICAgICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgICAgIGlmIGkgPT0gMDoKICAgICAgICAgICAgICAgIHBxd3JpdGVyID0gcHEuUGFycXVldFdyaXRlcihkZXN0X3BhdGgsIHRhYmxlLnNjaGVtYSkKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCgogICAgICAgIGlmIHBxd3JpdGVyOgogICAgICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJzYXZlZCB0YWJsZSB0byB7ZGVzdF9wYXRofSIpCiAgICBlbHNlOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oImRlc3RpbmF0aW9uIGZpbGUgZXhpc3RzIikKCiAgICBpZiBsb2dfZGF0YToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYibG9nZ2luZyB7ZGVzdF9wYXRofSB0byBjb250ZXh0IikKICAgICAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChrZXksIHRhcmdldF9wYXRoPWRlc3RfcGF0aCkKCg== + base_image: python:3.6-jessie + commands: + - pip install -q mlrun + - pip install -q pyarrow + - pip install -q numpy + - pip install -q pandas \ No newline at end of file diff --git a/fileutils/README.md b/fileutils/open_archive/README.md similarity index 77% rename from fileutils/README.md rename to fileutils/open_archive/README.md index 9d319610a..91c275ef6 100644 --- a/fileutils/README.md +++ b/fileutils/open_archive/README.md @@ -2,23 +2,23 @@ ## open_archive -Example function which can open a remote zip archive into a local target folder, [see source](file_utils.py). +Open a remote zip archive into a local target folder, [ource](file_utils.py). Usage example: ```python # load function from Github -xfn = mlrun.import_function('https://raw.githubusercontent.com/mlrun/functions/master/fileutils/function.yaml') +xfn = mlrun.import_function('https://raw.githubusercontent.com/mlrun/functions/master/fileutils/open_archive/function.yaml') # configute it: mount on iguazio fabric, set as interactive (return stdout) xfn.apply(mlrun.mount_v3io()) xfn.interactive = True # create and run the task -images_path = '/User/mlrun/examples/images' +images_path = '/User/mlrun/functions/images' open_archive_task = mlrun.NewTask('download', handler='open_archive', - params={'target_dir': images_path}, - inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'}) + params={'target_dir': images_path}, + inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'}) # run run = xfn.run(open_archive_task) @@ -40,4 +40,4 @@ type result.show() to see detailed results/progress or use CLI: !mlrun get run --uid 2ec277feb3b644e2a45c92ce8cb2537a [mlrun] 2019-10-28 22:31:03,699 run executed, status=completed -``` +``` \ No newline at end of file diff --git a/fileutils/file_utils.py b/fileutils/open_archive/file_utils.py similarity index 99% rename from fileutils/file_utils.py rename to fileutils/open_archive/file_utils.py index 4128d225e..b8cae15f1 100644 --- a/fileutils/file_utils.py +++ b/fileutils/open_archive/file_utils.py @@ -21,4 +21,4 @@ def open_archive(context, context.logger.info(f'extracted archive to {target_dir}') context.log_artifact('content', target_path=target_dir) - + \ No newline at end of file diff --git a/fileutils/function.yaml b/fileutils/open_archive/function.yaml similarity index 100% rename from fileutils/function.yaml rename to fileutils/open_archive/function.yaml diff --git a/serving/xgboost/xgb_serving.ipynb b/serving/xgboost/xgb_serving.ipynb new file mode 100644 index 000000000..0d0314c65 --- /dev/null +++ b/serving/xgboost/xgb_serving.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deploy a Serverless Model Server with Nuclio-KFServing\n", + " --------------------------------------------------------------------\n", + "\n", + "The following notebook demonstrates how to deploy an XGBoost model using nuclio + KFServing (a.k.a Nuclio-serving)\n", + "\n", + "#### **notebook how-to's**\n", + "* Write and test model serving (KFServing) class in a notebook.\n", + "* Deploy the model server as a Nuclio-serving function.\n", + "* Invoke and test the serving function." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### **steps**\n", + "**[define a new function and its dependencies](#define-function)**
\n", + "**[test the model serving class locally](#test-locally)**
\n", + "**[deploy our serving class using as a serverless function](#deploy)**
\n", + "**[test our model server using HTTP request](#test-model-server)**
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# nuclio: ignore\n", + "# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter\n", + "import nuclio " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### **define a new function and its dependencies**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%nuclio config kind=\"nuclio:serving\"\n", + "%nuclio env MODEL_CLASS=XGBoostModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%nuclio cmd\n", + "pip install kfserving --upgrade\n", + "pip install azure\n", + "pip install numpy\n", + "pip install xgboost\n", + "pip install git+https://github.com/mlrun/mlrun.git@v40-cleanups" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import kfserving\n", + "import os\n", + "import numpy as np\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "BOOSTER_FILE = \"model.bst\"\n", + "\n", + "class XGBoostModel(kfserving.KFModel):\n", + " def __init__(self, name: str, model_dir: str, booster: xgb.XGBModel = None):\n", + " super().__init__(name)\n", + " self.name = name\n", + " self.model_dir = model_dir\n", + " if not booster is None:\n", + " self._booster = booster\n", + " self.ready = True\n", + "\n", + " def load(self):\n", + " model_file = os.path.join(\n", + " kfserving.Storage.download(self.model_dir), BOOSTER_FILE)\n", + " self._booster = xgb.Booster(model_file=model_file)\n", + " self.ready = True\n", + "\n", + " def predict(self, body):\n", + " try:\n", + " # Use of list as input is deprecated see https://github.com/dmlc/xgboost/pull/3970\n", + " dmatrix = xgb.DMatrix(body['instances'])\n", + " result: xgb.DMatrix = self._booster.predict(dmatrix)\n", + " return result.tolist()\n", + " except Exception as e:\n", + " raise Exception(\"Failed to predict %s\" % e)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# nuclio: end-code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "______________________________________________" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### **test the model serving class locally**\n", + "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n", + "\n", + "> **Verify there is a `model.bst` file in the model_dir path (generated by the training notebook)**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "model_dir = '/User/mlrun/data'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 191225 20:59:23 storage:35] Copying contents of /User/mlrun/data to local\n" + ] + } + ], + "source": [ + "my_server = XGBoostModel('my-model', model_dir=model_dir)\n", + "my_server.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We can use the `.predict(body)` method to test the model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[0.789408266544342,\n", + " 0.02588181011378765,\n", + " 0.02631426602602005,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943],\n", + " [0.789408266544342,\n", + " 0.02588181011378765,\n", + " 0.02631426602602005,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943,\n", + " 0.022627953439950943]]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_server.predict({\"instances\":[[5], [10]]})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### **deploy our serving class using as a serverless function**\n", + "in the following section we create a new model serving function which wraps our class , and specify model and other resources.\n", + "\n", + "the `models` dict store model names and the assosiated model **dir** URL (the URL can start with `S3://` and other blob store options), the faster way is to use a shared file volume, we use `.apply(mount_v3io())` to attach a v3io (iguazio data fabric) volume to our function. By default v3io will mount the current user home into the `\\User` function path.\n", + "\n", + "**verify the model dir does contain a valid `model.bst` file**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlrun import new_model_server, mount_v3io\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn = new_model_server('iris-srv', \n", + " models={'iris_v1': model_dir}, \n", + " model_class='XGBoostModel')\n", + "\n", + "fn.apply(mount_v3io()) " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[mlrun] 2019-12-25 21:00:29,593 deploy started\n", + "[nuclio] 2019-12-25 21:00:30,683 (info) Building processor image\n", + "[nuclio] 2019-12-25 21:00:36,743 (info) Build complete\n", + "[nuclio] 2019-12-25 21:00:42,816 (info) Function deploy complete\n", + "[nuclio] 2019-12-25 21:00:42,826 done creating iris-srv, function address: 13.58.34.174:32590\n" + ] + } + ], + "source": [ + "addr = fn.deploy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### **test our model server using HTTP request**\n", + "\n", + "\n", + "We invoke our model serving function using test data, the data vector is specified in the `instances` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# KFServing protocol event\n", + "event_data = {\"instances\":[[5], [10]]}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.789408266544342, 0.02588181011378765, 0.02631426602602005, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943], [0.789408266544342, 0.02588181011378765, 0.02631426602602005, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943, 0.022627953439950943]]\n" + ] + } + ], + "source": [ + "import json\n", + "resp = requests.put(addr + '/iris_v1/predict', json=json.dumps(event_data))\n", + "print(resp.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**[back to top](#top)**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/file-utils.ipynb b/tests/file-utils.ipynb new file mode 100644 index 000000000..d3fdb407f --- /dev/null +++ b/tests/file-utils.ipynb @@ -0,0 +1,880 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n", + "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# archive to folder" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib3\n", + "urllib3.disable_warnings()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[mlrun] 2020-01-20 08:36:14,989 starting run download uid=79a5b0f103c24367961cf8c107126dd2 -> http://mlrun-api:8080\n", + "[mlrun] 2020-01-20 08:36:15,069 Job is running in the background, pod: download-6mg4q\n", + "[mlrun] 2020-01-20 08:36:19,610 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp\n", + "[mlrun] 2020-01-20 08:36:21,218 Verified directories\n", + "[mlrun] 2020-01-20 08:36:21,218 Extracting zip\n", + "[mlrun] 2020-01-20 08:36:22,988 extracted archive to content\n", + "[mlrun] 2020-01-20 08:36:23,001 log artifact content at content, size: None, db: Y\n", + "\n", + "[mlrun] 2020-01-20 08:36:23,011 run executed, status=completed\n", + "final state: succeeded\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uiditerstartstatenamelabelsinputsparametersresultsartifacts
...126dd2
0Jan 20 08:36:19completedfile_utils
host=download-6mg4q
kind=job
owner=admin
archive_url
key=contents
target_path=/User/mlrun/functions/images
content
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 79a5b0f103c24367961cf8c107126dd2 , !mlrun logs 79a5b0f103c24367961cf8c107126dd2 \n", + "[mlrun] 2020-01-20 08:36:24,208 run executed, status=completed\n" + ] + } + ], + "source": [ + "# load function from Github\n", + "xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/open_archive/function.yaml')\n", + "\n", + "# configute it: mount on iguazio fabric, set as interactive (return stdout)\n", + "xfn.apply(mlrun.mount_v3io())\n", + "xfn.interactive = True\n", + "\n", + "# create and run the task\n", + "\n", + "images_path = '/User/mlrun/functions/images'\n", + "\n", + "open_archive_task = mlrun.NewTask(\n", + " 'download',\n", + " handler='open_archive', \n", + " params={'target_path': images_path,\n", + " 'key' : 'contents'},\n", + " inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'}\n", + ")\n", + "\n", + "# run\n", + "run = xfn.run(open_archive_task)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_________" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# archive to parquet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### load and configure function" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# load function from Github\n", + "xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/arc_to_parquet/arc_to_parquet.yaml')\n", + "\n", + "# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)\n", + "xfn.apply(mlrun.mount_v3io())\n", + "xfn.interactive = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### deploy / build" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following triggers a build when run for the first time using specs found in the yaml file above. Unless that file changes, this only needs to be run once, even after the notebook has been restarted:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[mlrun] 2020-01-20 05:35:07,015 starting remote build, image: .mlrun/func-default-arc_to_parquet-latest\n", + "\u001b[36mINFO\u001b[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie \n", + "\u001b[36mINFO\u001b[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie \n", + "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n", + "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory \n", + "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n", + "\u001b[36mINFO\u001b[0m[0000] Built cross stage deps: map[] \n", + "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n", + "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory \n", + "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n", + "\u001b[36mINFO\u001b[0m[0001] Unpacking rootfs as cmd RUN pip install -q mlrun requires it. \n", + "\u001b[36mINFO\u001b[0m[0011] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0018] RUN pip install -q mlrun \n", + "\u001b[36mINFO\u001b[0m[0018] cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0018] args: [-c pip install -q mlrun] \n", + "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n", + "\u001b[36mINFO\u001b[0m[0065] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0082] RUN pip install -q pyarrow \n", + "\u001b[36mINFO\u001b[0m[0082] cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0082] args: [-c pip install -q pyarrow] \n", + "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n", + "\u001b[36mINFO\u001b[0m[0086] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0095] RUN pip install -q numpy \n", + "\u001b[36mINFO\u001b[0m[0095] cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0095] args: [-c pip install -q numpy] \n", + "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n", + "\u001b[36mINFO\u001b[0m[0096] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0099] RUN pip install -q pandas \n", + "\u001b[36mINFO\u001b[0m[0099] cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0099] args: [-c pip install -q pandas] \n", + "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n", + "\u001b[36mINFO\u001b[0m[0100] Taking snapshot of full filesystem... \n", + "\u001b[36mINFO\u001b[0m[0102] RUN pip install mlrun \n", + "\u001b[36mINFO\u001b[0m[0102] cmd: /bin/sh \n", + "\u001b[36mINFO\u001b[0m[0102] args: [-c pip install mlrun] \n", + "Requirement already satisfied: mlrun in /usr/local/lib/python3.6/site-packages (0.4.3)\n", + "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (7.0)\n", + "Requirement already satisfied: gunicorn==19.9.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (19.9.0)\n", + "Requirement already satisfied: requests>=2.20.1 in /usr/local/lib/python3.6/site-packages (from mlrun) (2.22.0)\n", + "Requirement already satisfied: aiohttp>=3.5.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (3.6.2)\n", + "Requirement already satisfied: sqlalchemy==1.3.11 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.3.11)\n", + "Requirement already satisfied: gevent==1.4.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.4.0)\n", + "Requirement already satisfied: nuclio-jupyter>=0.8.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.8.0)\n", + "Requirement already satisfied: kfp>=0.1.29 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.1.40)\n", + "Requirement already satisfied: GitPython>=2.1.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (3.0.5)\n", + "Requirement already satisfied: Flask>=1.1.1 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.1.1)\n", + "Requirement already satisfied: pandas>=0.23.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.25.3)\n", + "Requirement already satisfied: tabulate<=0.8.3,>=0.8.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.8.3)\n", + "Requirement already satisfied: boto3>=1.9 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.11.5)\n", + "Requirement already satisfied: pyyaml>=5.1.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (5.3)\n", + "Requirement already satisfied: nest-asyncio>=1.0.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.2.2)\n", + "Requirement already satisfied: nuclio-sdk>=0.0.3 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.0.5)\n", + "Requirement already satisfied: croniter==0.3.31 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.3.31)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (2019.11.28)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (3.0.4)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (2.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (1.24.3)\n", + "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (3.0.1)\n", + "Requirement already satisfied: idna-ssl>=1.0; python_version < \"3.7\" in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (1.1.0)\n", + "Requirement already satisfied: multidict<5.0,>=4.5 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (4.7.4)\n", + "Requirement already satisfied: typing-extensions>=3.6.5; python_version < \"3.7\" in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (3.7.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (1.4.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (19.3.0)\n", + "Requirement already satisfied: greenlet>=0.4.14; platform_python_implementation == \"CPython\" in /usr/local/lib/python3.6/site-packages (from gevent==1.4.0->mlrun) (0.4.15)\n", + "Requirement already satisfied: notebook>=5.7.2 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (6.0.2)\n", + "Requirement already satisfied: jupyterlab>=0.35.4 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (1.2.5)\n", + "Requirement already satisfied: tornado<6,>=5 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.1.1)\n", + "Requirement already satisfied: ipython>=7.2 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (7.11.1)\n", + "Requirement already satisfied: nbconvert>=5.4 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.6.1)\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (2.8.1)\n", + "Requirement already satisfied: argo-models==2.2.1a in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (2.2.1a0)\n", + "Requirement already satisfied: cloudpickle==1.1.1 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.1.1)\n", + "Requirement already satisfied: kubernetes<=10.0.0,>=8.0.0 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (10.0.0)\n", + "Requirement already satisfied: Deprecated in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.2.7)\n", + "Requirement already satisfied: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.25.0)\n", + "Requirement already satisfied: google-auth>=1.6.1 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.10.1)\n", + "Requirement already satisfied: requests-toolbelt>=0.8.0 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (0.9.1)\n", + "Requirement already satisfied: PyJWT>=1.6.4 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.7.1)\n", + "Requirement already satisfied: jsonschema>=3.0.1 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (3.2.0)\n", + "Requirement already satisfied: cryptography>=2.4.2 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (2.8)\n", + "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.14.0)\n", + "Requirement already satisfied: kfp-server-api<=0.1.40,>=0.1.18 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (0.1.40)\n", + "Requirement already satisfied: gitdb2>=2.0.0 in /usr/local/lib/python3.6/site-packages (from GitPython>=2.1.0->mlrun) (2.0.6)\n", + "Requirement already satisfied: Werkzeug>=0.15 in /usr/local/lib/python3.6/site-packages (from Flask>=1.1.1->mlrun) (0.16.0)\n", + "Requirement already satisfied: itsdangerous>=0.24 in /usr/local/lib/python3.6/site-packages (from Flask>=1.1.1->mlrun) (1.1.0)\n", + "Requirement already satisfied: Jinja2>=2.10.1 in /usr/local/lib/python3.6/site-packages (from Flask>=1.1.1->mlrun) (2.10.3)\n", + "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/site-packages (from pandas>=0.23.0->mlrun) (1.18.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/site-packages (from pandas>=0.23.0->mlrun) (2019.3)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/site-packages (from boto3>=1.9->mlrun) (0.9.4)\n", + "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/site-packages (from boto3>=1.9->mlrun) (0.3.1)\n", + "Requirement already satisfied: botocore<1.15.0,>=1.14.5 in /usr/local/lib/python3.6/site-packages (from boto3>=1.9->mlrun) (1.14.5)\n", + "Requirement already satisfied: jupyter-client>=5.3.4 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.3.4)\n", + "Requirement already satisfied: traitlets>=4.2.1 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (4.3.3)\n", + "Requirement already satisfied: Send2Trash in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (1.5.0)\n", + "Requirement already satisfied: terminado>=0.8.1 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.8.3)\n", + "Requirement already satisfied: pyzmq>=17 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (18.1.1)\n", + "Requirement already satisfied: prometheus-client in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.1)\n", + "Requirement already satisfied: jupyter-core>=4.6.0 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (4.6.1)\n", + "Requirement already satisfied: nbformat in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.0.3)\n", + "Requirement already satisfied: ipykernel in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.1.3)\n", + "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.2.0)\n", + "Requirement already satisfied: jupyterlab-server~=1.0.0 in /usr/local/lib/python3.6/site-packages (from jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (1.0.6)\n", + "Requirement already satisfied: jedi>=0.10 in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.15.2)\n", + "Requirement already satisfied: pygments in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (2.5.2)\n", + "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (3.0.2)\n", + "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (41.0.1)\n", + "Requirement already satisfied: backcall in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.0)\n", + "Requirement already satisfied: decorator in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.4.1)\n", + "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.7.0)\n", + "Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.5)\n", + "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (1.4.2)\n", + "Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.3)\n", + "Requirement already satisfied: testpath in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.4.4)\n", + "Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.4)\n", + "Requirement already satisfied: defusedxml in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n", + "Requirement already satisfied: bleach in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (3.1.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (0.57.0)\n", + "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (1.3.0)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.6/site-packages (from Deprecated->kfp>=0.1.29->mlrun) (1.11.2)\n", + "Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (0.5.0)\n", + "Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.2.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.2.8)\n", + "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0.0)\n", + "Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (0.15.7)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (1.4.0)\n", + "Requirement already satisfied: cffi!=1.11.3,>=1.8 in /usr/local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp>=0.1.29->mlrun) (1.13.2)\n", + "Requirement already satisfied: smmap2>=2.0.0 in /usr/local/lib/python3.6/site-packages (from gitdb2>=2.0.0->GitPython>=2.1.0->mlrun) (2.0.5)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/site-packages (from Jinja2>=2.10.1->Flask>=1.1.1->mlrun) (1.1.1)\n", + "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/site-packages (from botocore<1.15.0,>=1.14.5->boto3>=1.9->mlrun) (0.15.2)\n", + "Requirement already satisfied: ptyprocess; os_name != \"nt\" in /usr/local/lib/python3.6/site-packages (from terminado>=0.8.1->notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n", + "Requirement already satisfied: json5 in /usr/local/lib/python3.6/site-packages (from jupyterlab-server~=1.0.0->jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.5)\n", + "Requirement already satisfied: parso>=0.5.2 in /usr/local/lib/python3.6/site-packages (from jedi>=0.10->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.5.2)\n", + "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.8)\n", + "Requirement already satisfied: webencodings in /usr/local/lib/python3.6/site-packages (from bleach->nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.5.1)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/site-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (3.1.0)\n", + "Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/site-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.16.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.4.8)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/site-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (2.0.0)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp>=0.1.29->mlrun) (2.19)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.51.0)\n", + "Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (3.11.2)\n", + "Requirement already satisfied: more-itertools in /usr/local/lib/python3.6/site-packages (from zipp>=0.5->importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (8.1.0)\n", + "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n", + "\u001b[36mINFO\u001b[0m[0103] Taking snapshot of full filesystem... \n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xfn.deploy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also note that the build time can be reduced if you specifiy a pre-built image with all required packages." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# useful constants\n", + "target_path = '/User/mlrun/functions/parquet'\n", + "archive = 'https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz'\n", + "parquet_file = 'x_test_50.parquet' # the file extension is not necessary\n", + "parquet_file_path = target_path + \"/\" + parquet_file\n", + "artifact_key = 'raw_data'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[mlrun] 2020-01-20 05:38:21,743 starting run arc2parq uid=42af41d93f294cd09aace4942d25b106 -> http://mlrun-api:8080\n", + "[mlrun] 2020-01-20 05:38:21,823 Job is running in the background, pod: arc2parq-96gmq\n", + "[mlrun] 2020-01-20 05:38:37,072 destination file exists\n", + "[mlrun] 2020-01-20 05:38:37,072 logging /User/mlrun/functions/parquet/x_test_50.parquet to context\n", + "[mlrun] 2020-01-20 05:38:37,083 log artifact raw_data at /User/mlrun/functions/parquet/x_test_50.parquet, size: None, db: Y\n", + "\n", + "[mlrun] 2020-01-20 05:38:37,094 run executed, status=completed\n", + "final state: succeeded\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uiditerstartstatenamelabelsinputsparametersresultsartifacts
...25b106
0Jan 20 05:38:37completedarc_to_parquet
host=arc2parq-96gmq
kind=job
owner=admin
archive_url=https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz
key=raw_data
name=x_test_50.parquet
target_path=/User/mlrun/functions/parquet
raw_data
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 42af41d93f294cd09aace4942d25b106 , !mlrun logs 42af41d93f294cd09aace4942d25b106 \n", + "[mlrun] 2020-01-20 05:38:40,974 run executed, status=completed\n" + ] + } + ], + "source": [ + "# create and run the task\n", + "arc_to_parq_task = mlrun.NewTask(\n", + " 'arc2parq', \n", + " handler='arc_to_parquet', # a string since we are calling this 'remotely', outside this notebook\n", + " params={\n", + " 'target_path': target_path,\n", + " 'name' : parquet_file, \n", + " 'key' : artifact_key,\n", + " 'archive_url': archive},\n", + " outputs=[artifact_key])\n", + "\n", + "# run\n", + "run = xfn.run(arc_to_parq_task)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "___" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### tests" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# add more context tests\n", + "# convert these to real tests" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "assert artifact_key in run.outputs.keys(), f\"mlrun.functions: key {artifact_key} not fond in outputs\"\n", + "assert os.path.isfile(parquet_file_path), f\"mlrun.functions: artifact source not found at {parquet_file_path}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "original = pd.read_csv(archive)\n", + "copied = pd.read_parquet(parquet_file_path, engine=\"pyarrow\")\n", + "assert np.array_equal(original, copied), \"mlrun.functions: original and copied data not equal\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "os.remove(parquet_file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}