diff --git a/.gitignore b/.gitignore
index db580678d..eb4858686 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@ models/
.ipynb_checkpoints
*.gz
*.csv
+*.pyc
+*.swp
+dask-worker-space
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 0ff9cf764..000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
- "python.pythonPath": "/home/yasha/anaconda3/bin/python"
-}
\ No newline at end of file
diff --git a/datagen/README.md b/datagen/README.md
new file mode 100644
index 000000000..a0bafad15
--- /dev/null
+++ b/datagen/README.md
@@ -0,0 +1,9 @@
+# data generators
+
+## classification
+
+**`binary`** generate binary classification data
+
+## splitters
+
+**`train_valid_test`** given a raw dataset, create 3 splits and save the results
\ No newline at end of file
diff --git a/datagen/binary_classes/binary.py b/datagen/binary/function.py
similarity index 84%
rename from datagen/binary_classes/binary.py
rename to datagen/binary/function.py
index 715ab3d4c..d6806f007 100644
--- a/datagen/binary_classes/binary.py
+++ b/datagen/binary/function.py
@@ -1,4 +1,4 @@
-n_samp# Copyright 2019 Iguazio
+# Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -22,21 +22,21 @@
def create_binary_classification(
- context: MLClientCtx = None,
- n_samples: int = 100_000,
- m_features: int = 20,
- features_hdr: Optional[List[str]] = None,
- weight: float = 0.50,
- random_state=1,
- filename: Optional[str] = None,
- target_path: str = "",
- key: str = "",
- **sk_params,
+ context : MLClientCtx = None,
+ n_samples : int = 100_000,
+ m_features : int = 20,
+ features_hdr : Optional[List[str]] = None,
+ weight : float = 0.50,
+ random_state : int =1,
+ filename : Optional[str] = None,
+ target_path : str = "",
+ key : str = ""
):
"""Create a binary classification sample dataset and save.
If no filename is given it will default to:
'simdata-{n_samples}X{m_features}.parquet'.
All of the scikit-learn parameters can be set using **sk_params
+
:param context: function context
:param n_samples: number of rows/samples
:param m_features: number of cols/features
@@ -46,7 +46,6 @@ def create_binary_classification(
:param filename: optional name for stored data file
:param target_path: destimation for file
:param key: key of data in artifact store
- :param sk_params: keyword arguments for scikit-learn's 'make_classification'
Returns filename of created data (includes path).
"""
# check directories exist and create filename if None:
@@ -54,15 +53,15 @@ def create_binary_classification(
if not filename:
name = f"simdata-{n_samples:0.0e}X{m_features}.parquet".replace("+", "")
filename = os.path.join(target_path, name)
-
+ else:
+ filename = os.path.join(target_path, filename)
+
features, labels = make_classification(
n_samples=n_samples,
n_features=m_features,
weights=[weight], # False
n_classes=2,
- random_state=random_state,
- **sk_params,
- )
+ random_state=random_state)
# make dataframes, add column names, concatenate (X, y)
X = pd.DataFrame(features)
diff --git a/datagen/binary/function.yaml b/datagen/binary/function.yaml
new file mode 100644
index 000000000..4f9721e1a
--- /dev/null
+++ b/datagen/binary/function.yaml
@@ -0,0 +1,18 @@
+kind: job
+metadata:
+ name: binary
+ tag: ''
+ hash: 0527f27939f7f6b39d435d9e62d484c0bab308c8
+ project: ''
+spec:
+ command: ''
+ args: []
+ volumes: []
+ volume_mounts: []
+ env: []
+ description: ''
+ build:
+ functionSourceCode: IyBDb3B5cmlnaHQgMjAxOSBJZ3VhemlvCiMKIyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKIyB5b3UgbWF5IG5vdCB1c2UgdGhpcyBmaWxlIGV4Y2VwdCBpbiBjb21wbGlhbmNlIHdpdGggdGhlIExpY2Vuc2UuCiMgWW91IG1heSBvYnRhaW4gYSBjb3B5IG9mIHRoZSBMaWNlbnNlIGF0CiMKIyAgIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMAojCiMgVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQojIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuICJBUyBJUyIgQkFTSVMsCiMgV0lUSE9VVCBXQVJSQU5USUVTIE9SIENPTkRJVElPTlMgT0YgQU5ZIEtJTkQsIGVpdGhlciBleHByZXNzIG9yIGltcGxpZWQuCiMgU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAojIGxpbWl0YXRpb25zIHVuZGVyIHRoZSBMaWNlbnNlLgppbXBvcnQgb3MKaW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgcHlhcnJvdyBhcyBwYQppbXBvcnQgcHlhcnJvdy5wYXJxdWV0IGFzIHBxCmZyb20gdHlwaW5nIGltcG9ydCBPcHRpb25hbCwgTGlzdCwgQW55CmZyb20gc2tsZWFybi5kYXRhc2V0cyBpbXBvcnQgbWFrZV9jbGFzc2lmaWNhdGlvbgoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CgoKZGVmIGNyZWF0ZV9iaW5hcnlfY2xhc3NpZmljYXRpb24oCiAgICBjb250ZXh0IDogTUxDbGllbnRDdHggPSBOb25lLAogICAgbl9zYW1wbGVzIDogaW50ID0gMTAwXzAwMCwKICAgIG1fZmVhdHVyZXMgOiBpbnQgPSAyMCwKICAgIGZlYXR1cmVzX2hkciA6IE9wdGlvbmFsW0xpc3Rbc3RyXV0gPSBOb25lLAogICAgd2VpZ2h0IDogZmxvYXQgPSAwLjUwLAogICAgcmFuZG9tX3N0YXRlIDogaW50ID0xLAogICAgZmlsZW5hbWUgOiBPcHRpb25hbFtzdHJdID0gTm9uZSwKICAgIHRhcmdldF9wYXRoIDogc3RyID0gIiIsCiAgICBrZXkgOiBzdHIgPSAiIgopOgogICAgIiIiQ3JlYXRlIGEgYmluYXJ5IGNsYXNzaWZpY2F0aW9uIHNhbXBsZSBkYXRhc2V0IGFuZCBzYXZlLgogICAgSWYgbm8gZmlsZW5hbWUgaXMgZ2l2ZW4gaXQgd2lsbCBkZWZhdWx0IHRvOgogICAgJ3NpbWRhdGEte25fc2FtcGxlc31Ye21fZmVhdHVyZXN9LnBhcnF1ZXQnLgogICAgQWxsIG9mIHRoZSBzY2lraXQtbGVhcm4gcGFyYW1ldGVycyBjYW4gYmUgc2V0IHVzaW5nICoqc2tfcGFyYW1zCiAgICAKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gbl9zYW1wbGVzOiAgICAgbnVtYmVyIG9mIHJvd3Mvc2FtcGxlcwogICAgOnBhcmFtIG1fZmVhdHVyZXM6ICAgIG51bWJlciBvZiBjb2xzL2ZlYXR1cmVzCiAgICA6cGFyYW0gZmVhdHVyZXNfaGRyOiAgaGVhZGVyIGZvciBmZWF0dXJlcyBhcnJheQogICAgOnBhcmFtIHdlaWdodDogICAgICAgIGZyYWN0aW9uIG9mIHNhbXBsZSAobmVnKQogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogIHJuZyBzZWVkIChzZWUgaHR0cHM6Ly9zY2lraXQtbGVhcm4ub3JnL3N0YWJsZS9nbG9zc2FyeS5odG1sI3Rlcm0tcmFuZG9tLXN0YXRlKQogICAgOnBhcmFtIGZpbGVuYW1lOiAgICAgIG9wdGlvbmFsIG5hbWUgZm9yIHN0b3JlZCBkYXRhIGZpbGUKICAgIDpwYXJhbSB0YXJnZXRfcGF0aDogICBkZXN0aW1hdGlvbiBmb3IgZmlsZQogICAgOnBhcmFtIGtleTogICAgICAgICAgIGtleSBvZiBkYXRhIGluIGFydGlmYWN0IHN0b3JlCiAgICBSZXR1cm5zIGZpbGVuYW1lIG9mIGNyZWF0ZWQgZGF0YSAoaW5jbHVkZXMgcGF0aCkuCiAgICAiIiIKICAgICMgY2hlY2sgZGlyZWN0b3JpZXMgZXhpc3QgYW5kIGNyZWF0ZSBmaWxlbmFtZSBpZiBOb25lOgogICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGgsIGV4aXN0X29rPVRydWUpCiAgICBpZiBub3QgZmlsZW5hbWU6CiAgICAgICAgbmFtZSA9IGYic2ltZGF0YS17bl9zYW1wbGVzOjAuMGV9WHttX2ZlYXR1cmVzfS5wYXJxdWV0Ii5yZXBsYWNlKCIrIiwgIiIpCiAgICAgICAgZmlsZW5hbWUgPSBvcy5wYXRoLmpvaW4odGFyZ2V0X3BhdGgsIG5hbWUpCiAgICBlbHNlOgogICAgICAgIGZpbGVuYW1lID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBmaWxlbmFtZSkKICAgIAogICAgZmVhdHVyZXMsIGxhYmVscyA9IG1ha2VfY2xhc3NpZmljYXRpb24oCiAgICAgICAgbl9zYW1wbGVzPW5fc2FtcGxlcywKICAgICAgICBuX2ZlYXR1cmVzPW1fZmVhdHVyZXMsCiAgICAgICAgd2VpZ2h0cz1bd2VpZ2h0XSwgICMgRmFsc2UKICAgICAgICBuX2NsYXNzZXM9MiwKICAgICAgICByYW5kb21fc3RhdGU9cmFuZG9tX3N0YXRlKQoKICAgICMgbWFrZSBkYXRhZnJhbWVzLCBhZGQgY29sdW1uIG5hbWVzLCBjb25jYXRlbmF0ZSAoWCwgeSkKICAgIFggPSBwZC5EYXRhRnJhbWUoZmVhdHVyZXMpCiAgICBpZiBub3QgZmVhdHVyZXNfaGRyOgogICAgICAgIFguY29sdW1ucyA9IFsiZmVhdF8iICsgc3RyKHgpIGZvciB4IGluIHJhbmdlKG1fZmVhdHVyZXMpXQogICAgZWxzZToKICAgICAgICBYLmNvbHVtbnMgPSBmZWF0dXJlc19oZHIKCiAgICB5ID0gcGQuRGF0YUZyYW1lKGxhYmVscywgY29sdW1ucz1bImxhYmVscyJdKQogICAgZGF0YSA9IHBkLmNvbmNhdChbWCwgeV0sIGF4aXM9MSkKCiAgICBwcS53cml0ZV90YWJsZShwYS5UYWJsZS5mcm9tX3BhbmRhcyhkYXRhKSwgZmlsZW5hbWUpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChrZXksIHRhcmdldF9wYXRoPWZpbGVuYW1lKQo=
+ base_image: yjbds/mlrun-intel:dev
+ commands: []
+ code_origin: https://github.com/yjb-ds/functions.git#e4d74d784d42fb25cc75cbcab6d817bb1d2b150c:/User/repos/functions/datagen/classification/binary.py
diff --git a/datagen/train_valid_test/function.py b/datagen/train_valid_test/function.py
new file mode 100644
index 000000000..9a8976331
--- /dev/null
+++ b/datagen/train_valid_test/function.py
@@ -0,0 +1,102 @@
+import pandas as pd
+import os
+import numpy as np
+import pyarrow.parquet as pq
+import pyarrow as pa
+from cloudpickle import dump
+
+import pyarrow.parquet as pq
+import pyarrow as pa
+
+from sklearn.model_selection import train_test_split
+from typing import Optional, Union
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def train_valid_test_splitter(
+ context: Optional[MLClientCtx] = None,
+ src_file: Union[DataItem, str] = '',
+ header: Union[DataItem, str, list] = '',
+ sample: int = -1,
+ label_column: str = 'labels',
+ test_size: float = 0.1,
+ train_val_split: float = 0.75,
+ target_path: str = '',
+ name: str = '',
+ key: str = '',
+ random_state = 1
+) -> None:
+ """Split raw data input into train, validation and test sets.
+
+ :param context: the function context
+ :param src_file: ('raw') name of raw data file
+ :param header: (None) header artifact or list of column names.
+ :param sample: (-1). Selects the first n rows, or select a sample starting
+ from the first. If negative <-1, select a random sample from
+ the entire file
+ :param label_column: ground-truth (y) labels
+ :param test_size: (0.1) test set size
+ :param train_val_split: (0.75) Once the test set has been removed the
+ training set gets this proportion.
+ :param target_path: folder location of files
+ :param name: destination prefix name for model files
+ :param key: key for model artifact
+ :param random_state: (1) sklearn rng seed
+ """
+ srcfilepath = os.path.join(target_path, str(src_file))
+
+ if (sample == -1) or (sample >= 1):
+ # get all rows, or contiguous sample starting at row 1.
+ raw = pq.read_table(srcfilepath).to_pandas()
+ labels = raw.pop(label_column)
+ raw = raw.iloc[:sample, :]
+ labels = labels.iloc[:sample]
+ else:
+ # grab a random sample
+ #raw = pd.read_parquet(srcfilepath, engine='pyarrow').sample(sample*-1)
+ raw = pq.read_table(srcfilepath).to_pandas().sample(sample*-1)
+ labels = raw.pop(label_column)
+
+ # double split tp generate 3 data sets: train, validation and test
+ x, xtest, y, ytest = train_test_split(raw, labels, test_size=test_size,
+ random_state=random_state)
+
+ xtrain, xvalid, ytrain, yvalid = train_test_split(x, y,
+ train_size=train_val_split,
+ random_state=random_state)
+
+ if name:
+ name = '-' + name
+
+ # save header
+ f = os.path.join(target_path, name + 'header.pkl')
+ dump(raw.columns.values, open(f, 'wb'))
+ context.log_artifact('header', target_path=f)
+
+ # save data sets
+ f = os.path.join(target_path, name + 'xtrain.pqt')
+ xtrain.to_parquet(f)
+ context.log_artifact('xtrain', target_path=f)
+
+ f = os.path.join(target_path, name + 'xvalid.pqt')
+ xvalid.to_parquet(f)
+ context.log_artifact('xvalid', target_path=f)
+
+ f = os.path.join(target_path, name + 'xtest.pqt')
+ xtest.to_parquet(f)
+ context.log_artifact('xtest', target_path=f)
+
+ f = os.path.join(target_path, name + 'ytrain.pqt')
+ pd.DataFrame({'labels': ytrain}).to_parquet(f)
+ context.log_artifact('ytrain', target_path=f)
+
+ f = os.path.join(target_path, name + 'yvalid.pqt')
+ pd.DataFrame({'labels': yvalid}).to_parquet(f)
+ context.log_artifact('yvalid', target_path=f)
+
+ f = os.path.join(target_path, name + 'ytest.pqt')
+ pd.DataFrame({'labels': ytest}).to_parquet(f)
+ context.log_artifact('ytest', target_path=f)
diff --git a/datagen/train_valid_test/function.yaml b/datagen/train_valid_test/function.yaml
new file mode 100644
index 000000000..f413e1840
--- /dev/null
+++ b/datagen/train_valid_test/function.yaml
@@ -0,0 +1,18 @@
+kind: job
+metadata:
+ name: train-valid-test
+ tag: ''
+ hash: a20a8322b51297f4491727c3a2beb3b3ec505999
+ project: ''
+spec:
+ command: ''
+ args: []
+ volumes: []
+ volume_mounts: []
+ env: []
+ description: ''
+ build:
+ functionSourceCode: aW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgb3MKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBweWFycm93LnBhcnF1ZXQgYXMgcHEKaW1wb3J0IHB5YXJyb3cgYXMgcGEKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgZHVtcAoKaW1wb3J0IHB5YXJyb3cucGFycXVldCBhcyBwcQppbXBvcnQgcHlhcnJvdyBhcyBwYQoKZnJvbSBza2xlYXJuLm1vZGVsX3NlbGVjdGlvbiBpbXBvcnQgdHJhaW5fdGVzdF9zcGxpdApmcm9tIHR5cGluZyBpbXBvcnQgT3B0aW9uYWwsIFVuaW9uCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIG1scnVuLmRhdGFzdG9yZSBpbXBvcnQgRGF0YUl0ZW0KCmltcG9ydCB3YXJuaW5ncwp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSdpZ25vcmUnLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKZGVmIHRyYWluX3ZhbGlkX3Rlc3Rfc3BsaXR0ZXIoCiAgICBjb250ZXh0OiBPcHRpb25hbFtNTENsaWVudEN0eF0gPSBOb25lLAogICAgc3JjX2ZpbGU6IFVuaW9uW0RhdGFJdGVtLCBzdHJdID0gJycsCiAgICBoZWFkZXI6IFVuaW9uW0RhdGFJdGVtLCBzdHIsIGxpc3RdID0gJycsCiAgICBzYW1wbGU6IGludCA9IC0xLAogICAgbGFiZWxfY29sdW1uOiBzdHIgPSAnbGFiZWxzJywKICAgIHRlc3Rfc2l6ZTogZmxvYXQgPSAwLjEsCiAgICB0cmFpbl92YWxfc3BsaXQ6IGZsb2F0ID0gMC43NSwKICAgIHRhcmdldF9wYXRoOiBzdHIgPSAnJywKICAgIG5hbWU6IHN0ciA9ICcnLAogICAga2V5OiBzdHIgPSAnJywKICAgIHJhbmRvbV9zdGF0ZSA9IDEKKSAtPiBOb25lOgogICAgIiIiU3BsaXQgcmF3IGRhdGEgaW5wdXQgaW50byB0cmFpbiwgdmFsaWRhdGlvbiBhbmQgdGVzdCBzZXRzLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIHRoZSBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gc3JjX2ZpbGU6ICAgICAgICAoJ3JhdycpIG5hbWUgb2YgcmF3IGRhdGEgZmlsZQogICAgOnBhcmFtIGhlYWRlcjogICAgICAgICAgKE5vbmUpIGhlYWRlciBhcnRpZmFjdCBvciBsaXN0IG9mIGNvbHVtbiBuYW1lcy4KICAgIDpwYXJhbSBzYW1wbGU6ICAgICAgICAgICgtMSkuIFNlbGVjdHMgdGhlIGZpcnN0IG4gcm93cywgb3Igc2VsZWN0IGEgc2FtcGxlIHN0YXJ0aW5nCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBmcm9tIHRoZSBmaXJzdC4gSWYgbmVnYXRpdmUgPC0xLCBzZWxlY3QgYSByYW5kb20gc2FtcGxlIGZyb20gCiAgICAgICAgICAgICAgICAgICAgICAgICAgICB0aGUgZW50aXJlIGZpbGUKICAgIDpwYXJhbSBsYWJlbF9jb2x1bW46ICAgIGdyb3VuZC10cnV0aCAoeSkgbGFiZWxzCiAgICA6cGFyYW0gdGVzdF9zaXplOiAgICAgICAoMC4xKSB0ZXN0IHNldCBzaXplCiAgICA6cGFyYW0gdHJhaW5fdmFsX3NwbGl0OiAoMC43NSkgT25jZSB0aGUgdGVzdCBzZXQgaGFzIGJlZW4gcmVtb3ZlZCB0aGUgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICB0cmFpbmluZyBzZXQgZ2V0cyB0aGlzIHByb3BvcnRpb24uCiAgICA6cGFyYW0gdGFyZ2V0X3BhdGg6ICAgICBmb2xkZXIgbG9jYXRpb24gb2YgZmlsZXMKICAgIDpwYXJhbSBuYW1lOiAgICAgICAgICAgIGRlc3RpbmF0aW9uIHByZWZpeCBuYW1lIGZvciBtb2RlbCBmaWxlcwogICAgOnBhcmFtIGtleTogICAgICAgICAgICAga2V5IGZvciBtb2RlbCBhcnRpZmFjdAogICAgOnBhcmFtIHJhbmRvbV9zdGF0ZTogICAgKDEpIHNrbGVhcm4gcm5nIHNlZWQKICAgICIiIgogICAgc3JjZmlsZXBhdGggPSBvcy5wYXRoLmpvaW4odGFyZ2V0X3BhdGgsIHN0cihzcmNfZmlsZSkpCgogICAgaWYgKHNhbXBsZSA9PSAtMSkgb3IgKHNhbXBsZSA+PSAxKToKICAgICAgICAjIGdldCBhbGwgcm93cywgb3IgY29udGlndW91cyBzYW1wbGUgc3RhcnRpbmcgYXQgcm93IDEuCiAgICAgICAgcmF3ID0gcHEucmVhZF90YWJsZShzcmNmaWxlcGF0aCkudG9fcGFuZGFzKCkKICAgICAgICBsYWJlbHMgPSByYXcucG9wKGxhYmVsX2NvbHVtbikKICAgICAgICByYXcgPSByYXcuaWxvY1s6c2FtcGxlLCA6XQogICAgICAgIGxhYmVscyA9IGxhYmVscy5pbG9jWzpzYW1wbGVdCiAgICBlbHNlOgogICAgICAgICMgZ3JhYiBhIHJhbmRvbSBzYW1wbGUKICAgICAgICAjcmF3ID0gcGQucmVhZF9wYXJxdWV0KHNyY2ZpbGVwYXRoLCBlbmdpbmU9J3B5YXJyb3cnKS5zYW1wbGUoc2FtcGxlKi0xKQogICAgICAgIHJhdyA9IHBxLnJlYWRfdGFibGUoc3JjZmlsZXBhdGgpLnRvX3BhbmRhcygpLnNhbXBsZShzYW1wbGUqLTEpCiAgICAgICAgbGFiZWxzID0gcmF3LnBvcChsYWJlbF9jb2x1bW4pCiAgICAKICAgICMgZG91YmxlIHNwbGl0IHRwIGdlbmVyYXRlIDMgZGF0YSBzZXRzOiB0cmFpbiwgdmFsaWRhdGlvbiBhbmQgdGVzdAogICAgeCwgeHRlc3QsIHksIHl0ZXN0ID0gdHJhaW5fdGVzdF9zcGxpdChyYXcsIGxhYmVscywgdGVzdF9zaXplPXRlc3Rfc2l6ZSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHJhbmRvbV9zdGF0ZT1yYW5kb21fc3RhdGUpCiAgIAogICAgeHRyYWluLCB4dmFsaWQsIHl0cmFpbiwgeXZhbGlkID0gdHJhaW5fdGVzdF9zcGxpdCh4LCB5LCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdHJhaW5fc2l6ZT10cmFpbl92YWxfc3BsaXQsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICByYW5kb21fc3RhdGU9cmFuZG9tX3N0YXRlKSAgICAgICAgCgogICAgaWYgbmFtZToKICAgICAgICBuYW1lID0gJy0nICsgbmFtZQogICAgCiAgICAjIHNhdmUgaGVhZGVyCiAgICBmID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBuYW1lICsgJ2hlYWRlci5wa2wnKQogICAgZHVtcChyYXcuY29sdW1ucy52YWx1ZXMsIG9wZW4oZiwgJ3diJykpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgnaGVhZGVyJywgdGFyZ2V0X3BhdGg9ZikKICAgIAogICAgIyBzYXZlIGRhdGEgc2V0cwogICAgZiA9IG9zLnBhdGguam9pbih0YXJnZXRfcGF0aCwgbmFtZSArICd4dHJhaW4ucHF0JykKICAgIHh0cmFpbi50b19wYXJxdWV0KGYpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgneHRyYWluJywgdGFyZ2V0X3BhdGg9ZikKICAgIAogICAgZiA9IG9zLnBhdGguam9pbih0YXJnZXRfcGF0aCwgbmFtZSArICd4dmFsaWQucHF0JykKICAgIHh2YWxpZC50b19wYXJxdWV0KGYpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdCgneHZhbGlkJywgdGFyZ2V0X3BhdGg9ZikKICAgIAogICAgZiA9IG9zLnBhdGguam9pbih0YXJnZXRfcGF0aCwgbmFtZSArICd4dGVzdC5wcXQnKQogICAgeHRlc3QudG9fcGFycXVldChmKQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoJ3h0ZXN0JywgdGFyZ2V0X3BhdGg9ZikKICAgIAogICAgZiA9IG9zLnBhdGguam9pbih0YXJnZXRfcGF0aCwgbmFtZSArICd5dHJhaW4ucHF0JykKICAgIHBkLkRhdGFGcmFtZSh7J2xhYmVscyc6IHl0cmFpbn0pLnRvX3BhcnF1ZXQoZikKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KCd5dHJhaW4nLCB0YXJnZXRfcGF0aD1mKQogICAgCiAgICBmID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBuYW1lICsgJ3l2YWxpZC5wcXQnKQogICAgcGQuRGF0YUZyYW1lKHsnbGFiZWxzJzogeXZhbGlkfSkudG9fcGFycXVldChmKQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoJ3l2YWxpZCcsIHRhcmdldF9wYXRoPWYpCiAgICAKICAgIGYgPSBvcy5wYXRoLmpvaW4odGFyZ2V0X3BhdGgsIG5hbWUgKyAneXRlc3QucHF0JykKICAgIHBkLkRhdGFGcmFtZSh7J2xhYmVscyc6IHl0ZXN0fSkudG9fcGFycXVldChmKQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoJ3l0ZXN0JywgdGFyZ2V0X3BhdGg9ZikKICAgIAogICAgY29udGV4dC5sb2dnZXIuaW5mbygnbnVtcHknLCBucC5fX3ZlcnNpb25fXykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oJ3BhbmRhcyAnLCBwZC5fX3ZlcnNpb25fXykKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oJ3B5YXJyb3cnLCBwYS5fX3ZlcnNpb25fXyk=
+ base_image: yjbds/mlrun-intel:dev
+ commands: []
+ code_origin: https://github.com/yjb-ds/functions.git#e613e55761fd1ed325ad88155877924aa5b49ccc:/User/repos/functions/datagen/splitters/train_valid_test.py
diff --git a/evaluation/test-classifier.py b/evaluation/test-classifier.py
new file mode 100644
index 000000000..80a71ad03
--- /dev/null
+++ b/evaluation/test-classifier.py
@@ -0,0 +1,177 @@
+import os
+import importlib
+from cloudpickle import load
+
+import numpy as np
+import pandas as pd
+import lightgbm as lgb
+
+from sklearn.metrics import (roc_curve, confusion_matrix)
+from sklearn.model_selection import train_test_split
+
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure
+import seaborn as sns
+
+from typing import Optional, Union, List
+
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+from mlrun.artifacts import TableArtifact, PlotArtifact
+
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def test_model(
+ context: Optional[MLClientCtx],
+ model: Union[DataItem, str],
+ xtest,
+ ytest,
+ target_path: str = '',
+ name: str = '',
+ key: str = '',
+ random_state = 1
+) -> None:
+ """Test a classifier model
+
+ Using held-out test features, calls `model.predict(xtest)` and evaluates the accuracy of the
+ estimated model.
+
+ Can be part of a kubeflow pipeline as a test step or called
+
+ :param context: the function context
+ :param model: estimated model file name as artifact store item
+ or pickle file name
+ :param xtest: test features file name as artifact store item
+ or pickle file name
+ :param header: (Optional) use if xtest does not have a header
+ :param ytest: test labels file name as artifact store
+ item or pickle file name
+ :param target_path: folder location of files
+ :param name: destination name for test results
+ :param key: key for model artifact
+ """
+ # load model and data
+ clf = load(open(str(model), 'rb'))
+
+ if isinstance(xtest, DataItem):
+ xtest = pd.read_parquet(str(xtest))
+ ytest = pd.read_parquet(str(ytest))
+ else:
+ xtest = pd.read_parquet(xtest)
+ ytest = pd.read_parquet(ytest)
+
+ if callable(getattr(clf, 'predict_proba')):
+ ypred_probs = clf.predict_proba(xtest)[:, 1]
+ ypred = np.where(ypred_probs >= 0.5, 1, 0)
+ plot_roc(context, ytest, ypred_probs, target_path)
+ else:
+ ypred = clf.predict(xtest)
+ ypred_probs = None
+
+ plot_confusion_matrix(context, ytest, ypred, target_path)
+
+ if hasattr(clf, 'feature_importances_'):
+ plot_importance(context, clf, xtest.columns.values, target_path)
+
+def _gcf_clear(plt):
+ plt.cla()
+ plt.clf()
+ plt.close()
+
+def plot_roc(
+ context: MLClientCtx,
+ y_labels,
+ y_probs,
+ target_path: str = '',
+ name='roc.png',
+ key='roc',
+ fmt='png'
+):
+ """Plot an ROC curve from test data saved in an artifact store.
+
+ :param context: function context
+ :param y_labels: test data labels
+ :param y_probs: test data
+ """
+ fpr_xg, tpr_xg, _ = roc_curve(y_labels, y_probs)
+ plt.plot([0, 1], [0, 1], "k--")
+ plt.plot(fpr_xg, tpr_xg, label="roc")
+ plt.xlabel("false positive rate")
+ plt.ylabel("true positive rate")
+ plt.title("roc curve")
+ plt.legend(loc="best")
+ fig = plt.gcf()
+
+ plotpath = os.path.join(target_path, name)
+ fig.savefig(plotpath, format=fmt)
+ context.log_artifact(PlotArtifact(key, body=fig))
+
+ _gcf_clear(plt)
+
+def plot_confusion_matrix(
+ context: MLClientCtx,
+ labels,
+ predictions,
+ target_path: str = '',
+ name: str ="confusion.png",
+ key: str ='confusion_matrix',
+ fmt: str = 'png'
+):
+ """Create a confusion matrix.
+ Plot and save a confusion matrix using test data from a
+ pipeline step.
+
+ :param context: function context
+ :param labels: test data labels
+ :param predictions: test data predictions
+ """
+ cm = confusion_matrix(labels,
+ predictions,
+ sample_weight=None,
+ normalize='all')
+ sns.heatmap(cm, annot=True, cmap="Blues")
+ plotpath = os.path.join(target_path, name)
+ fig = plt.gcf()
+ fig.savefig(plotpath, format=fmt)
+ context.log_artifact(PlotArtifact(key, body=fig))
+
+ _gcf_clear(plt)
+
+def plot_importance(
+ context,
+ model,
+ header: List = [],
+ target_path: str = '',
+ name: str = 'feature-importances.png',
+ key: str = 'feature-importances',
+ fmt = 'png'
+):
+ """Display estimated feature importances.
+
+ :param context: function context
+ :param model: fitted lightgbm model
+ :param header: list of feature names
+ """
+ # create a feature importance table with desired labels
+ zipped = zip(model.feature_importances_, header)
+
+ feature_imp = pd.DataFrame(sorted(zipped), columns=['freq','feature']
+ ).sort_values(by="freq", ascending=False)
+
+ plt.figure(figsize=(20, 10))
+ sns.barplot(x="freq", y="feature", data=feature_imp)
+ plt.title('LightGBM Features')
+ plt.tight_layout()
+ fig = plt.gcf()
+ plotpath = os.path.join(target_path, name)
+ fig.savefig(plotpath, format='png')
+ context.log_artifact(PlotArtifact(key + '-plot', body=fig))
+
+ # feature importances are also saved as a table:
+ tablepath = os.path.join(target_path, key + '-table.csv')
+ feature_imp.to_csv(tablepath)
+ context.log_artifact(TableArtifact(key + '-table', target_path=tablepath))
+
+ # to ensure we don't overwrite this figure when creating the next:
+ _gcf_clear(plt)
diff --git a/evaluation/test-classifier.yaml b/evaluation/test-classifier.yaml
new file mode 100644
index 000000000..73f7cd29a
--- /dev/null
+++ b/evaluation/test-classifier.yaml
@@ -0,0 +1,19 @@
+kind: job
+metadata:
+ name: test-classifier
+ tag: ''
+ hash: 2946ecad9f24c488575cc7b4476528df09027080
+ project: ''
+spec:
+ command: ''
+ args: []
+ image: yjbds/mlrun-daskboost:latest
+ volumes: []
+ volume_mounts: []
+ env: []
+ description: ''
+ build:
+ functionSourceCode: aW1wb3J0IG9zCmltcG9ydCBpbXBvcnRsaWIKZnJvbSBjbG91ZHBpY2tsZSBpbXBvcnQgbG9hZAoKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IGxpZ2h0Z2JtIGFzIGxnYgoKZnJvbSBza2xlYXJuLm1ldHJpY3MgaW1wb3J0IChyb2NfY3VydmUsIGNvbmZ1c2lvbl9tYXRyaXgpCmZyb20gc2tsZWFybi5tb2RlbF9zZWxlY3Rpb24gaW1wb3J0IHRyYWluX3Rlc3Rfc3BsaXQKCmltcG9ydCBtYXRwbG90bGliLnB5cGxvdCBhcyBwbHQKZnJvbSBtYXRwbG90bGliLmZpZ3VyZSBpbXBvcnQgRmlndXJlCmltcG9ydCBzZWFib3JuIGFzIHNucwoKZnJvbSB0eXBpbmcgaW1wb3J0IE9wdGlvbmFsLCBVbmlvbiwgTGlzdAoKZnJvbSBtbHJ1bi5leGVjdXRpb24gaW1wb3J0IE1MQ2xpZW50Q3R4CmZyb20gbWxydW4uZGF0YXN0b3JlIGltcG9ydCBEYXRhSXRlbQpmcm9tIG1scnVuLmFydGlmYWN0cyBpbXBvcnQgVGFibGVBcnRpZmFjdCwgUGxvdEFydGlmYWN0CgppbXBvcnQgd2FybmluZ3MKd2FybmluZ3Muc2ltcGxlZmlsdGVyKGFjdGlvbj0naWdub3JlJywgY2F0ZWdvcnk9RnV0dXJlV2FybmluZykKCmRlZiB0ZXN0X21vZGVsKAogICAgY29udGV4dDogT3B0aW9uYWxbTUxDbGllbnRDdHhdLAogICAgbW9kZWw6IFVuaW9uW0RhdGFJdGVtLCBzdHJdLAogICAgeHRlc3QsIAogICAgeXRlc3QsCiAgICB0YXJnZXRfcGF0aDogc3RyID0gJycsCiAgICBuYW1lOiBzdHIgPSAnJywKICAgIGtleTogc3RyID0gJycsCiAgICByYW5kb21fc3RhdGUgPSAxCikgLT4gTm9uZToKICAgICIiIlRlc3QgYSBjbGFzc2lmaWVyIG1vZGVsCiAgICAKICAgIFVzaW5nIGhlbGQtb3V0IHRlc3QgZmVhdHVyZXMsIGNhbGxzIGBtb2RlbC5wcmVkaWN0KHh0ZXN0KWAgYW5kIGV2YWx1YXRlcyB0aGUgYWNjdXJhY3kgb2YgdGhlIAogICAgZXN0aW1hdGVkIG1vZGVsLgogICAgCiAgICBDYW4gYmUgcGFydCBvZiBhIGt1YmVmbG93IHBpcGVsaW5lIGFzIGEgdGVzdCBzdGVwIG9yIGNhbGxlZAogICAgCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIG1vZGVsOiAgICAgICAgICAgZXN0aW1hdGVkIG1vZGVsIGZpbGUgbmFtZSBhcyBhcnRpZmFjdCBzdG9yZSBpdGVtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBvciBwaWNrbGUgZmlsZSBuYW1lCiAgICA6cGFyYW0geHRlc3Q6ICAgICAgICAgICB0ZXN0IGZlYXR1cmVzIGZpbGUgbmFtZSBhcyBhcnRpZmFjdCBzdG9yZSBpdGVtCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBvciBwaWNrbGUgZmlsZSBuYW1lCiAgICA6cGFyYW0gaGVhZGVyOiAgICAgICAgICAoT3B0aW9uYWwpIHVzZSBpZiB4dGVzdCBkb2VzIG5vdCBoYXZlIGEgaGVhZGVyCiAgICA6cGFyYW0geXRlc3Q6ICAgICAgICAgICB0ZXN0IGxhYmVscyBmaWxlIG5hbWUgYXMgYXJ0aWZhY3Qgc3RvcmUgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBpdGVtIG9yIHBpY2tsZSBmaWxlIG5hbWUKICAgIDpwYXJhbSB0YXJnZXRfcGF0aDogICAgIGZvbGRlciBsb2NhdGlvbiBvZiBmaWxlcwogICAgOnBhcmFtIG5hbWU6ICAgICAgICAgICAgZGVzdGluYXRpb24gbmFtZSBmb3IgdGVzdCByZXN1bHRzCiAgICA6cGFyYW0ga2V5OiAgICAgICAgICAgICBrZXkgZm9yIG1vZGVsIGFydGlmYWN0CiAgICAiIiIKICAgICMgbG9hZCBtb2RlbCBhbmQgZGF0YQogICAgaWYgaXNpbnN0YW5jZShtb2RlbCwgRGF0YUl0ZW0pOgogICAgICAgIGNsZiA9IGxvYWQob3BlbihzdHIobW9kZWwpLCAncmInKSkKICAgIGVsc2U6CiAgICAgICAgY2xmID0gbG9hZChvcGVuKG1vZGVsLCAncmInKSkKCiAgICBpZiBpc2luc3RhbmNlKHh0ZXN0LCBEYXRhSXRlbSk6CiAgICAgICAgeHRlc3QgPSBwZC5yZWFkX3BhcnF1ZXQoc3RyKHh0ZXN0KSkKICAgICAgICB5dGVzdCA9IHBkLnJlYWRfcGFycXVldChzdHIoeXRlc3QpKQogICAgZWxzZToKICAgICAgICB4dGVzdCA9IHBkLnJlYWRfcGFycXVldCh4dGVzdCkKICAgICAgICB5dGVzdCA9IHBkLnJlYWRfcGFycXVldCh5dGVzdCkKICAgIAogICAgaWYgY2FsbGFibGUoZ2V0YXR0cihjbGYsICdwcmVkaWN0X3Byb2JhJykpOgogICAgICAgIHlwcmVkX3Byb2JzID0gY2xmLnByZWRpY3RfcHJvYmEoeHRlc3QpWzosIDFdCiAgICAgICAgeXByZWQgPSBucC53aGVyZSh5cHJlZF9wcm9icyA+PSAwLjUsIDEsIDApCiAgICAgICAgcGxvdF9yb2MoY29udGV4dCwgeXRlc3QsIHlwcmVkX3Byb2JzLCB0YXJnZXRfcGF0aCkKICAgIGVsc2U6CiAgICAgICAgeXByZWQgPSBjbGYucHJlZGljdCh4dGVzdCkKICAgICAgICB5cHJlZF9wcm9icyA9IE5vbmUKICAgIAogICAgcGxvdF9jb25mdXNpb25fbWF0cml4KGNvbnRleHQsIHl0ZXN0LCB5cHJlZCwgdGFyZ2V0X3BhdGgpCgogICAgaWYgaGFzYXR0cihjbGYsICdmZWF0dXJlX2ltcG9ydGFuY2VzXycpOgogICAgICAgIHBsb3RfaW1wb3J0YW5jZShjb250ZXh0LCBjbGYsIHh0ZXN0LmNvbHVtbnMudmFsdWVzLCB0YXJnZXRfcGF0aCkKCmRlZiBfZ2NmX2NsZWFyKHBsdCk6CiAgICBwbHQuY2xhKCkKICAgIHBsdC5jbGYoKQogICAgcGx0LmNsb3NlKCkgICAgICAgIAoKZGVmIHBsb3Rfcm9jKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsIAogICAgeV9sYWJlbHMsCiAgICB5X3Byb2JzLAogICAgdGFyZ2V0X3BhdGg6IHN0ciA9ICcnLAogICAgbmFtZT0ncm9jLnBuZycsCiAgICBrZXk9J3JvYycsCiAgICBmbXQ9J3BuZycKKToKICAgICIiIlBsb3QgYW4gUk9DIGN1cnZlIGZyb20gdGVzdCBkYXRhIHNhdmVkIGluIGFuIGFydGlmYWN0IHN0b3JlLgogICAgCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0geV9sYWJlbHM6ICAgICAgICB0ZXN0IGRhdGEgbGFiZWxzCiAgICA6cGFyYW0geV9wcm9iczogICAgICAgICB0ZXN0IGRhdGEgCiAgICAiIiIKICAgIGZwcl94ZywgdHByX3hnLCBfID0gcm9jX2N1cnZlKHlfbGFiZWxzLCB5X3Byb2JzKQogICAgcGx0LnBsb3QoWzAsIDFdLCBbMCwgMV0sICJrLS0iKQogICAgcGx0LnBsb3QoZnByX3hnLCB0cHJfeGcsIGxhYmVsPSJyb2MiKQogICAgcGx0LnhsYWJlbCgiZmFsc2UgcG9zaXRpdmUgcmF0ZSIpCiAgICBwbHQueWxhYmVsKCJ0cnVlIHBvc2l0aXZlIHJhdGUiKQogICAgcGx0LnRpdGxlKCJyb2MgY3VydmUiKQogICAgcGx0LmxlZ2VuZChsb2M9ImJlc3QiKQogICAgZmlnID0gcGx0LmdjZigpCgogICAgcGxvdHBhdGggPSBvcy5wYXRoLmpvaW4odGFyZ2V0X3BhdGgsIG5hbWUpCiAgICBmaWcuc2F2ZWZpZyhwbG90cGF0aCwgZm9ybWF0PWZtdCkKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KFBsb3RBcnRpZmFjdChrZXksIGJvZHk9ZmlnKSkKCiAgICBfZ2NmX2NsZWFyKHBsdCkKCmRlZiBwbG90X2NvbmZ1c2lvbl9tYXRyaXgoCiAgICBjb250ZXh0OiBNTENsaWVudEN0eCwgCiAgICBsYWJlbHMsIAogICAgcHJlZGljdGlvbnMsCiAgICB0YXJnZXRfcGF0aDogc3RyID0gJycsIAogICAgbmFtZTogc3RyID0iY29uZnVzaW9uLnBuZyIsIAogICAga2V5OiBzdHIgPSdjb25mdXNpb25fbWF0cml4JywKICAgIGZtdDogc3RyID0gJ3BuZycKKToKICAgICIiIkNyZWF0ZSBhIGNvbmZ1c2lvbiBtYXRyaXguCiAgICBQbG90IGFuZCBzYXZlIGEgY29uZnVzaW9uIG1hdHJpeCB1c2luZyB0ZXN0IGRhdGEgZnJvbSBhCiAgICBwaXBlbGluZSBzdGVwLgoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgICAgIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSBsYWJlbHM6ICAgICAgICAgIHRlc3QgZGF0YSBsYWJlbHMKICAgIDpwYXJhbSBwcmVkaWN0aW9uczogICAgIHRlc3QgZGF0YSBwcmVkaWN0aW9ucwogICAgIiIiCiAgICBjbSA9IGNvbmZ1c2lvbl9tYXRyaXgobGFiZWxzLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgcHJlZGljdGlvbnMsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBzYW1wbGVfd2VpZ2h0PU5vbmUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBub3JtYWxpemU9J2FsbCcpCiAgICBzbnMuaGVhdG1hcChjbSwgYW5ub3Q9VHJ1ZSwgY21hcD0iQmx1ZXMiKQogICAgcGxvdHBhdGggPSBvcy5wYXRoLmpvaW4odGFyZ2V0X3BhdGgsIG5hbWUpCiAgICBmaWcgPSBwbHQuZ2NmKCkKICAgIGZpZy5zYXZlZmlnKHBsb3RwYXRoLCBmb3JtYXQ9Zm10KQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoUGxvdEFydGlmYWN0KGtleSwgYm9keT1maWcpKQoKICAgIF9nY2ZfY2xlYXIocGx0KQoKZGVmIHBsb3RfaW1wb3J0YW5jZSgKICAgIGNvbnRleHQsCiAgICBtb2RlbCwKICAgIGhlYWRlcjogTGlzdCA9IFtdLAogICAgdGFyZ2V0X3BhdGg6IHN0ciA9ICcnLAogICAgbmFtZTogc3RyID0gJ2ZlYXR1cmUtaW1wb3J0YW5jZXMucG5nJywKICAgIGtleTogc3RyID0gJ2ZlYXR1cmUtaW1wb3J0YW5jZXMnLAogICAgZm10ID0gJ3BuZycKKToKICAgICIiIkRpc3BsYXkgZXN0aW1hdGVkIGZlYXR1cmUgaW1wb3J0YW5jZXMuCgogICAgOnBhcmFtIGNvbnRleHQ6ICAgICBmdW5jdGlvbiBjb250ZXh0CiAgICA6cGFyYW0gbW9kZWw6ICAgICAgIGZpdHRlZCBsaWdodGdibSBtb2RlbAogICAgOnBhcmFtIGhlYWRlcjogICAgICBsaXN0IG9mIGZlYXR1cmUgbmFtZXMKICAgICIiIgogICAgIyBjcmVhdGUgYSBmZWF0dXJlIGltcG9ydGFuY2UgdGFibGUgd2l0aCBkZXNpcmVkIGxhYmVscwogICAgemlwcGVkID0gemlwKG1vZGVsLmZlYXR1cmVfaW1wb3J0YW5jZXNfLCBoZWFkZXIpCgogICAgZmVhdHVyZV9pbXAgPSBwZC5EYXRhRnJhbWUoc29ydGVkKHppcHBlZCksIGNvbHVtbnM9WydmcmVxJywnZmVhdHVyZSddCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgKS5zb3J0X3ZhbHVlcyhieT0iZnJlcSIsIGFzY2VuZGluZz1GYWxzZSkKCiAgICBwbHQuZmlndXJlKGZpZ3NpemU9KDIwLCAxMCkpCiAgICBzbnMuYmFycGxvdCh4PSJmcmVxIiwgeT0iZmVhdHVyZSIsIGRhdGE9ZmVhdHVyZV9pbXApCiAgICBwbHQudGl0bGUoJ0xpZ2h0R0JNIEZlYXR1cmVzJykKICAgIHBsdC50aWdodF9sYXlvdXQoKQogICAgZmlnID0gcGx0LmdjZigpCiAgICBwbG90cGF0aCA9IG9zLnBhdGguam9pbih0YXJnZXRfcGF0aCwgbmFtZSkKICAgIGZpZy5zYXZlZmlnKHBsb3RwYXRoLCBmb3JtYXQ9J3BuZycpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChQbG90QXJ0aWZhY3Qoa2V5ICsgJy1wbG90JywgYm9keT1maWcpKQoKICAgICMgZmVhdHVyZSBpbXBvcnRhbmNlcyBhcmUgYWxzbyBzYXZlZCBhcyBhIHRhYmxlOgogICAgdGFibGVwYXRoID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBrZXkgKyAnLXRhYmxlLmNzdicpCiAgICBmZWF0dXJlX2ltcC50b19jc3YodGFibGVwYXRoKQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoVGFibGVBcnRpZmFjdChrZXkgKyAnLXRhYmxlJywgdGFyZ2V0X3BhdGg9dGFibGVwYXRoKSkKCiAgICAjIHRvIGVuc3VyZSB3ZSBkb24ndCBvdmVyd3JpdGUgdGhpcyBmaWd1cmUgd2hlbiBjcmVhdGluZyB0aGUgbmV4dDoKICAgIF9nY2ZfY2xlYXIocGx0KQo=
+ base_image: yjbds/mlrun-daskboost:dev
+ commands: []
+ code_origin: https://github.com/yjb-ds/functions.git#e613e55761fd1ed325ad88155877924aa5b49ccc:/User/repos/functions/evaluation/test-classifier.py
diff --git a/fileutils/README.md b/fileutils/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/fileutils/arc_to_parquet/arc_to_parquet.py b/fileutils/arc_to_parquet/arc_to_parquet.py
deleted file mode 100644
index 80e2a3b5a..000000000
--- a/fileutils/arc_to_parquet/arc_to_parquet.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import os
-from pathlib import Path
-import pandas as pd
-import pyarrow.parquet as pq
-import pyarrow as pa
-
-from mlrun.execution import MLClientCtx
-from typing import IO, AnyStr, Union, List, Optional
-
-
-def arc_to_parquet(
- context: MLClientCtx,
- archive_url: Union[str, Path, IO[AnyStr]],
- header: Optional[List[str]] = None,
- target_path: str = "",
- name: str = "",
- chunksize: int = 10_000,
- log_data: bool = True,
- add_uid: bool = False,
- key: str = "raw_data",
-) -> None:
- """Open a file/object archive and save as a parquet file.
-
- :param context: function context
- :param archive_url: any valid string path consistent with the path variable
- of pandas.read_csv, including strings as file paths, as urls,
- pathlib.Path objects, etc...
- :param header: column names
- :param target_path: destination folder of table
- :param name: name file to be saved locally, also
- :param chunksize: (0) row size retrieved per iteration
- :param log_data: (True) if True, log the data so that it is available
- at the next step
- :param add_uid: (False) add the metadata uid to the target_path so that
- runs can be identified
- :param key: key in artifact store (when log_data=True)
- """
- if not name.endswith(".parquet"):
- name += ".parquet"
-
- if not add_uid:
- uid = ""
- else:
- uid = context.uid
-
- dest_path = os.path.join(target_path, uid, name)
- os.makedirs(os.path.join(target_path, uid), exist_ok=True)
-
- if not os.path.isfile(dest_path):
- context.logger.info("destination file does not exist, downloading")
- pqwriter = None
- for i, df in enumerate(
- pd.read_csv(archive_url, chunksize=chunksize, names=header)
- ):
- table = pa.Table.from_pandas(df)
- if i == 0:
- pqwriter = pq.ParquetWriter(dest_path, table.schema)
- pqwriter.write_table(table)
-
- if pqwriter:
- pqwriter.close()
-
- context.logger.info(f"saved table to {dest_path}")
- else:
- context.logger.info("destination file already exists")
-
- if log_data:
- context.logger.info(f"assign data to {key} in artifact store")
- context.log_artifact(key, target_path=dest_path)
diff --git a/fileutils/arc_to_parquet/arc_to_parquet.yaml b/fileutils/arc_to_parquet/arc_to_parquet.yaml
deleted file mode 100644
index 28e73eca8..000000000
--- a/fileutils/arc_to_parquet/arc_to_parquet.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-kind: job
-metadata:
- name: arc_to_parquet
-spec:
- description: 'archive to parquet and log'
- build:
- functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDIwLTAxLTA5IDE3OjA3CgppbXBvcnQgb3MKCmZyb20gbWxydW4uZXhlY3V0aW9uIGltcG9ydCBNTENsaWVudEN0eApmcm9tIHR5cGluZyBpbXBvcnQgSU8sIEFueVN0ciwgVW5pb24sIExpc3QKZnJvbSBwYXRobGliIGltcG9ydCBQYXRoCgppbXBvcnQgcGFuZGFzIGFzIHBkCmltcG9ydCBweWFycm93LnBhcnF1ZXQgYXMgcHEKaW1wb3J0IHB5YXJyb3cgYXMgcGEKCmRlZiBhcmNfdG9fcGFycXVldCgKICAgIGNvbnRleHQ6IE1MQ2xpZW50Q3R4LAogICAgYXJjaGl2ZV91cmw6IFVuaW9uW3N0ciwgUGF0aCwgSU9bQW55U3RyXV0sCiAgICBoZWFkZXI6IFVuaW9uW05vbmUsIExpc3Rbc3RyXV0gPSBOb25lLAogICAgdGFyZ2V0X3BhdGg6IHN0ciA9ICIiLAogICAgbmFtZTogc3RyID0gIiIsCiAgICBjaHVua3NpemU6IGludCA9IDEwXzAwMCwKICAgIGxvZ19kYXRhOiBib29sID0gVHJ1ZSwKICAgIGtleTogc3RyID0gJ3Jhd19kYXRhJwopIC0+IE5vbmU6CiAgICAiIiJPcGVuIGEgZmlsZS9vYmplY3QgYXJjaGl2ZSBhbmQgc2F2ZSBhcyBhIHBhcnF1ZXQgZmlsZS4KICAgIAogICAgQXJnczoKICAgIDpwYXJhbSBjb250ZXh0OiAgICAgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIGFyY2hpdmVfdXJsOiBhbnkgdmFsaWQgc3RyaW5nIHBhdGggY29uc2lzdGVudCB3aXRoIHRoZSBwYXRoIHZhcmlhYmxlCiAgICAgICAgICAgICAgICAgICAgICAgIG9mIHBhbmRhcy5yZWFkX2Nzdi4gbmNsdWRpbmcgc3RyaW5ncyBhcyBmaWxlIHBhdGhzLCBhcyB1cmxzLCAKICAgICAgICAgICAgICAgICAgICAgICAgcGF0aGxpYi5QYXRoIG9iamVjdHMsIGV0Yy4uLgogICAgOnBhcmFtIGhlYWRlcjogICAgICBjb2x1bW4gbmFtZXMKICAgIDpwYXJhbSB0YXJnZXRfcGF0aDogZGVzdGluYXRpb24gZm9sZGVyIG9mIHRhYmxlCiAgICA6cGFyYW0gbmFtZTogICAgICAgIG5hbWUgZmlsZSB0byBiZSBzYXZlZCBsb2NhbGx5LCBhbHNvCiAgICA6cGFyYW0gY2h1bmtzaXplOiAgICgwKSByb3cgc2l6ZSByZXRyaWV2ZWQgcGVyIGl0ZXJhdGlvbgogICAgOnBhcmFtIGxvZ19kYXRhOiAgICAoVHJ1ZSkgaWYgVHJ1ZSwgbG9nIHRoZSBkYXRhIHNvIHRoYXQgaXQgaXMgYXZhaWxhYmxlCiAgICAgICAgICAgICAgICAgICAgICAgIGF0IHRoZSBuZXh0IHN0ZXAKICAgICIiIgogICAgb3MubWFrZWRpcnModGFyZ2V0X3BhdGgsIGV4aXN0X29rPVRydWUpCgogICAgaWYgbm90IG5hbWUuZW5kc3dpdGgoIi5wYXJxdWV0Iik6CiAgICAgICAgbmFtZSArPSAiLnBhcnF1ZXQiCgogICAgZGVzdF9wYXRoID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBuYW1lKQoKICAgIGlmIG5vdCBvcy5wYXRoLmlzZmlsZShkZXN0X3BhdGgpOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oImRlc3RpbmF0aW9uIGZpbGUgZG9lcyBub3QgZXhpc3QsIGRvd25sb2FkaW5nIikKICAgICAgICBwcXdyaXRlciA9IE5vbmUKICAgICAgICBmb3IgaSwgZGYgaW4gZW51bWVyYXRlKAogICAgICAgICAgICBwZC5yZWFkX2NzdihhcmNoaXZlX3VybCwgY2h1bmtzaXplPWNodW5rc2l6ZSwgbmFtZXM9aGVhZGVyKQogICAgICAgICk6CiAgICAgICAgICAgIHRhYmxlID0gcGEuVGFibGUuZnJvbV9wYW5kYXMoZGYpCiAgICAgICAgICAgIGlmIGkgPT0gMDoKICAgICAgICAgICAgICAgIHBxd3JpdGVyID0gcHEuUGFycXVldFdyaXRlcihkZXN0X3BhdGgsIHRhYmxlLnNjaGVtYSkKICAgICAgICAgICAgcHF3cml0ZXIud3JpdGVfdGFibGUodGFibGUpCgogICAgICAgIGlmIHBxd3JpdGVyOgogICAgICAgICAgICBwcXdyaXRlci5jbG9zZSgpCgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZiJzYXZlZCB0YWJsZSB0byB7ZGVzdF9wYXRofSIpCiAgICBlbHNlOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oImRlc3RpbmF0aW9uIGZpbGUgZXhpc3RzIikKCiAgICBpZiBsb2dfZGF0YToKICAgICAgICBjb250ZXh0LmxvZ2dlci5pbmZvKGYibG9nZ2luZyB7ZGVzdF9wYXRofSB0byBjb250ZXh0IikKICAgICAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChrZXksIHRhcmdldF9wYXRoPWRlc3RfcGF0aCkKCg==
- base_image: python:3.6-jessie
- commands:
- - pip install -q mlrun
- - pip install -q pyarrow
- - pip install -q numpy
- - pip install -q pandas
\ No newline at end of file
diff --git a/fileutils/arc_to_parquet/function.py b/fileutils/arc_to_parquet/function.py
new file mode 100644
index 000000000..8b261c944
--- /dev/null
+++ b/fileutils/arc_to_parquet/function.py
@@ -0,0 +1,117 @@
+# Copyright 2018 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ssl
+
+try:
+ _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+ # Legacy Python that doesn't verify HTTPS certificates by default
+ pass
+else:
+ # Handle target environment that doesn't support HTTPS verification
+ ssl._create_default_https_context = _create_unverified_https_context
+
+import os
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow as pa
+from pickle import dump, load
+
+from mlrun.execution import MLClientCtx
+from typing import IO, AnyStr, Union, List, Optional
+
+
+def arc_to_parquet(
+ context: MLClientCtx,
+ archive_url: Union[str, Path, IO[AnyStr]],
+ header: Optional[List[str]] = None,
+ inc_cols: Optional[List[str]] = None,
+ target_path: str = "",
+ name: str = "",
+ chunksize: int = 10_000,
+ dtype=None,
+ encoding: str = 'latin-1',
+ key: str = 'data',
+ dataset: Optional[str] = None,
+ partition_cols = [],
+) -> None:
+ """Open a file/object archive and save as a parquet file.
+
+ Partitioning requires precise specification of column types.
+
+ :param context: function context
+ :param archive_url: any valid string path consistent with the path variable
+ of pandas.read_csv, including strings as file paths, as urls,
+ pathlib.Path objects, etc...
+ :param header: column names
+ :param inc_cols: include only these columns
+ :param target_path: destination folder of table
+ :param name: name file to be saved locally, also
+ :param chunksize: (0) row size retrieved per iteration
+ :param dtype destination data type of specified columns
+ :param encoding ('latin-8') file encoding
+ :param key: key in artifact store (when log_data=True)
+ :param dataset: (None) if not None then 'target_path/dataset'
+ is folder for partitioned files
+ :param part_cols: ([]) list of partitioning columns
+
+ """
+ if not name.endswith(".pqt"):
+ name += ".pqt"
+
+ if dataset is not None:
+ os.makedirs(os.path.join(target_path, dataset), exist_ok=True)
+ dest_path = os.path.join(target_path, dataset)
+ else:
+ os.makedirs(os.path.join(target_path), exist_ok=True)
+ dest_path = os.path.join(target_path, name)
+
+ if not os.path.isfile(dest_path):
+ context.logger.info("destination file does not exist, downloading")
+ pqwriter = None
+ for i, df in enumerate(pd.read_csv(archive_url,
+ chunksize=chunksize,
+ names=header,
+ encoding=encoding,
+ usecols=inc_cols,
+ dtype=dtype)):
+ table = pa.Table.from_pandas(df)
+ if i == 0:
+ filepath = os.path.join(target_path,'header-only.pqt')
+ if dataset:
+ # just write header here
+ pq.ParquetWriter(filepath, table.schema)
+ #context.log_artifact('header', target_path=filepath)
+ else:
+ # start writing file
+ #context.log_artifact('header', target_path=filepath)
+ pqwriter = pq.ParquetWriter(dest_path, table.schema)
+
+ if dataset:
+ pq.write_to_dataset(table, root_path=dest_path, partition_cols=partition_cols)
+ else:
+ pqwriter.write_table(table)
+
+ if pqwriter:
+ pqwriter.close()
+
+ context.logger.info(f"saved table to {dest_path}")
+ else:
+ context.logger.info("destination file already exists")
+
+ context.log_artifact(key, target_path=dest_path)
diff --git a/fileutils/arc_to_parquet/function.yaml b/fileutils/arc_to_parquet/function.yaml
new file mode 100644
index 000000000..73628d357
--- /dev/null
+++ b/fileutils/arc_to_parquet/function.yaml
@@ -0,0 +1,16 @@
+kind: job
+metadata:
+ name: function
+ hash: 0a17345fa693f3b0fd5671a8f94e09f97676ded2
+ project: default
+spec:
+ command: https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/parquet_to_dask/function.py
+ args: []
+ image: ''
+ volumes: []
+ volume_mounts: []
+ env: []
+ description: retrieve archive table and save as parquet file
+ build:
+ base_image: yjbds/mlrun-daskboost:dev
+ commands: []
diff --git a/fileutils/open_archive/file_utils.py b/fileutils/open_archive/file_utils.py
index b8cae15f1..10c8afbe3 100644
--- a/fileutils/open_archive/file_utils.py
+++ b/fileutils/open_archive/file_utils.py
@@ -1,24 +1,48 @@
+# Copyright 2018 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import os
import zipfile
+import urllib
+import tarfile
import json
from tempfile import mktemp
-
def open_archive(context,
target_dir='content',
archive_url=''):
- """Open a file/object archive into a target directory"""
+ """Open a file/object archive into a target directory
+
+ Currently supports zip and tar.gz
+ """
# Define locations
os.makedirs(target_dir, exist_ok=True)
context.logger.info('Verified directories')
- # Extract dataset from zip
- context.logger.info('Extracting zip')
- zip_ref = zipfile.ZipFile(archive_url, 'r')
- zip_ref.extractall(target_dir)
- zip_ref.close()
+ splits = archive_url.split('.')
+ if ('.'.join(splits[-2:]) == 'tar.gz'):
+ # Extract dataset from tar
+ context.logger.info('opening tar_gz')
+ ftpstream = urllib.request.urlopen(archive_url)
+ ref = tarfile.open(fileobj=ftpstream, mode="r|gz")
+ elif splits[-1] == 'zip':
+ # Extract dataset from zip
+ context.logger.info('opening zip')
+ ref = zipfile.ZipFile(archive_url, 'r')
+
+ ref.extractall(target_dir)
+ ref.close()
context.logger.info(f'extracted archive to {target_dir}')
context.log_artifact('content', target_path=target_dir)
-
\ No newline at end of file
diff --git a/fileutils/open_archive/function.yaml b/fileutils/open_archive/function.yaml
index 5a4547ece..1c80bf091 100644
--- a/fileutils/open_archive/function.yaml
+++ b/fileutils/open_archive/function.yaml
@@ -1,8 +1,9 @@
kind: job
metadata:
- name: file_utils
+ name: open-archive
spec:
- image: mlrun/mlrun:latest
- description: 'file utilities'
+ description: 'retrieve archive and extract all'
build:
- functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDE5LTEwLTI4IDIyOjAzCgppbXBvcnQgb3MKaW1wb3J0IHppcGZpbGUKCmRlZiBvcGVuX2FyY2hpdmUoY29udGV4dCwgCiAgICAgICAgICAgICAgICAgdGFyZ2V0X2Rpcj0nY29udGVudCcsCiAgICAgICAgICAgICAgICAgYXJjaGl2ZV91cmw9JycpOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkiIiIKICAgICAgICAKICAgIG9zLm1ha2VkaXJzKHRhcmdldF9kaXIsIGV4aXN0X29rPVRydWUpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCdWZXJpZmllZCBkaXJlY3RvcmllcycpCiAgICAKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oJ0V4dHJhY3RpbmcgemlwJykKICAgIHppcF9yZWYgPSB6aXBmaWxlLlppcEZpbGUoYXJjaGl2ZV91cmwsICdyJykKICAgIHppcF9yZWYuZXh0cmFjdGFsbCh0YXJnZXRfZGlyKQogICAgemlwX3JlZi5jbG9zZSgpCiAgICAKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oZidleHRyYWN0ZWQgYXJjaGl2ZSB0byB7dGFyZ2V0X2Rpcn0nKQogICAgY29udGV4dC5sb2dfYXJ0aWZhY3QoJ2NvbnRlbnQnLCB0YXJnZXRfcGF0aD10YXJnZXRfZGlyKQoK
+ functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDIwLTAxLTIxIDA5OjQ3CgppbXBvcnQgbWxydW4KbWxydW4ubWxjb25mLmRicGF0aCA9ICdodHRwOi8vbWxydW4tYXBpOjgwODAnCgppbXBvcnQgdXJsbGliLnJlcXVlc3QKCmltcG9ydCBvcwppbXBvcnQgemlwZmlsZQppbXBvcnQgdXJsbGliCmltcG9ydCB0YXJmaWxlCmltcG9ydCBqc29uCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKCmRlZiBvcGVuX2FyY2hpdmUoY29udGV4dDogTUxDbGllbnRDdHgsIAogICAgICAgICAgICAgICAgIHRhcmdldF9kaXI6IHN0ciA9ICdjb250ZW50JywKICAgICAgICAgICAgICAgICBhcmNoaXZlX3VybDogc3RyID0gJycpOgogICAgIiIiT3BlbiBhIGZpbGUvb2JqZWN0IGFyY2hpdmUgaW50byBhIHRhcmdldCBkaXJlY3RvcnkKICAgIAogICAgQ3VycmVudGx5IHN1cHBvcnRzIHppcCBhbmQgdGFyLmd6CiAgICAiIiIKICAgIG9zLm1ha2VkaXJzKHRhcmdldF9kaXIsIGV4aXN0X29rPVRydWUpCiAgICBjb250ZXh0LmxvZ2dlci5pbmZvKCdWZXJpZmllZCBkaXJlY3RvcmllcycpCiAgICBwcmludChhcmNoaXZlX3VybCkKICAgIHNwbGl0cyA9IGFyY2hpdmVfdXJsLnNwbGl0KCcuJykKICAgIHByaW50KHNwbGl0cykKICAgIGlmIChzcGxpdHNbLTFdID09ICdneicpOgogICAgICAgIGNvbnRleHQubG9nZ2VyLmluZm8oJ29wZW5pbmcgdGFyX2d6JykKICAgICAgICByZWYgPSB0YXJmaWxlLm9wZW4oZmlsZW9iaj11cmxsaWIucmVxdWVzdC51cmxvcGVuKGFyY2hpdmVfdXJsKSwgbW9kZT0ncnxneicpCiAgICBlbGlmIHNwbGl0c1stMV0gPT0gJ3ppcCc6CiAgICAgICAgY29udGV4dC5sb2dnZXIuaW5mbygnb3BlbmluZyB6aXAnKQogICAgICAgIHJlZiA9IHppcGZpbGUuWmlwRmlsZShhcmNoaXZlX3VybCwgJ3InKQoKICAgIHJlZi5leHRyYWN0YWxsKHRhcmdldF9kaXIpCiAgICByZWYuY2xvc2UoKQoKICAgIGNvbnRleHQubG9nX2FydGlmYWN0KCdjb250ZW50JywgdGFyZ2V0X3BhdGg9dGFyZ2V0X2RpcikKCg==
+ build_image: yjbds/mlrun-base:dev
+ commands: []
diff --git a/fileutils/parquet_to_dask/function.py b/fileutils/parquet_to_dask/function.py
new file mode 100644
index 000000000..f5530d1dd
--- /dev/null
+++ b/fileutils/parquet_to_dask/function.py
@@ -0,0 +1,82 @@
+# Copyright 2018 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+import dask
+import dask.dataframe as dd
+from dask.distributed import Client, LocalCluster
+
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+
+from typing import IO, AnyStr, Union, List, Optional
+
+def parquet_to_dask(
+ context: MLClientCtx,
+ parquet_url: Union[DataItem, str, Path, IO[AnyStr]],
+ inc_cols: Optional[List[str]] = None,
+ index_cols: Optional[List[str]] = None,
+ shards: int = 4,
+ threads_per: int = 4,
+ processes: bool = False,
+ memory_limit: str = '2GB',
+ persist: bool = True,
+ dask_key: str = 'my_dask_dataframe',
+ target_path: str = ''
+) -> None:
+ """Load parquet dataset into dask cluster
+
+ If no cluster is found loads a new one and persist the data to it. It
+ shouold not be necessary to create a new cluster when the function
+ is run as a 'dask' job.
+
+ :param context: the function context
+ :param parquet_url: url of the parquet file or partitioned dataset as either
+ artifact DataItem, string, or path object (see pandas read_csv)
+ :param inc_cols: include only these columns (very fast)
+ :param index_cols: list of index column names (can be a long-running process)
+ :param shards: number of workers to launch
+ :param threads_per: number of threads per worker
+ :param processes:
+ """
+ if hasattr(context, 'dask_client'):
+ context.logger.info('found cluster...')
+ dask_client = context.dask_client
+ else:
+ context.logger.info('starting new cluster...')
+ cluster = LocalCluster(n_workers=shards,
+ threads_per_worker=threads_per,
+ processes=processes,
+ memory_limit=memory_limit)
+ dask_client = Client(cluster)
+
+ context.logger.info(dask_client)
+
+ df = dd.read_parquet(parquet_url)
+
+ if persist and context:
+ df = dask_client.persist(df)
+ dask_client.publish_dataset(dask_key=df)
+ context.dask_client = dask_client
+
+ # share the scheduler
+ filepath = os.path.join(target_path, 'scheduler.json')
+ dask_client.write_scheduler_file(filepath)
+ context.log_artifact('scheduler', target_path=filepath)
+
+ print(df.head())
diff --git a/fileutils/parquet_to_dask/function.yaml b/fileutils/parquet_to_dask/function.yaml
new file mode 100644
index 000000000..c40e87dcd
--- /dev/null
+++ b/fileutils/parquet_to_dask/function.yaml
@@ -0,0 +1,22 @@
+kind: dask
+metadata:
+ name: function
+ hash: 4ed6e4dfc23b35ca9a7a6029b1f08b9a1d786885
+ project: default
+spec:
+ command: https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/parquet_to_dask/function.py
+ args: []
+ image: ''
+ volumes: []
+ volume_mounts: []
+ env: []
+ build:
+ base_image: yjbds/mlrun-daskboost:dev
+ commands: []
+ description: ''
+ replicas: 4
+ remote: true
+ service_type: NodePort
+ nthreads: 1
+ min_replicas: 0
+ max_replicas: 4
diff --git a/serving/README.md b/serving/README.md
new file mode 100644
index 000000000..d6f2e8a36
--- /dev/null
+++ b/serving/README.md
@@ -0,0 +1,4 @@
+# serving models
+
+**`xgboost/xgb-serving.ipynb`** deploy an xgboost server model
+**`classifier_server.ipynb`** deploy any classifier model that has been pickled (cloudpickle)
\ No newline at end of file
diff --git a/serving/classifier_server.ipynb b/serving/classifier_server.ipynb
new file mode 100644
index 000000000..71e835ceb
--- /dev/null
+++ b/serving/classifier_server.ipynb
@@ -0,0 +1,375 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Deploy a Serverless Model Server with Nuclio-KFServing\n",
+ " --------------------------------------------------------------------\n",
+ "\n",
+ "The following notebook demonstrates how to deploy **any pickled model** using **[nuclio](https://github.com/nuclio/nuclio)** + **[KFServing](https://github.com/kubeflow/kfserving)** (a.k.a Nuclio-serving)\n",
+ "\n",
+ "#### **notebook how-to's**\n",
+ "* Write and test model serving (KFServing) class in a notebook.\n",
+ "* Deploy the model server as a Nuclio-serving function.\n",
+ "* Invoke and test the serving function."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "#### **steps**\n",
+ "**[define a new function and its dependencies](#define-function)**
\n",
+ "**[test the model serving class locally](#test-locally)**
\n",
+ "**[deploy our serving class using as a serverless function](#deploy)**
\n",
+ "**[test our model server using HTTP request](#test-model-server)**
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# nuclio: ignore\n",
+ "# if the nuclio-jupyter package is not installed run !pip install nuclio-jupyter\n",
+ "import nuclio"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### **define a new function and its dependencies**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "%nuclio: setting kind to 'nuclio:serving'\n",
+ "%nuclio: setting 'MODEL_CLASS' environment variable\n"
+ ]
+ }
+ ],
+ "source": [
+ "%nuclio config kind=\"nuclio:serving\"\n",
+ "%nuclio env MODEL_CLASS=ClassifierModel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%nuclio cmd -c\n",
+ "pip install -U -q kfserving\n",
+ "pip install -U -q azure\n",
+ "pip install -U -q mlrun"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# %nuclio config spec.build.baseImage = \"yjbds/mlrun-files:latest\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import kfserving\n",
+ "import os\n",
+ "import numpy as np\n",
+ "from cloudpickle import load as pload"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "TARGET_PATH = '/User/mlrun/models'\n",
+ "MODEL_FILE = 'lgb-classifier.pkl'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class ClassifierModel(kfserving.KFModel):\n",
+ " def __init__(self, name: str, model_dir: str, model = None):\n",
+ " super().__init__(name)\n",
+ " self.name = name\n",
+ " self.model_dir = model_dir\n",
+ " if not model is None:\n",
+ " self.classifier = model\n",
+ " self.ready = True\n",
+ "\n",
+ " def load(self):\n",
+ " model_file = os.path.join(\n",
+ " kfserving.Storage.download(self.model_dir), MODEL_FILE)\n",
+ " self.classifier = pload(open(model_file, 'rb'))\n",
+ " self.ready = True\n",
+ "\n",
+ " def predict(self, body):\n",
+ " try:\n",
+ " feats = np.asarray(body['instances'])\n",
+ " result: np.ndarray = self.classifier.predict(feats)\n",
+ " return result.tolist()\n",
+ " except Exception as e:\n",
+ " raise Exception(\"Failed to predict %s\" % e)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# nuclio: end-code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "______________________________________________"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### **test the model serving class locally**\n",
+ "The class above can be tested locally. Just instantiate the class, `.load()` will load the model to a local dir.\n",
+ "\n",
+ "> **Verify there is a `model.bst` file in the model_dir path (generated by the training notebook)**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[I 200122 18:51:48 storage:35] Copying contents of /User/mlrun/models to local\n"
+ ]
+ }
+ ],
+ "source": [
+ "my_server = ClassifierModel('classifier', model_dir='/User/mlrun/models')\n",
+ "my_server.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### _data_\n",
+ "Make some classification data using scikit learn's `make_classification`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import make_classification\n",
+ "n_samples = 10\n",
+ "train_size = 0.7\n",
+ "X, y = make_classification(\n",
+ " n_samples=n_samples,\n",
+ " n_features=28, \n",
+ " random_state = np.random.RandomState(1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "event = {\"instances\": X.tolist()}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "We can use the `.predict(body)` method to test the model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "my_server.predict(event)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### **deploy our serving class using as a serverless function**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from mlrun import new_model_server, mount_v3io\n",
+ "import requests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fn = new_model_server('generic', \n",
+ " models={'classifier_gen': TARGET_PATH}, \n",
+ " model_class='ClassifierModel').apply(mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fn.spec.no_cache = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-22 18:51:52,843 deploy started\n"
+ ]
+ }
+ ],
+ "source": [
+ "addr = fn.deploy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### **test our model server using HTTP request**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import requests\n",
+ "\n",
+ "resp = requests.post(addr + '/classifier_gen/predict', json=event)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "resp.__dict__['_content'] "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "json.loads(resp.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**[back to top](#top)**"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/arc_to_parquet-airlines.ipynb b/tests/arc_to_parquet-airlines.ipynb
new file mode 100644
index 000000000..bf8ab37f7
--- /dev/null
+++ b/tests/arc_to_parquet-airlines.ipynb
@@ -0,0 +1,890 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# archive to parquet - partitioned data\n",
+ "\n",
+ "Ailines data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FUNCTION = 'arc_to_parquet'\n",
+ "DESCRIPTION = 'retrieve archive table and save as partitioned parquet dataset'\n",
+ "\n",
+ "BASE_IMAGE = 'yjbds/mlrun-base:dev'\n",
+ "JOB_KIND = 'dask'\n",
+ "TASK_NAME = 'user-task-arc-to-part-parq'\n",
+ "https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/tests/describe.py\n",
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils'\n",
+ "\n",
+ "ARCHIVE_BIG = \"https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv\"\n",
+ "ARCHIVE = \"https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv\"\n",
+ "ARCHIVE_SMALL = \"https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv\"\n",
+ "\n",
+ "USE_ARCHIVE = ARCHIVE_SMALL\n",
+ "TARGET_PATH = '/User/mlrun/airlines/dataset-small'\n",
+ "\n",
+ "FILE_SHAPE = (123_534_969, 21) # (rows, cols)\n",
+ "SMALL_FILE_SHAPE = (43_978, 21) # (rows, cols)\n",
+ "\n",
+ "FILE_NAME = 'airlines.pqt'\n",
+ "KEY = 'airlines'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "PARTITIONS_DEST = 'partitions'\n",
+ "PARTITION_COLS = ['Year', 'Month']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HEADER = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',\n",
+ " 'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime',\n",
+ " 'ArrDelay','DepDelay','Origin','Dest','Distance','TaxiIn','TaxiOut','Cancelled',\n",
+ " 'CancellationCode','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',\n",
+ " 'LateAircraftDelay']\n",
+ "\n",
+ "INC_COLS = ['Year','Month','DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime',\n",
+ " 'UniqueCarrier','FlightNum', 'CRSElapsedTime','AirTime',\n",
+ " 'Origin','Dest','Distance', 'TaxiIn', 'TaxiOut','Cancelled',\n",
+ " 'CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',\n",
+ " 'LateAircraftDelay']\n",
+ "\n",
+ "ENCODING = 'latin-1'\n",
+ "\n",
+ "DTYPES_COLS = {\n",
+ " 'CRSElapsedTime': 'float32', \n",
+ " 'TailNum': 'str', \n",
+ " 'Distance': 'float32',\n",
+ " 'TaxiIn' : 'float32',\n",
+ " 'TaxiOut': 'float32',\n",
+ " 'ArrTime': 'float32',\n",
+ " 'AirTime': 'float32',\n",
+ " 'DepTime':'float32', \n",
+ " 'CarrierDelay': 'float32', \n",
+ " 'WeatherDelay': 'float32', \n",
+ " 'NASDelay':'float32', \n",
+ " 'SecurityDelay':'float32', \n",
+ " 'LateAircraftDelay':'float32'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LABEL_COLUMN = \"IsArrDelayed\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.makedirs(os.path.join(TARGET_PATH, PARTITIONS_DEST), exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### load function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 01:19:26,578 starting remote build, image: .mlrun/func-default-function-latest\n",
+ "\u001b[36mINFO\u001b[0m[0000] Resolved base name yjbds/mlrun-base:dev to yjbds/mlrun-base:dev \n",
+ "\u001b[36mINFO\u001b[0m[0000] Resolved base name yjbds/mlrun-base:dev to yjbds/mlrun-base:dev \n",
+ "\u001b[36mINFO\u001b[0m[0000] Downloading base image yjbds/mlrun-base:dev \n",
+ "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:2bbe9095ff126252340957bde01f8d26d7742ee802d9b07a1490ad87e13ea3eb: no such file or directory \n",
+ "\u001b[36mINFO\u001b[0m[0000] Downloading base image yjbds/mlrun-base:dev \n",
+ "\u001b[36mINFO\u001b[0m[0001] Built cross stage deps: map[] \n",
+ "\u001b[36mINFO\u001b[0m[0001] Downloading base image yjbds/mlrun-base:dev \n",
+ "\u001b[36mINFO\u001b[0m[0001] Error while retrieving image from cache: getting file info: stat /cache/sha256:2bbe9095ff126252340957bde01f8d26d7742ee802d9b07a1490ad87e13ea3eb: no such file or directory \n",
+ "\u001b[36mINFO\u001b[0m[0001] Downloading base image yjbds/mlrun-base:dev \n",
+ "\u001b[36mINFO\u001b[0m[0001] Unpacking rootfs as cmd RUN pip install mlrun requires it. \n",
+ "\u001b[36mINFO\u001b[0m[0021] Taking snapshot of full filesystem... \n",
+ "\u001b[36mINFO\u001b[0m[0031] RUN pip install mlrun \n",
+ "\u001b[36mINFO\u001b[0m[0031] cmd: /bin/sh \n",
+ "\u001b[36mINFO\u001b[0m[0031] args: [-c pip install mlrun] \n",
+ "Requirement already satisfied: mlrun in /opt/conda/lib/python3.7/site-packages (0.4.4)\n",
+ "Requirement already satisfied: nuclio-sdk>=0.0.3 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.0.7)\n",
+ "Requirement already satisfied: sqlalchemy==1.3.11 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.3.11)\n",
+ "Requirement already satisfied: pandas>=0.23.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.25.3)\n",
+ "Requirement already satisfied: aiohttp>=3.5.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (3.6.2)\n",
+ "Requirement already satisfied: requests>=2.20.1 in /opt/conda/lib/python3.7/site-packages (from mlrun) (2.22.0)\n",
+ "Requirement already satisfied: kfp>=0.1.29 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.2.0)\n",
+ "Requirement already satisfied: Flask>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.1.1)\n",
+ "Requirement already satisfied: gunicorn==19.9.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (19.9.0)\n",
+ "Requirement already satisfied: croniter==0.3.31 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.3.31)\n",
+ "Requirement already satisfied: nest-asyncio>=1.0.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.2.3)\n",
+ "Requirement already satisfied: boto3>=1.9 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.11.9)\n",
+ "Requirement already satisfied: pyyaml>=5.1.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (5.3)\n",
+ "Requirement already satisfied: tabulate<=0.8.3,>=0.8.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.8.3)\n",
+ "Requirement already satisfied: click>=7.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (7.0)\n",
+ "Requirement already satisfied: nuclio-jupyter>=0.8.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.8.1)\n",
+ "Requirement already satisfied: GitPython>=2.1.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (3.0.5)\n",
+ "Requirement already satisfied: gevent==1.4.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.4.0)\n",
+ "Requirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.23.0->mlrun) (1.18.1)\n",
+ "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.23.0->mlrun) (2019.3)\n",
+ "Requirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.23.0->mlrun) (2.8.1)\n",
+ "Requirement already satisfied: multidict<5.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (4.7.4)\n",
+ "Requirement already satisfied: chardet<4.0,>=2.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (3.0.4)\n",
+ "Requirement already satisfied: async-timeout<4.0,>=3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (3.0.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (19.3.0)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (1.4.2)\n",
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests>=2.20.1->mlrun) (1.24.3)\n",
+ "Requirement already satisfied: idna<2.9,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests>=2.20.1->mlrun) (2.8)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests>=2.20.1->mlrun) (2019.9.11)\n",
+ "Requirement already satisfied: six>=1.10 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.12.0)\n",
+ "Requirement already satisfied: argo-models==2.2.1a in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (2.2.1a0)\n",
+ "Requirement already satisfied: kfp-server-api<=0.1.40,>=0.1.18 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (0.1.40)\n",
+ "Requirement already satisfied: kubernetes<=10.0.0,>=8.0.0 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (10.0.0)\n",
+ "Requirement already satisfied: Deprecated in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.2.7)\n",
+ "Requirement already satisfied: google-cloud-storage>=1.13.0 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.25.0)\n",
+ "Requirement already satisfied: jsonschema>=3.0.1 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (3.2.0)\n",
+ "Requirement already satisfied: cloudpickle==1.1.1 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.1.1)\n",
+ "Requirement already satisfied: google-auth>=1.6.1 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.11.0)\n",
+ "Requirement already satisfied: cryptography>=2.4.2 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (2.7)\n",
+ "Requirement already satisfied: requests-toolbelt>=0.8.0 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (0.9.1)\n",
+ "Requirement already satisfied: PyJWT>=1.6.4 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.7.1)\n",
+ "Requirement already satisfied: Jinja2>=2.10.1 in /opt/conda/lib/python3.7/site-packages (from Flask>=1.1.1->mlrun) (2.11.0)\n",
+ "Requirement already satisfied: itsdangerous>=0.24 in /opt/conda/lib/python3.7/site-packages (from Flask>=1.1.1->mlrun) (1.1.0)\n",
+ "Requirement already satisfied: Werkzeug>=0.15 in /opt/conda/lib/python3.7/site-packages (from Flask>=1.1.1->mlrun) (0.16.1)\n",
+ "Requirement already satisfied: botocore<1.15.0,>=1.14.9 in /opt/conda/lib/python3.7/site-packages (from boto3>=1.9->mlrun) (1.14.9)\n",
+ "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3>=1.9->mlrun) (0.9.4)\n",
+ "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from boto3>=1.9->mlrun) (0.3.2)\n",
+ "Requirement already satisfied: nbconvert>=5.4 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.6.1)\n",
+ "Requirement already satisfied: jupyterlab>=0.35.4 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (1.2.6)\n",
+ "Requirement already satisfied: ipython>=7.2 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (7.11.1)\n",
+ "Requirement already satisfied: notebook>=5.7.2 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (6.0.3)\n",
+ "Requirement already satisfied: tornado>=5 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (6.0.3)\n",
+ "Requirement already satisfied: gitdb2>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from GitPython>=2.1.0->mlrun) (2.0.6)\n",
+ "Requirement already satisfied: greenlet>=0.4.14; platform_python_implementation == \"CPython\" in /opt/conda/lib/python3.7/site-packages (from gevent==1.4.0->mlrun) (0.4.15)\n",
+ "Requirement already satisfied: requests-oauthlib in /opt/conda/lib/python3.7/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (1.3.0)\n",
+ "Requirement already satisfied: setuptools>=21.0.0 in /opt/conda/lib/python3.7/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (41.4.0)\n",
+ "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.7/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (0.57.0)\n",
+ "Requirement already satisfied: wrapt<2,>=1.10 in /opt/conda/lib/python3.7/site-packages (from Deprecated->kfp>=0.1.29->mlrun) (1.11.2)\n",
+ "Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (0.5.0)\n",
+ "Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.2.0)\n",
+ "Requirement already satisfied: pyrsistent>=0.14.0 in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (0.15.7)\n",
+ "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (1.5.0)\n",
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.2.8)\n",
+ "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0.0)\n",
+ "Requirement already satisfied: rsa<4.1,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0)\n",
+ "Requirement already satisfied: asn1crypto>=0.21.0 in /opt/conda/lib/python3.7/site-packages (from cryptography>=2.4.2->kfp>=0.1.29->mlrun) (1.0.1)\n",
+ "Requirement already satisfied: cffi!=1.11.3,>=1.8 in /opt/conda/lib/python3.7/site-packages (from cryptography>=2.4.2->kfp>=0.1.29->mlrun) (1.12.3)\n",
+ "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/lib/python3.7/site-packages (from Jinja2>=2.10.1->Flask>=1.1.1->mlrun) (1.1.1)\n",
+ "Requirement already satisfied: docutils<0.16,>=0.10 in /opt/conda/lib/python3.7/site-packages (from botocore<1.15.0,>=1.14.9->boto3>=1.9->mlrun) (0.15.2)\n",
+ "Requirement already satisfied: pygments in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (2.5.2)\n",
+ "Requirement already satisfied: nbformat>=4.4 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (5.0.4)\n",
+ "Requirement already satisfied: traitlets>=4.2 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (4.3.3)\n",
+ "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
+ "Requirement already satisfied: mistune<2,>=0.8.1 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.4)\n",
+ "Requirement already satisfied: bleach in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (3.1.0)\n",
+ "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (1.4.2)\n",
+ "Requirement already satisfied: jupyter-core in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (4.6.1)\n",
+ "Requirement already satisfied: entrypoints>=0.2.2 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.3)\n",
+ "Requirement already satisfied: testpath in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.4.4)\n",
+ "Requirement already satisfied: jupyterlab-server~=1.0.0 in /opt/conda/lib/python3.7/site-packages (from jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (1.0.6)\n",
+ "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (3.0.3)\n",
+ "Requirement already satisfied: pickleshare in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.5)\n",
+ "Requirement already satisfied: jedi>=0.10 in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.16.0)\n",
+ "Requirement already satisfied: decorator in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.4.1)\n",
+ "Requirement already satisfied: backcall in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.0)\n",
+ "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.8.0)\n",
+ "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.1)\n",
+ "Requirement already satisfied: ipykernel in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.1.4)\n",
+ "Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.2.0)\n",
+ "Requirement already satisfied: pyzmq>=17 in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (18.1.1)\n",
+ "Requirement already satisfied: Send2Trash in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (1.5.0)\n",
+ "Requirement already satisfied: terminado>=0.8.1 in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.8.3)\n",
+ "Requirement already satisfied: jupyter-client>=5.3.4 in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.3.4)\n",
+ "Requirement already satisfied: smmap2>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from gitdb2>=2.0.0->GitPython>=2.1.0->mlrun) (2.0.5)\n",
+ "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.7/site-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (3.1.0)\n",
+ "Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.16.0)\n",
+ "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (2.1.0)\n",
+ "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.4.8)\n",
+ "Requirement already satisfied: pycparser in /opt/conda/lib/python3.7/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp>=0.1.29->mlrun) (2.19)\n",
+ "Requirement already satisfied: webencodings in /opt/conda/lib/python3.7/site-packages (from bleach->nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.5.1)\n",
+ "Requirement already satisfied: json5 in /opt/conda/lib/python3.7/site-packages (from jupyterlab-server~=1.0.0->jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.5)\n",
+ "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.8)\n",
+ "Requirement already satisfied: parso>=0.5.2 in /opt/conda/lib/python3.7/site-packages (from jedi>=0.10->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
+ "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.7/site-packages (from pexpect; sys_platform != \"win32\"->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
+ "Requirement already satisfied: protobuf>=3.4.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (3.11.2)\n",
+ "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.51.0)\n",
+ "\u001b[36mINFO\u001b[0m[0033] Taking snapshot of full filesystem... \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')\n",
+ "\n",
+ "arctoparq = mlrun.import_function(func_yaml)\n",
+ "\n",
+ "arctoparq.apply(mlrun.mount_v3io())\n",
+ "\n",
+ "arctoparq.deploy() #skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 01:20:30,226 starting run user-task-arc-to-part-parq uid=e98743f403fc4c1aabb5fd293ae16613 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-30 01:20:30,314 Job is running in the background, pod: user-task-arc-to-part-parq-km9tw\n",
+ "[mlrun] 2020-01-30 01:20:36,058 destination file does not exist, downloading\n",
+ "[mlrun] 2020-01-30 01:20:36,537 saved table to /User/mlrun/airlines/dataset-small/partitions\n",
+ "[mlrun] 2020-01-30 01:20:36,564 log artifact airlines at /User/mlrun/airlines/dataset-small/partitions, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-30 01:20:36,578 run executed, status=completed\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...e16613 | \n",
+ " 0 | \n",
+ " Jan 30 01:20:36 | \n",
+ " completed | \n",
+ " function | \n",
+ " host=user-task-arc-to-part-parq-km9tw kind=job owner=admin | \n",
+ " | \n",
+ " archive_url=https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv dataset=partitions dtype={'AirTime': 'float32', 'ArrTime': 'float32', 'CRSElapsedTime': 'float32', 'CarrierDelay': 'float32', 'DepTime': 'float32', 'Distance': 'float32', 'LateAircraftDelay': 'float32', 'NASDelay': 'float32', 'SecurityDelay': 'float32', 'TailNum': 'str', 'TaxiIn': 'float32', 'TaxiOut': 'float32', 'WeatherDelay': 'float32'} encoding=latin-1 inc_cols=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'CRSElapsedTime', 'AirTime', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'] key=airlines name=airlines.pqt part_cols=['Year', 'Month'] target_path=/User/mlrun/airlines/dataset-small | \n",
+ " | \n",
+ " airlines | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run e98743f403fc4c1aabb5fd293ae16613 , !mlrun logs e98743f403fc4c1aabb5fd293ae16613 \n",
+ "[mlrun] 2020-01-30 01:20:39,512 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "# create and run the task\n",
+ "arc_to_parq_task = mlrun.NewTask(\n",
+ " TASK_NAME, \n",
+ " handler=FUNCTION, \n",
+ " params={\n",
+ " 'target_path': TARGET_PATH,\n",
+ " 'name' : FILE_NAME, \n",
+ " 'key' : KEY,\n",
+ " 'archive_url': USE_ARCHIVE,\n",
+ " 'dataset' : PARTITIONS_DEST,\n",
+ " 'part_cols' : PARTITION_COLS,\n",
+ " 'encoding' : ENCODING,\n",
+ " 'inc_cols' : INC_COLS,\n",
+ " 'dtype' : DTYPES_COLS})\n",
+ "# run\n",
+ "run = arctoparq.run(arc_to_parq_task)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### a partitioned parquet table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pandas as pd\n",
+ "import pyarrow.parquet as pq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = pq.ParquetDataset(os.path.join(TARGET_PATH, PARTITIONS_DEST))\n",
+ "df = dataset.read().to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.set_index(PARTITION_COLS, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " DayofMonth | \n",
+ " DayOfWeek | \n",
+ " DepTime | \n",
+ " CRSDepTime | \n",
+ " ArrTime | \n",
+ " CRSArrTime | \n",
+ " UniqueCarrier | \n",
+ " FlightNum | \n",
+ " CRSElapsedTime | \n",
+ " AirTime | \n",
+ " ... | \n",
+ " Dest | \n",
+ " Distance | \n",
+ " TaxiIn | \n",
+ " TaxiOut | \n",
+ " Cancelled | \n",
+ " CarrierDelay | \n",
+ " WeatherDelay | \n",
+ " NASDelay | \n",
+ " SecurityDelay | \n",
+ " LateAircraftDelay | \n",
+ "
\n",
+ " \n",
+ " | Year | \n",
+ " Month | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1987 | \n",
+ " 10 | \n",
+ " 14 | \n",
+ " 3 | \n",
+ " 741.0 | \n",
+ " 730 | \n",
+ " 912.0 | \n",
+ " 849 | \n",
+ " PS | \n",
+ " 1451 | \n",
+ " 79.0 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " SFO | \n",
+ " 447.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 15 | \n",
+ " 4 | \n",
+ " 729.0 | \n",
+ " 730 | \n",
+ " 903.0 | \n",
+ " 849 | \n",
+ " PS | \n",
+ " 1451 | \n",
+ " 79.0 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " SFO | \n",
+ " 447.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 17 | \n",
+ " 6 | \n",
+ " 741.0 | \n",
+ " 730 | \n",
+ " 918.0 | \n",
+ " 849 | \n",
+ " PS | \n",
+ " 1451 | \n",
+ " 79.0 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " SFO | \n",
+ " 447.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 18 | \n",
+ " 7 | \n",
+ " 729.0 | \n",
+ " 730 | \n",
+ " 847.0 | \n",
+ " 849 | \n",
+ " PS | \n",
+ " 1451 | \n",
+ " 79.0 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " SFO | \n",
+ " 447.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 19 | \n",
+ " 1 | \n",
+ " 749.0 | \n",
+ " 730 | \n",
+ " 922.0 | \n",
+ " 849 | \n",
+ " PS | \n",
+ " 1451 | \n",
+ " 79.0 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " SFO | \n",
+ " 447.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime \\\n",
+ "Year Month \n",
+ "1987 10 14 3 741.0 730 912.0 849 \n",
+ " 10 15 4 729.0 730 903.0 849 \n",
+ " 10 17 6 741.0 730 918.0 849 \n",
+ " 10 18 7 729.0 730 847.0 849 \n",
+ " 10 19 1 749.0 730 922.0 849 \n",
+ "\n",
+ " UniqueCarrier FlightNum CRSElapsedTime AirTime ... Dest \\\n",
+ "Year Month ... \n",
+ "1987 10 PS 1451 79.0 NaN ... SFO \n",
+ " 10 PS 1451 79.0 NaN ... SFO \n",
+ " 10 PS 1451 79.0 NaN ... SFO \n",
+ " 10 PS 1451 79.0 NaN ... SFO \n",
+ " 10 PS 1451 79.0 NaN ... SFO \n",
+ "\n",
+ " Distance TaxiIn TaxiOut Cancelled CarrierDelay WeatherDelay \\\n",
+ "Year Month \n",
+ "1987 10 447.0 NaN NaN 0 NaN NaN \n",
+ " 10 447.0 NaN NaN 0 NaN NaN \n",
+ " 10 447.0 NaN NaN 0 NaN NaN \n",
+ " 10 447.0 NaN NaN 0 NaN NaN \n",
+ " 10 447.0 NaN NaN 0 NaN NaN \n",
+ "\n",
+ " NASDelay SecurityDelay LateAircraftDelay \n",
+ "Year Month \n",
+ "1987 10 NaN NaN NaN \n",
+ " 10 NaN NaN NaN \n",
+ " 10 NaN NaN NaN \n",
+ " 10 NaN NaN NaN \n",
+ " 10 NaN NaN NaN \n",
+ "\n",
+ "[5 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "AssertionError",
+ "evalue": "(87956, 21)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mFILE_SHAPE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mUSE_ARCHIVE\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mARCHIVE_SMALL\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0mSMALL_FILE_SHAPE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"{df.shape}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mAssertionError\u001b[0m: (87956, 21)"
+ ]
+ }
+ ],
+ "source": [
+ "if USE_ARCHIVE == ARCHIVE:\n",
+ " assert df.shape==FILE_SHAPE\n",
+ "if USE_ARCHIVE == ARCHIVE_SMALL:\n",
+ " assert df.shape==SMALL_FILE_SHAPE, f\"{df.shape}\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## cleanup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# import shutil\n",
+ "# shutil.rmtree(TARGET_PATH)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/arc_to_parquet.ipynb b/tests/arc_to_parquet.ipynb
new file mode 100644
index 000000000..ddd165cac
--- /dev/null
+++ b/tests/arc_to_parquet.ipynb
@@ -0,0 +1,820 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# archive to parquet\n",
+ "\n",
+ "HIGGS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FUNCTION = 'arc_to_parquet'\n",
+ "DESCRIPTION = 'retrieve archive table and save as parquet file'\n",
+ "\n",
+ "BASE_IMAGE = 'yjbds/mlrun_dev-files:latest'\n",
+ "JOB_KIND = 'job'\n",
+ "TASK_NAME = 'user-task-arc-to-parq'\n",
+ "\n",
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils'\n",
+ "\n",
+ "TARGET_PATH = '/User/mlrun/models'\n",
+ "\n",
+ "ARCHIVE_SAMPLE = \"https://fpsignals-public.s3.amazonaws.com/higgs-small.tar.gz\"\n",
+ "ARCHIVE = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz\"\n",
+ "\n",
+ "FILE_NAME = 'higgs.pqt'\n",
+ "KEY = 'higgs'\n",
+ "\n",
+ "HEADER = ['labels', 'lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', \n",
+ " 'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', \n",
+ " 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt', 'jet_3_eta',\n",
+ " 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag',\n",
+ " 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.makedirs(TARGET_PATH, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### load and configure function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "func_py = os.path.join(CODE_BASE, FUNCTION, 'function.py')\n",
+ "func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')\n",
+ "\n",
+ "arctoparq = mlrun.new_function(command=func_py, kind=JOB_KIND)\n",
+ "\n",
+ "arctoparq.spec.description = DESCRIPTION\n",
+ "arctoparq.spec.build.base_image = BASE_IMAGE"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-29 12:23:04,377 function spec saved to path: /User/repos/functions/fileutils/arc_to_parquet/function.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "arctoparq.export(func_yaml)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "arctoparq.apply(mlrun.mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### ...or load from yaml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# arctoparq = mlrun.import_function(func_yaml).apply(mlrun.mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### deploy / build"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following triggers a build when run for the first time using specs found in the yaml file above. Unless that file changes, this only needs to be run once, even after the notebook has been restarted:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ready'"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "arctoparq.deploy(skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-29 12:23:17,789 starting run user-task-arc-to-parq uid=c3c3a9ade23d413781b1f62fba0f7593 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-29 12:23:17,864 Job is running in the background, pod: user-task-arc-to-parq-nx92p\n",
+ "[mlrun] 2020-01-29 12:23:22,149 destination file does not exist, downloading\n",
+ "[mlrun] 2020-01-29 12:28:19,478 saved table to /User/mlrun/models/higgs.pqt\n",
+ "[mlrun] 2020-01-29 12:28:19,492 log artifact higgs at /User/mlrun/models/higgs.pqt, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-29 12:28:19,514 run executed, status=completed\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...0f7593 | \n",
+ " 0 | \n",
+ " Jan 29 12:23:22 | \n",
+ " completed | \n",
+ " function | \n",
+ " host=user-task-arc-to-parq-nx92p kind=job owner=admin | \n",
+ " | \n",
+ " archive_url=https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz header=None key=higgs name=higgs.pqt target_path=/User/mlrun/models | \n",
+ " | \n",
+ " higgs | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run c3c3a9ade23d413781b1f62fba0f7593 , !mlrun logs c3c3a9ade23d413781b1f62fba0f7593 \n",
+ "[mlrun] 2020-01-29 12:28:28,186 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "# create and run the task\n",
+ "arc_to_parq_task = mlrun.NewTask(\n",
+ " TASK_NAME,\n",
+ " handler=FUNCTION, \n",
+ " params={\n",
+ " 'target_path': TARGET_PATH,\n",
+ " 'name' : FILE_NAME, \n",
+ " 'key' : KEY,\n",
+ " 'archive_url': ARCHIVE,\n",
+ " 'header' : None},\n",
+ " outputs=[KEY])\n",
+ "\n",
+ "# run\n",
+ "rn = arctoparq.run(arc_to_parq_task)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'higgs': '/User/mlrun/models/higgs.pqt'}"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rn.outputs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "___"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import numpy as np\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add more context tests\n",
+ "# convert these to real tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert KEY in rn.outputs.keys(), f\"mlrun.functions: key {KEY} not found in outputs\"\n",
+ "assert os.path.isfile(TARGET_PATH+'/'+ FILE_NAME), f\"mlrun.functions: artifact source not found at {TARGET_PATH+'/'+ FILE_NAME}\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "copied = pd.read_parquet(TARGET_PATH+'/'+ FILE_NAME, engine=\"pyarrow\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 1.000000000000000000e+00 | \n",
+ " 8.692932128906250000e-01 | \n",
+ " -6.350818276405334473e-01 | \n",
+ " 2.256902605295181274e-01 | \n",
+ " 3.274700641632080078e-01 | \n",
+ " -6.899932026863098145e-01 | \n",
+ " 7.542022466659545898e-01 | \n",
+ " -2.485731393098831177e-01 | \n",
+ " -1.092063903808593750e+00 | \n",
+ " 0.000000000000000000e+00 | \n",
+ " ... | \n",
+ " -1.045456994324922562e-02 | \n",
+ " -4.576716944575309753e-02 | \n",
+ " 3.101961374282836914e+00 | \n",
+ " 1.353760004043579102e+00 | \n",
+ " 9.795631170272827148e-01 | \n",
+ " 9.780761599540710449e-01 | \n",
+ " 9.200048446655273438e-01 | \n",
+ " 7.216574549674987793e-01 | \n",
+ " 9.887509346008300781e-01 | \n",
+ " 8.766783475875854492e-01 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " 0.907542 | \n",
+ " 0.329147 | \n",
+ " 0.359412 | \n",
+ " 1.497970 | \n",
+ " -0.313010 | \n",
+ " 1.095531 | \n",
+ " -0.557525 | \n",
+ " -1.588230 | \n",
+ " 2.173076 | \n",
+ " ... | \n",
+ " -1.138930 | \n",
+ " -0.000819 | \n",
+ " 0.000000 | \n",
+ " 0.302220 | \n",
+ " 0.833048 | \n",
+ " 0.985700 | \n",
+ " 0.978098 | \n",
+ " 0.779732 | \n",
+ " 0.992356 | \n",
+ " 0.798343 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " 0.798835 | \n",
+ " 1.470639 | \n",
+ " -1.635975 | \n",
+ " 0.453773 | \n",
+ " 0.425629 | \n",
+ " 1.104875 | \n",
+ " 1.282322 | \n",
+ " 1.381664 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 1.128848 | \n",
+ " 0.900461 | \n",
+ " 0.000000 | \n",
+ " 0.909753 | \n",
+ " 1.108330 | \n",
+ " 0.985692 | \n",
+ " 0.951331 | \n",
+ " 0.803252 | \n",
+ " 0.865924 | \n",
+ " 0.780118 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.0 | \n",
+ " 1.344385 | \n",
+ " -0.876626 | \n",
+ " 0.935913 | \n",
+ " 1.992050 | \n",
+ " 0.882454 | \n",
+ " 1.786066 | \n",
+ " -1.646778 | \n",
+ " -0.942383 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " -0.678379 | \n",
+ " -1.360356 | \n",
+ " 0.000000 | \n",
+ " 0.946652 | \n",
+ " 1.028704 | \n",
+ " 0.998656 | \n",
+ " 0.728281 | \n",
+ " 0.869200 | \n",
+ " 1.026736 | \n",
+ " 0.957904 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " 1.105009 | \n",
+ " 0.321356 | \n",
+ " 1.522401 | \n",
+ " 0.882808 | \n",
+ " -1.205349 | \n",
+ " 0.681466 | \n",
+ " -1.070464 | \n",
+ " -0.921871 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " -0.373566 | \n",
+ " 0.113041 | \n",
+ " 0.000000 | \n",
+ " 0.755856 | \n",
+ " 1.361057 | \n",
+ " 0.986610 | \n",
+ " 0.838085 | \n",
+ " 1.133295 | \n",
+ " 0.872245 | \n",
+ " 0.808487 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.0 | \n",
+ " 1.595839 | \n",
+ " -0.607811 | \n",
+ " 0.007075 | \n",
+ " 1.818450 | \n",
+ " -0.111906 | \n",
+ " 0.847550 | \n",
+ " -0.566437 | \n",
+ " 1.581239 | \n",
+ " 2.173076 | \n",
+ " ... | \n",
+ " -0.654227 | \n",
+ " -1.274345 | \n",
+ " 3.101961 | \n",
+ " 0.823761 | \n",
+ " 0.938191 | \n",
+ " 0.971758 | \n",
+ " 0.789176 | \n",
+ " 0.430553 | \n",
+ " 0.961357 | \n",
+ " 0.957818 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 1.000000000000000000e+00 8.692932128906250000e-01 \\\n",
+ "0 1.0 0.907542 \n",
+ "1 1.0 0.798835 \n",
+ "2 0.0 1.344385 \n",
+ "3 1.0 1.105009 \n",
+ "4 0.0 1.595839 \n",
+ "\n",
+ " -6.350818276405334473e-01 2.256902605295181274e-01 \\\n",
+ "0 0.329147 0.359412 \n",
+ "1 1.470639 -1.635975 \n",
+ "2 -0.876626 0.935913 \n",
+ "3 0.321356 1.522401 \n",
+ "4 -0.607811 0.007075 \n",
+ "\n",
+ " 3.274700641632080078e-01 -6.899932026863098145e-01 \\\n",
+ "0 1.497970 -0.313010 \n",
+ "1 0.453773 0.425629 \n",
+ "2 1.992050 0.882454 \n",
+ "3 0.882808 -1.205349 \n",
+ "4 1.818450 -0.111906 \n",
+ "\n",
+ " 7.542022466659545898e-01 -2.485731393098831177e-01 \\\n",
+ "0 1.095531 -0.557525 \n",
+ "1 1.104875 1.282322 \n",
+ "2 1.786066 -1.646778 \n",
+ "3 0.681466 -1.070464 \n",
+ "4 0.847550 -0.566437 \n",
+ "\n",
+ " -1.092063903808593750e+00 0.000000000000000000e+00 ... \\\n",
+ "0 -1.588230 2.173076 ... \n",
+ "1 1.381664 0.000000 ... \n",
+ "2 -0.942383 0.000000 ... \n",
+ "3 -0.921871 0.000000 ... \n",
+ "4 1.581239 2.173076 ... \n",
+ "\n",
+ " -1.045456994324922562e-02 -4.576716944575309753e-02 \\\n",
+ "0 -1.138930 -0.000819 \n",
+ "1 1.128848 0.900461 \n",
+ "2 -0.678379 -1.360356 \n",
+ "3 -0.373566 0.113041 \n",
+ "4 -0.654227 -1.274345 \n",
+ "\n",
+ " 3.101961374282836914e+00 1.353760004043579102e+00 \\\n",
+ "0 0.000000 0.302220 \n",
+ "1 0.000000 0.909753 \n",
+ "2 0.000000 0.946652 \n",
+ "3 0.000000 0.755856 \n",
+ "4 3.101961 0.823761 \n",
+ "\n",
+ " 9.795631170272827148e-01 9.780761599540710449e-01 \\\n",
+ "0 0.833048 0.985700 \n",
+ "1 1.108330 0.985692 \n",
+ "2 1.028704 0.998656 \n",
+ "3 1.361057 0.986610 \n",
+ "4 0.938191 0.971758 \n",
+ "\n",
+ " 9.200048446655273438e-01 7.216574549674987793e-01 \\\n",
+ "0 0.978098 0.779732 \n",
+ "1 0.951331 0.803252 \n",
+ "2 0.728281 0.869200 \n",
+ "3 0.838085 1.133295 \n",
+ "4 0.789176 0.430553 \n",
+ "\n",
+ " 9.887509346008300781e-01 8.766783475875854492e-01 \n",
+ "0 0.992356 0.798343 \n",
+ "1 0.865924 0.780118 \n",
+ "2 1.026736 0.957904 \n",
+ "3 0.872245 0.808487 \n",
+ "4 0.961357 0.957818 \n",
+ "\n",
+ "[5 rows x 29 columns]"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "copied.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(10999999, 29)"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "copied.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### cleanup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# os.remove(parquet_file_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/create_binary_data.ipynb b/tests/create_binary_data.ipynb
new file mode 100644
index 000000000..54bfa384c
--- /dev/null
+++ b/tests/create_binary_data.ipynb
@@ -0,0 +1,365 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/datagen/classification'\n",
+ "FUNCTION = ''\n",
+ "N_SAMPLES = 100_000\n",
+ "M_FEATURES = 28\n",
+ "NEG_WEIGHT = 0.5\n",
+ "TARGET_DATA_PATH = '/User/mlrun/models'\n",
+ "KEY = 'simdata'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "binarydatagen = mlrun.import_function(os.path.join(CODE_BASE, 'function.yaml'))\n",
+ "binarydatagen.apply(mlrun.mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 14:32:09,075 starting remote build, image: .mlrun/func-default-binary-latest\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "binarydatagen.deploy(skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 14:32:15,900 starting run create_binary_classification uid=9dd358cd04554c9aa138275c0ec080aa -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-26 14:32:15,989 Job is running in the background, pod: create-binary-classification-crlf9\n",
+ "[mlrun] 2020-01-26 14:32:26,865 log artifact simdata at /User/mlrun/models/simdata-1e05X28.parquet, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-26 14:32:26,877 run executed, status=completed\n",
+ "/opt/conda/lib/python3.7/site-packages/pyarrow/pandas_compat.py:114: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n",
+ " result = infer_dtype(pandas_collection)\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...c080aa | \n",
+ " 0 | \n",
+ " Jan 26 14:32:26 | \n",
+ " completed | \n",
+ " binary | \n",
+ " host=create-binary-classification-crlf9 kind=job owner=admin | \n",
+ " | \n",
+ " key=simdata m_features=28 n_samples=100000 target_path=/User/mlrun/models weight=0.5 | \n",
+ " | \n",
+ " simdata | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 9dd358cd04554c9aa138275c0ec080aa , !mlrun logs 9dd358cd04554c9aa138275c0ec080aa \n",
+ "[mlrun] 2020-01-26 14:32:35,166 run executed, status=completed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "task = mlrun.NewTask()\n",
+ "task.with_params(\n",
+ " n_samples=N_SAMPLES,\n",
+ " m_features=M_FEATURES,\n",
+ " weight=NEG_WEIGHT,\n",
+ " target_path=TARGET_DATA_PATH,\n",
+ " key=KEY)\n",
+ "\n",
+ "binarydatagen.run(task, handler='create_binary_classification')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "fn = f\"simdata-{N_SAMPLES:0.0e}X{M_FEATURES}.parquet\".replace(\"+\", \"\")\n",
+ "df = pd.read_parquet(os.path.join(TARGET_DATA_PATH, fn), engine='pyarrow')\n",
+ "assert df.shape == (N_SAMPLES, M_FEATURES + 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/describe.py b/tests/describe.py
new file mode 100644
index 000000000..d680b6651
--- /dev/null
+++ b/tests/describe.py
@@ -0,0 +1,55 @@
+# Copyright 2018 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+
+import dask
+import dask.dataframe as dd
+from dask.distributed import Client
+
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+from mlrun.artifacts import ChartArtifact, TableArtifact, PlotArtifact
+
+from typing import IO, AnyStr, Union, List, Optional
+
+def table_summary(
+ context: MLClientCtx,
+ dask_client: Union[DataItem, str],
+ dask_key: str = 'my_dask_dataframe',
+ target_path: str = '',
+ name: str = 'table_summary.csv',
+ key: str = 'table_summary'
+) -> None:
+ """Summarize a table
+
+ :param context: the function context
+ :param dask_client: path to the dask client scheduler json file, as
+ string or artifact
+ :param dask_key: key of dataframe in dask client 'datasets' attribute
+ :param target_path: destimation folder for table summary file
+ :param name: name of table summary file (with extension like .csv)
+ :param key: key of table summary in artifact store
+ """
+ context.dask_client = Client(scheduler_file=str(dask_client))
+ df = context.dask_client.get_dataset('dask_key')
+ dscr = df.describe()
+
+ filepath = os.path.join(target_path, name)
+ dd.to_csv(dscr, filepath, single_file=True, index=False)
+ context.log_artifact(key, target_path=filepath)
+
\ No newline at end of file
diff --git a/tests/describe.yaml b/tests/describe.yaml
new file mode 100644
index 000000000..7095b5597
--- /dev/null
+++ b/tests/describe.yaml
@@ -0,0 +1,15 @@
+kind: job
+metadata:
+ name: describe
+ project: default
+spec:
+ command: https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/tests/describe.py
+ args: []
+ image: ''
+ volumes: []
+ volume_mounts: []
+ env: []
+ description: ''
+ build:
+ base_image: yjbds/mlrun-daskboost:dev
+ commands: []
diff --git a/tests/features_engineer.ipynb b/tests/features_engineer.ipynb
new file mode 100644
index 000000000..9fa519d53
--- /dev/null
+++ b/tests/features_engineer.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "import numpy as np\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CODE_BASE = '/User/repos/functions/' \n",
+ "TARGET_PATH = '/User/mlrun/models'\n",
+ "\n",
+ "SRC_FILE = 'higgs.pqt'\n",
+ "RNG = 1\n",
+ "\n",
+ "MODEL_KEY = 'model'\n",
+ "FEATURES_KEY = 'lgb-classifier.pkl'\n",
+ "VERBOSE = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 21:04:35,571 function spec saved to path: /User/repos/functions/datagen/features/features-engineer.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "testfn = mlrun.code_to_function(\n",
+ " kind='job', \n",
+ " filename=os.path.join(CODE_BASE, 'datagen/features', 'features-engineer.py'))\n",
+ "testfn.build_config(base_image='yjbds/mlrun-ds:latest', commands=[])\n",
+ "testfn.export(os.path.join(CODE_BASE, 'datagen/features', 'features-engineer.yaml'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "engineer = mlrun.import_function(\n",
+ " os.path.join(CODE_BASE, 'datagen/features', 'features-engineer.yaml')\n",
+ ").apply(mlrun.mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ready'"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "engineer.deploy(skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 21:05:17,440 starting run features_engineer uid=2f1baafc36b44bbea796fe5276c0e27d -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-26 21:05:17,525 Job is running in the background, pod: features-engineer-wpkc5\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ERROR:root:Internal Python error in the inspect module.\n",
+ "Below is the traceback from this internal error.\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 3319, in run_code\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " File \"\", line 6, in \n",
+ " engtsk = engineer.run(task, handler='features_engineer')\n",
+ " File \"/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/runtimes/base.py\", line 262, in run\n",
+ " runspec.logs(True, self._get_db())\n",
+ " File \"/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/model.py\", line 352, in logs\n",
+ " watch=watch)\n",
+ " File \"/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/db/httpdb.py\", line 115, in watch_log\n",
+ " time.sleep(10)\n",
+ "KeyboardInterrupt\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py\", line 2034, in showtraceback\n",
+ " stb = value._render_traceback_()\n",
+ "AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'\n",
+ "\n",
+ "During handling of the above exception, another exception occurred:\n",
+ "\n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/IPython/core/ultratb.py\", line 1151, in get_records\n",
+ " return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n",
+ " File \"/conda/lib/python3.6/site-packages/IPython/core/ultratb.py\", line 319, in wrapped\n",
+ " return f(*args, **kwargs)\n",
+ " File \"/conda/lib/python3.6/site-packages/IPython/core/ultratb.py\", line 353, in _fixed_getinnerframes\n",
+ " records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n",
+ " File \"/conda/lib/python3.6/inspect.py\", line 1490, in getinnerframes\n",
+ " frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)\n",
+ " File \"/conda/lib/python3.6/inspect.py\", line 1448, in getframeinfo\n",
+ " filename = getsourcefile(frame) or getfile(frame)\n",
+ " File \"/conda/lib/python3.6/inspect.py\", line 696, in getsourcefile\n",
+ " if getattr(getmodule(object, filename), '__loader__', None) is not None:\n",
+ " File \"/conda/lib/python3.6/inspect.py\", line 742, in getmodule\n",
+ " os.path.realpath(f)] = module.__name__\n",
+ " File \"/conda/lib/python3.6/posixpath.py\", line 395, in realpath\n",
+ " path, ok = _joinrealpath(filename[:0], filename, {})\n",
+ " File \"/conda/lib/python3.6/posixpath.py\", line 429, in _joinrealpath\n",
+ " if not islink(newpath):\n",
+ " File \"/conda/lib/python3.6/posixpath.py\", line 171, in islink\n",
+ " st = os.lstat(path)\n",
+ "KeyboardInterrupt\n"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "task = mlrun.NewTask()\n",
+ "task.with_params(\n",
+ " X='higgs.pqt',\n",
+ " target_path=TARGET_PATH)\n",
+ "\n",
+ "engtsk = engineer.run(task, handler='features_engineer')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/file-utils.ipynb b/tests/file-utils.ipynb
deleted file mode 100644
index d3fdb407f..000000000
--- a/tests/file-utils.ipynb
+++ /dev/null
@@ -1,880 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "import mlrun\n",
- "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# archive to folder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "import urllib3\n",
- "urllib3.disable_warnings()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[mlrun] 2020-01-20 08:36:14,989 starting run download uid=79a5b0f103c24367961cf8c107126dd2 -> http://mlrun-api:8080\n",
- "[mlrun] 2020-01-20 08:36:15,069 Job is running in the background, pod: download-6mg4q\n",
- "[mlrun] 2020-01-20 08:36:19,610 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp\n",
- "[mlrun] 2020-01-20 08:36:21,218 Verified directories\n",
- "[mlrun] 2020-01-20 08:36:21,218 Extracting zip\n",
- "[mlrun] 2020-01-20 08:36:22,988 extracted archive to content\n",
- "[mlrun] 2020-01-20 08:36:23,001 log artifact content at content, size: None, db: Y\n",
- "\n",
- "[mlrun] 2020-01-20 08:36:23,011 run executed, status=completed\n",
- "final state: succeeded\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | uid | \n",
- " iter | \n",
- " start | \n",
- " state | \n",
- " name | \n",
- " labels | \n",
- " inputs | \n",
- " parameters | \n",
- " results | \n",
- " artifacts | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " ...126dd2 | \n",
- " 0 | \n",
- " Jan 20 08:36:19 | \n",
- " completed | \n",
- " file_utils | \n",
- " host=download-6mg4q kind=job owner=admin | \n",
- " archive_url | \n",
- " key=contents target_path=/User/mlrun/functions/images | \n",
- " | \n",
- " content | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "to track results use .show() or .logs() or in CLI: \n",
- "!mlrun get run 79a5b0f103c24367961cf8c107126dd2 , !mlrun logs 79a5b0f103c24367961cf8c107126dd2 \n",
- "[mlrun] 2020-01-20 08:36:24,208 run executed, status=completed\n"
- ]
- }
- ],
- "source": [
- "# load function from Github\n",
- "xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/open_archive/function.yaml')\n",
- "\n",
- "# configute it: mount on iguazio fabric, set as interactive (return stdout)\n",
- "xfn.apply(mlrun.mount_v3io())\n",
- "xfn.interactive = True\n",
- "\n",
- "# create and run the task\n",
- "\n",
- "images_path = '/User/mlrun/functions/images'\n",
- "\n",
- "open_archive_task = mlrun.NewTask(\n",
- " 'download',\n",
- " handler='open_archive', \n",
- " params={'target_path': images_path,\n",
- " 'key' : 'contents'},\n",
- " inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'}\n",
- ")\n",
- "\n",
- "# run\n",
- "run = xfn.run(open_archive_task)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "_________"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# archive to parquet"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### load and configure function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# load function from Github\n",
- "xfn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/arc2parq/fileutils/arc_to_parquet/arc_to_parquet.yaml')\n",
- "\n",
- "# configure function: mount on the Iguazio data fabric, set as interactive (return stdout)\n",
- "xfn.apply(mlrun.mount_v3io())\n",
- "xfn.interactive = True"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### deploy / build"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following triggers a build when run for the first time using specs found in the yaml file above. Unless that file changes, this only needs to be run once, even after the notebook has been restarted:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": true,
- "jupyter": {
- "outputs_hidden": true
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[mlrun] 2020-01-20 05:35:07,015 starting remote build, image: .mlrun/func-default-arc_to_parquet-latest\n",
- "\u001b[36mINFO\u001b[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie \n",
- "\u001b[36mINFO\u001b[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie \n",
- "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n",
- "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory \n",
- "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n",
- "\u001b[36mINFO\u001b[0m[0000] Built cross stage deps: map[] \n",
- "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n",
- "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory \n",
- "\u001b[36mINFO\u001b[0m[0000] Downloading base image python:3.6-jessie \n",
- "\u001b[36mINFO\u001b[0m[0001] Unpacking rootfs as cmd RUN pip install -q mlrun requires it. \n",
- "\u001b[36mINFO\u001b[0m[0011] Taking snapshot of full filesystem... \n",
- "\u001b[36mINFO\u001b[0m[0018] RUN pip install -q mlrun \n",
- "\u001b[36mINFO\u001b[0m[0018] cmd: /bin/sh \n",
- "\u001b[36mINFO\u001b[0m[0018] args: [-c pip install -q mlrun] \n",
- "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n",
- "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
- "\u001b[36mINFO\u001b[0m[0065] Taking snapshot of full filesystem... \n",
- "\u001b[36mINFO\u001b[0m[0082] RUN pip install -q pyarrow \n",
- "\u001b[36mINFO\u001b[0m[0082] cmd: /bin/sh \n",
- "\u001b[36mINFO\u001b[0m[0082] args: [-c pip install -q pyarrow] \n",
- "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n",
- "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
- "\u001b[36mINFO\u001b[0m[0086] Taking snapshot of full filesystem... \n",
- "\u001b[36mINFO\u001b[0m[0095] RUN pip install -q numpy \n",
- "\u001b[36mINFO\u001b[0m[0095] cmd: /bin/sh \n",
- "\u001b[36mINFO\u001b[0m[0095] args: [-c pip install -q numpy] \n",
- "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n",
- "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
- "\u001b[36mINFO\u001b[0m[0096] Taking snapshot of full filesystem... \n",
- "\u001b[36mINFO\u001b[0m[0099] RUN pip install -q pandas \n",
- "\u001b[36mINFO\u001b[0m[0099] cmd: /bin/sh \n",
- "\u001b[36mINFO\u001b[0m[0099] args: [-c pip install -q pandas] \n",
- "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n",
- "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
- "\u001b[36mINFO\u001b[0m[0100] Taking snapshot of full filesystem... \n",
- "\u001b[36mINFO\u001b[0m[0102] RUN pip install mlrun \n",
- "\u001b[36mINFO\u001b[0m[0102] cmd: /bin/sh \n",
- "\u001b[36mINFO\u001b[0m[0102] args: [-c pip install mlrun] \n",
- "Requirement already satisfied: mlrun in /usr/local/lib/python3.6/site-packages (0.4.3)\n",
- "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (7.0)\n",
- "Requirement already satisfied: gunicorn==19.9.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (19.9.0)\n",
- "Requirement already satisfied: requests>=2.20.1 in /usr/local/lib/python3.6/site-packages (from mlrun) (2.22.0)\n",
- "Requirement already satisfied: aiohttp>=3.5.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (3.6.2)\n",
- "Requirement already satisfied: sqlalchemy==1.3.11 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.3.11)\n",
- "Requirement already satisfied: gevent==1.4.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.4.0)\n",
- "Requirement already satisfied: nuclio-jupyter>=0.8.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.8.0)\n",
- "Requirement already satisfied: kfp>=0.1.29 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.1.40)\n",
- "Requirement already satisfied: GitPython>=2.1.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (3.0.5)\n",
- "Requirement already satisfied: Flask>=1.1.1 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.1.1)\n",
- "Requirement already satisfied: pandas>=0.23.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.25.3)\n",
- "Requirement already satisfied: tabulate<=0.8.3,>=0.8.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.8.3)\n",
- "Requirement already satisfied: boto3>=1.9 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.11.5)\n",
- "Requirement already satisfied: pyyaml>=5.1.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (5.3)\n",
- "Requirement already satisfied: nest-asyncio>=1.0.0 in /usr/local/lib/python3.6/site-packages (from mlrun) (1.2.2)\n",
- "Requirement already satisfied: nuclio-sdk>=0.0.3 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.0.5)\n",
- "Requirement already satisfied: croniter==0.3.31 in /usr/local/lib/python3.6/site-packages (from mlrun) (0.3.31)\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (2019.11.28)\n",
- "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (3.0.4)\n",
- "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (2.8)\n",
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/site-packages (from requests>=2.20.1->mlrun) (1.24.3)\n",
- "Requirement already satisfied: async-timeout<4.0,>=3.0 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (3.0.1)\n",
- "Requirement already satisfied: idna-ssl>=1.0; python_version < \"3.7\" in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (1.1.0)\n",
- "Requirement already satisfied: multidict<5.0,>=4.5 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (4.7.4)\n",
- "Requirement already satisfied: typing-extensions>=3.6.5; python_version < \"3.7\" in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (3.7.4.1)\n",
- "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (1.4.2)\n",
- "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.6/site-packages (from aiohttp>=3.5.0->mlrun) (19.3.0)\n",
- "Requirement already satisfied: greenlet>=0.4.14; platform_python_implementation == \"CPython\" in /usr/local/lib/python3.6/site-packages (from gevent==1.4.0->mlrun) (0.4.15)\n",
- "Requirement already satisfied: notebook>=5.7.2 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (6.0.2)\n",
- "Requirement already satisfied: jupyterlab>=0.35.4 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (1.2.5)\n",
- "Requirement already satisfied: tornado<6,>=5 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.1.1)\n",
- "Requirement already satisfied: ipython>=7.2 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (7.11.1)\n",
- "Requirement already satisfied: nbconvert>=5.4 in /usr/local/lib/python3.6/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.6.1)\n",
- "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (2.8.1)\n",
- "Requirement already satisfied: argo-models==2.2.1a in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (2.2.1a0)\n",
- "Requirement already satisfied: cloudpickle==1.1.1 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.1.1)\n",
- "Requirement already satisfied: kubernetes<=10.0.0,>=8.0.0 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (10.0.0)\n",
- "Requirement already satisfied: Deprecated in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.2.7)\n",
- "Requirement already satisfied: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.25.0)\n",
- "Requirement already satisfied: google-auth>=1.6.1 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.10.1)\n",
- "Requirement already satisfied: requests-toolbelt>=0.8.0 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (0.9.1)\n",
- "Requirement already satisfied: PyJWT>=1.6.4 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.7.1)\n",
- "Requirement already satisfied: jsonschema>=3.0.1 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (3.2.0)\n",
- "Requirement already satisfied: cryptography>=2.4.2 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (2.8)\n",
- "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (1.14.0)\n",
- "Requirement already satisfied: kfp-server-api<=0.1.40,>=0.1.18 in /usr/local/lib/python3.6/site-packages (from kfp>=0.1.29->mlrun) (0.1.40)\n",
- "Requirement already satisfied: gitdb2>=2.0.0 in /usr/local/lib/python3.6/site-packages (from GitPython>=2.1.0->mlrun) (2.0.6)\n",
- "Requirement already satisfied: Werkzeug>=0.15 in /usr/local/lib/python3.6/site-packages (from Flask>=1.1.1->mlrun) (0.16.0)\n",
- "Requirement already satisfied: itsdangerous>=0.24 in /usr/local/lib/python3.6/site-packages (from Flask>=1.1.1->mlrun) (1.1.0)\n",
- "Requirement already satisfied: Jinja2>=2.10.1 in /usr/local/lib/python3.6/site-packages (from Flask>=1.1.1->mlrun) (2.10.3)\n",
- "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/site-packages (from pandas>=0.23.0->mlrun) (1.18.1)\n",
- "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/site-packages (from pandas>=0.23.0->mlrun) (2019.3)\n",
- "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/site-packages (from boto3>=1.9->mlrun) (0.9.4)\n",
- "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/site-packages (from boto3>=1.9->mlrun) (0.3.1)\n",
- "Requirement already satisfied: botocore<1.15.0,>=1.14.5 in /usr/local/lib/python3.6/site-packages (from boto3>=1.9->mlrun) (1.14.5)\n",
- "Requirement already satisfied: jupyter-client>=5.3.4 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.3.4)\n",
- "Requirement already satisfied: traitlets>=4.2.1 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (4.3.3)\n",
- "Requirement already satisfied: Send2Trash in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (1.5.0)\n",
- "Requirement already satisfied: terminado>=0.8.1 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.8.3)\n",
- "Requirement already satisfied: pyzmq>=17 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (18.1.1)\n",
- "Requirement already satisfied: prometheus-client in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.1)\n",
- "Requirement already satisfied: jupyter-core>=4.6.0 in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (4.6.1)\n",
- "Requirement already satisfied: nbformat in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.0.3)\n",
- "Requirement already satisfied: ipykernel in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.1.3)\n",
- "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.2.0)\n",
- "Requirement already satisfied: jupyterlab-server~=1.0.0 in /usr/local/lib/python3.6/site-packages (from jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (1.0.6)\n",
- "Requirement already satisfied: jedi>=0.10 in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.15.2)\n",
- "Requirement already satisfied: pygments in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (2.5.2)\n",
- "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (3.0.2)\n",
- "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (41.0.1)\n",
- "Requirement already satisfied: backcall in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.0)\n",
- "Requirement already satisfied: decorator in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.4.1)\n",
- "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.7.0)\n",
- "Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.5)\n",
- "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (1.4.2)\n",
- "Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.3)\n",
- "Requirement already satisfied: testpath in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.4.4)\n",
- "Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.4)\n",
- "Requirement already satisfied: defusedxml in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
- "Requirement already satisfied: bleach in /usr/local/lib/python3.6/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (3.1.0)\n",
- "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (0.57.0)\n",
- "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (1.3.0)\n",
- "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.6/site-packages (from Deprecated->kfp>=0.1.29->mlrun) (1.11.2)\n",
- "Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (0.5.0)\n",
- "Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.2.0)\n",
- "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.2.8)\n",
- "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0)\n",
- "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0.0)\n",
- "Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (0.15.7)\n",
- "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (1.4.0)\n",
- "Requirement already satisfied: cffi!=1.11.3,>=1.8 in /usr/local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp>=0.1.29->mlrun) (1.13.2)\n",
- "Requirement already satisfied: smmap2>=2.0.0 in /usr/local/lib/python3.6/site-packages (from gitdb2>=2.0.0->GitPython>=2.1.0->mlrun) (2.0.5)\n",
- "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/site-packages (from Jinja2>=2.10.1->Flask>=1.1.1->mlrun) (1.1.1)\n",
- "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/site-packages (from botocore<1.15.0,>=1.14.5->boto3>=1.9->mlrun) (0.15.2)\n",
- "Requirement already satisfied: ptyprocess; os_name != \"nt\" in /usr/local/lib/python3.6/site-packages (from terminado>=0.8.1->notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
- "Requirement already satisfied: json5 in /usr/local/lib/python3.6/site-packages (from jupyterlab-server~=1.0.0->jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.5)\n",
- "Requirement already satisfied: parso>=0.5.2 in /usr/local/lib/python3.6/site-packages (from jedi>=0.10->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.5.2)\n",
- "Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.8)\n",
- "Requirement already satisfied: webencodings in /usr/local/lib/python3.6/site-packages (from bleach->nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.5.1)\n",
- "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/site-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (3.1.0)\n",
- "Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/site-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.16.0)\n",
- "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.4.8)\n",
- "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/site-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (2.0.0)\n",
- "Requirement already satisfied: pycparser in /usr/local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp>=0.1.29->mlrun) (2.19)\n",
- "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.51.0)\n",
- "Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (3.11.2)\n",
- "Requirement already satisfied: more-itertools in /usr/local/lib/python3.6/site-packages (from zipp>=0.5->importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (8.1.0)\n",
- "WARNING: You are using pip version 19.1.1, however version 19.3.1 is available.\n",
- "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
- "\u001b[36mINFO\u001b[0m[0103] Taking snapshot of full filesystem... \n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "xfn.deploy()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Also note that the build time can be reduced if you specifiy a pre-built image with all required packages."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# useful constants\n",
- "target_path = '/User/mlrun/functions/parquet'\n",
- "archive = 'https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz'\n",
- "parquet_file = 'x_test_50.parquet' # the file extension is not necessary\n",
- "parquet_file_path = target_path + \"/\" + parquet_file\n",
- "artifact_key = 'raw_data'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[mlrun] 2020-01-20 05:38:21,743 starting run arc2parq uid=42af41d93f294cd09aace4942d25b106 -> http://mlrun-api:8080\n",
- "[mlrun] 2020-01-20 05:38:21,823 Job is running in the background, pod: arc2parq-96gmq\n",
- "[mlrun] 2020-01-20 05:38:37,072 destination file exists\n",
- "[mlrun] 2020-01-20 05:38:37,072 logging /User/mlrun/functions/parquet/x_test_50.parquet to context\n",
- "[mlrun] 2020-01-20 05:38:37,083 log artifact raw_data at /User/mlrun/functions/parquet/x_test_50.parquet, size: None, db: Y\n",
- "\n",
- "[mlrun] 2020-01-20 05:38:37,094 run executed, status=completed\n",
- "final state: succeeded\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | uid | \n",
- " iter | \n",
- " start | \n",
- " state | \n",
- " name | \n",
- " labels | \n",
- " inputs | \n",
- " parameters | \n",
- " results | \n",
- " artifacts | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " ...25b106 | \n",
- " 0 | \n",
- " Jan 20 05:38:37 | \n",
- " completed | \n",
- " arc_to_parquet | \n",
- " host=arc2parq-96gmq kind=job owner=admin | \n",
- " | \n",
- " archive_url=https://fpsignals-public.s3.amazonaws.com/x_test_50.csv.gz key=raw_data name=x_test_50.parquet target_path=/User/mlrun/functions/parquet | \n",
- " | \n",
- " raw_data | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "to track results use .show() or .logs() or in CLI: \n",
- "!mlrun get run 42af41d93f294cd09aace4942d25b106 , !mlrun logs 42af41d93f294cd09aace4942d25b106 \n",
- "[mlrun] 2020-01-20 05:38:40,974 run executed, status=completed\n"
- ]
- }
- ],
- "source": [
- "# create and run the task\n",
- "arc_to_parq_task = mlrun.NewTask(\n",
- " 'arc2parq', \n",
- " handler='arc_to_parquet', # a string since we are calling this 'remotely', outside this notebook\n",
- " params={\n",
- " 'target_path': target_path,\n",
- " 'name' : parquet_file, \n",
- " 'key' : artifact_key,\n",
- " 'archive_url': archive},\n",
- " outputs=[artifact_key])\n",
- "\n",
- "# run\n",
- "run = xfn.run(arc_to_parq_task)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "___"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### tests"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import numpy as np\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# add more context tests\n",
- "# convert these to real tests"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "assert artifact_key in run.outputs.keys(), f\"mlrun.functions: key {artifact_key} not fond in outputs\"\n",
- "assert os.path.isfile(parquet_file_path), f\"mlrun.functions: artifact source not found at {parquet_file_path}\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "original = pd.read_csv(archive)\n",
- "copied = pd.read_parquet(parquet_file_path, engine=\"pyarrow\")\n",
- "assert np.array_equal(original, copied), \"mlrun.functions: original and copied data not equal\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### cleanup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "os.remove(parquet_file_path)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/tests/open_archive.ipynb b/tests/open_archive.ipynb
new file mode 100644
index 000000000..42ff00d0e
--- /dev/null
+++ b/tests/open_archive.ipynb
@@ -0,0 +1,1001 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !python -m pip uninstall -y mlrun\n",
+ "# !python -m pip install mlrun"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# archive to folder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# nuclio: ignore\n",
+ "import nuclio"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import urllib.request\n",
+ "# urllib.disable_warnings()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import zipfile\n",
+ "import urllib\n",
+ "import tarfile\n",
+ "import json\n",
+ "\n",
+ "from mlrun.execution import MLClientCtx\n",
+ "\n",
+ "def open_archive(context: MLClientCtx, \n",
+ " target_dir: str = 'content',\n",
+ " archive_url: str = ''):\n",
+ " \"\"\"Open a file/object archive into a target directory\n",
+ " \n",
+ " Currently supports zip and tar.gz\n",
+ " \"\"\"\n",
+ " # Define locations\n",
+ " os.makedirs(target_dir, exist_ok=True)\n",
+ " context.logger.info('Verified directories')\n",
+ " print(archive_url)\n",
+ " splits = archive_url.split('.')\n",
+ " print(splits)\n",
+ " if (splits[-1] == 'gz'):\n",
+ " # Extract dataset from tar\n",
+ " context.logger.info('opening tar_gz')\n",
+ " ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')\n",
+ " elif splits[-1] == 'zip':\n",
+ " # Extract dataset from zip\n",
+ " context.logger.info('opening zip')\n",
+ " ref = zipfile.ZipFile(archive_url, 'r')\n",
+ "\n",
+ " ref.extractall(target_dir)\n",
+ " ref.close()\n",
+ "\n",
+ " context.log_artifact('content', target_path=target_dir)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# nuclio: end-code"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create job function object from notebook code\n",
+ "fn = mlrun.code_to_function(\n",
+ " 'open_archive', \n",
+ " runtime='job', \n",
+ " handler=open_archive, \n",
+ " image='mlrun/mlrun:latest')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# export function yaml\n",
+ "# fn.export('/User/repos/functions/fileutils/open_archive/function.yaml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# import function yaml\n",
+ "# fn = mlrun.import_function('/User/repos/functions/fileutils/open_archive/function.yaml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
+ " InsecureRequestWarning)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load function from Github\n",
+ "fn = mlrun.import_function('https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils/open_archive/function.yaml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# configute it: mount on iguazio fabric, set as interactive (return stdout)\n",
+ "fn.apply(mlrun.mount_v3io())\n",
+ "fn.interactive = True"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### zip file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create and run the task\n",
+ "images_path = '/User/mlrun/functions/images'\n",
+ "\n",
+ "open_archive_task = mlrun.NewTask(\n",
+ " 'download-zip',\n",
+ " handler='open_archive', \n",
+ " params={'target_dir' : images_path,\n",
+ " 'key' : 'contents'},\n",
+ " inputs={'archive_url': 'http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-21 19:19:43,612 starting run download uid=31c5db9ef8174d40ac94c6dad0258069 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-21 19:19:43,808 Job is running in the background, pod: download-tcrfc\n",
+ "[mlrun] 2020-01-21 19:20:04,079 downloading http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip to local tmp\n",
+ "[mlrun] 2020-01-21 19:20:05,501 Verified directories\n",
+ "/tmp/tmp4_eoapfc.zip\n",
+ "['/tmp/tmp4_eoapfc', 'zip']\n",
+ "[mlrun] 2020-01-21 19:20:05,501 opening zip\n",
+ "[mlrun] 2020-01-21 19:20:13,406 log artifact content at /User/mlrun/functions/images, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-21 19:20:13,416 run executed, status=completed\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...258069 | \n",
+ " 0 | \n",
+ " Jan 21 19:20:04 | \n",
+ " completed | \n",
+ " open-archive | \n",
+ " host=download-tcrfc kind=job owner=admin | \n",
+ " archive_url | \n",
+ " key=contents target_dir=/User/mlrun/functions/images | \n",
+ " | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 31c5db9ef8174d40ac94c6dad0258069 , !mlrun logs 31c5db9ef8174d40ac94c6dad0258069 \n",
+ "[mlrun] 2020-01-21 19:20:16,127 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "# run\n",
+ "run = fn.run(open_archive_task)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### tar.gz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create and run the task\n",
+ "images_path = '/User/mlrun/functions/images-from-tar'\n",
+ "\n",
+ "open_archive_task = mlrun.NewTask(\n",
+ " 'download-tar',\n",
+ " handler='open_archive', \n",
+ " params={'target_dir' : images_path,\n",
+ " 'key' : 'contents',\n",
+ " 'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-21 19:22:37,587 starting run download-tar uid=500c634fd1c546c5a58292d37f50320f -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-21 19:22:37,659 Job is running in the background, pod: download-tar-zh72r\n",
+ "[mlrun] 2020-01-21 19:22:42,412 Verified directories\n",
+ "https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz\n",
+ "['https://fpsignals-public', 's3', 'amazonaws', 'com/catsndogs', 'tar', 'gz']\n",
+ "[mlrun] 2020-01-21 19:22:42,412 opening tar_gz\n",
+ "[mlrun] 2020-01-21 19:22:57,936 log artifact content at /User/mlrun/functions/images-from-tar, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-21 19:22:57,948 run executed, status=completed\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...50320f | \n",
+ " 0 | \n",
+ " Jan 21 19:22:42 | \n",
+ " completed | \n",
+ " open-archive | \n",
+ " host=download-tar-zh72r kind=job owner=admin | \n",
+ " | \n",
+ " archive_url=https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz key=contents target_dir=/User/mlrun/functions/images-from-tar | \n",
+ " | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 500c634fd1c546c5a58292d37f50320f , !mlrun logs 500c634fd1c546c5a58292d37f50320f \n",
+ "[mlrun] 2020-01-21 19:23:06,873 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "# run\n",
+ "run = fn.run(open_archive_task)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create and run the task\n",
+ "images_path = '/User/mlrun/functions/images-from-tar-as-inputs'\n",
+ "\n",
+ "open_archive_task = mlrun.NewTask(\n",
+ " 'download',\n",
+ " handler='open_archive', \n",
+ " params={'target_dir' : images_path,\n",
+ " 'key' : 'contents'},\n",
+ " inputs={'archive_url': 'https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-21 19:23:39,448 starting run download uid=c163869b83cd49cc888f5e9126301911 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-21 19:23:39,535 Job is running in the background, pod: download-7qf2w\n",
+ "[mlrun] 2020-01-21 19:23:44,057 downloading https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz to local tmp\n",
+ "[mlrun] 2020-01-21 19:23:44,877 Verified directories\n",
+ "/tmp/tmptshxsk7d.gz\n",
+ "['/tmp/tmptshxsk7d', 'gz']\n",
+ "[mlrun] 2020-01-21 19:23:44,877 opening tar_gz\n",
+ "[mlrun] 2020-01-21 19:23:44,879 Traceback (most recent call last):\n",
+ " File \"/usr/local/lib/python3.6/site-packages/mlrun-0.4.3-py3.6.egg/mlrun/runtimes/local.py\", line 174, in exec_from_params\n",
+ " val = handler(*args_list)\n",
+ " File \"main.py\", line 30, in open_archive\n",
+ " ref = tarfile.open(fileobj=urllib.request.urlopen(archive_url), mode='r|gz')\n",
+ " File \"/usr/local/lib/python3.6/urllib/request.py\", line 223, in urlopen\n",
+ " return opener.open(url, data, timeout)\n",
+ " File \"/usr/local/lib/python3.6/urllib/request.py\", line 511, in open\n",
+ " req = Request(fullurl, data)\n",
+ " File \"/usr/local/lib/python3.6/urllib/request.py\", line 329, in __init__\n",
+ " self.full_url = url\n",
+ " File \"/usr/local/lib/python3.6/urllib/request.py\", line 355, in full_url\n",
+ " self._parse()\n",
+ " File \"/usr/local/lib/python3.6/urllib/request.py\", line 384, in _parse\n",
+ " raise ValueError(\"unknown url type: %r\" % self.full_url)\n",
+ "ValueError: unknown url type: '/tmp/tmptshxsk7d.gz'\n",
+ "\n",
+ "\n",
+ "[mlrun] 2020-01-21 19:23:44,891 exec error - unknown url type: '/tmp/tmptshxsk7d.gz'\n",
+ "[mlrun] 2020-01-21 19:23:44,917 run executed, status=error\n",
+ "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
+ " InsecureRequestWarning)\n",
+ "unknown url type: '/tmp/tmptshxsk7d.gz'\n",
+ "runtime error: unknown url type: '/tmp/tmptshxsk7d.gz'\n",
+ "final state: failed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...301911 | \n",
+ " 0 | \n",
+ " Jan 21 19:23:44 | \n",
+ " error | \n",
+ " open-archive | \n",
+ " host=download-7qf2w kind=job owner=admin | \n",
+ " archive_url | \n",
+ " key=contents target_dir=/User/mlrun/functions/images-from-tar-as-inputs | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run c163869b83cd49cc888f5e9126301911 , !mlrun logs c163869b83cd49cc888f5e9126301911 \n",
+ "[mlrun] 2020-01-21 19:23:48,687 run executed, status=error\n",
+ "runtime error: unknown url type: '/tmp/tmptshxsk7d.gz'\n"
+ ]
+ },
+ {
+ "ename": "RunError",
+ "evalue": "unknown url type: '/tmp/tmptshxsk7d.gz'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mRunError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# run\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mrun\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen_archive_task\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/runtimes/base.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, runspec, handler, name, project, params, inputs, out_path, workdir, watch, schedule)\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_post_run\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrunspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_wrap_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrunspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 268\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_wrap_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrunspec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 269\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 270\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_remote\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_api_server\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkfp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/runtimes/base.py\u001b[0m in \u001b[0;36m_wrap_result\u001b[0;34m(self, result, runspec, err)\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_remote\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_child\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'runtime error: {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRunError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mRunError\u001b[0m: unknown url type: '/tmp/tmptshxsk7d.gz'"
+ ]
+ }
+ ],
+ "source": [
+ "# run\n",
+ "run = fn.run(open_archive_task)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/parquet_to_dask.ipynb b/tests/parquet_to_dask.ipynb
new file mode 100644
index 000000000..69c667816
--- /dev/null
+++ b/tests/parquet_to_dask.ipynb
@@ -0,0 +1,1158 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# parquet to dask\n",
+ "load a parquet dataset into a dask cluster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'\n",
+ "mlrun.mlconf.remote_host = '3.133.8.252' "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## parameters\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FUNCTION = 'parquet_to_dask'\n",
+ "DESCRIPTION = 'load parquet dataset into a dask cluster'\n",
+ "\n",
+ "BASE_IMAGE = 'yjbds/mlrun-daskboost:dev'\n",
+ "JOB_KIND = 'dask'\n",
+ "TASK_NAME = 'user-task-parq-to-dask'\n",
+ "\n",
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/fileutils'\n",
+ "\n",
+ "SRC_PATH = '/User/mlrun/airlines/dataset-small/partitions'\n",
+ "\n",
+ "PARTITION_COLS = ['Year', 'Month']\n",
+ "\n",
+ "DASK_SHARDS = 4\n",
+ "DASK_THREADS_PER = 4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## load and configure function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "func_py = os.path.join(CODE_BASE, FUNCTION, 'function.py')\n",
+ "func_yaml = os.path.join(CODE_BASE, FUNCTION, 'function.yaml')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**If run the first time, create the function:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load function from a local Python file\n",
+ "parq2dask = mlrun.new_function(command=func_py, kind=JOB_KIND)\n",
+ "\n",
+ "parq2dask.spec.remote = True\n",
+ "parq2dask.spec.replicas = 4 \n",
+ "parq2dask.spec.max_replicas = 4\n",
+ "parq2dask.spec.service_type = 'NodePort'\n",
+ "parq2dask.spec.build.base_image = BASE_IMAGE"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 09:38:16,699 function spec saved to path: /User/repos/functions/fileutils/parquet_to_dask/function.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "parq2dask.export(func_yaml)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**otherwise load it:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 09:38:18,145 starting remote build, image: .mlrun/func-default-function-latest\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "parq2dask = mlrun.import_function(func_yaml)\n",
+ "\n",
+ "parq2dask.apply(mlrun.mount_v3io())\n",
+ "\n",
+ "parq2dask.deploy() # skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 09:38:20,399 starting run user-task-parq-to-dask uid=8d780f8755984477975fb16927110af1 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-30 09:38:21,436 saving function: function, tag: latest\n",
+ "[mlrun] 2020-01-30 09:38:27,297 using remote dask scheduler (mlrun-function-90bd99ce-2) at: 3.133.8.252:30417\n",
+ "[mlrun] 2020-01-30 09:38:27,298 remote dashboard (node) port: 3.133.8.252:30164\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/conda/lib/python3.6/site-packages/distributed/client.py:1074: VersionMismatchWarning: Mismatched versions found\n",
+ "\n",
+ "dask\n",
+ "+-----------+---------+\n",
+ "| | version |\n",
+ "+-----------+---------+\n",
+ "| client | 2.9.2 |\n",
+ "| scheduler | 2.10.0 |\n",
+ "+-----------+---------+\n",
+ "\n",
+ "distributed\n",
+ "+-----------+---------+\n",
+ "| | version |\n",
+ "+-----------+---------+\n",
+ "| client | 2.9.3 |\n",
+ "| scheduler | 2.10.0 |\n",
+ "+-----------+---------+\n",
+ "\n",
+ "msgpack\n",
+ "+-----------+---------+\n",
+ "| | version |\n",
+ "+-----------+---------+\n",
+ "| client | 0.6.2 |\n",
+ "| scheduler | 0.6.1 |\n",
+ "+-----------+---------+\n",
+ " warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 09:38:27,301 found cluster...\n",
+ "[mlrun] 2020-01-30 09:38:27,301 \n",
+ "[mlrun] 2020-01-30 09:38:27,636 log artifact scheduler at /User/mlrun/models/scheduler.json, size: None, db: Y\n",
+ " Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime \\\n",
+ "0 1997 1 7 2 1020.0 1020 1123.0 \n",
+ "1 1997 1 8 3 1107.0 1020 1205.0 \n",
+ "2 1997 1 9 4 1020.0 1020 1130.0 \n",
+ "3 1997 1 10 5 1020.0 1020 1123.0 \n",
+ "4 1997 1 12 7 1020.0 1020 1134.0 \n",
+ "\n",
+ " CRSArrTime UniqueCarrier FlightNum ... Dest Distance TaxiIn TaxiOut \\\n",
+ "0 1130 WN 1293 ... PHX 328.0 2.0 5.0 \n",
+ "1 1130 WN 1293 ... PHX 328.0 3.0 9.0 \n",
+ "2 1130 WN 1293 ... PHX 328.0 3.0 8.0 \n",
+ "3 1130 WN 1293 ... PHX 328.0 2.0 5.0 \n",
+ "4 1130 WN 1293 ... PHX 328.0 2.0 7.0 \n",
+ "\n",
+ " Cancelled CarrierDelay WeatherDelay NASDelay SecurityDelay \\\n",
+ "0 0 NaN NaN NaN NaN \n",
+ "1 0 NaN NaN NaN NaN \n",
+ "2 0 NaN NaN NaN NaN \n",
+ "3 0 NaN NaN NaN NaN \n",
+ "4 0 NaN NaN NaN NaN \n",
+ "\n",
+ " LateAircraftDelay \n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 NaN \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "\n",
+ "[5 rows x 23 columns]\n",
+ "\n",
+ "[mlrun] 2020-01-30 09:38:35,050 run ended with state \n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...110af1 | \n",
+ " 0 | \n",
+ " Jan 30 09:38:21 | \n",
+ " completed | \n",
+ " user-task-parq-to-dask | \n",
+ " kind=dask owner=admin host=jupyter-1-6ccccd5fdf-mz2ld | \n",
+ " | \n",
+ " parquet_url=/User/mlrun/airlines/dataset-small/partitions index_cols=['Year', 'Month'] shards=4 threads_per=4 persist=True dask_key=testdf1 target_path=/User/mlrun/models | \n",
+ " | \n",
+ " scheduler | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 8d780f8755984477975fb16927110af1 , !mlrun logs 8d780f8755984477975fb16927110af1 \n",
+ "[mlrun] 2020-01-30 09:38:35,085 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "# create and run the task\n",
+ "parq_to_dask_task = mlrun.NewTask(\n",
+ " TASK_NAME, \n",
+ " handler=FUNCTION, \n",
+ " params={\n",
+ " 'parquet_url': SRC_PATH,\n",
+ " 'index_cols' : PARTITION_COLS,\n",
+ " 'shards' : DASK_SHARDS,\n",
+ " 'threads_per': DASK_THREADS_PER,\n",
+ " 'persist' : True,\n",
+ " 'dask_key' : 'testdf1',\n",
+ " 'target_path': '/User/mlrun/models'})\n",
+ "# run\n",
+ "rn = parq2dask.run(parq_to_dask_task)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'/User/mlrun/models/scheduler.json'"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rn.outputs['scheduler']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### What's the scheduler address?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'type': 'Scheduler',\n",
+ " 'id': 'Scheduler-e216939d-7eaf-4946-98dc-29a0b571b1e2',\n",
+ " 'address': 'tcp://10.233.64.55:8786',\n",
+ " 'services': {},\n",
+ " 'workers': {}}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import json\n",
+ "json.load(open(rn.outputs['scheduler']))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### create a component 'on the fly' to summarise the table\n",
+ "\n",
+ "The nice thing about having a dask clkuster loaded with all you rdata is that you can write _quick and dirty_ jobs either in your notebook, a local file, or a gihub repo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-30 09:38:53,769 function spec saved to path: /User/repos/functions/tests/describe.yaml\n",
+ "[mlrun] 2020-01-30 09:38:53,822 starting run user-task-my-sum uid=5a52e1a6009647848d71dd211b741ee8 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-30 09:38:53,905 Job is running in the background, pod: user-task-my-sum-k5nsk\n",
+ "[mlrun] 2020-01-30 09:39:04,332 log artifact table-summary at /User/mlrun/models/table-summary.csv, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-30 09:39:04,347 run executed, status=completed\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...741ee8 | \n",
+ " 0 | \n",
+ " Jan 30 09:39:02 | \n",
+ " completed | \n",
+ " describe | \n",
+ " host=user-task-my-sum-k5nsk kind=job owner=admin | \n",
+ " | \n",
+ " dask_client=/User/mlrun/models/scheduler.json dask_key=testdf1 key=table-summary name=table-summary.csv target_path=/User/mlrun/models | \n",
+ " | \n",
+ " table-summary | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 5a52e1a6009647848d71dd211b741ee8 , !mlrun logs 5a52e1a6009647848d71dd211b741ee8 \n",
+ "[mlrun] 2020-01-30 09:39:13,120 run executed, status=completed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'table-summary': '/User/mlrun/models/table-summary.csv'}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# write up function in local directory\n",
+ "summ = mlrun.new_function(command='https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/tests/describe.py', \n",
+ " kind='job')\n",
+ "# specify a base image\n",
+ "summ.spec.build.base_image = BASE_IMAGE\n",
+ "\n",
+ "# (optional) export it as yaml\n",
+ "summ.export('/User/repos/functions/tests/describe.yaml')\n",
+ "\n",
+ "# mount it on iguazio data fabric\n",
+ "summ.apply(mlrun.mount_v3io())\n",
+ "\n",
+ "# deploy the function\n",
+ "summ.deploy(skip_deployed=True, with_mlrun=False)\n",
+ "\n",
+ "# create the task\n",
+ "summ_task = mlrun.NewTask(\n",
+ " 'user-task-my-sum', \n",
+ " handler='table_summary', \n",
+ " params={\n",
+ " 'dask_key' : 'testdf1',\n",
+ " 'dask_client': rn.outputs['scheduler'],\n",
+ " 'target_path': '/User/mlrun/models',\n",
+ " 'name' : 'table-summary.csv',\n",
+ " 'key' : 'table-summary'})\n",
+ "\n",
+ "# run\n",
+ "rn2 = summ.run(summ_task)\n",
+ "\n",
+ "rn2.outputs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## our cluster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from dask.distributed import Client, LocalCluster\n",
+ "\n",
+ "client = Client(scheduler_file='/User/mlrun/models/scheduler.json') # Client(scheduler_file=rn.outputs['scheduler'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = client.get_dataset('dask_key')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "175912"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.shape[0].compute()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{\"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 7)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 6)\": 1159726,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 19)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 14)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 16)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 8)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 9)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 11)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 3)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 5)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 15)\": 1159726,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 0)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 1)\": 1159726,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 12)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 4)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 18)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 10)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 2)\": 2912128,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 17)\": 1159726,\n",
+ " \"('read-parquet-d8c9ad5a8e529c3979516dc1ad71970c', 13)\": 2912128}"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "client.nbytes(summary=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'tcp://10.233.64.56:38718': 1,\n",
+ " 'tcp://10.233.64.57:36325': 1,\n",
+ " 'tcp://10.233.64.58:38383': 1,\n",
+ " 'tcp://10.233.64.59:44139': 1}"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "client.ncores()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'tcp://10.233.64.56:38718': 1,\n",
+ " 'tcp://10.233.64.57:36325': 1,\n",
+ " 'tcp://10.233.64.58:38383': 1,\n",
+ " 'tcp://10.233.64.59:44139': 1}"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "client.nthreads()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'tcp://10.233.64.56:38718': (),\n",
+ " 'tcp://10.233.64.57:36325': (),\n",
+ " 'tcp://10.233.64.58:38383': (),\n",
+ " 'tcp://10.233.64.59:44139': ()}"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "distributed.client - ERROR - Failed to reconnect to scheduler after 3.00 seconds, closing client\n",
+ "distributed.client - ERROR - Failed to reconnect to scheduler after 3.00 seconds, closing client\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1025, in _reconnect\n",
+ " await self._close()\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1025, in _reconnect\n",
+ " await self._close()\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client\n",
+ "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client\n",
+ "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1025, in _reconnect\n",
+ " await self._close()\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1025, in _reconnect\n",
+ " await self._close()\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n",
+ "distributed.utils - ERROR - \n",
+ "Traceback (most recent call last):\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/utils.py\", line 662, in log_errors\n",
+ " yield\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1025, in _reconnect\n",
+ " await self._close()\n",
+ " File \"/conda/lib/python3.6/site-packages/distributed/client.py\", line 1311, in _close\n",
+ " await gen.with_timeout(timedelta(seconds=2), list(coroutines))\n",
+ " File \"/conda/lib/python3.6/asyncio/tasks.py\", line 250, in _wakeup\n",
+ " future.result()\n",
+ "concurrent.futures._base.CancelledError\n"
+ ]
+ }
+ ],
+ "source": [
+ "client.processing()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "____\n",
+ "\n",
+ "# tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import pyarrow as pa\n",
+ "import pyarrow.parquet as pq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import dask\n",
+ "import dask.dataframe as dd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = pq.ParquetDataset(os.path.join(SRC_PATH))\n",
+ "df = dataset.read().to_pandas()\n",
+ "\n",
+ "\n",
+ "ddf = dd.read_parquet(SRC_PATH) #+'/*.parquet')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ddf = ddf.persist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ddf.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ddf.shape[0].compute()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/test_classifier.ipynb b/tests/test_classifier.ipynb
new file mode 100644
index 000000000..2bba2b6c0
--- /dev/null
+++ b/tests/test_classifier.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# test a model\n",
+ "\n",
+ "Test youy rmodel right after training in a kubeflow pipeline, or run this function independently. In addition, the plotting components in **[test_classifier.py]()** can also be run independently."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "import numpy as np\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving'\n",
+ "\n",
+ "MODEL_FILE = '/User/mlrun/models/lgb-classifier.pkl'\n",
+ "\n",
+ "TARGET_DATA_PATH = '/User/mlrun/models'\n",
+ "XTEST_FILE = '/User/mlrun/models/xtest.pqt'\n",
+ "YTEST_FILE = '/User/mlrun/models/ytest.pqt'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## run tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 19:24:09,737 function spec saved to path: /User/repos/functions/evaluation/test-classifier.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "testfn = mlrun.code_to_function(\n",
+ " kind='job', \n",
+ " image='yjbds/mlrun-ds:latest',\n",
+ " filename=os.path.join(CODE_BASE, 'evaluation', 'test-classifier.py'))\n",
+ "testfn.build_config(base_image='yjbds/mlrun-ds:latest', commands=[])\n",
+ "\n",
+ "testfn.export(os.path.join(CODE_BASE, 'evaluation', 'test-classifier.yaml'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testfn = mlrun.import_function(\n",
+ " os.path.join(CODE_BASE, 'evaluation', 'test-classifier.yaml')\n",
+ ").apply(mlrun.mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ready'"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "testfn.deploy(skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "task = mlrun.NewTask()\n",
+ "task.with_params(\n",
+ " model=MODEL_FILE,\n",
+ " xtest=XTEST_FILE,\n",
+ " ytest=YTEST_FILE,\n",
+ " target_path=TARGET_DATA_PATH)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 19:27:16,202 starting run test_model uid=60d2146665834f8ba1ca829227156ac8 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-26 19:27:16,305 Job is running in the background, pod: test-model-w96c5\n",
+ "[mlrun] 2020-01-26 19:27:27,264 log artifact roc.html at roc.html, size: 41071, db: Y\n",
+ "[mlrun] 2020-01-26 19:27:28,585 log artifact confusion_matrix.html at confusion_matrix.html, size: 14016, db: Y\n",
+ "[mlrun] 2020-01-26 19:27:29,158 log artifact feature-importances-plot.html at feature-importances-plot.html, size: 71976, db: Y\n",
+ "[mlrun] 2020-01-26 19:27:29,177 log artifact feature-importances-table at /User/mlrun/models/feature-importances-table.csv, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-26 19:27:29,251 run executed, status=completed\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...156ac8 | \n",
+ " 0 | \n",
+ " Jan 26 19:27:24 | \n",
+ " completed | \n",
+ " test-classifier | \n",
+ " host=test-model-w96c5 kind=job owner=admin | \n",
+ " | \n",
+ " model=/User/mlrun/models/lgb-classifier.pkl target_path=/User/mlrun/models xtest=/User/mlrun/models/xtest.pqt ytest=/User/mlrun/models/ytest.pqt | \n",
+ " | \n",
+ " roc.html confusion_matrix.html feature-importances-plot.html feature-importances-table | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 60d2146665834f8ba1ca829227156ac8 , !mlrun logs 60d2146665834f8ba1ca829227156ac8 \n",
+ "[mlrun] 2020-01-26 19:27:35,599 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "tsk_run = testfn.run(task, handler='test_model')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/train_classifier.ipynb b/tests/train_classifier.ipynb
new file mode 100644
index 000000000..ee5e62e20
--- /dev/null
+++ b/tests/train_classifier.ipynb
@@ -0,0 +1,573 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# deploying yaml on optimized python images\n",
+ "\n",
+ "* one node\n",
+ "* lightgbm\n",
+ "* 10 mio samples / 20 features\n",
+ "* code stored as yaml in github\n",
+ "* precomiled images using optimized for cpu python libraries \n",
+ " * **[yjbds/mlrun-ds](https://hub.docker.com/repository/docker/yjbds/mlrun-ds)** a data science stack\n",
+ " * **[yjbds/mlrun-files](https://hub.docker.com/repository/docker/yjbds/mlrun-files)** a parquet/pandas stack"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "import numpy as np\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/' \n",
+ "TARGET_DATA_PATH = '/User/mlrun/models'\n",
+ "\n",
+ "SKLEARN_CLASSIFIER = 'lightgbm.sklearn.LGBMClassifier'\n",
+ "MODEL_KEY = 'model'\n",
+ "MODEL_NAME = 'lgb-classifier.pkl'\n",
+ "VERBOSE = False"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "_____\n",
+ "## train a classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 19:17:00,914 function spec saved to path: /User/repos/functions/train/sklearn-classifier.yaml\n"
+ ]
+ }
+ ],
+ "source": [
+ "testfn = mlrun.code_to_function(\n",
+ " kind='job', \n",
+ " filename=os.path.join(CODE_BASE, 'train', 'sklearn-classifier.py'))\n",
+ "testfn.build_config(base_image='yjbds/mlrun-ds:latest', commands=[])\n",
+ "testfn.export(os.path.join(CODE_BASE, 'train', 'sklearn-classifier.yaml'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trainfn = mlrun.import_function(\n",
+ " os.path.join(CODE_BASE+'train/sklearn-classifier.yaml')\n",
+ ").apply(mlrun.mount_v3io())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 19:17:15,462 starting remote build, image: .mlrun/func-default-sklearn-classifier-latest\n",
+ "\u001b[36mINFO\u001b[0m[0000] Resolved base name yjbds/mlrun-ds:latest to yjbds/mlrun-ds:latest \n",
+ "\u001b[36mINFO\u001b[0m[0000] Resolved base name yjbds/mlrun-ds:latest to yjbds/mlrun-ds:latest \n",
+ "\u001b[36mINFO\u001b[0m[0000] Downloading base image yjbds/mlrun-ds:latest \n",
+ "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:d7724d11d33770dd3f65bee87ce7bf9f182428e96d53343f82ab5fce506f875b: no such file or directory \n",
+ "\u001b[36mINFO\u001b[0m[0000] Downloading base image yjbds/mlrun-ds:latest \n",
+ "\u001b[36mINFO\u001b[0m[0000] Built cross stage deps: map[] \n",
+ "\u001b[36mINFO\u001b[0m[0000] Downloading base image yjbds/mlrun-ds:latest \n",
+ "\u001b[36mINFO\u001b[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:d7724d11d33770dd3f65bee87ce7bf9f182428e96d53343f82ab5fce506f875b: no such file or directory \n",
+ "\u001b[36mINFO\u001b[0m[0000] Downloading base image yjbds/mlrun-ds:latest \n",
+ "\u001b[36mINFO\u001b[0m[0000] Unpacking rootfs as cmd RUN pip install mlrun requires it. \n",
+ "\u001b[36mINFO\u001b[0m[0048] Taking snapshot of full filesystem... \n",
+ "\u001b[36mINFO\u001b[0m[0063] RUN pip install mlrun \n",
+ "\u001b[36mINFO\u001b[0m[0063] cmd: /bin/sh \n",
+ "\u001b[36mINFO\u001b[0m[0063] args: [-c pip install mlrun] \n",
+ "Requirement already satisfied: mlrun in /opt/conda/lib/python3.7/site-packages (0.4.3)\n",
+ "Requirement already satisfied: nuclio-jupyter>=0.8.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.8.0)\n",
+ "Requirement already satisfied: nuclio-sdk>=0.0.3 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.0.5)\n",
+ "Requirement already satisfied: pandas>=0.23.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.25.3)\n",
+ "Requirement already satisfied: gevent==1.4.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.4.0)\n",
+ "Requirement already satisfied: pyyaml>=5.1.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (5.1.1)\n",
+ "Requirement already satisfied: aiohttp>=3.5.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (3.6.2)\n",
+ "Requirement already satisfied: click>=7.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (7.0)\n",
+ "Requirement already satisfied: requests>=2.20.1 in /opt/conda/lib/python3.7/site-packages (from mlrun) (2.20.1)\n",
+ "Requirement already satisfied: nest-asyncio>=1.0.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.2.2)\n",
+ "Requirement already satisfied: sqlalchemy==1.3.11 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.3.11)\n",
+ "Requirement already satisfied: GitPython>=2.1.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (3.0.5)\n",
+ "Requirement already satisfied: tabulate<=0.8.3,>=0.8.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.8.3)\n",
+ "Requirement already satisfied: gunicorn==19.9.0 in /opt/conda/lib/python3.7/site-packages (from mlrun) (19.9.0)\n",
+ "Requirement already satisfied: Flask>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.1.1)\n",
+ "Requirement already satisfied: kfp>=0.1.29 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.2.0)\n",
+ "Requirement already satisfied: croniter==0.3.31 in /opt/conda/lib/python3.7/site-packages (from mlrun) (0.3.31)\n",
+ "Requirement already satisfied: boto3>=1.9 in /opt/conda/lib/python3.7/site-packages (from mlrun) (1.11.9)\n",
+ "Requirement already satisfied: jupyterlab>=0.35.4 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (1.2.6)\n",
+ "Requirement already satisfied: nbconvert>=5.4 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.6.1)\n",
+ "Requirement already satisfied: ipython>=7.2 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (7.11.1)\n",
+ "Requirement already satisfied: notebook>=5.7.2 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (6.0.3)\n",
+ "Requirement already satisfied: tornado<6,>=5 in /opt/conda/lib/python3.7/site-packages (from nuclio-jupyter>=0.8.0->mlrun) (5.1.1)\n",
+ "Requirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.23.0->mlrun) (1.17.4)\n",
+ "Requirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.23.0->mlrun) (2.8.0)\n",
+ "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.23.0->mlrun) (2019.1)\n",
+ "Requirement already satisfied: greenlet>=0.4.14; platform_python_implementation == \"CPython\" in /opt/conda/lib/python3.7/site-packages (from gevent==1.4.0->mlrun) (0.4.15)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (19.3.0)\n",
+ "Requirement already satisfied: async-timeout<4.0,>=3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (3.0.1)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (1.4.2)\n",
+ "Requirement already satisfied: multidict<5.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (4.7.4)\n",
+ "Requirement already satisfied: chardet<4.0,>=2.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp>=3.5.0->mlrun) (3.0.4)\n",
+ "Requirement already satisfied: idna<2.8,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests>=2.20.1->mlrun) (2.6)\n",
+ "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests>=2.20.1->mlrun) (1.24.1)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests>=2.20.1->mlrun) (2019.9.11)\n",
+ "Requirement already satisfied: gitdb2>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from GitPython>=2.1.0->mlrun) (2.0.6)\n",
+ "Requirement already satisfied: Werkzeug>=0.15 in /opt/conda/lib/python3.7/site-packages (from Flask>=1.1.1->mlrun) (0.16.0)\n",
+ "Requirement already satisfied: Jinja2>=2.10.1 in /opt/conda/lib/python3.7/site-packages (from Flask>=1.1.1->mlrun) (2.10.3)\n",
+ "Requirement already satisfied: itsdangerous>=0.24 in /opt/conda/lib/python3.7/site-packages (from Flask>=1.1.1->mlrun) (1.1.0)\n",
+ "Requirement already satisfied: cloudpickle==1.1.1 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.1.1)\n",
+ "Requirement already satisfied: argo-models==2.2.1a in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (2.2.1a0)\n",
+ "Requirement already satisfied: requests-toolbelt>=0.8.0 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (0.9.1)\n",
+ "Requirement already satisfied: kfp-server-api<=0.1.40,>=0.1.18 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (0.1.40)\n",
+ "Requirement already satisfied: Deprecated in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.2.7)\n",
+ "Requirement already satisfied: jsonschema>=3.0.1 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (3.2.0)\n",
+ "Requirement already satisfied: cryptography>=2.4.2 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (2.7)\n",
+ "Requirement already satisfied: google-cloud-storage>=1.13.0 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.25.0)\n",
+ "Requirement already satisfied: PyJWT>=1.6.4 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.7.1)\n",
+ "Requirement already satisfied: kubernetes<=10.0.0,>=8.0.0 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (10.0.0)\n",
+ "Requirement already satisfied: six>=1.10 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.12.0)\n",
+ "Requirement already satisfied: google-auth>=1.6.1 in /opt/conda/lib/python3.7/site-packages (from kfp>=0.1.29->mlrun) (1.11.0)\n",
+ "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3>=1.9->mlrun) (0.9.4)\n",
+ "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from boto3>=1.9->mlrun) (0.3.2)\n",
+ "Requirement already satisfied: botocore<1.15.0,>=1.14.9 in /opt/conda/lib/python3.7/site-packages (from boto3>=1.9->mlrun) (1.14.9)\n",
+ "Requirement already satisfied: jupyterlab-server~=1.0.0 in /opt/conda/lib/python3.7/site-packages (from jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (1.0.6)\n",
+ "Requirement already satisfied: defusedxml in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
+ "Requirement already satisfied: traitlets>=4.2 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (4.3.3)\n",
+ "Requirement already satisfied: pandocfilters>=1.4.1 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (1.4.2)\n",
+ "Requirement already satisfied: pygments in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (2.5.2)\n",
+ "Requirement already satisfied: nbformat>=4.4 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (5.0.4)\n",
+ "Requirement already satisfied: bleach in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (3.1.0)\n",
+ "Requirement already satisfied: entrypoints>=0.2.2 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.3)\n",
+ "Requirement already satisfied: jupyter-core in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (4.6.1)\n",
+ "Requirement already satisfied: testpath in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.4.4)\n",
+ "Requirement already satisfied: mistune<2,>=0.8.1 in /opt/conda/lib/python3.7/site-packages (from nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.4)\n",
+ "Requirement already satisfied: decorator in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.4.1)\n",
+ "Requirement already satisfied: jedi>=0.10 in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.16.0)\n",
+ "Requirement already satisfied: setuptools>=18.5 in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (41.0.1.post20191122)\n",
+ "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (3.0.2)\n",
+ "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (4.8.0)\n",
+ "Requirement already satisfied: pickleshare in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.5)\n",
+ "Requirement already satisfied: backcall in /opt/conda/lib/python3.7/site-packages (from ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.0)\n",
+ "Requirement already satisfied: ipykernel in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.1.3)\n",
+ "Requirement already satisfied: terminado>=0.8.1 in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.8.3)\n",
+ "Requirement already satisfied: ipython-genutils in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.2.0)\n",
+ "Requirement already satisfied: prometheus-client in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (0.7.1)\n",
+ "Requirement already satisfied: jupyter-client>=5.3.4 in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (5.3.4)\n",
+ "Requirement already satisfied: pyzmq>=17 in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (18.1.1)\n",
+ "Requirement already satisfied: Send2Trash in /opt/conda/lib/python3.7/site-packages (from notebook>=5.7.2->nuclio-jupyter>=0.8.0->mlrun) (1.5.0)\n",
+ "Requirement already satisfied: smmap2>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from gitdb2>=2.0.0->GitPython>=2.1.0->mlrun) (2.0.5)\n",
+ "Requirement already satisfied: MarkupSafe>=0.23 in /opt/conda/lib/python3.7/site-packages (from Jinja2>=2.10.1->Flask>=1.1.1->mlrun) (1.1.1)\n",
+ "Requirement already satisfied: wrapt<2,>=1.10 in /opt/conda/lib/python3.7/site-packages (from Deprecated->kfp>=0.1.29->mlrun) (1.11.2)\n",
+ "Requirement already satisfied: pyrsistent>=0.14.0 in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (0.15.7)\n",
+ "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /opt/conda/lib/python3.7/site-packages (from jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (1.4.0)\n",
+ "Requirement already satisfied: asn1crypto>=0.21.0 in /opt/conda/lib/python3.7/site-packages (from cryptography>=2.4.2->kfp>=0.1.29->mlrun) (0.24.0)\n",
+ "Requirement already satisfied: cffi!=1.11.3,>=1.8 in /opt/conda/lib/python3.7/site-packages (from cryptography>=2.4.2->kfp>=0.1.29->mlrun) (1.12.3)\n",
+ "Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.2.0)\n",
+ "Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (0.5.0)\n",
+ "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.7/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (0.57.0)\n",
+ "Requirement already satisfied: requests-oauthlib in /opt/conda/lib/python3.7/site-packages (from kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (1.3.0)\n",
+ "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0.0)\n",
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.2.8)\n",
+ "Requirement already satisfied: rsa<4.1,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.6.1->kfp>=0.1.29->mlrun) (4.0)\n",
+ "Requirement already satisfied: docutils<0.16,>=0.10 in /opt/conda/lib/python3.7/site-packages (from botocore<1.15.0,>=1.14.9->boto3>=1.9->mlrun) (0.15.2)\n",
+ "Requirement already satisfied: json5 in /opt/conda/lib/python3.7/site-packages (from jupyterlab-server~=1.0.0->jupyterlab>=0.35.4->nuclio-jupyter>=0.8.0->mlrun) (0.8.5)\n",
+ "Requirement already satisfied: webencodings in /opt/conda/lib/python3.7/site-packages (from bleach->nbconvert>=5.4->nuclio-jupyter>=0.8.0->mlrun) (0.5.1)\n",
+ "Requirement already satisfied: parso>=0.5.2 in /opt/conda/lib/python3.7/site-packages (from jedi>=0.10->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.5.2)\n",
+ "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.7/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.1.8)\n",
+ "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.7/site-packages (from pexpect; sys_platform != \"win32\"->ipython>=7.2->nuclio-jupyter>=0.8.0->mlrun) (0.6.0)\n",
+ "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp>=0.1.29->mlrun) (2.1.0)\n",
+ "Requirement already satisfied: pycparser in /opt/conda/lib/python3.7/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp>=0.1.29->mlrun) (2.18)\n",
+ "Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.16.0)\n",
+ "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.7/site-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp>=0.1.29->mlrun) (3.1.0)\n",
+ "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp>=0.1.29->mlrun) (0.4.8)\n",
+ "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (1.51.0)\n",
+ "Requirement already satisfied: protobuf>=3.4.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp>=0.1.29->mlrun) (3.11.2)\n",
+ "\u001b[36mINFO\u001b[0m[0066] Taking snapshot of full filesystem... \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trainfn.deploy(skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "task = mlrun.NewTask()\n",
+ "task.with_params(\n",
+ " SKClassifier=SKLEARN_CLASSIFIER,\n",
+ " callbacks = [],\n",
+ " xtrain='/User/mlrun/models/xtrain.pqt',\n",
+ " ytrain='/User/mlrun/models/ytrain.pqt',\n",
+ " xvalid='/User/mlrun/models/xvalid.pqt',\n",
+ " yvalid='/User/mlrun/models/yvalid.pqt',\n",
+ " target_path='/User/mlrun/models',\n",
+ " name=MODEL_NAME,\n",
+ " key=MODEL_KEY,\n",
+ " verbose=VERBOSE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 19:20:00,276 starting run train uid=984d1c76d3744b8593395d4cec4c06e7 -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-26 19:20:00,370 Job is running in the background, pod: train-7mmh2\n",
+ "[LightGBM] [Warning] Starting from the 2.1.2 version, default value for the \"boost_from_average\" parameter in \"binary\" objective is true.\n",
+ "This may cause significantly different results comparing to the previous versions of LightGBM.\n",
+ "Try to set boost_from_average=false, if your old models produce bad results\n",
+ "[LightGBM] [Warning] Cannot change bin_construct_sample_cnt after constructed Dataset handle.\n",
+ "[mlrun] 2020-01-26 19:21:48,490 log artifact training-validation-plot.html at training-validation-plot.html, size: 36420, db: Y\n",
+ "[mlrun] 2020-01-26 19:21:48,562 log artifact model at /User/mlrun/models/lgb-classifier.pkl, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-26 19:21:48,603 run executed, status=completed\n",
+ "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+ " y = column_or_1d(y, warn=True)\n",
+ "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:268: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+ " y = column_or_1d(y, warn=True)\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...4c06e7 | \n",
+ " 0 | \n",
+ " Jan 26 19:20:08 | \n",
+ " completed | \n",
+ " sklearn-classifier | \n",
+ " host=train-7mmh2 kind=job owner=admin | \n",
+ " | \n",
+ " SKClassifier=lightgbm.sklearn.LGBMClassifier callbacks=[] key=model name=lgb-classifier.pkl target_path=/User/mlrun/models verbose=False xtrain=/User/mlrun/models/xtrain.pqt xvalid=/User/mlrun/models/xvalid.pqt ytrain=/User/mlrun/models/ytrain.pqt yvalid=/User/mlrun/models/yvalid.pqt | \n",
+ " train_accuracy=0.732269862931968 | \n",
+ " training-validation-plot.html model | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 984d1c76d3744b8593395d4cec4c06e7 , !mlrun logs 984d1c76d3744b8593395d4cec4c06e7 \n",
+ "[mlrun] 2020-01-26 19:21:52,483 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "tsk = trainfn.run(task, handler='train')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'train_accuracy': 0.732269862931968,\n",
+ " 'training-validation-plot.html': 'training-validation-plot.html',\n",
+ " 'model': '/User/mlrun/models/lgb-classifier.pkl'}"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tsk.outputs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/train_valid_test_split.ipynb b/tests/train_valid_test_split.ipynb
new file mode 100644
index 000000000..891423a4e
--- /dev/null
+++ b/tests/train_valid_test_split.ipynb
@@ -0,0 +1,576 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## split data into train, validation and test sets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import mlrun\n",
+ "import os\n",
+ "import numpy as np\n",
+ "mlrun.mlconf.dbpath = 'http://mlrun-api:8080'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## parameters\n",
+ "\n",
+ "**Please be sure to run the notebook [arc_to_parquet](arc_to_parquet.ipynb) before running this one.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FUNCTION = 'train_valid_test'\n",
+ "DESCRIPTION = 'split data into train, validation and test splits'\n",
+ "\n",
+ "BASE_IMAGE = 'yjbds/mlrun-intel:dev'\n",
+ "JOB_KIND = 'job'\n",
+ "TASK_NAME = 'user-task-data-splits'\n",
+ "\n",
+ "CODE_BASE = 'https://raw.githubusercontent.com/yjb-ds/functions/lgbm-serving/datagen/splitters'\n",
+ "PROJECT = 'splitters'\n",
+ "\n",
+ "RNG = 1\n",
+ "TARGET_DATA_PATH = '/User/mlrun/models'\n",
+ "SRC_FILE = 'higgs.pqt'\n",
+ "KEY = 'higgs'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## split the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "splitter = mlrun.import_function(\n",
+ " os.path.join(CODE_BASE, FUNCTION, 'function.yaml'))\n",
+ "\n",
+ "splitter.apply(mlrun.mount_v3io())\n",
+ "\n",
+ "splitter.deploy(skip_deployed=True, with_mlrun=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[mlrun] 2020-01-26 19:13:38,909 starting run train_valid_test_splitter uid=2238f0c5856e4359a068cd881dc0c62b -> http://mlrun-api:8080\n",
+ "[mlrun] 2020-01-26 19:13:39,003 Job is running in the background, pod: train-valid-test-splitter-zlfcj\n",
+ "[mlrun] 2020-01-26 19:14:07,630 log artifact header at /User/mlrun/models/header.pkl, size: None, db: Y\n",
+ "[mlrun] 2020-01-26 19:14:17,951 log artifact xtrain at /User/mlrun/models/xtrain.pqt, size: None, db: Y\n",
+ "[mlrun] 2020-01-26 19:14:21,585 log artifact xvalid at /User/mlrun/models/xvalid.pqt, size: None, db: Y\n",
+ "[mlrun] 2020-01-26 19:14:23,245 log artifact xtest at /User/mlrun/models/xtest.pqt, size: None, db: Y\n",
+ "[mlrun] 2020-01-26 19:14:24,139 log artifact ytrain at /User/mlrun/models/ytrain.pqt, size: None, db: Y\n",
+ "[mlrun] 2020-01-26 19:14:24,519 log artifact yvalid at /User/mlrun/models/yvalid.pqt, size: None, db: Y\n",
+ "[mlrun] 2020-01-26 19:14:24,755 log artifact ytest at /User/mlrun/models/ytest.pqt, size: None, db: Y\n",
+ "\n",
+ "[mlrun] 2020-01-26 19:14:25,528 run executed, status=completed\n",
+ "--- Logging error ---\n",
+ "Traceback (most recent call last):\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 1025, in emit\n",
+ " msg = self.format(record)\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 869, in format\n",
+ " return fmt.format(record)\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 608, in format\n",
+ " record.message = record.getMessage()\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 369, in getMessage\n",
+ " msg = msg % self.args\n",
+ "TypeError: not all arguments converted during string formatting\n",
+ "Call stack:\n",
+ " File \"/opt/conda/bin/mlrun\", line 10, in \n",
+ " sys.exit(main())\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 764, in __call__\n",
+ " return self.main(*args, **kwargs)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 717, in main\n",
+ " rv = self.invoke(ctx)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 1137, in invoke\n",
+ " return _process_result(sub_ctx.command.invoke(sub_ctx))\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 956, in invoke\n",
+ " return ctx.invoke(self.callback, **ctx.params)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 555, in invoke\n",
+ " return callback(*args, **kwargs)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/__main__.py\", line 167, in run\n",
+ " resp = fn.run(runobj, watch=watch, schedule=schedule)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/base.py\", line 294, in run\n",
+ " resp = self._run(runspec, execution)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py\", line 89, in _run\n",
+ " sout, serr = exec_from_params(fn, runobj, context)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py\", line 174, in exec_from_params\n",
+ " val = handler(*args_list)\n",
+ " File \"main.py\", line 104, in train_valid_test_splitter\n",
+ " context.logger.info('numpy', np.__version__)\n",
+ "Message: 'numpy'\n",
+ "Arguments: ('1.17.4',)\n",
+ "--- Logging error ---\n",
+ "Traceback (most recent call last):\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 1025, in emit\n",
+ " msg = self.format(record)\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 869, in format\n",
+ " return fmt.format(record)\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 608, in format\n",
+ " record.message = record.getMessage()\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 369, in getMessage\n",
+ " msg = msg % self.args\n",
+ "TypeError: not all arguments converted during string formatting\n",
+ "Call stack:\n",
+ " File \"/opt/conda/bin/mlrun\", line 10, in \n",
+ " sys.exit(main())\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 764, in __call__\n",
+ " return self.main(*args, **kwargs)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 717, in main\n",
+ " rv = self.invoke(ctx)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 1137, in invoke\n",
+ " return _process_result(sub_ctx.command.invoke(sub_ctx))\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 956, in invoke\n",
+ " return ctx.invoke(self.callback, **ctx.params)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 555, in invoke\n",
+ " return callback(*args, **kwargs)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/__main__.py\", line 167, in run\n",
+ " resp = fn.run(runobj, watch=watch, schedule=schedule)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/base.py\", line 294, in run\n",
+ " resp = self._run(runspec, execution)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py\", line 89, in _run\n",
+ " sout, serr = exec_from_params(fn, runobj, context)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py\", line 174, in exec_from_params\n",
+ " val = handler(*args_list)\n",
+ " File \"main.py\", line 105, in train_valid_test_splitter\n",
+ " context.logger.info('pandas ', pd.__version__)\n",
+ "Message: 'pandas '\n",
+ "Arguments: ('0.25.3',)\n",
+ "--- Logging error ---\n",
+ "Traceback (most recent call last):\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 1025, in emit\n",
+ " msg = self.format(record)\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 869, in format\n",
+ " return fmt.format(record)\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 608, in format\n",
+ " record.message = record.getMessage()\n",
+ " File \"/opt/conda/lib/python3.7/logging/__init__.py\", line 369, in getMessage\n",
+ " msg = msg % self.args\n",
+ "TypeError: not all arguments converted during string formatting\n",
+ "Call stack:\n",
+ " File \"/opt/conda/bin/mlrun\", line 10, in \n",
+ " sys.exit(main())\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 764, in __call__\n",
+ " return self.main(*args, **kwargs)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 717, in main\n",
+ " rv = self.invoke(ctx)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 1137, in invoke\n",
+ " return _process_result(sub_ctx.command.invoke(sub_ctx))\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 956, in invoke\n",
+ " return ctx.invoke(self.callback, **ctx.params)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/click/core.py\", line 555, in invoke\n",
+ " return callback(*args, **kwargs)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/__main__.py\", line 167, in run\n",
+ " resp = fn.run(runobj, watch=watch, schedule=schedule)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/base.py\", line 294, in run\n",
+ " resp = self._run(runspec, execution)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py\", line 89, in _run\n",
+ " sout, serr = exec_from_params(fn, runobj, context)\n",
+ " File \"/opt/conda/lib/python3.7/site-packages/mlrun/runtimes/local.py\", line 174, in exec_from_params\n",
+ " val = handler(*args_list)\n",
+ " File \"main.py\", line 106, in train_valid_test_splitter\n",
+ " context.logger.info('pyarrow', pa.__version__)\n",
+ "Message: 'pyarrow'\n",
+ "Arguments: ('0.15.1',)\n",
+ "final state: succeeded\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | uid | \n",
+ " iter | \n",
+ " start | \n",
+ " state | \n",
+ " name | \n",
+ " labels | \n",
+ " inputs | \n",
+ " parameters | \n",
+ " results | \n",
+ " artifacts | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ...c0c62b | \n",
+ " 0 | \n",
+ " Jan 26 19:13:46 | \n",
+ " completed | \n",
+ " train-valid-test | \n",
+ " host=train-valid-test-splitter-zlfcj kind=job owner=admin | \n",
+ " | \n",
+ " random_state=1 src_file=higgs.pqt target_path=/User/mlrun/models | \n",
+ " | \n",
+ " header xtrain xvalid xtest ytrain yvalid ytest | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to track results use .show() or .logs() or in CLI: \n",
+ "!mlrun get run 2238f0c5856e4359a068cd881dc0c62b , !mlrun logs 2238f0c5856e4359a068cd881dc0c62b \n",
+ "[mlrun] 2020-01-26 19:14:28,394 run executed, status=completed\n"
+ ]
+ }
+ ],
+ "source": [
+ "task2 = mlrun.NewTask()\n",
+ "task2.with_params(\n",
+ " src_file='higgs.pqt',\n",
+ " target_path=TARGET_DATA_PATH,\n",
+ " random_state=RNG)\n",
+ "\n",
+ "tsk2 = splitter.run(task2, handler='train_valid_test_splitter')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'header': '/User/mlrun/models/header.pkl',\n",
+ " 'xtrain': '/User/mlrun/models/xtrain.pqt',\n",
+ " 'xvalid': '/User/mlrun/models/xvalid.pqt',\n",
+ " 'xtest': '/User/mlrun/models/xtest.pqt',\n",
+ " 'ytrain': '/User/mlrun/models/ytrain.pqt',\n",
+ " 'yvalid': '/User/mlrun/models/yvalid.pqt',\n",
+ " 'ytest': '/User/mlrun/models/ytest.pqt'}"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tsk2.outputs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n_samples, n_features = pd.read_parquet(os.path.join(TARGET_DATA_PATH, SRC_FILE), engine='pyarrow').shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "xtrain_shape = pd.read_parquet(tsk2.outputs['xtrain'], engine='pyarrow').shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rounding_err = -1\n",
+ "assert (int(n_samples*.75*.9)+rounding_err, M_FEATURES) == xtrain_shape, \"xtrain doesn't have the expected shape\"\n",
+ "assert ytrain_shape[0] == xtrain_shape[0], \"ytrain and xtrain have different shapes\"\n",
+ "assert ytrain_shape[1] == 1, \"ytrain (labels) has more than 1 column\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "xtest_shape = pd.read_parquet(tsk2.outputs['xtest'], engine='pyarrow').shape\n",
+ "ytest_shape = pd.read_parquet(tsk2.outputs['ytest'], engine='pyarrow').shape\n",
+ "assert (int(n_samples*.1), M_FEATURES) == xtest_shape, \"xtest doesn't have the expected shape\"\n",
+ "assert ytest_shape[0] == xtest_shape[0], \"ytest and xtest have different shapes\"\n",
+ "assert ytest_shape[1] == 1, \"ytest (test labels) has more than 1 column\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from cloudpickle import load"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert len(load(open(tsk2.outputs['header'], 'rb'))) == n_features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(load(open(tsk2.outputs['header'], 'rb')))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/train/README.md b/train/README.md
new file mode 100644
index 000000000..6aa7c6d4b
--- /dev/null
+++ b/train/README.md
@@ -0,0 +1,5 @@
+# training functions
+
+1. **`sklearn-classify`**
+train any sklearn classifier model
+
\ No newline at end of file
diff --git a/train/sklearn-classifier.py b/train/sklearn-classifier.py
new file mode 100644
index 000000000..eae4def1d
--- /dev/null
+++ b/train/sklearn-classifier.py
@@ -0,0 +1,164 @@
+import numpy as np
+import pandas as pd
+
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure
+import seaborn as sns
+
+from typing import Optional, Union
+import os
+import importlib
+from cloudpickle import dump
+
+from mlrun.execution import MLClientCtx
+from mlrun.datastore import DataItem
+from mlrun.artifacts import TableArtifact, PlotArtifact
+
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def train(
+ context: Optional[MLClientCtx] = None,
+ SKClassifier: str = '',
+ callbacks = [],
+ xtrain: Union[DataItem, str] = '',
+ ytrain: Union[DataItem, str] = '',
+ xvalid: Union[DataItem, str] = '',
+ yvalid: Union[DataItem, str] = '',
+ target_path: str = '',
+ name: str = '',
+ key: str = '',
+ verbose: bool = False,
+ random_state = 1
+) -> None:
+ """Train and save an Scikitlearn model.
+
+ The data source can either be a string file name or an artifact item.
+
+ The header is eith a list of column names, an artifact header item, or None.
+
+
+ :param context: the function context
+ :param SKClassifier: string module and classname of classifier
+ :param callbacks: sklearn classifier fit function callbacks
+ :param xtrain:
+ :param ytrain:
+ :param xvalid:
+ :param yvalid:
+ :param target_path: folder location of files
+ :param name: destination name for model file
+ :param key: key for model artifact
+ :param verbose : (False) show metrics for training/validation steps.
+ :param random_state: (1) sklearn rng seed
+
+ example callbacks:
+ ```
+ from lightgbm import record_evaluation
+ eval_results = dict()
+ callbacks = [record_evaluation(eval_results)]
+ ```
+ """
+ # load data
+ xtrain = pd.read_parquet(str(xtrain), engine='pyarrow')
+ ytrain = pd.read_parquet(str(ytrain), engine='pyarrow')
+ xvalid = pd.read_parquet(str(xvalid), engine='pyarrow')
+ yvalid = pd.read_parquet(str(yvalid), engine='pyarrow')
+
+ # create classifier class from string and instantiate
+ splits = SKClassifier.split(".")
+ clfclass = getattr(importlib.import_module(".".join(splits[:-1])), splits[-1])
+ model = clfclass(random_state=random_state, verbose=int(verbose == True))
+
+ model.fit(xtrain,
+ ytrain,
+ eval_set=[(xvalid, yvalid), (xtrain, ytrain)],
+ eval_names=['valid', 'train'],
+ callbacks=callbacks,
+ verbose=verbose)
+
+ context.log_result("train_accuracy", float(model.score(xtrain, ytrain)))
+
+ # plot train and validation history, save and log
+ loss = np.asarray(model.evals_result_['train']['binary_logloss'], dtype=np.float)
+ val_loss = np.asarray(model.evals_result_['valid']['binary_logloss'], dtype=np.float)
+ plot_validation(context, loss, val_loss, target_path)
+
+ # save model
+ filepath = os.path.join(target_path, name)
+ dump(model, open(filepath, 'wb'))
+ context.log_artifact(key, target_path=filepath)
+
+def plot_validation(
+ context: MLClientCtx,
+ train_metric,
+ valid_metric,
+ target_path: str = '',
+ name: str = "history.png",
+ key: str = 'training-validation-plot'
+):
+ """Plot train and validation loss curves
+
+ These curves represent the training round losses from the training
+ and validation sets.
+
+ :param context: the function context
+ :param train_metric: train metric
+ :param valid_metric: validation metric
+ :param target_path: destinatin path for train/volidation history plot artifact
+ """
+ # generate plot
+ plt.plot(train_metric)
+ plt.plot(valid_metric)
+ plt.title("training validation results")
+ plt.xlabel("epoch")
+ plt.ylabel("")
+ plt.legend(["train", "valid"])
+ fig = plt.gcf()
+
+ # save figure and log artifact
+ plotpath = os.path.join(target_path, name)
+ plt.savefig(plotpath)
+ context.log_artifact(PlotArtifact(key, body=fig))
+
+ # plot cleanup
+ plt.cla()
+ plt.clf()
+ plt.close()
+
+
+
+def keras_classifier_generator(
+ metrics: list = [],
+ input_size: int = 20,
+ dropout: float = 0.5,
+ output_bias: float = None,
+ learning_rate: float = 1e-3
+):
+ """Generate a super simple classifier
+
+ :param metrics: select metrics to be evaluated
+ :param output_bias: layer initializer
+ :param input_size: number of features, size of input
+ :param dropout: dropout frequency
+ :param learning_rate:
+
+ returns a compiled keras model used as input to the KerasClassifer wrapper
+ """
+ if output_bias is not None:
+ output_bias = Constant(output_bias)
+
+ model = Sequential(
+ [
+ Dense(16, activation="relu", input_shape=(input_size,)),
+ Dropout(dropout),
+ Dense(1, activation="sigmoid", bias_initializer=output_bias),
+ ]
+ )
+
+ model.compile(
+ optimizer=Adam(lr=learning_rate),
+ loss=BinaryCrossentropy(),
+ metrics=metrics
+ )
+
+ return model
\ No newline at end of file
diff --git a/train/sklearn-classifier.yaml b/train/sklearn-classifier.yaml
new file mode 100644
index 000000000..bbe34c8c3
--- /dev/null
+++ b/train/sklearn-classifier.yaml
@@ -0,0 +1,18 @@
+kind: job
+metadata:
+ name: sklearn-classifier
+ tag: ''
+ hash: 14f1603a259311e015900f245810d9bf474dbb20
+ project: ''
+spec:
+ command: ''
+ args: []
+ volumes: []
+ volume_mounts: []
+ env: []
+ description: ''
+ build:
+ functionSourceCode: aW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBwYW5kYXMgYXMgcGQKCmltcG9ydCBtYXRwbG90bGliLnB5cGxvdCBhcyBwbHQKZnJvbSBtYXRwbG90bGliLmZpZ3VyZSBpbXBvcnQgRmlndXJlCmltcG9ydCBzZWFib3JuIGFzIHNucwoKZnJvbSB0eXBpbmcgaW1wb3J0IE9wdGlvbmFsLCBVbmlvbgppbXBvcnQgb3MKaW1wb3J0IGltcG9ydGxpYgpmcm9tIGNsb3VkcGlja2xlIGltcG9ydCBkdW1wCgpmcm9tIG1scnVuLmV4ZWN1dGlvbiBpbXBvcnQgTUxDbGllbnRDdHgKZnJvbSBtbHJ1bi5kYXRhc3RvcmUgaW1wb3J0IERhdGFJdGVtCmZyb20gbWxydW4uYXJ0aWZhY3RzIGltcG9ydCBUYWJsZUFydGlmYWN0LCBQbG90QXJ0aWZhY3QKCmltcG9ydCB3YXJuaW5ncwp3YXJuaW5ncy5zaW1wbGVmaWx0ZXIoYWN0aW9uPSdpZ25vcmUnLCBjYXRlZ29yeT1GdXR1cmVXYXJuaW5nKQoKZGVmIHRyYWluKAogICAgY29udGV4dDogT3B0aW9uYWxbTUxDbGllbnRDdHhdID0gTm9uZSwKICAgIFNLQ2xhc3NpZmllcjogc3RyICA9ICcnLAogICAgY2FsbGJhY2tzICA9IFtdLAogICAgeHRyYWluOiBVbmlvbltEYXRhSXRlbSwgc3RyXSA9ICcnLAogICAgeXRyYWluOiBVbmlvbltEYXRhSXRlbSwgc3RyXSA9ICcnLAogICAgeHZhbGlkOiBVbmlvbltEYXRhSXRlbSwgc3RyXSA9ICcnLAogICAgeXZhbGlkOiBVbmlvbltEYXRhSXRlbSwgc3RyXSA9ICcnLAogICAgdGFyZ2V0X3BhdGg6IHN0ciA9ICcnLAogICAgbmFtZTogc3RyID0gJycsCiAgICBrZXk6IHN0ciA9ICcnLAogICAgdmVyYm9zZTogYm9vbCA9IEZhbHNlLAogICAgcmFuZG9tX3N0YXRlID0gMQopIC0+IE5vbmU6CiAgICAiIiJUcmFpbiBhbmQgc2F2ZSBhbiBTY2lraXRsZWFybiBtb2RlbC4KICAgIAogICAgVGhlIGRhdGEgc291cmNlIGNhbiBlaXRoZXIgYmUgYSBzdHJpbmcgZmlsZSBuYW1lIG9yIGFuIGFydGlmYWN0IGl0ZW0uCiAgICAKICAgIFRoZSBoZWFkZXIgaXMgZWl0aCBhIGxpc3Qgb2YgY29sdW1uIG5hbWVzLCBhbiBhcnRpZmFjdCBoZWFkZXIgaXRlbSwgb3IgTm9uZS4KICAgIAogICAgCiAgICA6cGFyYW0gY29udGV4dDogICAgICAgICB0aGUgZnVuY3Rpb24gY29udGV4dAogICAgOnBhcmFtIFNLQ2xhc3NpZmllcjogICAgc3RyaW5nIG1vZHVsZSBhbmQgY2xhc3NuYW1lIG9mIGNsYXNzaWZpZXIKICAgIDpwYXJhbSBjYWxsYmFja3M6ICAgICAgIHNrbGVhcm4gY2xhc3NpZmllciBmaXQgZnVuY3Rpb24gY2FsbGJhY2tzCiAgICA6cGFyYW0geHRyYWluOiAgICAgICAgICAKICAgIDpwYXJhbSB5dHJhaW46CiAgICA6cGFyYW0geHZhbGlkOgogICAgOnBhcmFtIHl2YWxpZDoKICAgIDpwYXJhbSB0YXJnZXRfcGF0aDogICAgIGZvbGRlciBsb2NhdGlvbiBvZiBmaWxlcwogICAgOnBhcmFtIG5hbWU6ICAgICAgICAgICAgZGVzdGluYXRpb24gbmFtZSBmb3IgbW9kZWwgZmlsZQogICAgOnBhcmFtIGtleTogICAgICAgICAgICAga2V5IGZvciBtb2RlbCBhcnRpZmFjdAogICAgOnBhcmFtIHZlcmJvc2UgOiAgICAgICAgKEZhbHNlKSBzaG93IG1ldHJpY3MgZm9yIHRyYWluaW5nL3ZhbGlkYXRpb24gc3RlcHMuCiAgICA6cGFyYW0gcmFuZG9tX3N0YXRlOiAgICAoMSkgc2tsZWFybiBybmcgc2VlZAogICAgCiAgICBleGFtcGxlIGNhbGxiYWNrczoKICAgIGBgYAogICAgZnJvbSBsaWdodGdibSBpbXBvcnQgcmVjb3JkX2V2YWx1YXRpb24KICAgIGV2YWxfcmVzdWx0cyA9IGRpY3QoKQogICAgY2FsbGJhY2tzID0gW3JlY29yZF9ldmFsdWF0aW9uKGV2YWxfcmVzdWx0cyldCiAgICBgYGAKICAgICIiIgogICAgIyBsb2FkIGRhdGEKICAgIHh0cmFpbiA9IHBkLnJlYWRfcGFycXVldChzdHIoeHRyYWluKSwgZW5naW5lPSdweWFycm93JykKICAgIHl0cmFpbiA9IHBkLnJlYWRfcGFycXVldChzdHIoeXRyYWluKSwgZW5naW5lPSdweWFycm93JykKICAgIHh2YWxpZCA9IHBkLnJlYWRfcGFycXVldChzdHIoeHZhbGlkKSwgZW5naW5lPSdweWFycm93JykKICAgIHl2YWxpZCA9IHBkLnJlYWRfcGFycXVldChzdHIoeXZhbGlkKSwgZW5naW5lPSdweWFycm93JykKCiAgICAjIGNyZWF0ZSBjbGFzc2lmaWVyIGNsYXNzIGZyb20gc3RyaW5nIGFuZCBpbnN0YW50aWF0ZQogICAgc3BsaXRzID0gU0tDbGFzc2lmaWVyLnNwbGl0KCIuIikKICAgIGNsZmNsYXNzID0gZ2V0YXR0cihpbXBvcnRsaWIuaW1wb3J0X21vZHVsZSgiLiIuam9pbihzcGxpdHNbOi0xXSkpLCBzcGxpdHNbLTFdKQogICAgbW9kZWwgPSBjbGZjbGFzcyhyYW5kb21fc3RhdGU9cmFuZG9tX3N0YXRlLCB2ZXJib3NlPWludCh2ZXJib3NlID09IFRydWUpKQoKICAgIG1vZGVsLmZpdCh4dHJhaW4sIAogICAgICAgICAgICAgIHl0cmFpbiwKICAgICAgICAgICAgICBldmFsX3NldD1bKHh2YWxpZCwgeXZhbGlkKSwgKHh0cmFpbiwgeXRyYWluKV0sCiAgICAgICAgICAgICAgZXZhbF9uYW1lcz1bJ3ZhbGlkJywgJ3RyYWluJ10sCiAgICAgICAgICAgICAgY2FsbGJhY2tzPWNhbGxiYWNrcywKICAgICAgICAgICAgICB2ZXJib3NlPXZlcmJvc2UpCiAgICAgCiAgICBjb250ZXh0LmxvZ19yZXN1bHQoInRyYWluX2FjY3VyYWN5IiwgZmxvYXQobW9kZWwuc2NvcmUoeHRyYWluLCB5dHJhaW4pKSkKICAgIAogICAgIyBwbG90IHRyYWluIGFuZCB2YWxpZGF0aW9uIGhpc3RvcnksIHNhdmUgYW5kIGxvZwogICAgbG9zcyA9IG5wLmFzYXJyYXkobW9kZWwuZXZhbHNfcmVzdWx0X1sndHJhaW4nXVsnYmluYXJ5X2xvZ2xvc3MnXSwgZHR5cGU9bnAuZmxvYXQpCiAgICB2YWxfbG9zcyA9IG5wLmFzYXJyYXkobW9kZWwuZXZhbHNfcmVzdWx0X1sndmFsaWQnXVsnYmluYXJ5X2xvZ2xvc3MnXSwgZHR5cGU9bnAuZmxvYXQpCiAgICBwbG90X3ZhbGlkYXRpb24oY29udGV4dCwgbG9zcywgdmFsX2xvc3MsIHRhcmdldF9wYXRoKQogICAgCiAgICAjIHNhdmUgbW9kZWwKICAgIGZpbGVwYXRoID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBuYW1lKQogICAgZHVtcChtb2RlbCwgb3BlbihmaWxlcGF0aCwgJ3diJykpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChrZXksIHRhcmdldF9wYXRoPWZpbGVwYXRoKQogICAgICAgIApkZWYgcGxvdF92YWxpZGF0aW9uKAogICAgY29udGV4dDogTUxDbGllbnRDdHgsCiAgICB0cmFpbl9tZXRyaWMsCiAgICB2YWxpZF9tZXRyaWMsCiAgICB0YXJnZXRfcGF0aDogc3RyID0gJycsCiAgICBuYW1lOiBzdHIgPSAiaGlzdG9yeS5wbmciLAogICAga2V5OiBzdHIgPSAndHJhaW5pbmctdmFsaWRhdGlvbi1wbG90JwopOgogICAgIiIiUGxvdCB0cmFpbiBhbmQgdmFsaWRhdGlvbiBsb3NzIGN1cnZlcwoKICAgIFRoZXNlIGN1cnZlcyByZXByZXNlbnQgdGhlIHRyYWluaW5nIHJvdW5kIGxvc3NlcyBmcm9tIHRoZSB0cmFpbmluZwogICAgYW5kIHZhbGlkYXRpb24gc2V0cy4KICAgIAogICAgOnBhcmFtIGNvbnRleHQ6ICAgICAgICAgdGhlIGZ1bmN0aW9uIGNvbnRleHQKICAgIDpwYXJhbSB0cmFpbl9tZXRyaWM6ICAgIHRyYWluIG1ldHJpYwogICAgOnBhcmFtIHZhbGlkX21ldHJpYzogICAgdmFsaWRhdGlvbiBtZXRyaWMKICAgIDpwYXJhbSB0YXJnZXRfcGF0aDogICAgIGRlc3RpbmF0aW4gcGF0aCBmb3IgdHJhaW4vdm9saWRhdGlvbiBoaXN0b3J5IHBsb3QgYXJ0aWZhY3QKICAgICIiIgogICAgIyBnZW5lcmF0ZSBwbG90CiAgICBwbHQucGxvdCh0cmFpbl9tZXRyaWMpCiAgICBwbHQucGxvdCh2YWxpZF9tZXRyaWMpCiAgICBwbHQudGl0bGUoInRyYWluaW5nIHZhbGlkYXRpb24gcmVzdWx0cyIpCiAgICBwbHQueGxhYmVsKCJlcG9jaCIpCiAgICBwbHQueWxhYmVsKCIiKQogICAgcGx0LmxlZ2VuZChbInRyYWluIiwgInZhbGlkIl0pCiAgICBmaWcgPSBwbHQuZ2NmKCkKCiAgICAjIHNhdmUgZmlndXJlIGFuZCBsb2cgYXJ0aWZhY3QKICAgIHBsb3RwYXRoID0gb3MucGF0aC5qb2luKHRhcmdldF9wYXRoLCBuYW1lKQogICAgcGx0LnNhdmVmaWcocGxvdHBhdGgpCiAgICBjb250ZXh0LmxvZ19hcnRpZmFjdChQbG90QXJ0aWZhY3Qoa2V5LCBib2R5PWZpZykpCgogICAgIyBwbG90IGNsZWFudXAKICAgIHBsdC5jbGEoKQogICAgcGx0LmNsZigpCiAgICBwbHQuY2xvc2UoKSAgICAgICAgCg==
+ base_image: yjbds/mlrun-daskboost:dev
+ commands: []
+ code_origin: https://github.com/yjb-ds/functions.git#e613e55761fd1ed325ad88155877924aa5b49ccc:/User/repos/functions/train/sklearn-classifier.py