{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pipelines using Dask, Kubeflow and MLRun"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a project to host functions, jobs and artifacts\n",
    "\n",
    "Projects are used to package multiple functions, workflows, and artifacts. Project code and definitions are usually stored in a Git archive.\n",
    "\n",
    "The following code creates a new project in a local dir and initializes git tracking on it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> 2021-01-24 16:39:27,665 [warning] Failed resolving version info. Ignoring and using defaults\n",
      "> 2021-01-24 16:39:29,248 [warning] Unable to parse server or client version. Assuming compatible: {'server_version': 'unstable', 'client_version': 'unstable'}\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import mlrun\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "# set project name and dir\n",
    "project_name = 'sk-project-dask'\n",
    "project_dir = './project'\n",
    "\n",
    "# specify artifacts target location\n",
    "_, artifact_path = mlrun.set_environment(project=project_name)\n",
    "\n",
    "# set project\n",
    "sk_dask_proj = mlrun.new_project(project_name, project_dir, init_git=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Init Dask cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> 2021-01-24 16:39:36,831 [info] using in-cluster config.\n"
     ]
    }
   ],
   "source": [
    "import mlrun\n",
    "# set up function from local file\n",
    "dsf = mlrun.new_function(name=\"mydask\", kind=\"dask\", image=\"mlrun/ml-models\")\n",
    "\n",
    "# set up function specs for dask\n",
    "dsf.spec.remote = True\n",
    "dsf.spec.replicas = 5\n",
    "dsf.spec.service_type = 'NodePort'\n",
    "dsf.with_limits(mem=\"6G\")\n",
    "dsf.spec.nthreads = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<mlrun.runtimes.daskjob.DaskCluster at 0x7f5dfe154550>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# apply mount_v3io over our function so that our k8s pod which run our function\n",
    "# will be able to access our data (shared data access)\n",
    "dsf.apply(mlrun.mount_v3io())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'52f5dcddb916b12943e9d44e9e2b75f48e286ec7'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dsf.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> 2021-01-24 20:15:37,716 [info] trying dask client at: tcp://mlrun-mydask-997e6385-a.default-tenant:8786\n",
      "> 2021-01-24 20:15:48,564 [warning] remote scheduler at tcp://mlrun-mydask-997e6385-a.default-tenant:8786 not ready, will try to restart Timed out trying to connect to 'tcp://mlrun-mydask-997e6385-a.default-tenant:8786' after 10 s: Timed out trying to connect to 'tcp://mlrun-mydask-997e6385-a.default-tenant:8786' after 10 s: [Errno -2] Name or service not known\n",
      "> 2021-01-24 20:15:54,442 [info] using remote dask scheduler (mlrun-mydask-b4eb4ec5-8) at: tcp://mlrun-mydask-b4eb4ec5-8.default-tenant:8786\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a href=\"http://default-tenant.app.yh57.iguazio-cd0.com:30975/status\" target=\"_blank\" >dashboard link: default-tenant.app.yh57.iguazio-cd0.com:30975</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Client</h3>\n",
       "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
       "  <li><b>Scheduler: </b>tcp://mlrun-mydask-b4eb4ec5-8.default-tenant:8786</li>\n",
       "  <li><b>Dashboard: </b><a href='http://mlrun-mydask-b4eb4ec5-8.default-tenant:8787/status' target='_blank'>http://mlrun-mydask-b4eb4ec5-8.default-tenant:8787/status</a></li>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Cluster</h3>\n",
       "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
       "  <li><b>Workers: </b>0</li>\n",
       "  <li><b>Cores: </b>0</li>\n",
       "  <li><b>Memory: </b>0 B</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: 'tcp://10.200.0.53:8786' processes=0 threads=0, memory=0 B>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# init dask cluster\n",
    "dsf.client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and run a functions\n",
    "\n",
    "Load the function object from .py .yaml file or function hub (marketplace).<br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f5dfc3b8d90>"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load function from the marketplace\n",
    "sk_dask_proj.set_function('hub://describe_dask', name='describe')\n",
    "sk_dask_proj.set_function('hub://sklearn_classifier_dask', name='dask_classifier')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sk_dask_proj.set_function('/User/dask/04-describe.py', name='describe', kind='job', image='mlrun/ml-models')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a fully automated ML pipeline\n",
    "\n",
    "### Add more functions to the project to be used in the pipeline (from the functions hub/marketplace)\n",
    "\n",
    "Describe data, train and eval model with dask."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define and save a pipeline\n",
    "\n",
    "The following workflow definition will be written into a file. It describes a Kubeflow execution graph (DAG) \n",
    "and how functions and data are connected to form an end to end pipeline. \n",
    "\n",
    "* Ingest data\n",
    "* Describe data\n",
    "* Train, test and evaluate with dask\n",
    "\n",
    "Check the code below to see how functions objects are initialized and used (by name) inside the workflow.<br>\n",
    "The `workflow.py` file has two parts, initialize the function objects and define pipeline dsl (connect the function inputs and outputs).\n",
    "\n",
    "> Note: The pipeline can include CI steps like building container images and deploying models as illustrated  in the following example.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting project/workflow.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile project/workflow.py\n",
    "from kfp import dsl\n",
    "from mlrun import mount_v3io\n",
    "\n",
    "# params\n",
    "funcs       = {}\n",
    "LABELS      = \"label\"\n",
    "DROP        = 'congestion_surcharge'\n",
    "#DATA_URL    = \"/User/iris.csv\"\n",
    "DATA_URL    = \"/User/iris.csv\"\n",
    "DASK_CLIENT = \"db://sk-project-dask/mydask\"\n",
    "\n",
    "# init functions is used to configure function resources and local settings\n",
    "def init_functions(functions: dict, project=None, secrets=None):\n",
    "    for f in functions.values():\n",
    "        f.apply(mount_v3io())\n",
    "        pass\n",
    "     \n",
    "@dsl.pipeline(\n",
    "    name=\"Demo training pipeline\",\n",
    "    description=\"Shows how to use mlrun\"\n",
    ")\n",
    "def kfpipeline():\n",
    "    \n",
    "    # describe data\n",
    "    describe = funcs['describe'].as_step(\n",
    "        params={\"dask_function\"  : DASK_CLIENT},\n",
    "        inputs={\"dataset\"       : DATA_URL}\n",
    "    )\n",
    "    \n",
    "    # get data, train, test and evaluate \n",
    "    train = funcs['dask_classifier'].as_step(\n",
    "        name=\"train\",\n",
    "        handler=\"train_model\",\n",
    "        params={\"label_column\"    : LABELS,\n",
    "                \"dask_function\"    : DASK_CLIENT,\n",
    "                \"test_size\"       : 0.10,\n",
    "                \"model_pkg_class\" : \"sklearn.ensemble.RandomForestClassifier\",\n",
    "                \"drop_cols\"       : DROP},\n",
    "        inputs={\"dataset\"         : DATA_URL},\n",
    "        outputs=['model', 'test_set']\n",
    "    )\n",
    "    \n",
    "    train.after(describe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "# register the workflow file as \"main\", embed the workflow code into the project YAML\n",
    "sk_dask_proj.set_workflow('main', 'workflow.py', embed=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the project definitions to a file (project.yaml). It is recommended to commit all changes to a Git repo."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "sk_dask_proj.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='run-pipeline'></a>\n",
    "## Run a pipeline workflow\n",
    "Use the `run` method to execute a workflow. You can provide alternative arguments and specify the default target for workflow artifacts.<br>\n",
    "The workflow ID is returned and can be used to track the progress or you can use the hyperlinks.\n",
    "\n",
    "> Note: The same command can be issued through CLI commands:<br>\n",
    "    `mlrun project my-proj/ -r main -p \"v3io:///users/admin/mlrun/kfp/{{workflow.uid}}/\"`\n",
    "\n",
    "The `dirty` flag lets you run a project with uncommitted changes (when the notebook is in the same git dir it is always dirty)<br>\n",
    "The `watch` flag waits for the pipeline to complete and print results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Experiment link <a href=\"https://dashboard.default-tenant.app.yh57.iguazio-cd0.com/pipelines/#/experiments/details/893ef115-8294-42dd-a993-84a8f88ab269\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run link <a href=\"https://dashboard.default-tenant.app.yh57.iguazio-cd0.com/pipelines/#/runs/details/c1b351fc-073b-4cdd-a6c3-fc167afbce8e\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "> 2021-01-24 21:30:12,077 [info] Pipeline run id=c1b351fc-073b-4cdd-a6c3-fc167afbce8e, check UI or DB for progress\n",
      "> 2021-01-24 21:30:12,079 [info] waiting for pipeline run completion\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<h2>Run Results</h2>Workflow c1b351fc-073b-4cdd-a6c3-fc167afbce8e finished, status=Succeeded<br>click the hyper links below to see detailed results<br><table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>uid</th>\n",
       "      <th>start</th>\n",
       "      <th>state</th>\n",
       "      <th>name</th>\n",
       "      <th>results</th>\n",
       "      <th>artifacts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td><div title=\"9aeaed387f90421c910cab50724d3610\"><a href=\"https://mlrun-ui.default-tenant.app.yh57.iguazio-cd0.com/projects/sk-project-dask/jobs/monitor/9aeaed387f90421c910cab50724d3610/info\" target=\"_blank\" >...724d3610</a></div></td>\n",
       "      <td>Jan 24 21:30:37</td>\n",
       "      <td>completed</td>\n",
       "      <td>train</td>\n",
       "      <td><div class=\"dictlist\">micro=0.9979224376731302</div><div class=\"dictlist\">macro=1.0</div><div class=\"dictlist\">precision-1=1.0</div><div class=\"dictlist\">precision-0=1.0</div><div class=\"dictlist\">precision-2=0.8571428571428571</div><div class=\"dictlist\">recall-1=1.0</div><div class=\"dictlist\">recall-0=0.8461538461538461</div><div class=\"dictlist\">recall-2=1.0</div><div class=\"dictlist\">f1-1=1.0</div><div class=\"dictlist\">f1-0=0.9166666666666666</div><div class=\"dictlist\">f1-2=0.923076923076923</div></td>\n",
       "      <td><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/ROCAUC.html\">ROCAUC</div><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/ClassificationReport.html\">ClassificationReport</div><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/ConfusionMatrix.html\">ConfusionMatrix</div><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/FeatureImportances.html\">FeatureImportances</div><div title=\"/User/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/models\">model</div><div title=\"/User/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/models/standard_scaler\">standard_scaler</div><div title=\"/User/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/models/label_encoder\">label_encoder</div><div title=\"/User/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/data/test_set.parquet\">test_set</div></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td><div title=\"29ef7589ed994922bd40d8b880e9f673\"><a href=\"https://mlrun-ui.default-tenant.app.yh57.iguazio-cd0.com/projects/sk-project-dask/jobs/monitor/29ef7589ed994922bd40d8b880e9f673/info\" target=\"_blank\" >...80e9f673</a></div></td>\n",
       "      <td>Jan 24 21:30:20</td>\n",
       "      <td>completed</td>\n",
       "      <td>describe-dask</td>\n",
       "      <td><div class=\"dictlist\">scale_pos_weight=1.00</div></td>\n",
       "      <td><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/plots/hist.html\">histograms</div><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/plots/imbalance.html\">imbalance</div><div class=\"artifact\" onclick=\"expandPanel(this)\" paneName=\"result\" title=\"/files/dask/pipe/c1b351fc-073b-4cdd-a6c3-fc167afbce8e/plots/corr.html\">correlation</div></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "artifact_path = os.path.abspath('./pipe/{{workflow.uid}}')\n",
    "run_id = sk_dask_proj.run(\n",
    "    'main',\n",
    "    arguments={}, \n",
    "    artifact_path=artifact_path, \n",
    "    dirty=False, watch=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**[back to top](#top)**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}