Deploying an LLM using MLRun#
This notebook illustrates deploying an LLM using MLRun: it shows how to grab a dataset, which is a list of articles, scrape those articles, chunk and index them into a vector store, and then deploy an open-source Hugging Face model to an endpoint where you can make requests and get responses with a RAG enrichment using the data that you just downloaded.
Since this tutorial is for illustrative purposes, it uses minimal resources — CPU and not GPU, and a small amount of data.
In this tutorial:
See also:
MLRun installation and configuration#
Before running this notebook make sure the mlrun packages are installed (pip install mlrun) and that you have configured the access to MLRun service.
# Install MLRun if not installed, run this only once. Restart the notebook after the install
# %pip install mlrun
import json
import mlrun
Get or create a new project
First create, load or use (get) an MLRun Project. The get_or_create_project method tries to load the project from the MLRun DB. If the project does not exist, it creates a new one.
project = mlrun.get_or_create_project(
"genai-tutorial", "./", user_project=True, allow_cross_project=True
)
project.set_source(".", pull_at_runtime=True)
Set up the vector database in the cluster#
These two steps imports a pre-defined dataset and load it into a vector database. Then the vector database is stored in the data layer of the cluster.
If you're not using Iguazio's Jupyter, download fetch-vectordb-data.py.
# The model used is the free open-source PHI 2
MODEL_ID = "microsoft/phi-2"
# Define the dataset for the VectorDB
DATA_SET = mlrun.get_sample_path("data/genai-tutorial/labelled_newscatcher_dataset.csv")
# The location of the VectorDB files
CACHE_DIR = mlrun.mlconf.artifact_path
CACHE_DIR = (
CACHE_DIR.replace("v3io://", "/v3io").replace("{{run.project}}", project.name)
+ "/cache"
)
is_ce = CACHE_DIR.startswith("s3://")
is_ce
build an image from mlrun/mlrun to include langchain and torch packages#
The image can be created with mlrun/mlrun as base:
commands = [
"pip install chromadb==0.5.0 langchain==0.2.3 langchain-community==0.2.4 langchain-core==0.2.5 langchain-text-splitters==0.2.1 clean-text==0.6.0 transformers==4.41.2",
"pip install torch --index-url https://download.pytorch.org/whl/cpu",
"pip install --upgrade requests requests-toolbelt",
]
# need different version of protobuf to work with different python version, python 3.9 -> protobuf 3.20.1, python 3.11 -> protobuf latest
import sys
minor_version = float(sys.version_info[1])
if minor_version >= 11:
commands.append("pip install --upgrade --force-reinstall protobuf")
elif minor_version == 9:
commands.append("pip install protobuf==3.20.2")
else:
print(f"minor_version {minor_version} not supported")
commands
Run the following command to build the image, once it's successfully built, no need to run the cell again since it takes quite sometime to build an image.
project.build_image(
image=".llm-demo-data",
base_image="mlrun/mlrun",
set_as_default=False,
commands=commands,
)
Fetch the dataset for the Vector DB and save it in cluster:
fetch = project.set_function(
name="fetch-vectordb-data",
func="src/fetch-vectordb-data.py",
kind="job",
image=".llm-demo-data",
)
ret = project.run_function(
name="fetch-vectordb-data-run",
function="fetch-vectordb-data",
handler="handler",
params={"data_set": DATA_SET},
)
ret.outputs
Build the vector DB#
Build the vector DB in the data layer and load the data into it.
If you're not using Iguazio's Jupyter, download the build vector db.
# Build the vector DB using the image
build_vectordb = project.set_function(
name="build-vectordb",
func="src/build-vector-db.py",
kind="job",
image=".llm-demo-data",
)
if not is_ce:
build_vectordb.apply(mlrun.auto_mount())
print("Applying mlrun.auto_mount!")
else:
print("Not applying mlrun.auto_mount!")
build_vectordb_run = project.run_function(
function="build-vectordb",
inputs={"df": ret.outputs["vector-db-dataset"]},
params={"cache_dir": CACHE_DIR},
handler="handler_chroma",
outputs=["vect_db"],
)
VECTORDB_PATH = build_vectordb_run.outputs["vect_db"]
Serving the function#
If you're not using Iguazio's Jupyter, download serving.py.
Now you can deploy the the Nuclio function that serves the LLM:
serve_func = project.set_function(
name="serve-llm",
func="src/serving.py",
image=".llm-demo-data",
kind="nuclio",
)
# Transferring the model and VectorDB path to the serving functions
serve_func.set_envs(
env_vars={
"MODEL_ID": MODEL_ID,
"CACHE_DIR": CACHE_DIR,
"VECTORDB_PATH": VECTORDB_PATH,
}
)
# Since the model is stored in memory, use only 1 replica and and one worker
# Since this is running on CPU only, inference might take ~1 minute (increasing timeout)
serve_func.spec.min_replicas = 1
serve_func.spec.max_replicas = 1
serve_func.with_http(worker_timeout=120, gateway_timeout=150, workers=1)
serve_func.set_config("spec.readinessTimeoutSeconds", 1200)
if not is_ce:
serve_func.apply(mlrun.auto_mount())
print("Applying mlrun.auto_mount!")
else:
print("Not applying mlrun.auto_mount!")
serve_func = project.deploy_function(function="serve-llm")
Test Serving Function#
The inference endpoint of a LLM which is hosted in the cluster
body = {
"question": "What are some new developments in space travel?",
"topic": "science",
}
resp = serve_func.function.invoke("/", body=json.dumps(body))
print(resp["response"])
print(resp["sources"])
print(resp["prompt"])
project.set_function(f"db://{project.name}/fetch-vectordb-data")
project.set_function(f"db://{project.name}/build-vectordb")
project.set_function(f"db://{project.name}/serve-llm")
project.set_source(f"db://{project.name}")
project.save()
Run E2E Workflow#
%%writefile workflow.py
import mlrun
from kfp import dsl
@dsl.pipeline(
name="GenAI demo"
)
def kfpipeline(data_set, cache_dir, model_id):
project = mlrun.get_current_project()
fetch = project.run_function(
function="fetch-vectordb-data",
name="fetch-vectordb-data-run",
handler="handler",
params = {"data_set" : data_set},
outputs=['vector-db-dataset']
)
vectordb_build = project.run_function(
function="build-vectordb",
inputs={"df" : fetch.outputs["vector-db-dataset"]},
params={"cache_dir" : cache_dir},
handler="handler_chroma",
outputs=["vect_db"]
)
serve_func = project.get_function("serve-llm")
serve_func.set_envs(
env_vars={"MODEL_ID": model_id,
"CACHE_DIR": cache_dir,
"VECTORDB_PATH":vectordb_build.outputs["vect_db"]}
)
serve_func.spec.min_replicas = 1
serve_func.spec.max_replicas = 1
serve_func.with_http(worker_timeout=120, gateway_timeout=150, workers=1)
serve_func.set_config("spec.readinessTimeoutSeconds", 1200)
deploy = project.deploy_function("serve-llm", verbose=True).after(vectordb_build)
project.set_workflow("main", "workflow.py", embed=True)
project.save()
Please note that the workflow may take up to 20 mins to complete.#
run_id = project.run(
"main",
arguments={"cache_dir": CACHE_DIR, "data_set": DATA_SET, "model_id": MODEL_ID},
engine="remote",
watch=True,
)