Model monitoring using LLM#
This tutorial illustrates a model monitoring system that leverages LLMs to maintain high standards for deployed models.
In this tutorial
This tutorial explains how an LLM can be monitored. To see it in action, run the Large Language Model Monitoring demo.
Prerequisites#
import mlrun
from mlrun.features import Feature
Create the project
project = mlrun.get_or_create_project(name="llm-monitoring-intro", context="./")
> 2024-10-22 11:06:22,479 [info] Created and saved project: {"context":"./","from_template":null,"name":"llm-monitoring-intro","overwrite":false,"save":true}
> 2024-10-22 11:06:22,481 [info] Project created successfully: {"project_name":"llm-monitoring-intro","stored_in_db":true}
Set the credentials
project.set_model_monitoring_credentials(
os.environ["V3IO_ACCESS_KEY"],
"v3io",
"v3io",
"v3io",
)
Enable model monitoring for the project
project.enable_model_monitoring(
image="mlrun/mlrun",
base_period=2, # frequency (in minutes) at which the monitoring applications are triggered
)
Add the monitoring-function code#
The monitoring function code collects the traffic to the serving function, analyzes it, and generates results for the specified metric.
%%writefile monit-code.py
import re
from typing import Any, Union
import mlrun
import mlrun.common.schemas
from mlrun.model_monitoring.applications import (
ModelMonitoringApplicationBase,
ModelMonitoringApplicationResult,
)
STATUS_RESULT_MAPPING = {
0: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected,
1: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection,
}
class LLMAsAJudgeApplication(ModelMonitoringApplicationBase):
def do_tracking(
self,
monitoring_context,
) -> Union[
ModelMonitoringApplicationResult, list[ModelMonitoringApplicationResult]
]:
# User monitoring sampling, in this case an integer representing model performance
# Can be calulated based off the traffic to the function using monitoring_context.sample_df
result = 0.9
tag = re.sub(pattern, "-", str(monitoring_context.end_infer_time))
monitoring_context.log_dataset(
key="llm-monitoring-df",
df=monitoring_context.sample_df
)
# get status:
status = STATUS_RESULT_MAPPING[round(result)]
return ModelMonitoringApplicationResult(
name="llm-monitoring-df",
value=result,
kind=mlrun.common.schemas.model_monitoring.constants.ResultKindApp.model_performance,
status=status,
extra_data={},
)
Define the model monitoring custom function that scans the traffic and calculates the performance metrics
application = project.set_model_monitoring_function(
func="monit-code.py",
application_class="LLMMonitApplication",
name="llm-monit",
image="mlrun/mlrun",
)
project.deploy_function(application)
Create a model serving class that loads the LLM and generates responses
%%writefile model-serving.py
import mlrun
from mlrun.serving.v2_serving import V2ModelServer
from transformers import AutoModelForCausalLM, AutoTokenizer
class LLMModelServer(V2ModelServer):
def __init__(
self,
context: mlrun.MLClientCtx = None,
name: str = None,
model_path: str = None,
model_name: str = None,
**kwargs
):
super().__init__(name=name, context=context, model_path=model_path, **kwargs)
self.model_name = model_name
def load(
self,
):
# Load the model from Hugging Face
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
def predict(self, request: dict[str, Any]):
inputs = request.get("inputs", [])
input_ids, attention_mask = self.tokenizer(
inputs[0], return_tensors="pt"
).values()
outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
# Remove input:
outputs = self.tokenizer.decode(outputs[0])
outputs = outputs.split(inputs[0])[-1].replace(self.tokenizer.eos_token, "")
return [{"generated_text": outputs}]
Create the serving function using the class you just defined
serving_fn = project.set_function(
func="model-serving.py",
name="llm-server",
kind="serving",
image="gcr.io/iguazio/llm-serving:1.7.0",
)
serving_fn.apply(mlrun.auto_mount())
# GPU is optional
serving_fn.with_limits(gpus=1)
Deploy the model, enable tracking, and deploy the function#
This tutorial uses the gemma-2b model by Google.
Log the model to the project
base_model = "google-gemma-2b"
project.log_model(
base_model,
model_file="model-iris.pkl",
inputs=[Feature(value_type="str", name="question")],
outputs=[Feature(value_type="str", name="answer")],
)
Adding the model parameters to the endpoint. This allow the model server class to initialize.
serving_function.add_model(
"gemma-2b",
class_name="LLMModelServer",
model_path=f"store://models/{project.name}/gemma-2b:latest",
model_name="google/gemma-2b",
)
Enable tracking for the function, then deploy it.
serving_fn.set_tracking()
deployment = serving_fn.deploy()
Now the traffic to the function is analyzed and the performance is calculated.