Model monitoring using LLM#
This tutorial illustrates a model monitoring system that leverages LLMs to maintain high standards for deployed models.
In this tutorial
This tutorial explains how an LLM can be monitored. To see it in action, run the Large Language Model Monitoring demo.
Prerequisites#
GPU node with NVIDIA drivers is necessary for the serving function
import mlrun
from mlrun.features import Feature
Create the project
project = mlrun.get_or_create_project(
"genai-tutorial", user_project=True, allow_cross_project=True
)
project.set_source(".", pull_at_runtime=True)
> 2025-09-12 04:27:22,190 [info] Project loaded successfully: {"project_name":"genai-tutorial-xingsheng"}
Set the credentials
from src.model_monitoring_utils import enable_model_monitoring
# If this project was running with MM enabled pre-1.8.0, disable the old model monitoring to update configurations
project.disable_model_monitoring(delete_stream_function=True)
Enable model monitoring for the project
enable_model_monitoring(project=project, base_period=2)
Add the monitoring-function code#
The monitoring function code collects the traffic to the serving function, analyzes it, and generates results for the specified metric.
Note
You can also import model monitoring applications from the MLRun hub. Each application has complete usage instructions.
%%writefile monit-code.py
import re
from typing import Any, Union
import mlrun
import mlrun.common.schemas
from mlrun.model_monitoring.applications import (
ModelMonitoringApplicationBase,
ModelMonitoringApplicationResult,
)
STATUS_RESULT_MAPPING = {
0: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected,
1: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection,
}
class LLMMonitoringFunction(ModelMonitoringApplicationBase):
def do_tracking(
self,
monitoring_context,
) -> Union[
ModelMonitoringApplicationResult, list[ModelMonitoringApplicationResult]
]:
# User monitoring sampling, in this case an integer representing model performance
# Can be calulated based off the traffic to the function using monitoring_context.sample_df
result = 0.9
monitoring_context.log_dataset(
key="llm-monitoring-df",
df=monitoring_context.sample_df
)
# get status:
status = STATUS_RESULT_MAPPING[round(result)]
return ModelMonitoringApplicationResult(
name="llm_monitoring_df",
value=result,
kind=mlrun.common.schemas.model_monitoring.constants.ResultKindApp.model_performance,
status=status,
extra_data={},
)
Define the model monitoring custom function that scans the traffic and calculates the performance metrics
application = project.set_model_monitoring_function(
func="monit-code.py",
application_class="LLMMonitoringFunction",
name="llm-monit",
image="mlrun/mlrun",
)
application.spec.readiness_timeout = 1200
project.deploy_function(application)
Create a model serving class that loads the LLM and generates responses
%%writefile model-serving.py
import mlrun
from mlrun.serving.v2_serving import V2ModelServer
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Any
class LLMModelServer(V2ModelServer):
def __init__(
self,
context: mlrun.MLClientCtx = None,
name: str = None,
model_path: str = None,
model_name: str = None,
**kwargs
):
super().__init__(name=name, context=context, model_path=model_path, **kwargs)
self.model_name = model_name
def load(
self,
):
# Load the model from Hugging Face
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
def predict(self, request: dict[str, Any]):
inputs = request.get("inputs", [])
input_ids, attention_mask = self.tokenizer(
inputs[0], return_tensors="pt"
).values()
outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
# Remove input:
outputs = self.tokenizer.decode(outputs[0])
outputs = outputs.split(inputs[0])[-1].replace(self.tokenizer.eos_token, "")
return [{"generated_text": outputs}]
Build an image
commands = [
"pip install pytorch-lightning",
"pip install packaging==21.3",
"pip install transformers adapters openai",
]
# need different version of protobuf to work with different python version, python 3.9 -> protobuf 3.20.2, python 3.11 -> protobuf latest
import sys
minor_version = float(sys.version_info[1])
if minor_version >= 11:
commands.append("pip install --upgrade --force-reinstall protobuf")
elif minor_version == 9:
commands.append("pip install protobuf==3.20.2")
else:
print(f"minor_version {minor_version} not supported")
commands
Run the following command to build the image, once it's successfully built, no need to run the cell again since it takes quite sometime to build an image
project.build_image(
image=".llm-serving-base",
base_image="mlrun/mlrun",
set_as_default=False,
commands=commands,
)
Create the serving function using the class you just defined
serving_fn = project.set_function(
func="model-serving.py",
name="llm-server",
kind="serving",
image=".llm-serving-base",
)
# Set readiness timeout to 20 minutes, deploy might take a while.
serving_fn.spec.readiness_timeout = 1200
# Attach fuse mount to the function if not in ce mode
if not mlrun.mlconf.is_ce_mode():
serving_fn.apply(mlrun.auto_mount())
Deploy the model, enable tracking, and deploy the function#
This tutorial uses the gpt2 model by Google.
Log the model to the project
base_model = "gpt2"
project.log_model(
base_model,
model_file="src/model-iris.pkl",
inputs=[Feature(value_type="str", name="question")],
outputs=[Feature(value_type="str", name="answer")],
)
Adding the model parameters to the endpoint. This allow the model server class to initialize.
serving_fn.add_model(
"gpt2",
class_name="LLMModelServer",
model_path=f"store://models/{project.name}/gpt2:latest",
model_name="gpt2",
)
Enable tracking for the function, then deploy it.
serving_fn.set_tracking()
deployment = serving_fn.deploy()
ret = serving_fn.invoke(
path=f"/v2/models/{base_model}/infer",
body={"inputs": ["What is a mortgage?"]},
)
ret
Test your model serving
Let's generate traffic against the model:
import time
def question_model(questions, serving_function, base_model):
for question in questions:
seconds = 0.5
# Invoking the pretrained model:
ret = serving_fn.invoke(
path=f"/v2/models/{base_model}/infer",
body={"inputs": [question]},
)
print(ret)
time.sleep(seconds)
example_questions = [
"What is a mortgage?",
"How does a credit card work?",
"Who painted the Mona Lisa?",
"Please plan me a 4-days trip to north Italy",
"Write me a song",
"How much people are there in the world?",
"What is climate change?",
"How does the stock market work?",
"Who wrote 'To Kill a Mockingbird'?",
"Please plan me a 3-day trip to Paris",
"Write me a poem about the ocean",
"How many continents are there in the world?",
"What is artificial intelligence?",
"How does a hybrid car work?",
"Who invented the telephone?",
"Please plan me a week-long trip to New Zealand",
"What is inflation?",
"How do vaccines work?",
"Who discovered gravity?",
"Please plan me a weekend trip to Tokyo.",
"Write me a short story about a time traveler.",
"How many planets are in the solar system?",
"What is quantum physics?",
"How does a dishwasher work?",
"Who wrote '1984'?",
"Please plan me a 5-day road trip through California.",
"Write me a haiku about autumn.",
"What is the tallest mountain in the world?",
"How does cryptocurrency work?",
"Who invented the light bulb?",
"What is the meaning of photosynthesis?",
"How does an airplane fly?",
"Who painted 'The Starry Night'?",
"Please plan me a 10-day trip across South America.",
"Write me a letter to apologize to a friend.",
"How many countries are there in the world?",
"What is renewable energy?",
"How does Wi-Fi work?",
"Who directed the movie 'Inception'?",
"Please plan me a cultural tour of Egypt.",
]
question_model(
questions=example_questions,
serving_function=serving_fn,
base_model=base_model,
)
Now the traffic to the function is analyzed and the performance is calculated.