Model monitoring using LLM#

This tutorial illustrates a model monitoring system that leverages LLMs to maintain high standards for deployed models.

In this tutorial

This tutorial explains how an LLM can be monitored. To see it in action, run the Large Language Model Monitoring demo.

Prerequisites#

import mlrun
from mlrun.features import Feature

Create the project

project = mlrun.get_or_create_project(name="llm-monitoring-intro", context="./")
> 2024-10-22 11:06:22,479 [info] Created and saved project: {"context":"./","from_template":null,"name":"llm-monitoring-intro","overwrite":false,"save":true}
> 2024-10-22 11:06:22,481 [info] Project created successfully: {"project_name":"llm-monitoring-intro","stored_in_db":true}

Set the credentials

project.set_model_monitoring_credentials(
    os.environ["V3IO_ACCESS_KEY"],
    "v3io",
    "v3io",
    "v3io",
)

Enable model monitoring for the project

project.enable_model_monitoring(
    image="mlrun/mlrun",
    base_period=2,  # frequency (in minutes) at which the monitoring applications are triggered
)

Add the monitoring-function code#

The monitoring function code collects the traffic to the serving function, analyzes it, and generates results for the specified metric.

%%writefile monit-code.py
import re
from typing import Any, Union

import mlrun
import mlrun.common.schemas
from mlrun.model_monitoring.applications import (
    ModelMonitoringApplicationBase,
    ModelMonitoringApplicationResult,
)

STATUS_RESULT_MAPPING = {
    0: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected,
    1: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection,
}


class LLMAsAJudgeApplication(ModelMonitoringApplicationBase):

    def do_tracking(
        self,
        monitoring_context,
    ) -> Union[
        ModelMonitoringApplicationResult, list[ModelMonitoringApplicationResult]
    ]:
        
        # User monitoring sampling, in this case an integer representing model performance
        # Can be calulated based off the traffic to the function using monitoring_context.sample_df
        result = 0.9

        tag = re.sub(pattern, "-", str(monitoring_context.end_infer_time))
        monitoring_context.log_dataset(
            key="llm-monitoring-df",
            df=monitoring_context.sample_df
        )

        # get status:
        status = STATUS_RESULT_MAPPING[round(result)]

        return ModelMonitoringApplicationResult(
            name="llm-monitoring-df",
            value=result,
            kind=mlrun.common.schemas.model_monitoring.constants.ResultKindApp.model_performance,
            status=status,
            extra_data={},
        )

Define the model monitoring custom function that scans the traffic and calculates the performance metrics

application = project.set_model_monitoring_function(
    func="monit-code.py",
    application_class="LLMMonitApplication",
    name="llm-monit",
    image="mlrun/mlrun",
)
project.deploy_function(application)

Create a model serving class that loads the LLM and generates responses

%%writefile model-serving.py
import mlrun
from mlrun.serving.v2_serving import V2ModelServer
from transformers import AutoModelForCausalLM, AutoTokenizer


class LLMModelServer(V2ModelServer):

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        name: str = None,
        model_path: str = None,
        model_name: str = None,
        **kwargs
    ):
        super().__init__(name=name, context=context, model_path=model_path, **kwargs)
        self.model_name = model_name
    
    def load(
        self,
    ):
        # Load the model from Hugging Face
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)


    def predict(self, request: dict[str, Any]):
        inputs = request.get("inputs", [])
      
        input_ids, attention_mask = self.tokenizer(
            inputs[0], return_tensors="pt"
        ).values()

        outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)

        # Remove input:
        outputs = self.tokenizer.decode(outputs[0])
        outputs = outputs.split(inputs[0])[-1].replace(self.tokenizer.eos_token, "")
        return [{"generated_text": outputs}]

Create the serving function using the class you just defined

serving_fn = project.set_function(
    func="model-serving.py",
    name="llm-server",
    kind="serving",
    image="gcr.io/iguazio/llm-serving:1.7.0",
)

serving_fn.apply(mlrun.auto_mount())

# GPU is optional
serving_fn.with_limits(gpus=1)

Deploy the model, enable tracking, and deploy the function#

This tutorial uses the gemma-2b model by Google.

Log the model to the project

base_model = "google-gemma-2b"
project.log_model(
    base_model,
    model_file="model-iris.pkl",
    inputs=[Feature(value_type="str", name="question")],
    outputs=[Feature(value_type="str", name="answer")],
)

Adding the model parameters to the endpoint. This allow the model server class to initialize.

serving_function.add_model(
    "gemma-2b",
    class_name="LLMModelServer",
    model_path=f"store://models/{project.name}/gemma-2b:latest",
    model_name="google/gemma-2b",
)

Enable tracking for the function, then deploy it.

serving_fn.set_tracking()
deployment = serving_fn.deploy()

Now the traffic to the function is analyzed and the performance is calculated.