Model monitoring using LLM

Model monitoring using LLM#

This tutorial illustrates a model monitoring system that leverages LLMs to maintain high standards for deployed models.

In this tutorial

Prerequisites
Add the monitoring-function code
Deploy the model, enable tracking, and deploy the function

This tutorial explains how an LLM can be monitored. To see it in action, run the Large Language Model Monitoring demo.

Prerequisites#

GPU node with NVIDIA drivers is necessary for the serving function

import mlrun
from mlrun.features import Feature

Create the project

project = mlrun.get_or_create_project(
    "genai-tutorial", user_project=True, allow_cross_project=True
)

Set the credentials

from src.model_monitoring_utils import enable_model_monitoring

# If this project was running with MM enabled pre-1.8.0, disable the old model monitoring to update configurations
project.disable_model_monitoring(delete_stream_function=True)

Enable model monitoring for the project

enable_model_monitoring(
    project=project,
    base_period=2,
    # Uncomment to enable lag detection (see running-applications.md):
    # lag_threshold=10,       # alert if writer is 10+ min behind
    # lag_event_cooldown=5,   # min interval between lag events per worker
)

Add the monitoring-function code#

The monitoring function code collects the traffic to the serving function, analyzes it, and generates results for the specified metric.

Note

You can also import model monitoring applications from the MLRun hub. Each application has complete usage instructions.

%%writefile monit-code.py
import re
from typing import Any, Union

import mlrun
import mlrun.common.schemas
from mlrun.model_monitoring.applications import (
    ModelMonitoringApplicationBase,
    ModelMonitoringApplicationResult,
)

STATUS_RESULT_MAPPING = {
    0: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected,
    1: mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection,
}


class LLMMonitoringFunction(ModelMonitoringApplicationBase):

    def do_tracking(
        self,
        monitoring_context,
    ) -> Union[
        ModelMonitoringApplicationResult, list[ModelMonitoringApplicationResult]
    ]:
        
        # User monitoring sampling, in this case an integer representing model performance
        # Can be calulated based off the traffic to the function using monitoring_context.sample_df
        result = 0.9

        monitoring_context.log_dataset(
            key="llm-monitoring-df",
            df=monitoring_context.sample_df
        )

        # get status:
        status = STATUS_RESULT_MAPPING[round(result)]

        return ModelMonitoringApplicationResult(
            name="llm_monitoring_df",
            value=result,
            kind=mlrun.common.schemas.model_monitoring.constants.ResultKindApp.model_performance,
            status=status,
            extra_data={},
        )

Define the model monitoring custom function that scans the traffic and calculates the performance metrics

application = project.set_model_monitoring_function(
    func="monit-code.py",
    application_class="LLMMonitoringFunction",
    name="llm-monit",
    image="mlrun/mlrun",
)

application.spec.readiness_timeout = 1200

project.deploy_function(application)

Create a model serving class that loads the LLM and generates responses

%%writefile model-serving.py
import mlrun
from mlrun.serving.v2_serving import V2ModelServer
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Any

class LLMModelServer(V2ModelServer):

    def __init__(
        self,
        context: mlrun.MLClientCtx = None,
        name: str = None,
        model_path: str = None,
        model_name: str = None,
        **kwargs
    ):
        super().__init__(name=name, context=context, model_path=model_path, **kwargs)
        self.model_name = model_name
    
    def load(
        self,
    ):
        # Load the model from Hugging Face
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)


    def predict(self, request: dict[str, Any]):
        inputs = request.get("inputs", [])
      
        input_ids, attention_mask = self.tokenizer(
            inputs[0], return_tensors="pt"
        ).values()

        outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)

        # Remove input:
        outputs = self.tokenizer.decode(outputs[0])
        outputs = outputs.split(inputs[0])[-1].replace(self.tokenizer.eos_token, "")
        return [{"generated_text": outputs}]

Build an image

commands = [
    "pip install torch --index-url https://download.pytorch.org/whl/cpu",
    "pip install packaging==21.3",
    "pip install transformers adapters openai",
]

import sys

minor_version = float(sys.version_info[1])
if minor_version >= 11:
    commands.append("pip install --upgrade --force-reinstall protobuf")
else:
    print(f"minor_version {minor_version} not supported")
commands

Run the following command to build the image, once it's successfully built, no need to run the cell again since it takes quite sometime to build an image

project.build_image(
    image=".llm-serving-base",
    base_image="mlrun/mlrun",
    set_as_default=False,
    commands=commands,
)

Create the serving function using the class you just defined

serving_fn = project.set_function(
    func="model-serving.py",
    name="llm-server",
    kind="serving",
    image=".llm-serving-base",
)

# Set readiness timeout to 20 minutes, deploy might take a while.
serving_fn.spec.readiness_timeout = 1200

# Attach fuse mount to the function if not in ce / ig4
if mlrun.mlconf.is_using_v3io():
    serving_fn.apply(mlrun.auto_mount())

Deploy the model, enable tracking, and deploy the function#

This tutorial uses the gpt2 model by Google.

Log the model to the project

base_model = "gpt2"
project.log_model(
    base_model,
    model_file="src/model-iris.pkl",
    inputs=[Feature(value_type="str", name="question")],
    outputs=[Feature(value_type="str", name="answer")],
)

Adding the model parameters to the endpoint. This allow the model server class to initialize.

serving_fn.add_model(
    "gpt2",
    class_name="LLMModelServer",
    model_path=f"store://models/{project.name}/gpt2:latest",
    model_name="gpt2",
)

Enable tracking for the function, then deploy it.

serving_fn.set_tracking()

deployment = serving_fn.deploy()

ret = serving_fn.invoke(
    path=f"/v2/models/{base_model}/infer",
    body={"inputs": ["What is a mortgage?"]},
)
ret

Test your model serving

Let's generate traffic against the model:

import time


def question_model(questions, serving_function, base_model):
    for question in questions:
        seconds = 0.5
        # Invoking the pretrained model:
        ret = serving_fn.invoke(
            path=f"/v2/models/{base_model}/infer",
            body={"inputs": [question]},
        )
        print(ret)
        time.sleep(seconds)

example_questions = [
    "What is a mortgage?",
    "How does a credit card work?",
    "Who painted the Mona Lisa?",
    "Please plan me a 4-days trip to north Italy",
    "Write me a song",
    "How much people are there in the world?",
    "What is climate change?",
    "How does the stock market work?",
    "Who wrote 'To Kill a Mockingbird'?",
    "Please plan me a 3-day trip to Paris",
    "Write me a poem about the ocean",
    "How many continents are there in the world?",
    "What is artificial intelligence?",
    "How does a hybrid car work?",
    "Who invented the telephone?",
    "Please plan me a week-long trip to New Zealand",
    "What is inflation?",
    "How do vaccines work?",
    "Who discovered gravity?",
    "Please plan me a weekend trip to Tokyo.",
    "Write me a short story about a time traveler.",
    "How many planets are in the solar system?",
    "What is quantum physics?",
    "How does a dishwasher work?",
    "Who wrote '1984'?",
    "Please plan me a 5-day road trip through California.",
    "Write me a haiku about autumn.",
    "What is the tallest mountain in the world?",
    "How does cryptocurrency work?",
    "Who invented the light bulb?",
    "What is the meaning of photosynthesis?",
    "How does an airplane fly?",
    "Who painted 'The Starry Night'?",
    "Please plan me a 10-day trip across South America.",
    "Write me a letter to apologize to a friend.",
    "How many countries are there in the world?",
    "What is renewable energy?",
    "How does Wi-Fi work?",
    "Who directed the movie 'Inception'?",
    "Please plan me a cultural tour of Egypt.",
]

question_model(
    questions=example_questions,
    serving_function=serving_fn,
    base_model=base_model,
)

Now the traffic to the function is analyzed and the performance is calculated.