Serving graph with batching using a Hugging Face model#

This notebook demonstrates how to set up a serving graph with batch processing with a Hugging Face model, including the Hugging Face profile configuration, creating model artifacts, deploying a serving function with the dedicated_process execution mechanism (see add_model()), and testing the model inference.

In this section

Import the dependencies#

The MLRun imports include:

  • ModelRunnerStep: to run multiple models on each event.

  • LLModel(): to wrap a model for handling a LLM (Large Language Model) prompt-based inference.

  • HuggingFaceProfile(): to create a new model by parsing and validating input data from keyword arguments.

import os
from dotenv import load_dotenv

import mlrun
import mlrun.artifacts
import mlrun.serving
from mlrun.serving import ModelRunnerStep
from mlrun.datastore.datastore_profile import HuggingFaceProfile
from mlrun.runtimes.nuclio.function import AsyncSpec
from concurrent.futures import ThreadPoolExecutor
from time import sleep

# Load environment variables, including HF_TOKEN and other secrets needed for the Hugging Face SDK
load_dotenv("secrets.env")
True

Configure the project#

The MLRun project is a container for all your work on a this gen AI application. Read more about Projects and automation.

First you configure the project, then initialize it a few steps further on.

# Project configuration
project_name = "hf-batch-step"
image = "mlrun/mlrun"
profile_name = "huggingface_batch_step"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
execution_mechanism = "dedicated_process"
mlrun_model_name = "invoke_model"
# Input data
BATCH_INPUT_DATA = [
    {
        "question": "What is the capital of France? Answer with one word first, then provide a historical overview."
        " Answer in detail with at least 200 words.",
        "depth_level": "detailed",
        "persona": "teacher",
        "tone": "casual",
    },
    {
        "question": "What is the largest planet in our solar system? First give a one-word answer, "
        "then provide a detailed explanation in at least 200 words.",
        "depth_level": "basic",
        "persona": "astronomy teacher",
        "tone": "simple",
    },
    {
        "question": "Who wrote Hamlet? Answer shortly and then explain with details.  "
        "Answer in detail with at least 200 words.",
        "depth_level": "basic",
        "persona": "literature professor",
        "tone": "formal",
    },
    {
        "question": "What color is the sky on a clear day? Answer shortly and then "
        "Answer in detail with at least 200 words.",
        "depth_level": "basic",
        "persona": "child",
        "tone": "fun",
    },
    {
        "question": "What planet do we live on? Answer shortly and then explain with details. "
        "Answer in detail with at least 200 words.",
        "depth_level": "basic",
        "persona": "astronaut",
        "tone": "educational",
    },
]

PROMPT_LEGEND = {
    "question": {"field": None, "description": None},
    "depth_level": {"field": None, "description": None},
    "persona": {"field": None, "description": None},
    "tone": {"field": None, "description": None},
}
EXPECTED_RESULTS = ["paris", "jupiter", "shakespeare", "blue", "earth"]

PROMPT_TEMPLATE = [
    {
        "role": "user",
        "content": "{question}. Explain {depth_level} as a {persona} in {tone} style.",
    }
]

Create the project and the Hugging Face profile#

The HuggingFaceProfile is a datastore profile for credentials management. Read more about Data store profiles.

Note: Downloading HuggingFace models requires stable network connectivity. Downloading can fail or get stuck with unreliable connections. Ensure adequate network bandwidth.

# Initialize the MLRun project
project = mlrun.get_or_create_project(project_name)

# Create the HuggingFace data store profile with environment variables
profile = HuggingFaceProfile(
    name=profile_name,
    task="text-generation",
    token=os.environ.get("HF_TOKEN"),
    device=os.environ.get("HF_DEVICE"),
    device_map=os.environ.get("HF_DEVICE_MAP"),
    trust_remote_code=os.environ.get("HF_TRUST_REMOTE_CODE"),
)

# Register the profile with the project
project.register_datastore_profile(profile)

# Set up model URL
url_prefix = f"ds://{profile_name}/"
model_url = url_prefix + model_id

print(f"Project: {project_name}")
print(f"Profile: {profile_name}")
print(f"Model URL: {model_url}")
print(f"Execution Mechanism: {execution_mechanism}")
> 2026-03-05 16:29:49,616 [info] Created and saved project: {"context":"./","from_template":null,"name":"hf-batch-step","overwrite":false,"save":true}
> 2026-03-05 16:29:49,619 [info] Project created successfully: {"project_name":"hf-batch-step","stored_in_db":true}
Project: hf-batch-step
Profile: huggingface_batch_step
Model URL: ds://huggingface_batch_step/TinyLlama/TinyLlama-1.1B-Chat-v1.0
Execution Mechanism: dedicated_process

Create the model artifact#

# Log the model artifact
model_artifact = project.log_model(
    mlrun_model_name,
    model_url=model_url,
    default_config={"max_new_tokens": 50},
)
print(f"Model artifact created: {model_artifact}")
llm_prompt_artifact = project.log_llm_prompt(
    "llm_artifact",
    prompt_template=PROMPT_TEMPLATE,
    description="remote_model_huggingface-llm-prompt",
    prompt_legend=PROMPT_LEGEND,
    model_artifact=model_artifact,
)
print(f"LLM Prompt artifact created: {llm_prompt_artifact}")
Model artifact created: {'spec': {'model_file': '', 'framework': '', 'db_key': 'invoke_model', 'producer': {'kind': 'project', 'name': 'hf-batch-step', 'tag': 'f81aaeca-68c0-4e92-9771-75cc4a5618cb', 'owner': 'admin'}, 'model_url': 'ds://huggingface_batch_step/TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'has_children': False, 'license': '', 'parameters': {'default_config': {'max_new_tokens': 50}}}, 'kind': 'model', 'status': {'state': 'created'}, 'metadata': {'key': 'invoke_model', 'tree': 'f81aaeca-68c0-4e92-9771-75cc4a5618cb', 'uid': '53b38fa55da1fb00014003a239cff7945b396a97', 'iter': 0, 'project': 'hf-batch-step'}}
LLM Prompt artifact created: {'spec': {'prompt_legend': {'question': {'field': 'question', 'description': None}, 'depth_level': {'field': 'depth_level', 'description': None}, 'persona': {'field': 'persona', 'description': None}, 'tone': {'field': 'tone', 'description': None}}, 'description': 'remote_model_huggingface-llm-prompt', 'db_key': 'llm_artifact', 'producer': {'kind': 'project', 'name': 'hf-batch-step', 'tag': 'd4c3aede-f993-457f-af73-585e15ade8bb', 'owner': 'admin'}, 'prompt_template': [{'role': 'user', 'content': '{question}. Explain {depth_level} as a {persona} in {tone} style.'}], 'parent_uri': 'store://models/hf-batch-step/invoke_model#0@f81aaeca-68c0-4e92-9771-75cc4a5618cb^53b38fa55da1fb00014003a239cff7945b396a97', 'format': 'json', 'has_children': False, 'license': '', 'target_path': 'v3io:///projects/hf-batch-step/artifacts/llm_artifact.json', 'size': 98}, 'kind': <ArtifactCategories.llm_prompt: 'llm-prompt'>, 'status': {'state': 'created'}, 'metadata': {'key': 'llm_artifact', 'tree': 'd4c3aede-f993-457f-af73-585e15ade8bb', 'uid': '8b8c1a3f22076cde971bfe21a518648514a001f5', 'hash': '24312969d4fde40522a147a1728bfe0fb5fb7755', 'iter': 0, 'project': 'hf-batch-step'}}

Create the serving function#

Now create the serving function. Read mode about set_function.

# Create the serving function
# Using the CPU version of PyTorch here for faster deployment
# For improved performance, consider using the GPU version instead
function = project.set_function(
    name="hugging-face-batch-step",
    kind="serving",
    image=image,
    requirements=[
        "--extra-index-url",
        "https://download.pytorch.org/whl/cpu",
        "torch==2.8.0+cpu",
        "transformers==4.56.2",
    ],
)

Set up the serving graph#

The flow topology is a full graph/DAG. In this example it uses the async engine, which is based on storey.transformations and an asynchronous event loop. This notebook uses the ModelRunnerStep to run the model as a graph.

graph = function.set_topology("flow", engine="async")
step = graph.to(
    "storey.Batch",
    "my_batching",
    max_events=2,
    flush_after_seconds=4,
    full_event=True,
)
model_runner_step = ModelRunnerStep(name="my_model_runner")
model_runner_step.add_model(
    model_class="mlrun.serving.states.LLModel",
    endpoint_name="my_endpoint",
    execution_mechanism="dedicated_process",
    model_artifact=llm_prompt_artifact,
    result_path="output",
)
step = step.to(model_runner_step)
step.to("storey.FlatMap", _fn="(event.body)", full_event=True).respond()

print("Serving graph configured with dedicated_process execution mechanism")
Serving graph configured with dedicated_process execution mechanism

Deploy the function#

# For larger models, Hugging Face models may require extended resources:
#
function.spec.replicas = (
    1  # prevents allocating extended resources to multiple replicas
)
function.spec.resources = {
    "limits": {"cpu": "6", "memory": "20Gi"},
    "requests": {"cpu": "25m", "memory": "1Mi"},
}

#  when using batch step, we must enable AsyncSpec
function.with_http(
    gateway_timeout=600,
    worker_timeout=500,
    workers=None,
    async_spec=AsyncSpec(),
)
function.spec.readiness_timeout = 600
> 2026-03-05 16:29:54,751 [warning] Adding HTTP trigger despite the default HTTP trigger creation being disabled
# Deploy the function
print("Deploying function...")
function.deploy()
print("Function deployed successfully!")
Deploying function...
> 2026-03-05 16:29:54,774 [info] Starting remote function deploy
2026-03-05 16:29:55  (info) Deploying function
2026-03-05 16:29:55  (info) Building
2026-03-05 16:29:55  (info) Staging files and preparing base images
2026-03-05 16:29:55  (warn) Using user provided base image, runtime interpreter version is provided by the base image
2026-03-05 16:29:55  (info) Building processor image
2026-03-05 16:32:10  (info) Build complete
2026-03-05 16:33:00  (info) Function deploy complete
> 2026-03-05 16:33:06,560 [info] Model endpoint creation task completed with state succeeded
> 2026-03-05 16:33:06,560 [info] Successfully deployed function: {"external_invocation_urls":["hf-batch-step-hugging-face-batch-step.default-tenant.app.vmdev25.lab.iguazeng.com/"],"internal_invocation_urls":["nuclio-hf-batch-step-hugging-face-batch-step.default-tenant.svc.cluster.local:8080"]}
Function deployed successfully!

Test the model inference#

# Test the model with the input data
def send_event(event, delay):
    sleep(delay)
    return function.invoke(
        f"v2/models/{mlrun_model_name}/infer",
        json.dumps(event),
    )


with ThreadPoolExecutor(max_workers=len(BATCH_INPUT_DATA)) as executor:
    futures = [
        executor.submit(send_event, event, i * 0.2)
        for i, event in enumerate(BATCH_INPUT_DATA)
    ]
    batch_response = [future.result() for future in futures]

print("Responses received:")
for response, expected_result in zip(batch_response, EXPECTED_RESULTS):
    answer = response["output"]["answer"]
    if expected_result in answer.lower():
        print(
            f'"{expected_result}" in the response of the question: "{response["question"]}"'
        )
    else:
        print(
            f'The response to the question "{response["question"]}" did not match the expected result.'
        )
Responses received:
"paris" in the response of the question: "What is the capital of France? Answer with one word first, then provide a historical overview. Answer in detail with at least 200 words."
"jupiter" in the response of the question: "What is the largest planet in our solar system? First give a one-word answer, then provide a detailed explanation in at least 200 words."
"shakespeare" in the response of the question: "Who wrote Hamlet? Answer shortly and then explain with details.  Answer in detail with at least 200 words."
"blue" in the response of the question: "What color is the sky on a clear day? Answer shortly and then Answer in detail with at least 200 words."
"earth" in the response of the question: "What planet do we live on? Answer shortly and then explain with details. Answer in detail with at least 200 words."