Serving graph with batching using a Hugging Face model#
This notebook demonstrates how to set up a serving graph with batch processing with a Hugging Face model, including the Hugging Face profile configuration, creating model artifacts, deploying a serving function with the dedicated_process execution mechanism (see add_model()), and testing the model inference.
In this section
Import the dependencies#
The MLRun imports include:
ModelRunnerStep: to run multiple models on each event.LLModel(): to wrap a model for handling a LLM (Large Language Model) prompt-based inference.HuggingFaceProfile(): to create a new model by parsing and validating input data from keyword arguments.
import os
from dotenv import load_dotenv
import mlrun
import mlrun.artifacts
import mlrun.serving
from mlrun.serving import ModelRunnerStep
from mlrun.datastore.datastore_profile import HuggingFaceProfile
from mlrun.runtimes.nuclio.function import AsyncSpec
from concurrent.futures import ThreadPoolExecutor
from time import sleep
# Load environment variables, including HF_TOKEN and other secrets needed for the Hugging Face SDK
load_dotenv("secrets.env")
True
Configure the project#
The MLRun project is a container for all your work on a this gen AI application. Read more about Projects and automation.
First you configure the project, then initialize it a few steps further on.
# Project configuration
project_name = "hf-batch-step"
image = "mlrun/mlrun"
profile_name = "huggingface_batch_step"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
execution_mechanism = "dedicated_process"
mlrun_model_name = "invoke_model"
# Input data
BATCH_INPUT_DATA = [
{
"question": "What is the capital of France? Answer with one word first, then provide a historical overview."
" Answer in detail with at least 200 words.",
"depth_level": "detailed",
"persona": "teacher",
"tone": "casual",
},
{
"question": "What is the largest planet in our solar system? First give a one-word answer, "
"then provide a detailed explanation in at least 200 words.",
"depth_level": "basic",
"persona": "astronomy teacher",
"tone": "simple",
},
{
"question": "Who wrote Hamlet? Answer shortly and then explain with details. "
"Answer in detail with at least 200 words.",
"depth_level": "basic",
"persona": "literature professor",
"tone": "formal",
},
{
"question": "What color is the sky on a clear day? Answer shortly and then "
"Answer in detail with at least 200 words.",
"depth_level": "basic",
"persona": "child",
"tone": "fun",
},
{
"question": "What planet do we live on? Answer shortly and then explain with details. "
"Answer in detail with at least 200 words.",
"depth_level": "basic",
"persona": "astronaut",
"tone": "educational",
},
]
PROMPT_LEGEND = {
"question": {"field": None, "description": None},
"depth_level": {"field": None, "description": None},
"persona": {"field": None, "description": None},
"tone": {"field": None, "description": None},
}
EXPECTED_RESULTS = ["paris", "jupiter", "shakespeare", "blue", "earth"]
PROMPT_TEMPLATE = [
{
"role": "user",
"content": "{question}. Explain {depth_level} as a {persona} in {tone} style.",
}
]
Create the project and the Hugging Face profile#
The HuggingFaceProfile is a datastore profile for credentials management. Read more about Data store profiles.
Note: Downloading HuggingFace models requires stable network connectivity. Downloading can fail or get stuck with unreliable connections. Ensure adequate network bandwidth.
# Initialize the MLRun project
project = mlrun.get_or_create_project(project_name)
# Create the HuggingFace data store profile with environment variables
profile = HuggingFaceProfile(
name=profile_name,
task="text-generation",
token=os.environ.get("HF_TOKEN"),
device=os.environ.get("HF_DEVICE"),
device_map=os.environ.get("HF_DEVICE_MAP"),
trust_remote_code=os.environ.get("HF_TRUST_REMOTE_CODE"),
)
# Register the profile with the project
project.register_datastore_profile(profile)
# Set up model URL
url_prefix = f"ds://{profile_name}/"
model_url = url_prefix + model_id
print(f"Project: {project_name}")
print(f"Profile: {profile_name}")
print(f"Model URL: {model_url}")
print(f"Execution Mechanism: {execution_mechanism}")
> 2026-03-05 16:29:49,616 [info] Created and saved project: {"context":"./","from_template":null,"name":"hf-batch-step","overwrite":false,"save":true}
> 2026-03-05 16:29:49,619 [info] Project created successfully: {"project_name":"hf-batch-step","stored_in_db":true}
Project: hf-batch-step
Profile: huggingface_batch_step
Model URL: ds://huggingface_batch_step/TinyLlama/TinyLlama-1.1B-Chat-v1.0
Execution Mechanism: dedicated_process
Create the model artifact#
# Log the model artifact
model_artifact = project.log_model(
mlrun_model_name,
model_url=model_url,
default_config={"max_new_tokens": 50},
)
print(f"Model artifact created: {model_artifact}")
llm_prompt_artifact = project.log_llm_prompt(
"llm_artifact",
prompt_template=PROMPT_TEMPLATE,
description="remote_model_huggingface-llm-prompt",
prompt_legend=PROMPT_LEGEND,
model_artifact=model_artifact,
)
print(f"LLM Prompt artifact created: {llm_prompt_artifact}")
Model artifact created: {'spec': {'model_file': '', 'framework': '', 'db_key': 'invoke_model', 'producer': {'kind': 'project', 'name': 'hf-batch-step', 'tag': 'f81aaeca-68c0-4e92-9771-75cc4a5618cb', 'owner': 'admin'}, 'model_url': 'ds://huggingface_batch_step/TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'has_children': False, 'license': '', 'parameters': {'default_config': {'max_new_tokens': 50}}}, 'kind': 'model', 'status': {'state': 'created'}, 'metadata': {'key': 'invoke_model', 'tree': 'f81aaeca-68c0-4e92-9771-75cc4a5618cb', 'uid': '53b38fa55da1fb00014003a239cff7945b396a97', 'iter': 0, 'project': 'hf-batch-step'}}
LLM Prompt artifact created: {'spec': {'prompt_legend': {'question': {'field': 'question', 'description': None}, 'depth_level': {'field': 'depth_level', 'description': None}, 'persona': {'field': 'persona', 'description': None}, 'tone': {'field': 'tone', 'description': None}}, 'description': 'remote_model_huggingface-llm-prompt', 'db_key': 'llm_artifact', 'producer': {'kind': 'project', 'name': 'hf-batch-step', 'tag': 'd4c3aede-f993-457f-af73-585e15ade8bb', 'owner': 'admin'}, 'prompt_template': [{'role': 'user', 'content': '{question}. Explain {depth_level} as a {persona} in {tone} style.'}], 'parent_uri': 'store://models/hf-batch-step/invoke_model#0@f81aaeca-68c0-4e92-9771-75cc4a5618cb^53b38fa55da1fb00014003a239cff7945b396a97', 'format': 'json', 'has_children': False, 'license': '', 'target_path': 'v3io:///projects/hf-batch-step/artifacts/llm_artifact.json', 'size': 98}, 'kind': <ArtifactCategories.llm_prompt: 'llm-prompt'>, 'status': {'state': 'created'}, 'metadata': {'key': 'llm_artifact', 'tree': 'd4c3aede-f993-457f-af73-585e15ade8bb', 'uid': '8b8c1a3f22076cde971bfe21a518648514a001f5', 'hash': '24312969d4fde40522a147a1728bfe0fb5fb7755', 'iter': 0, 'project': 'hf-batch-step'}}
Create the serving function#
Now create the serving function. Read mode about set_function.
# Create the serving function
# Using the CPU version of PyTorch here for faster deployment
# For improved performance, consider using the GPU version instead
function = project.set_function(
name="hugging-face-batch-step",
kind="serving",
image=image,
requirements=[
"--extra-index-url",
"https://download.pytorch.org/whl/cpu",
"torch==2.8.0+cpu",
"transformers==4.56.2",
],
)
Set up the serving graph#
The flow topology is a full graph/DAG. In this example it uses the async engine, which is based on storey.transformations and an asynchronous event loop.
This notebook uses the ModelRunnerStep to run the model as a graph.
graph = function.set_topology("flow", engine="async")
step = graph.to(
"storey.Batch",
"my_batching",
max_events=2,
flush_after_seconds=4,
full_event=True,
)
model_runner_step = ModelRunnerStep(name="my_model_runner")
model_runner_step.add_model(
model_class="mlrun.serving.states.LLModel",
endpoint_name="my_endpoint",
execution_mechanism="dedicated_process",
model_artifact=llm_prompt_artifact,
result_path="output",
)
step = step.to(model_runner_step)
step.to("storey.FlatMap", _fn="(event.body)", full_event=True).respond()
print("Serving graph configured with dedicated_process execution mechanism")
Serving graph configured with dedicated_process execution mechanism
Deploy the function#
# For larger models, Hugging Face models may require extended resources:
#
function.spec.replicas = (
1 # prevents allocating extended resources to multiple replicas
)
function.spec.resources = {
"limits": {"cpu": "6", "memory": "20Gi"},
"requests": {"cpu": "25m", "memory": "1Mi"},
}
# when using batch step, we must enable AsyncSpec
function.with_http(
gateway_timeout=600,
worker_timeout=500,
workers=None,
async_spec=AsyncSpec(),
)
function.spec.readiness_timeout = 600
> 2026-03-05 16:29:54,751 [warning] Adding HTTP trigger despite the default HTTP trigger creation being disabled
# Deploy the function
print("Deploying function...")
function.deploy()
print("Function deployed successfully!")
Deploying function...
> 2026-03-05 16:29:54,774 [info] Starting remote function deploy
2026-03-05 16:29:55 (info) Deploying function
2026-03-05 16:29:55 (info) Building
2026-03-05 16:29:55 (info) Staging files and preparing base images
2026-03-05 16:29:55 (warn) Using user provided base image, runtime interpreter version is provided by the base image
2026-03-05 16:29:55 (info) Building processor image
2026-03-05 16:32:10 (info) Build complete
2026-03-05 16:33:00 (info) Function deploy complete
> 2026-03-05 16:33:06,560 [info] Model endpoint creation task completed with state succeeded
> 2026-03-05 16:33:06,560 [info] Successfully deployed function: {"external_invocation_urls":["hf-batch-step-hugging-face-batch-step.default-tenant.app.vmdev25.lab.iguazeng.com/"],"internal_invocation_urls":["nuclio-hf-batch-step-hugging-face-batch-step.default-tenant.svc.cluster.local:8080"]}
Function deployed successfully!
Test the model inference#
# Test the model with the input data
def send_event(event, delay):
sleep(delay)
return function.invoke(
f"v2/models/{mlrun_model_name}/infer",
json.dumps(event),
)
with ThreadPoolExecutor(max_workers=len(BATCH_INPUT_DATA)) as executor:
futures = [
executor.submit(send_event, event, i * 0.2)
for i, event in enumerate(BATCH_INPUT_DATA)
]
batch_response = [future.result() for future in futures]
print("Responses received:")
for response, expected_result in zip(batch_response, EXPECTED_RESULTS):
answer = response["output"]["answer"]
if expected_result in answer.lower():
print(
f'"{expected_result}" in the response of the question: "{response["question"]}"'
)
else:
print(
f'The response to the question "{response["question"]}" did not match the expected result.'
)
Responses received:
"paris" in the response of the question: "What is the capital of France? Answer with one word first, then provide a historical overview. Answer in detail with at least 200 words."
"jupiter" in the response of the question: "What is the largest planet in our solar system? First give a one-word answer, then provide a detailed explanation in at least 200 words."
"shakespeare" in the response of the question: "Who wrote Hamlet? Answer shortly and then explain with details. Answer in detail with at least 200 words."
"blue" in the response of the question: "What color is the sky on a clear day? Answer shortly and then Answer in detail with at least 200 words."
"earth" in the response of the question: "What planet do we live on? Answer shortly and then explain with details. Answer in detail with at least 200 words."