# Copyright 2018 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
from typing import Any, Dict, List, Union
import lightgbm as lgb
import mlrun
from .._ml_common import MLArtifactsLibrary, MLPlan
from ..sklearn import Metric, MetricsLibrary
from .mlrun_interfaces import (
LGBMBoosterMLRunInterface,
LGBMMLRunInterface,
LGBMModelMLRunInterface,
)
from .model_handler import LGBMModelHandler
from .model_server import LGBMModelServer
from .utils import LGBMTypes, LGBMUtils
# Placeholders as the SciKit-Learn API is commonly used among all ML frameworks:
LGBMArtifactsLibrary = MLArtifactsLibrary
def _apply_mlrun_on_module(
model_name: str = "model",
tag: str = "",
modules_map: Union[Dict[str, Union[None, str, List[str]]], str] = None,
custom_objects_map: Union[Dict[str, Union[str, List[str]]], str] = None,
custom_objects_directory: str = None,
context: mlrun.MLClientCtx = None,
model_format: str = LGBMModelHandler.ModelFormats.PKL,
sample_set: Union[LGBMTypes.DatasetType, mlrun.DataItem, str] = None,
y_columns: Union[List[str], List[int]] = None,
feature_vector: str = None,
feature_weights: List[float] = None,
labels: Dict[str, Union[str, int, float]] = None,
parameters: Dict[str, Union[str, int, float]] = None,
extra_data: Dict[str, LGBMTypes.ExtraDataType] = None,
auto_log: bool = True,
mlrun_logging_callback_kwargs: Dict[str, Any] = None,
):
# Apply MLRun's interface on the LightGBM module:
LGBMMLRunInterface.add_interface(obj=lgb)
# If automatic logging is required, set the future logging callbacks that will be applied to the training functions:
if auto_log:
lgb.configure_logging(
context=context,
model_handler_kwargs={
"model_name": model_name,
"modules_map": modules_map,
"custom_objects_map": custom_objects_map,
"custom_objects_directory": custom_objects_directory,
"model_format": model_format,
},
log_model_kwargs={
"tag": tag,
"sample_set": sample_set,
"target_columns": y_columns,
"feature_vector": feature_vector,
"feature_weights": feature_weights,
"labels": labels,
"parameters": parameters,
"extra_data": extra_data,
},
mlrun_logging_callback_kwargs=mlrun_logging_callback_kwargs,
)
def _apply_mlrun_on_model(
model: LGBMTypes.ModelType = None,
model_name: str = "model",
tag: str = "",
model_path: str = None,
modules_map: Union[Dict[str, Union[None, str, List[str]]], str] = None,
custom_objects_map: Union[Dict[str, Union[str, List[str]]], str] = None,
custom_objects_directory: str = None,
context: mlrun.MLClientCtx = None,
model_format: str = LGBMModelHandler.ModelFormats.PKL,
artifacts: Union[List[MLPlan], List[str], Dict[str, dict]] = None,
metrics: Union[
List[Metric],
List[LGBMTypes.MetricEntryType],
Dict[str, LGBMTypes.MetricEntryType],
] = None,
x_test: LGBMTypes.DatasetType = None,
y_test: LGBMTypes.DatasetType = None,
sample_set: Union[LGBMTypes.DatasetType, mlrun.DataItem, str] = None,
y_columns: Union[List[str], List[int]] = None,
feature_vector: str = None,
feature_weights: List[float] = None,
labels: Dict[str, Union[str, int, float]] = None,
parameters: Dict[str, Union[str, int, float]] = None,
extra_data: Dict[str, LGBMTypes.ExtraDataType] = None,
auto_log: bool = True,
**kwargs
):
# Create a model handler:
model_handler_kwargs = (
kwargs.pop("model_handler_kwargs") if "model_handler_kwargs" in kwargs else {}
)
handler = LGBMModelHandler(
model_name=model_name,
model_path=model_path,
model=model,
context=context,
model_format=model_format,
modules_map=modules_map,
custom_objects_map=custom_objects_map,
custom_objects_directory=custom_objects_directory,
**model_handler_kwargs,
)
# Set the handler's logging attributes:
handler.set_tag(tag=tag)
if sample_set is not None:
handler.set_sample_set(sample_set=sample_set)
if y_columns is not None:
handler.set_target_columns(target_columns=y_columns)
if feature_vector is not None:
handler.set_feature_vector(feature_vector=feature_vector)
if feature_weights is not None:
handler.set_feature_weights(feature_weights=feature_weights)
if labels is not None:
handler.set_labels(to_add=labels)
if parameters is not None:
handler.set_parameters(to_add=parameters)
if extra_data is not None:
handler.set_extra_data(to_add=extra_data)
# Load the model if it was not provided:
if handler.model is None:
handler.load()
model = handler.model
# Add MLRun's interface to the model according to the model type (LGBMModel or Booster):
if isinstance(model, lgb.LGBMModel):
# Apply MLRun's interface on the `LGBMModel`:
LGBMModelMLRunInterface.add_interface(obj=model)
# Configure the logger:
model.configure_logging(
context=context,
plans=LGBMArtifactsLibrary.get_plans(
artifacts=artifacts,
context=context,
include_default=auto_log,
model=model,
y=y_test,
),
metrics=MetricsLibrary.get_metrics(
metrics=metrics,
context=context,
include_default=auto_log,
model=model,
y=y_test,
),
x_test=x_test,
y_test=y_test,
model_handler=handler,
)
else: # lgb.Booster
LGBMBoosterMLRunInterface.add_interface(obj=model)
model.model_handler = handler
return handler
[docs]def apply_mlrun(
model: LGBMTypes.ModelType = None,
model_name: str = "model",
tag: str = "",
model_path: str = None,
modules_map: Union[Dict[str, Union[None, str, List[str]]], str] = None,
custom_objects_map: Union[Dict[str, Union[str, List[str]]], str] = None,
custom_objects_directory: str = None,
context: mlrun.MLClientCtx = None,
model_format: str = LGBMModelHandler.ModelFormats.PKL,
artifacts: Union[List[MLPlan], List[str], Dict[str, dict]] = None,
metrics: Union[
List[Metric],
List[LGBMTypes.MetricEntryType],
Dict[str, LGBMTypes.MetricEntryType],
] = None,
x_test: LGBMTypes.DatasetType = None,
y_test: LGBMTypes.DatasetType = None,
sample_set: Union[LGBMTypes.DatasetType, mlrun.DataItem, str] = None,
y_columns: Union[List[str], List[int]] = None,
feature_vector: str = None,
feature_weights: List[float] = None,
labels: Dict[str, Union[str, int, float]] = None,
parameters: Dict[str, Union[str, int, float]] = None,
extra_data: Dict[str, LGBMTypes.ExtraDataType] = None,
auto_log: bool = True,
mlrun_logging_callback_kwargs: Dict[str, Any] = None,
**kwargs
) -> Union[LGBMModelHandler, None]:
"""
Apply MLRun's interface on top of LightGBM by wrapping the module itself or the given model, providing both with
MLRun's quality of life features.
:param model: The model to wrap. Can be loaded from the model path given as well.
:param model_name: The model name to use for storing the model artifact. Default: "model".
:param tag: The model's tag to log with.
:param model_path: The model's store object path. Mandatory for evaluation (to know which model to
update). If model is not provided, it will be loaded from this path.
:param modules_map: A dictionary of all the modules required for loading the model. Each key is a
path to a module and its value is the object name to import from it. All the
modules will be imported globally. If multiple objects needed to be imported
from the same module a list can be given. The map can be passed as a path to a
json file as well. For example:
.. code-block:: python
{
"module1": None, # import module1
"module2": ["func1", "func2"], # from module2 import func1, func2
"module3.sub_module": "func3", # from module3.sub_module import func3
}
If the model path given is of a store object, the modules map will be read from
the logged modules map artifact of the model.
:param custom_objects_map: A dictionary of all the custom objects required for loading the model. Each key is
a path to a python file and its value is the custom object name to import from it.
If multiple objects needed to be imported from the same py file a list can be
given. The map can be passed as a path to a json file as well. For example:
.. code-block:: python
{
"/.../custom_model.py": "MyModel",
"/.../custom_objects.py": ["object1", "object2"]
}
All the paths will be accessed from the given 'custom_objects_directory', meaning
each py file will be read from 'custom_objects_directory/<MAP VALUE>'. If the model
path given is of a store object, the custom objects map will be read from the
logged custom object map artifact of the model.
Notice: The custom objects will be imported in the order they came in this
dictionary (or json). If a custom object is depended on another, make sure to
put it below the one it relies on.
:param custom_objects_directory: Path to the directory with all the python files required for the custom objects.
Can be passed as a zip file as well (will be extracted during the run before
loading the model). If the model path given is of a store object, the custom
objects files will be read from the logged custom object artifact of the model.
:param context: MLRun context to work with. If no context is given it will be retrieved via
'mlrun.get_or_create_ctx(None)'
:param artifacts: A list of artifacts plans to produce during the run.
:param metrics: A list of metrics to calculate during the run.
:param x_test: The validation data for producing and calculating artifacts and metrics post
training. Without this, validation will not be performed.
:param y_test: The test data ground truth for producing and calculating artifacts and metrics post
training or post predict / predict_proba.
:param sample_set: A sample set of inputs for the model for logging its stats along the model in
favour of model monitoring.
:param y_columns: List of names of all the columns in the ground truth labels in case its a
pd.DataFrame or a list of integers in case the dataset is a np.ndarray. If not
given but 'y_train' / 'y_test' is given then the labels / indices in it will be
used by default.
:param feature_vector: Feature store feature vector uri (store://feature-vectors/<project>/<name>[:tag])
:param feature_weights: List of feature weights, one per input column.
:param labels: Labels to log with the model.
:param parameters: Parameters to log with the model.
:param extra_data: Extra data to log with the model.
:param auto_log: Whether to apply MLRun's auto logging on the model. Auto logging will add the
default artifacts and metrics to the lists of artifacts and metrics. Default:
True.
:param mlrun_logging_callback_kwargs: Key word arguments for the MLRun callback. For further information see the
documentation of the class 'MLRunLoggingCallback'. Note that 'context' is already
given here.
:return: If a model was provided via `model` or `model_path` the model handler initialized with the provided model
will be returned. Otherwise, None.
"""
# Get the default context:
if context is None:
context = mlrun.get_or_create_ctx(LGBMMLRunInterface.DEFAULT_CONTEXT_NAME)
# If a model or a model path were provided, apply on the provided model, otherwise on the LightGBM module:
if model is None and model_path is None:
_apply_mlrun_on_module(
model_name=model_name,
tag=tag,
modules_map=modules_map,
custom_objects_map=custom_objects_map,
custom_objects_directory=custom_objects_directory,
context=context,
model_format=model_format,
sample_set=sample_set,
y_columns=y_columns,
feature_vector=feature_vector,
feature_weights=feature_weights,
labels=labels,
parameters=parameters,
extra_data=extra_data,
auto_log=auto_log,
mlrun_logging_callback_kwargs=mlrun_logging_callback_kwargs,
)
return
return _apply_mlrun_on_model(
model=model,
model_name=model_name,
tag=tag,
model_path=model_path,
modules_map=modules_map,
custom_objects_map=custom_objects_map,
custom_objects_directory=custom_objects_directory,
context=context,
model_format=model_format,
artifacts=artifacts,
metrics=metrics,
x_test=x_test,
y_test=y_test,
sample_set=sample_set,
y_columns=y_columns,
feature_vector=feature_vector,
feature_weights=feature_weights,
labels=labels,
parameters=parameters,
extra_data=extra_data,
auto_log=auto_log,
**kwargs,
)