Source code for mlrun.frameworks.sklearn

# Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings
from typing import Optional, Union

import mlrun
from mlrun.frameworks.sklearn.metric import Metric

from .._ml_common import MLArtifactsLibrary, MLPlan, PickleModelServer
from .metrics_library import MetricsLibrary
from .mlrun_interface import SKLearnMLRunInterface
from .model_handler import SKLearnModelHandler
from .utils import SKLearnTypes, SKLearnUtils

# Placeholders as the SciKit-Learn API is commonly used among all ML frameworks:
SKLearnModelServer = PickleModelServer
SKLearnArtifactsLibrary = MLArtifactsLibrary



[docs]
def apply_mlrun(
    model: SKLearnTypes.ModelType = None,
    model_name: str = "model",
    tag: str = "",
    model_path: str | None = None,
    modules_map: Union[dict[str, Union[None, str, list[str]]], str] | None = None,
    custom_objects_map: Union[dict[str, Union[str, list[str]]], str] | None = None,
    custom_objects_directory: str | None = None,
    context: mlrun.MLClientCtx = None,
    artifacts: Union[list[MLPlan], list[str], dict[str, dict]] | None = None,
    metrics: Union[
        list[Metric],
        list[SKLearnTypes.MetricEntryType],
        dict[str, SKLearnTypes.MetricEntryType],
    ]
    | None = None,
    x_test: SKLearnTypes.DatasetType = None,
    y_test: SKLearnTypes.DatasetType = None,
    sample_set: Union[SKLearnTypes.DatasetType, mlrun.DataItem, str] = None,
    y_columns: Union[list[str], list[int]] | None = None,
    feature_vector: str | None = None,
    feature_weights: list[float] | None = None,
    labels: dict[str, Union[str, int, float]] | None = None,
    parameters: dict[str, Union[str, int, float]] | None = None,
    extra_data: dict[str, SKLearnTypes.ExtraDataType] | None = None,
    auto_log: bool = True,
    **kwargs,
) -> SKLearnModelHandler:
    """
    Wrap the given model with MLRun's interface providing it with mlrun's additional features.

    :param model:                    The model to wrap. Can be loaded from the model path given as well.
    :param model_name:               The model name to use for storing the model artifact. Default: "model".
    :param tag:                      The model's tag to log with.
    :param model_path:               The model's store object path. Mandatory for evaluation (to know which model to
                                     update). If model is not provided, it will be loaded from this path.
    :param modules_map:              A dictionary of all the modules required for loading the model. Each key is a
                                     path to a module and its value is the object name to import from it. All the
                                     modules will be imported globally. If multiple objects needed to be imported
                                     from the same module a list can be given. The map can be passed as a path to a
                                     json file as well. For example:

                                     .. code-block:: python

                                         {
                                             "module1": None,  # import module1
                                             "module2": [
                                                 "func1",
                                                 "func2",
                                             ],  # from module2 import func1, func2
                                             "module3.sub_module": "func3",  # from module3.sub_module import func3
                                         }

                                     If the model path given is of a store object, the modules map will be read from
                                     the logged modules map artifact of the model.
    :param custom_objects_map:       A dictionary of all the custom objects required for loading the model. Each key is
                                     a path to a python file and its value is the custom object name to import from it.
                                     If multiple objects needed to be imported from the same py file a list can be
                                     given. The map can be passed as a path to a json file as well. For example:

                                     .. code-block:: python

                                         {
                                             "/.../custom_model.py": "MyModel",
                                             "/.../custom_objects.py": [
                                                 "object1",
                                                 "object2",
                                             ],
                                         }

                                     All the paths will be accessed from the given 'custom_objects_directory', meaning
                                     each py file will be read from 'custom_objects_directory/<MAP VALUE>'. If the model
                                     path given is of a store object, the custom objects map will be read from the
                                     logged custom object map artifact of the model.
                                     Notice: The custom objects will be imported in the order they came in this
                                     dictionary (or json). If a custom object is depended on another, make sure to
                                     put it below the one it relies on.
    :param custom_objects_directory: Path to the directory with all the python files required for the custom objects.
                                     Can be passed as a zip file as well (will be extracted during the run before
                                     loading the model). If the model path given is of a store object, the custom
                                     objects files will be read from the logged custom object artifact of the model.
    :param context:                  MLRun context to work with. If no context is given it will be retrieved via
                                     'mlrun.get_or_create_ctx(None)'
    :param artifacts:                A list of artifacts plans to produce during the run.
    :param metrics:                  A list of metrics to calculate during the run.
    :param x_test:                   The validation data for producing and calculating artifacts and metrics post
                                     training. Without this, validation will not be performed.
    :param y_test:                   The test data ground truth for producing and calculating artifacts and metrics post
                                     training or post predict / predict_proba.
    :param sample_set:               A sample set of inputs for the model for logging its stats along the model in
                                     favour of model monitoring. If not given the 'x_train' will be used by default.
    :param y_columns:                List of names of all the columns in the ground truth labels in case its a
                                     pd.DataFrame or a list of integers in case the dataset is a np.ndarray. If not
                                     given 'y_train' is given then the labels / indices in it will be used by default.
    :param feature_vector:           Feature store feature vector uri (store://feature-vectors/<project>/<name>[:tag])
    :param feature_weights:          List of feature weights, one per input column.
    :param labels:                   Labels to log with the model.
    :param parameters:               Parameters to log with the model.
    :param extra_data:               Extra data to log with the model.
    :param auto_log:                 Whether to apply MLRun's auto logging on the model. Auto logging will add the
                                     default artifacts and metrics to the lists of artifacts and metrics. Default:
                                     True.

    :return: The model handler initialized with the provided model.
    """
    # Get the default context:
    if context is None:
        context = mlrun.get_or_create_ctx(SKLearnMLRunInterface.DEFAULT_CONTEXT_NAME)

    # Create a model handler:
    model_handler_kwargs = (
        kwargs.pop("model_handler_kwargs") if "model_handler_kwargs" in kwargs else {}
    )
    handler = SKLearnModelHandler(
        model_name=model_name,
        model_path=model_path,
        model=model,
        context=context,
        modules_map=modules_map,
        custom_objects_map=custom_objects_map,
        custom_objects_directory=custom_objects_directory,
        **model_handler_kwargs,
    )

    # Load the model if it was not provided:
    if model is None:
        handler.load()
        model = handler.model

    # Set the handler's logging attributes:
    handler.set_tag(tag=tag)
    if sample_set is not None:
        handler.set_sample_set(sample_set=sample_set)
    if y_columns is not None:
        handler.set_target_columns(target_columns=y_columns)
    if feature_vector is not None:
        handler.set_feature_vector(feature_vector=feature_vector)
    if feature_weights is not None:
        handler.set_feature_weights(feature_weights=feature_weights)
    if labels is not None:
        handler.set_labels(to_add=labels)
    if parameters is not None:
        handler.set_parameters(to_add=parameters)
    if extra_data is not None:
        handler.set_extra_data(to_add=extra_data)

    # Add MLRun's interface to the model:
    SKLearnMLRunInterface.add_interface(obj=model)

    # Configure the logger:
    model.configure_logging(
        context=context,
        plans=SKLearnArtifactsLibrary.get_plans(
            artifacts=artifacts,
            context=context,
            include_default=auto_log,
            model=model,
            y=y_test,
        ),
        metrics=MetricsLibrary.get_metrics(
            metrics=metrics,
            context=context,
            include_default=auto_log,
            model=model,
            y=y_test,
        ),
        x_test=x_test,
        y_test=y_test,
        model_handler=handler,
    )

    return handler