from sklearn import ensemble
from sklearn.model_selection import train_test_split
import mlrun
from mlrun.frameworks.sklearn import apply_mlrun
def train(
dataset: mlrun.DataItem,
label_column: str = "label",
n_estimators: int = 100,
learning_rate: float = 0.1,
max_depth: int = 3,
model_name: str = "cancer_classifier",
):
# Initialize our dataframes
df = dataset.as_df()
X = df.drop(label_column, axis=1)
y = df[label_column]
# Train/Test split Iris data-set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Pick an ideal ML model
model = ensemble.GradientBoostingClassifier(
n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth
)
# Wrap our model with Mlrun features, specify the test dataset for analysis and accuracy measurements
apply_mlrun(model=model, model_name=model_name, x_test=X_test, y_test=y_test)
# Train our model
model.fit(X_train, y_train)