Source code for stoneforge.machine_learning.classification.fit

import numpy as np
import numpy.typing as npt
import pickle
import json
import os
from typing import Annotated

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# By Jose Augusto Victorino Dias empirical expertise
MODELS = {
    "gaussian_naive_bayes": (GaussianNB, {}),
    "decision_tree_classifier": (DecisionTreeClassifier, {
        "criterion": ['gini', 'entropy'],
        "max_depth": [5, 10, 15, 30, 50, 70, 100],
    }),
    "support_vector_machine": (SVC, {}),
    "logistic_regression": (LogisticRegression, {
        "random_state": [99]
    }),
    "k_neighbors_classifier": (KNeighborsClassifier, {
        "n_neighbors": np.arange(3, 61, 2),
        "weights": ['uniform', 'distance'],
        "p": np.arange(1, 6)
    }),
    "random_forest_classifier": (RandomForestClassifier, {
        "criterion": ['gini', 'entropy'],
        "max_depth": [5, 10, 15, 30, 50, 70, 100],
        "random_state": [99]
    })
}

def _saves(
    info : Annotated [dict, "Serialized or dictionary settings"],
    filepath : Annotated [str, "Path to the file"],
    name : Annotated [str, "File name"],
    suffix : Annotated [str, "Suffix of the file to be saved"] = "_fit_property.pkl",
    sz : Annotated [str, "Serialized option"]=False)-> None:
    """Saves model file in .json or .pkl format.
    
    Parameters
    ----------
    info : dict
        The settings or model to be saved, either as a dictionary or serialized object.
    filepath : str
        The path where the file will be saved.
    name : str
        The name of the file to save the settings, without extension.
    suffix : str, optional
        The suffix to append to the file name. Defaults to "_fit_property.pkl".
    sz : bool, optional
        If True, the info is serialized and returned as a .pkl format. Defaults to False, which saves the file.
        
    Warnings
    --------
    Don't change the suffix parameter unless you know what you are doing. The other functions recognize this suffix.
    """
    
    full_path = os.path.join(filepath, name + suffix)
    if sz:
        return pickle.dumps(info)
    with open(full_path, "wb") as f:
        pickle.dump(info, f)

def _load_settings(
    filepath : Annotated [str, "Path to the file"],
    method : Annotated [str, "machine learning method"] = "gaussian_naive_bayes") -> dict:
    """Loads settings from a JSON file for the specified machine learning method.
    
    Parameters
    ----------
    filepath : str
        The path to the directory where the settings file is located.
    method : str, optional
        The name of the machine learning method for which settings are to be loaded.
        
    Returns
    -------
    dict
        A dictionary containing the settings for the specified machine learning method.
    """
    
    with open(os.path.join(filepath, f"{method}_settings.json")) as f:
        return json.load(f)

def _train_model(
    X : Annotated [np.array, "X feature data"],
    y : Annotated [np.array, "y target data"],
    method : Annotated [str, "Machine learning method"],
    filepath : Annotated [str, "Path to the file where the model will be saved"],
    gs : Annotated [bool, "Grid search for hyperparameter tuning"] = False,
    settings : Annotated [bool, "Settings for the model, if not provided will load from file"] = False,
    **kwargs):
    """Internal function that trains a machine learning model based on the specified method and saves the training into file.
    
    Parameters
    ----------
    X : np.array
        Feature data for training the model.
    y : np.array
        Target data for training the model.
    method : str
        The machine learning method to be used for training. Should be one of the following:
    filepath : str
        The path to the file where the trained model will be saved. If not provided, the model will be returned as a serialized object.
    gs : bool, optional
        If True, performs grid search for hyperparameter tuning. Defaults to False.
    settings : bool, optional
        If True, uses the provided settings for the model. If False, loads settings from a file if available.
    **kwargs : dict
        Additional keyword arguments to be passed to the model's fit method.
        
    Returns
    -------
    np.array or None
        If `filepath` is not provided, returns the serialized model as a byte string. Otherwise, saves the model to the specified file and returns None.
        
    Warnings
    --------
    If the `method` is not supported, a ValueError will be raised.
    If `gs` is True, the model will be trained using grid search for hyperparameter.
    Filenames are standardized to include the method name and a suffix "_fit_property.pkl" for consistency.
    """
    if method not in MODELS:
        raise ValueError(f"Method '{method}' is not supported. Available methods: {list(MODELS.keys())}")

    
    ModelClass, param_grid = MODELS[method]

    # Hyperparameter tuning
    if gs:
        model = ModelClass()
        clf = GridSearchCV(model, param_grid, scoring='accuracy')
        clf.fit(X, y)
        settings = clf.best_params_

    # Load or deserialize settings
    elif not settings:
        settings = _load_settings(filepath, method) if filepath else {}
    else:
        settings = pickle.loads(settings)

    model = ModelClass(**settings)
    model.fit(X, y, **kwargs)

    if not filepath:
        return pickle.dumps(model)
    else:
        _saves(model, filepath, method)


[docs]
def fit(
    X : Annotated [np.array, "X feature data"],
    y : Annotated [np.array, "y target data"],
    method : Annotated [str, "Machine learning method"] = 'gaussian_naive_bayes',
    filepath : Annotated [str, "Path to the file where the model will be saved"] = ".",
    gs : Annotated [bool, "Grid search for hyperparameter tuning"] = False,
    settings : Annotated [bool, "Settings for the model, if not provided will load from file"] = False,
    **kwargs):
    """Fits a machine learning model to the provided data and saves the model to a file :footcite:t:`scikit-learn, dias2023`.
    
    Parameters
    ----------
    X : np.array
        Feature data for training the model.
    y : np.array
        Target data for training the model.
    method : str
        The machine learning method to be used for training. Should be one of the following:
            - 'gaussian_naive_bayes'
            - 'decision_tree_classifier'
            - 'support_vector_machine'
            - 'logistic_regression'
            - 'k_neighbors_classifier'
            - 'random_forest_classifier'
    filepath : str
        The path to the file where the trained model will be saved. If not provided, the model will be returned as a serialized object.
    gs : bool, optional
        If True, performs grid search for hyperparameter tuning. Defaults to False.
    settings : bool, optional
        If True, uses the provided settings for the model. If False, loads settings from a file if available.
    **kwargs : dict
        Additional keyword arguments to be passed to the model's fit method.
        
    Returns
    -------
    np.array or None
        If `filepath` is not provided, returns the serialized model as a byte string. Otherwise, saves the model to the specified file and returns None.
        
    Examples
    --------
    >>> X_fit = np.array([[1, 2], [3, 4], [5, 6]])
    >>> y_fit = np.array([0, 1, 0])
    >>> fit(X_fit, y_fit, method="gaussian_naive_bayes", filepath="./nb_project", gs=True)

    Warnings
    --------
    If the `method` is not supported, a ValueError will be raised.
    If `gs` is True, the model will be trained using grid search for hyperparameter.
    Filenames are standardized to include the method name and a suffix "_fit_property.pkl" for consistency.  
    """

    # Normalize features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    scalerp = StandardScaler()
    X_norm = scalerp.fit_transform(X_scaled)

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Optionally return just the preprocessors
    if method == "scalers":
        return (pickle.dumps(scaler),
                pickle.dumps(scalerp),
                pickle.dumps(le))

    # Save preprocessors
    if filepath:
        _saves(scaler, filepath, f"{method}_scaler", suffix=".pkl")
        _saves(scalerp, filepath, f"{method}_scalerp", suffix=".pkl")
        _saves(le, filepath, f"{method}_y_encoded", suffix=".pkl")

    # Train model
    return _train_model(X_norm, y_encoded, method, filepath, gs, settings, **kwargs)



#XGBOOST
#def xgboost(X: npt.ArrayLike, y: npt.ArrayLike, path, gs, settings, **kwargs) -> np.ndarray:
#
#    method = "x_g_boost_classifier"
#
#    if gs:
#        parameters =  {'n_estimators': [100],
#        'learning_rate': [0.5],
#        'max_depth':[5,10,15,30,50,70,100],
#        'random_state':[99]}
#
#        xgb = XGBClassifier()
#
#        bestxgb = GridSearchCV(xgb,parameters,scoring='accuracy')
#        bestxgb.fit(X,y)
#        settings = bestxgb.best_params_
#
#    if not gs:
#        if path:
#            settings = load_settings(path, method)
#        else:
#            settings = pickle.loads(settings)
#
#    settings['random_state'] = 99
#    xg = XGBClassifier(**settings)
#    xg.fit(X, y, **kwargs)
#
#    if not path:
#        serialized_model = pickle.dumps(xg)
#        return serialized_model
#    else:
#        saves(xg, path, method)