Source code for stoneforge.machine_learning.classification.fit

import numpy as np
import numpy.typing as npt
import pickle
import json
import os
from typing import Annotated

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# By Jose Augusto Victorino Dias empirical expertise
MODELS = {
    "gaussian_naive_bayes": (GaussianNB, {}),
    "decision_tree_classifier": (DecisionTreeClassifier, {
        "criterion": ['gini', 'entropy'],
        "max_depth": [5, 10, 15, 30, 50, 70, 100],
    }),
    "support_vector_machine": (SVC, {}),
    "logistic_regression": (LogisticRegression, {
        "random_state": [99]
    }),
    "k_neighbors_classifier": (KNeighborsClassifier, {
        "n_neighbors": np.arange(3, 61, 2),
        "weights": ['uniform', 'distance'],
        "p": np.arange(1, 6)
    }),
    "random_forest_classifier": (RandomForestClassifier, {
        "criterion": ['gini', 'entropy'],
        "max_depth": [5, 10, 15, 30, 50, 70, 100],
        "random_state": [99]
    })
}

def _saves(
    info : Annotated [dict, "Serialized or dictionary settings"],
    filepath : Annotated [str, "Path to the file"],
    name : Annotated [str, "File name"],
    suffix : Annotated [str, "Suffix of the file to be saved"] = "_fit_property.pkl",
    sz : Annotated [str, "Serialized option"]=False)-> None:
    """Saves model file in .json or .pkl format.
    
    Parameters
    ----------
    info : dict
        The settings or model to be saved, either as a dictionary or serialized object.
    filepath : str
        The path where the file will be saved.
    name : str
        The name of the file to save the settings, without extension.
    suffix : str, optional
        The suffix to append to the file name. Defaults to "_fit_property.pkl".
    sz : bool, optional
        If True, the info is serialized and returned as a .pkl format. Defaults to False, which saves the file.
        
    Warnings
    --------
    Don't change the suffix parameter unless you know what you are doing. The other functions recognize this suffix.
    """
    
    full_path = os.path.join(filepath, name + suffix)
    if sz:
        return pickle.dumps(info)
    with open(full_path, "wb") as f:
        pickle.dump(info, f)

def _load_settings(
    filepath : Annotated [str, "Path to the file"],
    method : Annotated [str, "machine learning method"] = "gaussian_naive_bayes") -> dict:
    """Loads settings from a JSON file for the specified machine learning method.
    
    Parameters
    ----------
    filepath : str
        The path to the directory where the settings file is located.
    method : str, optional
        The name of the machine learning method for which settings are to be loaded.
        
    Returns
    -------
    dict
        A dictionary containing the settings for the specified machine learning method.
    """
    
    with open(os.path.join(filepath, f"{method}_settings.json")) as f:
        return json.load(f)

def _train_model(
    X : Annotated [np.array, "X feature data"],
    y : Annotated [np.array, "y target data"],
    method : Annotated [str, "Machine learning method"],
    filepath : Annotated [str, "Path to the file where the model will be saved"],
    gs : Annotated [bool, "Grid search for hyperparameter tuning"] = False,
    settings : Annotated [bool, "Settings for the model, if not provided will load from file"] = False,
    **kwargs):
    """Internal function that trains a machine learning model based on the specified method and saves the training into file.
    
    Parameters
    ----------
    X : np.array
        Feature data for training the model.
    y : np.array
        Target data for training the model.
    method : str
        The machine learning method to be used for training. Should be one of the following:
    filepath : str
        The path to the file where the trained model will be saved. If not provided, the model will be returned as a serialized object.
    gs : bool, optional
        If True, performs grid search for hyperparameter tuning. Defaults to False.
    settings : bool, optional
        If True, uses the provided settings for the model. If False, loads settings from a file if available.
    **kwargs : dict
        Additional keyword arguments to be passed to the model's fit method.
        
    Returns
    -------
    np.array or None
        If `filepath` is not provided, returns the serialized model as a byte string. Otherwise, saves the model to the specified file and returns None.
        
    Warnings
    --------
    If the `method` is not supported, a ValueError will be raised.
    If `gs` is True, the model will be trained using grid search for hyperparameter.
    Filenames are standardized to include the method name and a suffix "_fit_property.pkl" for consistency.
    """
    if method not in MODELS:
        raise ValueError(f"Method '{method}' is not supported. Available methods: {list(MODELS.keys())}")

    
    ModelClass, param_grid = MODELS[method]

    # Hyperparameter tuning
    if gs:
        model = ModelClass()
        clf = GridSearchCV(model, param_grid, scoring='accuracy')
        clf.fit(X, y)
        settings = clf.best_params_

    # Load or deserialize settings
    elif not settings:
        settings = _load_settings(filepath, method) if filepath else {}
    else:
        settings = pickle.loads(settings)

    model = ModelClass(**settings)
    model.fit(X, y, **kwargs)

    if not filepath:
        return pickle.dumps(model)
    else:
        _saves(model, filepath, method)

[docs] def fit( X : Annotated [np.array, "X feature data"], y : Annotated [np.array, "y target data"], method : Annotated [str, "Machine learning method"] = 'gaussian_naive_bayes', filepath : Annotated [str, "Path to the file where the model will be saved"] = ".", gs : Annotated [bool, "Grid search for hyperparameter tuning"] = False, settings : Annotated [bool, "Settings for the model, if not provided will load from file"] = False, **kwargs): """Fits a machine learning model to the provided data and saves the model to a file :footcite:t:`scikit-learn, dias2023`. Parameters ---------- X : np.array Feature data for training the model. y : np.array Target data for training the model. method : str The machine learning method to be used for training. Should be one of the following: - 'gaussian_naive_bayes' - 'decision_tree_classifier' - 'support_vector_machine' - 'logistic_regression' - 'k_neighbors_classifier' - 'random_forest_classifier' filepath : str The path to the file where the trained model will be saved. If not provided, the model will be returned as a serialized object. gs : bool, optional If True, performs grid search for hyperparameter tuning. Defaults to False. settings : bool, optional If True, uses the provided settings for the model. If False, loads settings from a file if available. **kwargs : dict Additional keyword arguments to be passed to the model's fit method. Returns ------- np.array or None If `filepath` is not provided, returns the serialized model as a byte string. Otherwise, saves the model to the specified file and returns None. Examples -------- >>> X_fit = np.array([[1, 2], [3, 4], [5, 6]]) >>> y_fit = np.array([0, 1, 0]) >>> fit(X_fit, y_fit, method="gaussian_naive_bayes", filepath="./nb_project", gs=True) Warnings -------- If the `method` is not supported, a ValueError will be raised. If `gs` is True, the model will be trained using grid search for hyperparameter. Filenames are standardized to include the method name and a suffix "_fit_property.pkl" for consistency. """ # Normalize features scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X) scalerp = StandardScaler() X_norm = scalerp.fit_transform(X_scaled) le = LabelEncoder() y_encoded = le.fit_transform(y) # Optionally return just the preprocessors if method == "scalers": return (pickle.dumps(scaler), pickle.dumps(scalerp), pickle.dumps(le)) # Save preprocessors if filepath: _saves(scaler, filepath, f"{method}_scaler", suffix=".pkl") _saves(scalerp, filepath, f"{method}_scalerp", suffix=".pkl") _saves(le, filepath, f"{method}_y_encoded", suffix=".pkl") # Train model return _train_model(X_norm, y_encoded, method, filepath, gs, settings, **kwargs)
#XGBOOST #def xgboost(X: npt.ArrayLike, y: npt.ArrayLike, path, gs, settings, **kwargs) -> np.ndarray: # # method = "x_g_boost_classifier" # # if gs: # parameters = {'n_estimators': [100], # 'learning_rate': [0.5], # 'max_depth':[5,10,15,30,50,70,100], # 'random_state':[99]} # # xgb = XGBClassifier() # # bestxgb = GridSearchCV(xgb,parameters,scoring='accuracy') # bestxgb.fit(X,y) # settings = bestxgb.best_params_ # # if not gs: # if path: # settings = load_settings(path, method) # else: # settings = pickle.loads(settings) # # settings['random_state'] = 99 # xg = XGBClassifier(**settings) # xg.fit(X, y, **kwargs) # # if not path: # serialized_model = pickle.dumps(xg) # return serialized_model # else: # saves(xg, path, method)