Source code for AutoMxL.Modelisation.Bagging

""" Bagging algorithm class. Methods :

- Bagging (class) : generate new training more balanced and train model for each
- Bagging_sample (func) : generate bagging sample

"""
from sklearn.ensemble import RandomForestClassifier
from AutoMxL.Modelisation.Utils import *
import pandas as pd

"""
Default bagging parameters
"""
default_bagging_param = {'n_sample': 5,
                         'pos_sample_size': 1.0,
                         'replace': False}


[docs]class Bagging(object): """Meta-algo designed to improve the stability and accuracy of ML classif/regression algos or to face an "imbalanced target distribution" issue. Bagging generates m new training sets more balanced. Then, a model is fitted on each sample and outputs are combined by averaging (for regression) or voting (for classification). Available classifiers : Random Forest and XGBOOST Parameters ---------- clf : Model fitted on samples (Default : RandomForestClassifier(n_estimators=100, max_leaf_nodes=100) Model fitted on the samples n_sample : int (Default : 5) number a samples pos_sample_size : int/float (Default : 1.0) Number/rate of target=1 observations in each sample (filled with 3 times more target=0 ) - if int : number of target=1 - if float : rate of total target=1 replace : Boolean (Default : False) Enable sampling with replacement list_model : list (Default : None) Fitted models (created with fit method) """ def __init__(self, clf=RandomForestClassifier(n_estimators=100, max_leaf_nodes=100), n_sample=5, pos_sample_size=1.0, replace=True): self.classifier = clf self.niter = n_sample self.pos_sample_size = pos_sample_size self.replace = replace self.list_model = list() self.is_fitted = False """ ------------------------------------------------------------------------------------------------------------- """
[docs] def get_params(self): """Get bagging object parameters Returns ------- dict {param : value} """ return {'classifier': self.classifier, 'niter': self.niter, 'pos_sample_size': self.pos_sample_size, 'replace': self.replace, 'list_model': self.list_model}
""" ------------------------------------------------------------------------------------------------------------- """
[docs] def fit(self, df_train, target): """Create bagging samples from a DataFrame and fit the model (self.clf) on each sample Parameters ---------- df_train : DataFrame Training dataset target : String Target name Returns ------- self.list_model : list Fitted models """ # list_model init self.list_model = [None] * self.niter # get number of target=1 in bagging samples if isinstance(self.pos_sample_size, int): N = self.pos_sample_size else: N = int(self.pos_sample_size * df_train.loc[df_train[target] == 1].shape[0]) for i in range(self.niter): # Sample creation df_train_bag = create_sample(df_train, target, N, replace=self.replace) # X_train / y_train X_train_bag = df_train_bag.copy() y_train_bag = X_train_bag[target] del X_train_bag[target] # Create and store model self.list_model[i] = self.classifier # fit model for each sample self.list_model[i].fit(X_train_bag, y_train_bag) self.is_fitted = True return self
""" ------------------------------------------------------------------------------------------------------------- """
[docs] def predict(self, df): """Apply models fitted on sample to a dataset. Combine models by averaging the outputs (for regression) or voting (for classification) Parameters ---------- df : DataFrame Dataset to apply the model Returns ------- numpy.ndarray (float) Averaged classification probabilities numpy.ndarray (int) Predictions for each observation """ assert self.is_fitted, "Fit first !" # Init probs storage matrix mat_prob = np.zeros((self.niter, df.shape[0])) # for each fitted models for j in range(self.niter): # apply the model on test set y_prob_rf = self.list_model[j].predict_proba(df) # probabilities storage in matrix mat_prob[j] = y_prob_rf[:, 1] # probas averaging list_prob_pred = mat_prob.sum(axis=0) / self.niter # voting list_pred = [round(elem, 0) for elem in list_prob_pred] return list_prob_pred, list_pred
""" ------------------------------------------------------------------------------------------------------------- """
[docs] def bag_feature_importance(self, X): """Get features importance of the model by averaging importance of models fitted on the samples Parameters ---------- X : DataFrame Input Dataset Returns ------- dict {feature : importance} """ # Init importance storage matrix mat_feat_imp = np.zeros((self.niter, len(X.columns))) # for each fitted models for i in range(self.niter): # importances storage in matrix mat_feat_imp[i] = self.list_model[i].feature_importances_ # Averaging importances list_feat_imp_moy = mat_feat_imp.sum(axis=0) / self.niter features_dict = dict(zip(X.columns, list_feat_imp_moy)) return features_dict
""" ------------------------------------------------------------------------------------------------------------- """
[docs]def create_sample(df, target, pos_target_nb, replace=False): """Generate a DataFrame sample with selected number of target=1 Parameters ---------- df : DataFrame Input dataset target : String Target name pos_target_nb : int Number of target=1 observations in the sample replace : Boolean (défaut : False) If True, create samples with replacement Returns ------- DataFrame sample dataset """ # split target = 1 / 0 df_pos = df.loc[(df[target] == 1)] df_neg = df.loc[(df[target] == 0)] n_size = min(3 * pos_target_nb, df_neg.shape[0]) # sample creation df_bag = pd.concat( (df_pos.sample(n=pos_target_nb, replace=replace), df_neg.sample(n=n_size, replace=replace)), axis=0) return df_bag