Source code for AutoMxL.main

from AutoMxL.Utils.Display import print_title1
from AutoMxL.Utils.Decorators import timer
from AutoMxL.Explore.Explore import explore
from AutoMxL.Preprocessing.Date import DateEncoder
from AutoMxL.Preprocessing.Missing_Values import NAEncoder
from AutoMxL.Preprocessing.Outliers import OutliersEncoder
from AutoMxL.Preprocessing.Categorical import CategoricalEncoder
from AutoMxL.Modelisation.HyperOpt import *
from AutoMxL.Select_Features.Select_Features import FeatSelector
from time import time


[docs]class AML(pd.DataFrame):
    """Covers the complete pipeline of a classification project from a raw dataset to a deployable model.

    AML is built as a class inherited from pandas DataFrame. Each Machine Learning step corresponds to method that
    can be called with default or filled parameters.

    - explore: explore dataset and identify features types
    - preprocess: clean and prepare data (optional : outliers processing).
    - select_features: features selection (optional)
    - model_train_predict : split AML in train/test sets to fits/apply models with random search.
      Returns the list of the valid models (without overfitting) and the best one.

    deployment methods:

    - preprocess_apply : apply fitted preprocessing transformation to a new dataset
    - select_features_apply : idem
    - model_apply : apply fitted models to a new dataset


    Notes :

    - A method requires that the former one has been applied (actuel step is given by "step" attribute)
    - Target has to be binary and encoded as int (1/0) (see MLGB59.Start.Encode_Target module if you need help)
    - don't call your target "target" please :>

    Parameters
    ----------
    _obj : DataFrame
        Source Dataset
    target : string
        target name
  """

    def __init__(self, *args, target=None, **kwargs):
        super(AML, self).__init__(*args, **kwargs)
        assert target != 'target', 'target name cannot be "target"'
        # parameters
        self.target = target
        # attributes
        self.step = 'None'
        self.d_features = None
        self.d_preprocess = None
        self.features_selector = None
        self.d_hyperopt = None
        self.is_fitted_preprocessing = False
        self.is_fitted_selector = False
        self.is_fitted_model = False

    """
    --------------------------------------------------------------------------------------------------------------------
    """

    def __repr__(self):
        return 'AutoMxL instance'

    """
    --------------------------------------------------------------------------------------------------------------------
    """

    def duplicate(self):
        res = AML(self)
        res.__dict__.update(self.__dict__)
        return res

    """
    --------------------------------------------------------------------------------------------------------------------
    """

[docs]    def explore(self, verbose=False):
        """data exploration and features type identification

        Note :  if you disagree with automated identification, you can directly modify d_features attribute

        Create self.d_features : dict {x : list of variables names}
            - date: date features
            - identifier: identifier features
            - verbatim: verbatim features
            - boolean: boolean features
            - categorical: categorical features
            - numerical: numerical features
            - NA: features which contains NA values
            - low_variance: list of the features with low variance and unique values

        Parameters
        ----------
        verbose : boolean (Default False)
            Get logging information
        """
        if verbose:
            start_time = time()
            print_title1('Explore')

        df_local = self.copy()
        if self.target is not None:
            df_local = df_local.drop(self.target, axis=1)

        # call std_audit_dataset function
        self.d_features = explore(
            df_local, verbose=verbose)

        self.step = 'explore'

        # created attributes display
        if verbose:
            color_print("\nCreated attributes :  d_features (dict) ")
            print("Keys :")
            print("  -> date")
            print("  -> identifier")
            print("  -> verbatim")
            print("  -> boolean")
            print("  -> categorical")
            print("  -> numerical")
            print("  -> date")
            print("  -> NA")
            print("  -> low_variance")
            print('\n\t\t>>>', 'explore execution time:', round(time() - start_time, 4), 'secs. <<<')

    """
    --------------------------------------------------------------------------------------------------------------------
    """

[docs]    def preprocess(self, date_ref=None, process_outliers=False,
                   cat_method='deep_encoder', verbose=False):
        """Prepare the data before feeding it to the model :

            - remove low variance features
            - remove identifiers and verbatims features
            - transform date features to timedelta
            - fill missing values
            - process categorical and boolean data (one-hot-encoding or Pytorch NN encoder)
            - replace outliers (optional)

            create self.d_preprocess : dict {step : transformation}
                - remove: list of the features to remove
                - date: fitted DateEncoder object
                - NA: fitted NAEncoder object
                - categorical: fitted CategoricalEncoder object
                - outlier: fitted OutlierEncoder object

        Parameters
        ----------
        date_ref : string '%d/%m/%y' (Default : None)
            ref date to compute date features timedelta.
            If None, today date
        process_outliers : boolean (Default : False)
            Enable outliers replacement
        cat_method : string (Default : 'deep_encoder')
            Categorical features encoding method
        verbose : boolean (Default False)
            Get logging information

        """
        # check pipe step
        assert self.step in ['explore'], 'apply explore method first'
        assert not self.is_fitted_preprocessing, 'preprocessing encoders already fitted'

        ###############################
        # Fit and apply preprocessing #
        ###############################
        if verbose:
            start_time = time()
            print_title1('Fit and apply preprocessing')

        target = self.target
        df_local = self.copy()

        # Features Removing 'zero variance / verbatims / identifiers)
        if verbose:
            color_print("Features removing (zero variance / verbatims / identifiers)")

        l_remove = self.d_features['low_variance'] + self.d_features['verbatim'] + self.d_features['identifier']
        if len(l_remove) > 0:
            df_local = df_local.drop(l_remove, axis=1)

        if verbose:
            print("  >", len(l_remove), "features to remove")
            if len(l_remove) > 0:
                print(" ", l_remove)

        # Transform date -> time between date and date_ref
        if verbose:
            color_print("Transform date")

        date_encoder = DateEncoder(method='timedelta', date_ref=date_ref)
        date_encoder.fit(self, l_var=self.d_features['date'], verbose=False)
        df_local = date_encoder.transform(df_local, verbose=verbose)

        # Missing Values
        if verbose:
            color_print('Missing values')

        NA_encoder = NAEncoder()
        NA_encoder.fit(df_local, l_var=None, verbose=False)
        df_local = NA_encoder.transform(df_local, verbose=verbose)

        # replace outliers
        if process_outliers:
            if verbose:
                color_print('Outliers')
            out_encoder = OutliersEncoder()
            out_encoder.fit(df_local, l_var=None, verbose=False)
            df_local = out_encoder.transform(df_local, verbose=verbose)
        else:
            out_encoder = None

        # categorical processing
        if verbose:
            color_print('Encode Categorical and boolean')

        cat_col = self.d_features['categorical'] + self.d_features['boolean']
        # apply one-hot encoding if target not filled in class parameters
        if self.target is None:
            cat_method = 'one_hot'
            color_print('No target -> one_hot encoding !', 31)

        # get embedding
        cat_encoder = CategoricalEncoder(method=cat_method)
        cat_encoder.fit(self, l_var=cat_col, target=self.target, verbose=verbose)
        df_local = cat_encoder.transform(df_local, verbose=verbose)

        # store preprocessing params
        self.d_preprocess = {'remove': l_remove, 'date': date_encoder, 'NA': NA_encoder, 'categorical': cat_encoder}
        if out_encoder is not None:
            self.d_preprocess['outlier'] = out_encoder

        if verbose:
            color_print("\nCreated attributes :  d_preprocess (dict) ")
            print("Keys :")
            print("  -> remove")
            print("  -> date")
            print("  -> NA")
            print("  -> categorical")
            print("  -> outlier (optional)")

        # is_fitted
        self.is_fitted_preprocessing = True

        # update self
        self.__dict__.update(df_local.__dict__)
        self.target = target
        self.step = 'preprocess'

        if verbose:
            color_print("New DataFrame size ")
            print("  > row number : ", self.shape[0], "\n  > col number : ", self.shape[1])
            print('\n\t\t>>>', 'proprocess execution time:', round(time() - start_time, 4), 'secs. <<<')

    """
    --------------------------------------------------------------------------------------------------------------------
    """

[docs]    def preprocess_apply(self, df, verbose=False):
        """Apply preprocessing.

        Requires preprocess method to have been applied (so that all encoder are fitted).

        Parameters
        ----------
        df : DataFrame
            dataset to apply preprocessing on
        verbose : boolean (Default False)
            Get logging information

        Returns
        -------
        DataFrame : Preprocessed dataset
        """
        if verbose:
            start_time = time()
            print_title1('Apply Preprocessing')

        # check pipe step and is_fitted
        assert self.is_fitted_preprocessing, "fit first (please)"

        #
        df_local = df.copy()

        # Remove features with zero variance / verbatims and identifiers
        if verbose:
            color_print("Remove features (zero variance, verbatims and identifiers")

        if len(self.d_preprocess['remove']) > 0:
            df_local = df_local.drop(self.d_preprocess['remove'], axis=1)
            if verbose:
                print("  >", len(self.d_preprocess['remove']), 'removed features')
        else:
            if verbose:
                print("  > No features to remove")

        # Transform date -> time between date and date_ref
        if verbose:
            color_print("Transform date")
        df_local = self.d_preprocess['date'].transform(df_local, verbose=verbose)

        # Missing Values
        if verbose:
            color_print('Missing values')
        df_local = self.d_preprocess['NA'].transform(df_local, verbose=verbose)

        # replace outliers
        if 'outlier' in list(self.d_preprocess.keys()):
            if verbose:
                color_print('Outliers')
            df_local = self.d_preprocess['outlier'].transform(df_local, verbose=verbose)

        # categorical processing
        if verbose:
            color_print('Encode categorical and boolean')
            print('\n\t\t>>>', 'preprocess_apply execution time:', round(time() - start_time, 4), 'secs. <<<')
        df_local = self.d_preprocess['categorical'].transform(df_local, verbose=verbose)

        return df_local

    """
    --------------------------------------------------------------------------------------------------------------------
    """

[docs]    def select_features(self, method='pca', verbose=False):
        """ fit and apply features selection (optional)

        Parameters
        ----------
        method : string (Default pca)
            method use to select features
        verbose : boolean (Default False)
            Get logging information

        """
        assert self.step in ['preprocess'], 'apply preprocess method'

        target = self.target

        if verbose:
            start_time = time()
            print_title1('Features Selection')

        df_local = self.copy()

        l_select_var = [col for col in df_local.columns.tolist() if col != self.target]

        # df_local = select_features(df=df_local, target=self.target, method=method, verbose=verbose)

        features_selector = FeatSelector(method=method)
        features_selector.fit(df_local, l_var=l_select_var, verbose=verbose)
        df_local = features_selector.transform(df_local, verbose=verbose)

        self.__dict__.update(df_local.__dict__)
        self.target = target
        self.features_selector = features_selector
        self.is_fitted_selector = True
        self.step = 'features_selection'

        if verbose :
            print('\n\t\t>>>', 'select_features execution time:', round(time() - start_time, 4), 'secs. <<<')

    """
        --------------------------------------------------------------------------------------------------------------------
    """

[docs]    def select_features_apply(self, df, verbose=False):
        """Apply features selection.

         Requires Select_Features method to have been applied

         Parameters
         ----------
         df : DataFrame
             dataset to apply selection on
         verbose : boolean (Default False)
             Get logging information

         Returns
         -------
         DataFrame : reduced dataset
         """
        # check pipe step and is_fitted
        assert self.is_fitted_selector, "fit first (please)"

        if verbose:
            start_time = time()
            print_title1('Apply select_features')

        df_local = df.copy()

        df_local = self.features_selector.transform(df_local, verbose=verbose)

        if verbose:
            print('\n\t\t>>>', 'select_features_apply execution time:', round(time() - start_time, 4), 'secs. <<<')

        return df_local

    """
        --------------------------------------------------------------------------------------------------------------------
    """

[docs]    def model_train_test(self, clf='XGBOOST', grid_param=None, metric='F1', delta_auc=0.03, top_bagging=False, n_comb=10,
                         comb_seed=None,
                         verbose=False):
        """train and test models with random search

        - creates models with random hyper-parameters combinations from HP grid
        - splits (random 80/20) train/test sets to fit/apply models
        - identifies valid models (auc(train)-auc(test)<0.03
        - gets the best model in respect of a selected metric among valid model


        Notes :

        - Available classifiers : Random Forest, XGBOOST
        - can enable bagging algo with top_bagging parameter

        Parameters
        ----------
        clf : string (Default : 'XGBOOST')
            classifier used for modelisation
        grid_param : dict
            random search grid {Hyperparameter name : values list}
        metric : string (Default : 'F1')
            objective metric
        top_bagging : boolean (Default : False)
            enable Bagging
        n_comb : int (Default : 10)
            HP combination number
        comb_seed : int (Default : None)
            random combination seed
        verbose : boolean (Default False)
            Get logging information

        Returns
        -------
        dict
            {model_index : {'HP', 'probas', 'model', 'features_importance', 'train_metrics', 'metrics', 'output'}
        list
            valid models indexes
        int
            best model index
        DataFrame
            models summary
        """
        assert self.step in ['preprocess', 'features_selection'], 'apply preprocess method'

        if verbose:
            start_time = time()
            print_title1('Train predict')

        # Train/Test split
        df_train, df_test = train_test(self, 0.2)

        # Create Hyperopt object
        hyperopt = HyperOpt(classifier=clf, grid_param=grid_param, n_param_comb=n_comb,
                            bagging=top_bagging, comb_seed=comb_seed)

        # fit model on train set
        if verbose:
            color_print('training models')

        hyperopt.fit(df_train, self.target, verbose=verbose)

        # Apply model on test set
        if verbose:
            color_print('\napplying models')

        d_fitted_models = hyperopt.predict(df_test, self.target, delta_auc=delta_auc, verbose=verbose)

        # model selection
        if verbose:
            color_print('\nbest model selection')
        best_model_idx, l_valid_models = hyperopt.get_best_model(d_fitted_models, metric=metric, delta_auc_th=delta_auc,
                                                                 verbose=False)

        df_model_res = hyperopt.model_res_to_df(d_fitted_models, sort_metric=metric)

        if best_model_idx is not None:
            print_title1('best model : ' + str(best_model_idx))
            print(metric + ' : ' + str(round(d_fitted_models[best_model_idx]['metrics'][metric], 4)))
            print('AUC : ' + str(round(d_fitted_models[best_model_idx]['metrics']['Roc_auc'], 4)))
            if round(d_fitted_models[best_model_idx]['metrics'][metric], 4) == 1.0:
                color_print("C'était pas qu'un physique finalement hein ?", 32)
            print('\n\t\t>>>', 'model_train_test execution time:', round(time() - start_time, 4), 'secs. <<<')

        self.d_hyperopt = hyperopt
        self.is_fitted_model = True

        return d_fitted_models, l_valid_models, best_model_idx, df_model_res

    """
    ------------------------------------------------------------------------------------------------------------------------
    """

[docs]    def model_train(self, clf='XGBOOST', grid_param=None, top_bagging=False, n_comb=10, comb_seed=None, verbose=False):
        """train models with random search

        - creates models with random hyper-parameters combinations from HP grid
        - fits models on self

        Notes :

        - Available classifiers : Random Forest, XGBOOST
        - can enable bagging algo with top_bagging parameter

        Parameters
        ----------
        clf : string (Default : 'XGBOOST')
            classifier used for modelisation
        grid_param : dict
            random search grid {Hyperparameter name : values list}
        top_bagging : boolean (Default : False)
            enable Bagging
        n_comb : int (Default : 10)
            HP combination number
        comb_seed : int (Default : None)
            random combination seed
        verbose : boolean (Default False)
            Get logging information

        """
        assert self.step in ['preprocess', 'features_selection'], 'apply preprocess method'

        df_train = self.copy()
        target = self.target


        if verbose:
            start_time = time()
            print_title1('Train Models')

        # instantiate Hyperopt object
        hyperopt = HyperOpt(classifier=clf, grid_param=grid_param, n_param_comb=n_comb,
                            bagging=top_bagging, comb_seed=comb_seed)

        # fit model on train set
        if verbose:
            color_print('training models')

        # fit hyperopt on self
        hyperopt.fit(df_train, self.target, verbose=verbose)

        self.d_hyperopt = hyperopt
        self.is_fitted_model = True
        self.target = target
        self.step = 'train_model'

        if verbose:
            print('\n\t\t>>>', 'model_train execution time:', round(time() - start_time, 4), 'secs. <<<')

    """
    ------------------------------------------------------------------------------------------------------------------------
    """

[docs]    def model_predict(self, df, metric='F1', delta_auc=0.03, verbose=False):
        """apply fitted models on a dataset

        - identifies valid models (auc(train)-auc(test)<0.03
        - gets the best model in respect of a selected metric among valid model

        Parameters
        ----------
        metric : string (Default : 'F1')
            objective metric
        verbose : boolean (Default False)
            Get logging information

        Returns
        -------
        dict
            {model_index : {'HP', 'probas', 'model', 'features_importance', 'train_metrics', 'metrics', 'output'}
        list
            valid models indexes
        int
            best model index
        DataFrame
            models summary
        """
        assert self.is_fitted_model, "model is not fitted yet, apply model_train_predict or model_train methods"

        if verbose:
            start_time = time()
            color_print('\napplying models')

        # apply models on dataset
        d_fitted_models = self.d_hyperopt.predict(df, self.target, delta_auc=delta_auc, verbose=verbose)

        # model selection
        if verbose:
            color_print('\nbest model selection')
        best_model_idx, l_valid_models = self.d_hyperopt.get_best_model(d_fitted_models, metric=metric,
                                                                        delta_auc_th=delta_auc,
                                                                        verbose=False)
        # store model results
        df_model_res = self.d_hyperopt.model_res_to_df(d_fitted_models, sort_metric=metric)

        if best_model_idx is not None:
            print_title1('best model : ' + str(best_model_idx))
            print(metric + ' : ' + str(round(d_fitted_models[best_model_idx]['metrics'][metric], 4)))
            print('AUC : ' + str(round(d_fitted_models[best_model_idx]['metrics']['Roc_auc'], 4)))
            if round(d_fitted_models[best_model_idx]['metrics'][metric], 4) == 1.0:
                color_print("C'était pas qu'un physique finalement hein ?", 32)
            print('\n\t\t>>>', 'model_predict execution time:', round(time() - start_time, 4), 'secs. <<<')

        return d_fitted_models, l_valid_models, best_model_idx, df_model_res
Source code for AutoMxL.__main__

Source code for AutoMxL.main