Source code for AutoMxL.Preprocessing.Outliers

""" Outliers handling functions

 - OutliersEncoding (class) : identify and replace outliers
 - get_cat_outliers (funct): identify categorical features containing outliers
 - get_num_outliers (func): identify numerical features containing outliers
 - replace_category (func): replace categories of a categorical variable
 - replace_extreme_values (func): replace extreme values (oh!)
"""
import pandas as pd
import numpy as np
from AutoMxL.Utils.Display import *


[docs]class OutliersEncoder(object): """Identify et replace outliers for categorical dang numerical features - num : x outlier <=> abs(x - mean) > xstd * var - cat : x outlier category <=> with frequency <x% (Default 5%) Parameters ---------- cat_threshold : float (default 0.02) Minimum modality frequency num_xstd : int (Default : 3) Variance gap coef """ def __init__(self, cat_threshold=0.02, num_xstd=4 ): self.cat_threshold = cat_threshold, self.num_xstd = num_xstd self.is_fitted = False self.l_var_num = [] self.l_var_cat = [] self.d_num_outliers = {} self.d_cat_outliers = {} """ ---------------------------------------------------------------------------------------------- """
[docs] def fit(self, df, l_var, verbose=False): """Fit encoder Parameters ---------- df : DataFrame input dataset l_var : list features to encode. If None, all features verbose : boolean (Default False) Get logging information """ # get num and cat features l_num = [col for col in df.columns.tolist() if df[col].dtype != 'object'] l_str = [col for col in df.columns.tolist() if df[col].dtype == 'object'] # get valid values (not boolean) if l_var is None: self.l_var_cat = [col for col in l_str if df[col].nunique() > 2] self.l_var_num = [col for col in l_num if df[col].nunique() > 2] else: self.l_var_cat = [col for col in l_var if col in l_str and df[col].nunique() > 2] self.l_var_num = [col for col in l_var if col in l_num and df[col].nunique() > 2] # cat outliers if len(self.l_var_cat) > 0: self.d_cat_outliers = get_cat_outliers(df, l_var=self.l_var_cat, threshold=self.cat_threshold, verbose=False) # num outliers if len(self.l_var_num) > 0: self.d_num_outliers = get_num_outliers(df, l_var=self.l_var_num, xstd=self.num_xstd, verbose=False) # Fitted ! self.is_fitted = True # verbose if verbose: print(" **method cat: frequency<" + str(self.cat_threshold) + " / num:( x: |x - mean| > " + str(self.num_xstd) + "* var)") print(" >", len(self.d_cat_outliers.keys()) + len(self.d_num_outliers.keys()), "features with outliers") if len(self.d_cat_outliers.keys()) > 0: print(" - cat", list(self.d_cat_outliers.keys())) if len(self.d_num_outliers.keys()) > 0: print(" - num", list(self.d_num_outliers.keys()))
""" ---------------------------------------------------------------------------------------------- """
[docs] def transform(self, df, verbose=False): """Transform dataset features using the encoder. Can be done only if encoder has been fitted Parameters ---------- df : DataFrame dataset to transform verbose : boolean (Default False) Get logging information """ assert self.is_fitted, 'fit the encoding first using .fit method' df_local = df.copy() # cat features if len(list(self.d_cat_outliers.keys())) > 0: if verbose: print(" - cat aggregated values:") for col in self.d_cat_outliers.keys(): df_local = replace_category(df_local, col, self.d_cat_outliers[col], replace_with='outliers', verbose=verbose) # num features if len(list(self.d_num_outliers.keys())) > 0: if verbose: print(" - num values replaces:") for col in self.d_num_outliers.keys(): df_local = replace_extreme_values(df_local, col, self.d_num_outliers[col][0], self.d_num_outliers[col][1], verbose=verbose) # if no features with outliers if len(list(self.d_cat_outliers.keys())) + len(list(self.d_num_outliers.keys())) == 0: print(" > no outlier to replace") return df_local
""" ---------------------------------------------------------------------------------------------- """
[docs] def fit_transform(self, df, l_var=None, verbose=False): """Fit and transform dataset with encoder Parameters ---------- df : DataFrame input dataset l_var : list features to encode. If None, all features identified as dates (see Features_Type module) verbose : boolean (Default False) Get logging information """ df_local = df.copy() # fit self.fit(df_local, l_var=l_var, verbose=False) # transform df_local = self.transform(df_local, verbose=verbose) return df_local
""" ---------------------------------------------------------------------------------------------- """
[docs]def get_cat_outliers(df, l_var=None, threshold=0.05, verbose=False): """Outliers detection for selected/all categorical features. Method : Modalities with frequency <x% (Default 5%) Parameters ---------- df : DataFrame Input dataset l_var : list (Default : None) Names of the features If None, all the categorical features threshold : float (Default : 0.05) Minimum modality frequency verbose : boolean (Default False) Get logging information Returns ------- dict {variable : list of categories considered as outliers} """ # if var_list = None, get all categorical features # else, remove features from var_list whose type is not categorical l_cat = [col for col in df.columns.tolist() if df[col].dtype == 'object'] if l_var is None: l_var = l_cat else: l_var = [col for col in l_var if col in l_cat] df_local = df[l_var].copy() # dict containing value_counts for each variable d_freq = {col: pd.value_counts(df[col], dropna=False, normalize=True) for col in l_var} # if features contain at least 1 outlier category (frequency <threshold) # store outliers categories in dict d_outliers = {k: v[v < threshold].index.tolist() for k, v in d_freq.items() if len(v[v < threshold]) > 1} if verbose: color_print('cat features outliers identification (frequency<' + str(threshold) + ')') print(' > features : ', df_local.columns, ) print(" > containing outliers", list(d_outliers.keys())) return d_outliers
""" ------------------------------------------------------------------------------------------------------------------------- """
[docs]def get_num_outliers(df, l_var=None, xstd=3, verbose=False): """Outliers detection for selected/all numerical features. Method : x outlier <=> abs(x - mean) > xstd * var Parameters ---------- df : DataFrame Input dataset l_var : list (Default : None) Names of the features If None, all the num features xstd : int (Default : 3) Variance gap coef verbose : boolean (Default False) Get logging information Returns ------- dict {variable : [lower_limit, upper_limit]} """ # if var_list = None, get all num features # else, remove features from var_list whose type is not num l_num = df._get_numeric_data().columns.tolist() if l_var is None: l_var = l_num else: l_var = [col for col in l_var if col in l_num] df_local = df[l_var].copy() # compute features upper and lower limit (abs(x - mean) > xstd * var (x=3 by default)) data_std = np.std(df_local) data_mean = np.mean(df_local) anomaly_cut_off = data_std * xstd lower_limit = data_mean - anomaly_cut_off upper_limit = data_mean + anomaly_cut_off data_min = np.min(df_local) data_max = np.max(df_local) # store variables and lower/upper limits d_outliers = {col: [lower_limit[col], upper_limit[col]] for col in df_local.columns.tolist() if (data_min[col] < lower_limit[col] or data_max[col] > upper_limit[col])} if verbose: color_print('num features outliers identification ( x: |x - mean| > ' + str(xstd) + ' * var)') print(' > features : ', l_var) print(" > containing outliers", list(d_outliers.keys())) return d_outliers
""" ------------------------------------------------------------------------------------------------------------------------- """
[docs]def replace_category(df, var, categories, replace_with='outliers', verbose=False): """Replace categories of a categorical variable Parameters ---------- df : DataFrame Input dataset var : string variable to modify categories : list(string) categories to replace replace_with : string (Default : 'outliers') word to replace categories with verbose : boolean (Default False) Get logging information Returns ------- DataFrame Modified dataset """ df_local = df.copy() # replace categories df_local.loc[df_local[var].isin(categories), var] = replace_with if verbose: print(' > ' + var + ' ', categories) return df_local
""" ------------------------------------------------------------------------------------------------------------------------- """
[docs]def replace_extreme_values(df, var, lower_th=None, upper_th=None, verbose=False): """Replace extrem values : > upper threshold or < lower threshold Parameters ---------- df : DataFrame Input dataset var : string variable to modify lower_th : int/float (Default=None) lower threshold upper_th : int/float (Default=None) upper threshold verbose : boolean (Default False) Get logging information Returns ------- DataFrame Modified dataset """ assert (lower_th is not None or upper_th is not None), 'specify at least one limit value' df_local = df.copy() # replace values with upper_limit and lower_limit if upper_th is not None: df_local.loc[df_local[var] > upper_th, var] = upper_th if lower_th is not None: df_local.loc[df_local[var] < lower_th, var] = lower_th if verbose: print(' > ' + var + ' < ' + str(round(lower_th, 4)) + ' or > ' + str( round(upper_th, 4))) return df_local