Source code for AutoMxL.Explore.Explore

""" Global dataset information functions :

 - explore (func): Identify variables types and gives global information about the dataset (NA, low variance features)
 - low variance features (func): identify features with low variance
 - get_features_type (func): get all features per type
"""
from sklearn.preprocessing import MinMaxScaler
from AutoMxL.Explore.Features_Type import *
from AutoMxL.Utils.Display import *


[docs]def explore(df, verbose=False): """Identify variables types and gives global information about the dataset - Variables type : - date - identifier - verbatim - boolean - categorical - numerical - variables containing NA values - low variance and unique values variables See get_features_type function doc for type identification heuristics Parameters ---------- df : DataFrame input dataset verbose : boolean (Default False) Get logging information Returns ------- dict {x : variables names list } - date : date features - identifier : identifier features - verbatim : verbatim features - boolean : boolean features - categorical : categorical features - numerical : numerical features - categorical : categorical features - date : date features - NA : features which contains NA values - low_variance : list of the features with low variance """ # dataset dimensions if verbose: color_print("Dimensions :") print(" > row number :", df.shape[0], "\n > col number :", df.shape[1]) ######################### # Low variance features ######################### if verbose: color_print('Low variance features') l_low_var = \ low_variance_features(df, var_list=df._get_numeric_data().columns.tolist(), threshold=0, rescale=True, verbose=verbose).index.tolist() # categorical features with unique values l_unique = [col for col in df.columns.tolist() if df[col].dtype == 'object' and df[col].nunique(dropna=True) == 1] l_low_var = l_low_var + l_unique df_valid = df.drop(l_low_var, axis=1).copy() ################# # features type # ################# d_features = get_features_type(df_valid, l_var=None, th=0.95) if verbose: color_print("Features type identification : ") list(map(lambda typ: print(" > " + typ + " : " + str(len(d_features[typ])) + ' (' + str( round(len(d_features[typ]) / df_valid.shape[1] * 100)) + '%)'), d_features.keys())) ###################### # NA values analysis ###################### df_col = pd.DataFrame(df_valid.columns.values, columns=['variables']) df_col['Nbr NA'] = df_valid.isna().sum().tolist() df_col['Taux NA'] = df_col['Nbr NA'] / df_valid.shape[0] # features containing NA values NA_columns = df_col.loc[df_col['Nbr NA'] > 0].sort_values('Nbr NA', ascending=False).variables.tolist() col_des = df_col['Taux NA'].describe() if verbose: color_print(str(len(NA_columns)) + " features containing NA") print(' > Taux NA moyen : ' + str(round(col_des['mean'] * 100, 2)) + '%', '\n > min : ' + str(round(col_des['min'] * 100, 2)) + '%', '\n > max : ' + str(round(col_des['max'] * 100, 2)) + '%') # store into DataFrame d_features['NA'] = NA_columns d_features['low_variance'] = l_low_var return d_features
""" ------------------------------------------------------------------------------------------------------------------------- """
[docs]def get_features_type(df, l_var=None, th=0.95): """ Get all features per type : - date : try to apply to_datetime - identifier : - #(unique values)/#(total values) > threshold (default 0.95) - AND length is the same for all values (for non NA) - verbatim : - #(unique values)/#(total values) >= threshold (default 0.95) - AND length is NOT the same for all values (for non NA) - boolean : #(distinct values) = 2 - categorical : - not a date - #(unique values)/#(total values) < threshold (default 0.95) - AND #(uniques values)>2 - AND for num values #(unique values)<30 - numerical : others Parameters ---------- df : DataFrame input dataset l_var : list (Default : None) variable names th : float (Default : 0.95) threshold used to identify identifiers/verbatims variables Returns ------- dict { type : variables name list} """ d_output = {} if l_var is None: df_local = df.copy() else: df_local = df[l_var].copy() l_col = df_local.columns.tolist() for typ in ['date', 'identifier', 'verbatim', 'boolean', 'categorical']: d_output[typ] = features_from_type(df_local, typ, l_var=l_col, th=th) l_col = [x for x in l_col if (x not in d_output[typ])] d_output['numerical'] = l_col return d_output
""" ------------------------------------------------------------------------------------------------------------------------- """
[docs]def low_variance_features(df, var_list=None, threshold=0, rescale=True, verbose=False): """Identify numerical features with low variance : (< threshold). Possible to rescale feature before computing. Parameters ---------- df : DataFrame input DataFrame var_list : list (default : None) names of the variables to check variance if None : all the numerical features threshold : float (default : 0) variance threshold rescale : bool (default : true) enable MinMaxScaler before computing variance verbose : boolean (Default False) Get logging information Returns ------- list Names of the variables with low variance """ # if var_list = None, get all num features # else, remove features from var_list whose type is not num l_num = df._get_numeric_data().columns.tolist() if var_list is None: var_list = l_num else: var_list = [col for col in var_list if col in l_num] df_bis = df.copy() if rescale: scler = MinMaxScaler() df_bis[var_list] = scler.fit_transform(df_bis[var_list].astype('float64')) selected_var = df_bis[var_list].var().loc[df_bis.var() <= threshold] if verbose: # print('features : ',list(var_list)) if rescale: print(' **MinMaxScaler [0,1]') print(' ', str(len(selected_var)) + ' feature(s) with variance <= threshold (' + str(threshold) + ')') return selected_var.sort_values(ascending=True)