""" Global dataset information functions :
- explore (func): Identify variables types and gives global information about the dataset (NA, low variance features)
- low variance features (func): identify features with low variance
- get_features_type (func): get all features per type
"""
from sklearn.preprocessing import MinMaxScaler
from AutoMxL.Explore.Features_Type import *
from AutoMxL.Utils.Display import *
[docs]def explore(df, verbose=False):
"""Identify variables types and gives global information about the dataset
- Variables type :
- date
- identifier
- verbatim
- boolean
- categorical
- numerical
- variables containing NA values
- low variance and unique values variables
See get_features_type function doc for type identification heuristics
Parameters
----------
df : DataFrame
input dataset
verbose : boolean (Default False)
Get logging information
Returns
-------
dict
{x : variables names list }
- date : date features
- identifier : identifier features
- verbatim : verbatim features
- boolean : boolean features
- categorical : categorical features
- numerical : numerical features
- categorical : categorical features
- date : date features
- NA : features which contains NA values
- low_variance : list of the features with low variance
"""
# dataset dimensions
if verbose:
color_print("Dimensions :")
print(" > row number :", df.shape[0], "\n > col number :", df.shape[1])
#########################
# Low variance features
#########################
if verbose:
color_print('Low variance features')
l_low_var = \
low_variance_features(df, var_list=df._get_numeric_data().columns.tolist(), threshold=0, rescale=True,
verbose=verbose).index.tolist()
# categorical features with unique values
l_unique = [col for col in df.columns.tolist() if df[col].dtype == 'object' and df[col].nunique(dropna=True) == 1]
l_low_var = l_low_var + l_unique
df_valid = df.drop(l_low_var, axis=1).copy()
#################
# features type #
#################
d_features = get_features_type(df_valid, l_var=None, th=0.95)
if verbose:
color_print("Features type identification : ")
list(map(lambda typ:
print(" > " + typ + " : " + str(len(d_features[typ])) + ' (' + str(
round(len(d_features[typ]) / df_valid.shape[1] * 100)) + '%)'),
d_features.keys()))
######################
# NA values analysis
######################
df_col = pd.DataFrame(df_valid.columns.values, columns=['variables'])
df_col['Nbr NA'] = df_valid.isna().sum().tolist()
df_col['Taux NA'] = df_col['Nbr NA'] / df_valid.shape[0]
# features containing NA values
NA_columns = df_col.loc[df_col['Nbr NA'] > 0].sort_values('Nbr NA', ascending=False).variables.tolist()
col_des = df_col['Taux NA'].describe()
if verbose:
color_print(str(len(NA_columns)) + " features containing NA")
print(' > Taux NA moyen : ' + str(round(col_des['mean'] * 100, 2)) + '%',
'\n > min : ' + str(round(col_des['min'] * 100, 2)) + '%',
'\n > max : ' + str(round(col_des['max'] * 100, 2)) + '%')
# store into DataFrame
d_features['NA'] = NA_columns
d_features['low_variance'] = l_low_var
return d_features
"""
-------------------------------------------------------------------------------------------------------------------------
"""
[docs]def get_features_type(df, l_var=None, th=0.95):
""" Get all features per type :
- date : try to apply to_datetime
- identifier :
- #(unique values)/#(total values) > threshold (default 0.95)
- AND length is the same for all values (for non NA)
- verbatim :
- #(unique values)/#(total values) >= threshold (default 0.95)
- AND length is NOT the same for all values (for non NA)
- boolean : #(distinct values) = 2
- categorical :
- not a date
- #(unique values)/#(total values) < threshold (default 0.95)
- AND #(uniques values)>2
- AND for num values #(unique values)<30
- numerical : others
Parameters
----------
df : DataFrame
input dataset
l_var : list (Default : None)
variable names
th : float (Default : 0.95)
threshold used to identify identifiers/verbatims variables
Returns
-------
dict
{ type : variables name list}
"""
d_output = {}
if l_var is None:
df_local = df.copy()
else:
df_local = df[l_var].copy()
l_col = df_local.columns.tolist()
for typ in ['date', 'identifier', 'verbatim', 'boolean', 'categorical']:
d_output[typ] = features_from_type(df_local, typ, l_var=l_col, th=th)
l_col = [x for x in l_col if (x not in d_output[typ])]
d_output['numerical'] = l_col
return d_output
"""
-------------------------------------------------------------------------------------------------------------------------
"""
[docs]def low_variance_features(df, var_list=None, threshold=0, rescale=True, verbose=False):
"""Identify numerical features with low variance : (< threshold).
Possible to rescale feature before computing.
Parameters
----------
df : DataFrame
input DataFrame
var_list : list (default : None)
names of the variables to check variance
if None : all the numerical features
threshold : float (default : 0)
variance threshold
rescale : bool (default : true)
enable MinMaxScaler before computing variance
verbose : boolean (Default False)
Get logging information
Returns
-------
list
Names of the variables with low variance
"""
# if var_list = None, get all num features
# else, remove features from var_list whose type is not num
l_num = df._get_numeric_data().columns.tolist()
if var_list is None:
var_list = l_num
else:
var_list = [col for col in var_list if col in l_num]
df_bis = df.copy()
if rescale:
scler = MinMaxScaler()
df_bis[var_list] = scler.fit_transform(df_bis[var_list].astype('float64'))
selected_var = df_bis[var_list].var().loc[df_bis.var() <= threshold]
if verbose:
# print('features : ',list(var_list))
if rescale:
print(' **MinMaxScaler [0,1]')
print(' ', str(len(selected_var)) + ' feature(s) with variance <= threshold (' + str(threshold) + ')')
return selected_var.sort_values(ascending=True)