""" Features selection
- select_features (func) : features selection following method
"""
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
[docs]class FeatSelector(object):
"""features selection following method
- pca : use pca to reduce dataset dimensions
- no_rescale_pca : use pca without rescaling data
Parameters
----------
method : string (Default pca)
method use to select features
"""
def __init__(self,
method='pca'
):
assert method in ['pca', 'no_rescale_pca'], 'invalid method : select pca / no_rescale_pca'
self.method = method
self.is_fitted = False
self.l_select_var = []
self.selector = None
self.scaler = None
"""
----------------------------------------------------------------------------------------------
"""
[docs] def fit(self, df, l_var=None, verbose=False):
"""fit selector
Parameters
----------
df : DataFrame
input dataset
l_var : list
features to encode.
If None, all features identified as numerical
verbose : boolean (Default False)
Get logging information
"""
# get categorical and boolean features (see Features_Type module doc)
l_num = [col for col in df.columns.tolist() if df[col].dtype != 'object']
# list of features to encode
if l_var is None:
self.l_select_var = l_num
else:
self.l_select_var = [col for col in l_var if col in l_num]
if len(self.l_select_var) > 1:
# PCA method
if self.method in ['pca', 'no_rescale_pca']:
if self.method == 'pca':
scaler = StandardScaler()
df_local = scaler.fit_transform(df[self.l_select_var])
self.scaler = scaler
else:
df_local = df[self.l_select_var].copy()
# init pca object
pca = PCA()
# fit and transform with pca
pca.fit(df_local)
self.selector = pca
# Fitted !
self.is_fitted = True
# verbose
if verbose:
print(" **method : " + self.method)
print(" >", len(self.l_select_var), "features to encode")
else:
print('not enough features !')
"""
----------------------------------------------------------------------------------------------
"""
"""
----------------------------------------------------------------------------------------------
"""
"""
----------------------------------------------------------------------------------------------
"""
[docs]def select_features(df, target, method='pca', verbose=False):
"""features selection following method
- pca : use pca to reduce dataset dimensions
- no_rescale_pca : use pca without rescaling data
Parameters
----------
df : DataFrame
input dataset containing features
target : string
target name
method : string (Default pca)
method use to select features
verbose : boolean (Default False)
Get logging information
Returns
-------
DataFrame
modified dataset
"""
# assert valid method
assert method in ['pca', 'no_rescale_pca'], method + " invalid method : select pca, no_rescale_pca"
# get numerical features (except target) and others
l_num = [col for col in df._get_numeric_data().columns.tolist() if col != target]
l_other = [col for col in df.columns.tolist() if col not in l_num]
# prepare dataset to apply PCA
df_num = df[l_num].copy()
# PCA method
if method in ['pca', 'no_rescale_pca']:
if method == 'pca':
scaler = StandardScaler()
X = scaler.fit_transform(df_num)
else:
X = df_num.copy()
# init pca object
pca = PCA()
# fit and transform with pca
X_transform = pd.DataFrame(pca.fit_transform(X))
X_transform = X_transform.rename(
columns=dict(zip(X_transform.columns.tolist(), ['Dim' + str(v) for v in X_transform.columns.tolist()])))
# find argmin to get 90% of variance
n_dim = np.argwhere(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0]
# concat with other dataset features
if len(l_other) > 0:
df_pca = pd.concat((df[l_other].reset_index(drop=True), X_transform.iloc[:, :n_dim + 1]), axis=1)
else:
df_pca = X_transform.iloc[:, :n_dim + 1]
# verbose
if verbose:
print("Numerical Dimensions reduction : " + str(len(l_num)) + " - > " + str(n_dim + 1))
print("explained inertia : " + str(round(np.cumsum(pca.explained_variance_ratio_)[n_dim], 4)))
return df_pca