Feature Engineering 6 - Handling Imbalanced Dataset

4 minute read

Handling Imbalanced Datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv('Datasets/Creditcard/creditcard.csv')
df.head()
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0

5 rows × 31 columns

df.shape
(284807, 31)
df.isna().sum()
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
#To check balance of data

df['Class'].value_counts() # as Class is dependent variable
0    284315
1       492
Name: Class, dtype: int64
  • As we can see the data is not balanced
X=df.drop('Class',axis=1) #Independent Features
y=df.Class                #Dependent Feature
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold,train_test_split,GridSearchCV
log_class=LogisticRegression()
#Final Solution that we should try in imbalanced dataset :- Perform GridSearchCV and KFold
grid_param={'C':10.0**np.arange(-2,3),
           'penalty':['l1','l2']}

cv=KFold(n_splits=5,shuffle=False,random_state=None)
grid_param
{'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'penalty': ['l1', 'l2']}
clf=GridSearchCV(log_class,param_grid=grid_param,n_jobs=-1,cv=cv,scoring='f1_macro')
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=0)
clf.fit(X_train,y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:921: UserWarning: One or more of the test scores are non-finite: [       nan 0.8150325         nan 0.83872168        nan 0.84009661
        nan 0.84109159        nan 0.83551738]
  category=UserWarning
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)





GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')
y_pred=clf.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
 [[85262    34]
 [   53    94]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.73      0.64      0.68       147

    accuracy                           1.00     85443
   macro avg       0.87      0.82      0.84     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9989817773252344
  • We should not look into accuracy score in case of imbalanced dataset.
  • we should focus on precision and recall values.
#Trying to implement RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
  • Explore class weight parameter to give weightage to the categories
classifier.fit(X_train,y_train)
RandomForestClassifier()
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
 [[85289     7]
 [   34   113]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.94      0.77      0.85       147

    accuracy                           1.00     85443
   macro avg       0.97      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9995201479348805

1. Under Sampling

  • Reduce the data points of maximum labels

  • Disadvantage

    • Loss of data
    • we should use only where there is very less dataset
from collections import Counter
from imblearn.under_sampling import NearMiss
Counter(y_train)
Counter({0: 199019, 1: 345})
ns=NearMiss(0.8)
X_train_ns,y_train_ns=ns.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_ns)))
/opt/anaconda3/lib/python3.7/site-packages/imblearn/utils/_validation.py:591: FutureWarning: Pass sampling_strategy=0.8 as keyword args. From version 0.9 passing these as positional arguments will result in an error
  FutureWarning,


The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 431, 1: 345})
# 0.8 of x = 345
# x=345/0.8

356/0.8  # approximately Number of reduced maximum labels
445.0
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)
RandomForestClassifier()
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
 [[62817 22479]
 [    8   139]]

classification_report
               precision    recall  f1-score   support

           0       1.00      0.74      0.85     85296
           1       0.01      0.95      0.01       147

    accuracy                           0.74     85443
   macro avg       0.50      0.84      0.43     85443
weighted avg       1.00      0.74      0.85     85443


accuracy_score
 0.7368186978453471

2. Over Sampling

  • Adding more points belonging to minority category
  • Existing points will be created multiple times/ replicated multiple times to oversample.
from imblearn.over_sampling import RandomOverSampler

os=RandomOverSampler(0.75)
X_train_os,y_train_os=os.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))
/opt/anaconda3/lib/python3.7/site-packages/imblearn/utils/_validation.py:591: FutureWarning: Pass sampling_strategy=0.75 as keyword args. From version 0.9 passing these as positional arguments will result in an error
  FutureWarning,


The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 199019, 1: 149264})
#number of minority sample after oversampling
199019*0.75
149264.25
classifier.fit(X_train_os,y_train_os)
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
 [[85290     6]
 [   33   114]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.95      0.78      0.85       147

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9995435553526912

3. SMOTETomek

  • Additional new points created belonging to minority category.
  • Not overlapping points but altogether new points will be created (based on nearest neighbour)
from imblearn.combine import SMOTETomek
smote=SMOTETomek(0.5)

X_train_sm,y_train_sm=smote.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_sm)))

The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 198085, 1: 98575})
198085*0.5
99042.5
classifier.fit(X_train_sm,y_train_sm)
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
 [[85283    13]
 [   27   120]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.90      0.82      0.86       147

    accuracy                           1.00     85443
   macro avg       0.95      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9995318516437859
  • Oversampling/Undersampling should be applied during model creation

4. Ensemble Technique

from imblearn.ensemble import EasyEnsembleClassifier
easy=EasyEnsembleClassifier()
easy.fit(X_train,y_train)
EasyEnsembleClassifier()

y_pred=easy.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
 [[82902  2394]
 [   16   131]]

classification_report
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     85296
           1       0.05      0.89      0.10       147

    accuracy                           0.97     85443
   macro avg       0.53      0.93      0.54     85443
weighted avg       1.00      0.97      0.98     85443


accuracy_score
 0.9717940615381014