Feature Engineering 6 - Handling Imbalanced Dataset

4 minute read

Handling Imbalanced Datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df=pd.read_csv('Datasets/Creditcard/creditcard.csv')

df.head()

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

5 rows × 31 columns

df.shape

(284807, 31)

df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

#To check balance of data

df['Class'].value_counts() # as Class is dependent variable

0    284315
1       492
Name: Class, dtype: int64

As we can see the data is not balanced

X=df.drop('Class',axis=1) #Independent Features
y=df.Class                #Dependent Feature

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold,train_test_split,GridSearchCV

log_class=LogisticRegression()

#Final Solution that we should try in imbalanced dataset :- Perform GridSearchCV and KFold
grid_param={'C':10.0**np.arange(-2,3),
           'penalty':['l1','l2']}

cv=KFold(n_splits=5,shuffle=False,random_state=None)

grid_param

{'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'penalty': ['l1', 'l2']}

clf=GridSearchCV(log_class,param_grid=grid_param,n_jobs=-1,cv=cv,scoring='f1_macro')

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=0)

clf.fit(X_train,y_train)

/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:921: UserWarning: One or more of the test scores are non-finite: [       nan 0.8150325         nan 0.83872168        nan 0.84009661
        nan 0.84109159        nan 0.83551738]
  category=UserWarning
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)





GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

y_pred=clf.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))

confusion_matrix
 [[85262    34]
 [   53    94]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.73      0.64      0.68       147

    accuracy                           1.00     85443
   macro avg       0.87      0.82      0.84     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9989817773252344

We should not look into accuracy score in case of imbalanced dataset.
we should focus on precision and recall values.

#Trying to implement RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

classifier=RandomForestClassifier()

Explore class weight parameter to give weightage to the categories

classifier.fit(X_train,y_train)

RandomForestClassifier()

y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))

confusion_matrix
 [[85289     7]
 [   34   113]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.94      0.77      0.85       147

    accuracy                           1.00     85443
   macro avg       0.97      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9995201479348805

1. Under Sampling

Reduce the data points of maximum labels
Disadvantage
- Loss of data
- we should use only where there is very less dataset

from collections import Counter
from imblearn.under_sampling import NearMiss

Counter(y_train)

Counter({0: 199019, 1: 345})

ns=NearMiss(0.8)
X_train_ns,y_train_ns=ns.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_ns)))

/opt/anaconda3/lib/python3.7/site-packages/imblearn/utils/_validation.py:591: FutureWarning: Pass sampling_strategy=0.8 as keyword args. From version 0.9 passing these as positional arguments will result in an error
  FutureWarning,


The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 431, 1: 345})

# 0.8 of x = 345
# x=345/0.8

356/0.8  # approximately Number of reduced maximum labels

445.0

classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))

confusion_matrix
 [[62817 22479]
 [    8   139]]

classification_report
               precision    recall  f1-score   support

           0       1.00      0.74      0.85     85296
           1       0.01      0.95      0.01       147

    accuracy                           0.74     85443
   macro avg       0.50      0.84      0.43     85443
weighted avg       1.00      0.74      0.85     85443


accuracy_score
 0.7368186978453471

2. Over Sampling

Adding more points belonging to minority category
Existing points will be created multiple times/ replicated multiple times to oversample.

from imblearn.over_sampling import RandomOverSampler

os=RandomOverSampler(0.75)
X_train_os,y_train_os=os.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))

/opt/anaconda3/lib/python3.7/site-packages/imblearn/utils/_validation.py:591: FutureWarning: Pass sampling_strategy=0.75 as keyword args. From version 0.9 passing these as positional arguments will result in an error
  FutureWarning,


The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 199019, 1: 149264})

#number of minority sample after oversampling
199019*0.75

149264.25

classifier.fit(X_train_os,y_train_os)
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))

confusion_matrix
 [[85290     6]
 [   33   114]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.95      0.78      0.85       147

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9995435553526912

3. SMOTETomek

Additional new points created belonging to minority category.
Not overlapping points but altogether new points will be created (based on nearest neighbour)

from imblearn.combine import SMOTETomek

smote=SMOTETomek(0.5)

X_train_sm,y_train_sm=smote.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_sm)))

The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 198085, 1: 98575})

198085*0.5

99042.5

classifier.fit(X_train_sm,y_train_sm)
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))

confusion_matrix
 [[85283    13]
 [   27   120]]

classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.90      0.82      0.86       147

    accuracy                           1.00     85443
   macro avg       0.95      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443


accuracy_score
 0.9995318516437859

Oversampling/Undersampling should be applied during model creation

4. Ensemble Technique

from imblearn.ensemble import EasyEnsembleClassifier

easy=EasyEnsembleClassifier()
easy.fit(X_train,y_train)

EasyEnsembleClassifier()

y_pred=easy.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))

confusion_matrix
 [[82902  2394]
 [   16   131]]

classification_report
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     85296
           1       0.05      0.89      0.10       147

    accuracy                           0.97     85443
   macro avg       0.53      0.93      0.54     85443
weighted avg       1.00      0.97      0.98     85443


accuracy_score
 0.9717940615381014

Share on

Twitter Facebook Google+ LinkedIn

Rohit Kumar

Feature Engineering 6 - Handling Imbalanced Dataset

Handling Imbalanced Datasets

1. Under Sampling

2. Over Sampling

3. SMOTETomek

4. Ensemble Technique

Share on

You May Also Enjoy

Python Pandas - String and Regular Expression

Python Pandas - Joining and merging DataFrame

Python Pandas - DataFrame

Feature Engineering 7 - Outliers and Its impact on ML usecases