Feature Engineering 6 - Handling Imbalanced Dataset
Handling Imbalanced Datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv('Datasets/Creditcard/creditcard.csv')
df.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
df.shape
(284807, 31)
df.isna().sum()
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
dtype: int64
#To check balance of data
df['Class'].value_counts() # as Class is dependent variable
0 284315
1 492
Name: Class, dtype: int64
- As we can see the data is not balanced
X=df.drop('Class',axis=1) #Independent Features
y=df.Class #Dependent Feature
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold,train_test_split,GridSearchCV
log_class=LogisticRegression()
#Final Solution that we should try in imbalanced dataset :- Perform GridSearchCV and KFold
grid_param={'C':10.0**np.arange(-2,3),
'penalty':['l1','l2']}
cv=KFold(n_splits=5,shuffle=False,random_state=None)
grid_param
{'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'penalty': ['l1', 'l2']}
clf=GridSearchCV(log_class,param_grid=grid_param,n_jobs=-1,cv=cv,scoring='f1_macro')
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=0)
clf.fit(X_train,y_train)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:921: UserWarning: One or more of the test scores are non-finite: [ nan 0.8150325 nan 0.83872168 nan 0.84009661
nan 0.84109159 nan 0.83551738]
category=UserWarning
/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
estimator=LogisticRegression(), n_jobs=-1,
param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
'penalty': ['l1', 'l2']},
scoring='f1_macro')
y_pred=clf.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
[[85262 34]
[ 53 94]]
classification_report
precision recall f1-score support
0 1.00 1.00 1.00 85296
1 0.73 0.64 0.68 147
accuracy 1.00 85443
macro avg 0.87 0.82 0.84 85443
weighted avg 1.00 1.00 1.00 85443
accuracy_score
0.9989817773252344
- We should not look into accuracy score in case of imbalanced dataset.
- we should focus on precision and recall values.
#Trying to implement RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
- Explore class weight parameter to give weightage to the categories
classifier.fit(X_train,y_train)
RandomForestClassifier()
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
[[85289 7]
[ 34 113]]
classification_report
precision recall f1-score support
0 1.00 1.00 1.00 85296
1 0.94 0.77 0.85 147
accuracy 1.00 85443
macro avg 0.97 0.88 0.92 85443
weighted avg 1.00 1.00 1.00 85443
accuracy_score
0.9995201479348805
1. Under Sampling
-
Reduce the data points of maximum labels
-
Disadvantage
- Loss of data
- we should use only where there is very less dataset
from collections import Counter
from imblearn.under_sampling import NearMiss
Counter(y_train)
Counter({0: 199019, 1: 345})
ns=NearMiss(0.8)
X_train_ns,y_train_ns=ns.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_ns)))
/opt/anaconda3/lib/python3.7/site-packages/imblearn/utils/_validation.py:591: FutureWarning: Pass sampling_strategy=0.8 as keyword args. From version 0.9 passing these as positional arguments will result in an error
FutureWarning,
The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 431, 1: 345})
# 0.8 of x = 345
# x=345/0.8
356/0.8 # approximately Number of reduced maximum labels
445.0
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)
RandomForestClassifier()
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
[[62817 22479]
[ 8 139]]
classification_report
precision recall f1-score support
0 1.00 0.74 0.85 85296
1 0.01 0.95 0.01 147
accuracy 0.74 85443
macro avg 0.50 0.84 0.43 85443
weighted avg 1.00 0.74 0.85 85443
accuracy_score
0.7368186978453471
2. Over Sampling
- Adding more points belonging to minority category
- Existing points will be created multiple times/ replicated multiple times to oversample.
from imblearn.over_sampling import RandomOverSampler
os=RandomOverSampler(0.75)
X_train_os,y_train_os=os.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_os)))
/opt/anaconda3/lib/python3.7/site-packages/imblearn/utils/_validation.py:591: FutureWarning: Pass sampling_strategy=0.75 as keyword args. From version 0.9 passing these as positional arguments will result in an error
FutureWarning,
The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 199019, 1: 149264})
#number of minority sample after oversampling
199019*0.75
149264.25
classifier.fit(X_train_os,y_train_os)
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
[[85290 6]
[ 33 114]]
classification_report
precision recall f1-score support
0 1.00 1.00 1.00 85296
1 0.95 0.78 0.85 147
accuracy 1.00 85443
macro avg 0.97 0.89 0.93 85443
weighted avg 1.00 1.00 1.00 85443
accuracy_score
0.9995435553526912
3. SMOTETomek
- Additional new points created belonging to minority category.
- Not overlapping points but altogether new points will be created (based on nearest neighbour)
from imblearn.combine import SMOTETomek
smote=SMOTETomek(0.5)
X_train_sm,y_train_sm=smote.fit_resample(X_train,y_train)
print('The number of classes before fit {}'.format(Counter(y_train)))
print('The number of classes before fit {}'.format(Counter(y_train_sm)))
The number of classes before fit Counter({0: 199019, 1: 345})
The number of classes before fit Counter({0: 198085, 1: 98575})
198085*0.5
99042.5
classifier.fit(X_train_sm,y_train_sm)
y_pred=classifier.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
[[85283 13]
[ 27 120]]
classification_report
precision recall f1-score support
0 1.00 1.00 1.00 85296
1 0.90 0.82 0.86 147
accuracy 1.00 85443
macro avg 0.95 0.91 0.93 85443
weighted avg 1.00 1.00 1.00 85443
accuracy_score
0.9995318516437859
- Oversampling/Undersampling should be applied during model creation
4. Ensemble Technique
from imblearn.ensemble import EasyEnsembleClassifier
easy=EasyEnsembleClassifier()
easy.fit(X_train,y_train)
EasyEnsembleClassifier()
y_pred=easy.predict(X_test)
print('confusion_matrix\n',confusion_matrix(y_test,y_pred))
print('\nclassification_report\n',classification_report(y_test,y_pred))
print('\naccuracy_score\n',accuracy_score(y_test,y_pred))
confusion_matrix
[[82902 2394]
[ 16 131]]
classification_report
precision recall f1-score support
0 1.00 0.97 0.99 85296
1 0.05 0.89 0.10 147
accuracy 0.97 85443
macro avg 0.53 0.93 0.54 85443
weighted avg 1.00 0.97 0.98 85443
accuracy_score
0.9717940615381014