Feature Engineering 3 - Handling Categorial Variable
Handle Categorial Features
1. One Hot Encoding
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df1=pd.read_csv('Datasets/Titanic/train.csv',usecols=['Embarked','Sex'])
df1
Sex | Embarked | |
---|---|---|
0 | male | S |
1 | female | C |
2 | female | S |
3 | female | S |
4 | male | S |
... | ... | ... |
886 | male | S |
887 | female | S |
888 | female | S |
889 | male | C |
890 | male | Q |
891 rows × 2 columns
pd.get_dummies(df1).head()
Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 1 |
1 | 1 | 0 | 1 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 1 |
3 | 1 | 0 | 0 | 0 | 1 |
4 | 0 | 1 | 0 | 0 | 1 |
- Dummy Variable Trap :- We can explain a variable having n category using n-1 column we actually need not to convert into n columns. Above situation where variable has been split into n columns we can drop a column and still be able to get the same information.
pd.get_dummies(df1,drop_first=True)
Sex_male | Embarked_Q | Embarked_S | |
---|---|---|---|
0 | 1 | 0 | 1 |
1 | 0 | 0 | 0 |
2 | 0 | 0 | 1 |
3 | 0 | 0 | 1 |
4 | 1 | 0 | 1 |
... | ... | ... | ... |
886 | 1 | 0 | 1 |
887 | 0 | 0 | 1 |
888 | 0 | 0 | 1 |
889 | 1 | 0 | 0 |
890 | 1 | 1 | 0 |
891 rows × 3 columns
Disadvantages
- If we have many categories in a variable, numbers of column required will be large (we should avoid this case)
One hot Encoding with many categories in a feature
- Take only top n features into account and perform one-hot encoding on the same only
df2=pd.read_csv('Datasets/Mercedes/train.csv',usecols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
df2.head()
X0 | X1 | X2 | X3 | X4 | X5 | X6 | |
---|---|---|---|---|---|---|---|
0 | k | v | at | a | d | u | j |
1 | k | t | av | e | d | y | l |
2 | az | w | n | c | d | x | j |
3 | az | t | n | f | d | x | l |
4 | az | v | n | f | d | h | d |
df2.nunique()
X0 47
X1 27
X2 44
X3 7
X4 4
X5 29
X6 12
dtype: int64
#We are just taking top 10 categories of a variable
df2.X1.value_counts()[:10]
aa 833
s 598
b 592
l 590
v 408
r 251
i 203
a 143
c 121
o 82
Name: X1, dtype: int64
lst_10=df2.X1.value_counts()[:10].index.tolist()
lst_10
['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']
for columns in df2.columns:
lst_10=df2[columns].value_counts()[:10].index.tolist()
for categories in lst_10:
df2[columns+"_"+categories]=np.where(df2[columns]==categories,1,0)
df2
X0 | X1 | X2 | X3 | X4 | X5 | X6 | X0_z | X0_ak | X0_y | ... | X6_g | X6_j | X6_d | X6_i | X6_l | X6_a | X6_h | X6_k | X6_c | X6_b | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | k | v | at | a | d | u | j | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | k | t | av | e | d | y | l | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
2 | az | w | n | c | d | x | j | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | az | t | n | f | d | x | l | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | az | v | n | f | d | h | d | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4204 | ak | s | as | c | d | aa | d | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4205 | j | o | t | d | d | aa | h | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4206 | ak | v | r | a | d | aa | g | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4207 | al | r | e | f | d | aa | l | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4208 | z | r | ae | c | d | aa | g | 1 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4209 rows × 68 columns
df2.columns
Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X0_z', 'X0_ak', 'X0_y',
'X0_ay', 'X0_t', 'X0_x', 'X0_o', 'X0_f', 'X0_n', 'X0_w', 'X1_aa',
'X1_s', 'X1_b', 'X1_l', 'X1_v', 'X1_r', 'X1_i', 'X1_a', 'X1_c', 'X1_o',
'X2_as', 'X2_ae', 'X2_ai', 'X2_m', 'X2_ak', 'X2_r', 'X2_n', 'X2_s',
'X2_f', 'X2_e', 'X3_c', 'X3_f', 'X3_a', 'X3_d', 'X3_g', 'X3_e', 'X3_b',
'X4_d', 'X4_a', 'X4_b', 'X4_c', 'X5_w', 'X5_v', 'X5_q', 'X5_r', 'X5_d',
'X5_s', 'X5_n', 'X5_m', 'X5_p', 'X5_i', 'X6_g', 'X6_j', 'X6_d', 'X6_i',
'X6_l', 'X6_a', 'X6_h', 'X6_k', 'X6_c', 'X6_b'],
dtype='object')
df2[['X1','X1_aa','X1_s', 'X1_b', 'X1_l', 'X1_v', 'X1_r', 'X1_i', 'X1_a', 'X1_c', 'X1_o']]
X1 | X1_aa | X1_s | X1_b | X1_l | X1_v | X1_r | X1_i | X1_a | X1_c | X1_o | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | v | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | t | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | w | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | t | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | v | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4204 | s | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4205 | o | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4206 | v | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4207 | r | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4208 | r | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
4209 rows × 11 columns
2. Ordinal Number encoding
When the category present in the variable can be ordered based on their character/behaviour.
Example :-
- Grading system (A,B,C,D,E)
- User Experience (Excellent,good,average,below average,poor)
- Weekdays in a week (weekdays-0,weekend-1)
import datetime as dt
today_date=dt.datetime.today()
#way to get date of n prior days
today_date-dt.timedelta(15)
datetime.datetime(2021, 3, 19, 13, 40, 5, 705343)
#List comprehension
days=[today_date-dt.timedelta(x) for x in range(15)]
df3=pd.DataFrame(days)
df3.columns=['Day']
df3
Day | |
---|---|
0 | 2021-04-03 13:40:05.705343 |
1 | 2021-04-02 13:40:05.705343 |
2 | 2021-04-01 13:40:05.705343 |
3 | 2021-03-31 13:40:05.705343 |
4 | 2021-03-30 13:40:05.705343 |
5 | 2021-03-29 13:40:05.705343 |
6 | 2021-03-28 13:40:05.705343 |
7 | 2021-03-27 13:40:05.705343 |
8 | 2021-03-26 13:40:05.705343 |
9 | 2021-03-25 13:40:05.705343 |
10 | 2021-03-24 13:40:05.705343 |
11 | 2021-03-23 13:40:05.705343 |
12 | 2021-03-22 13:40:05.705343 |
13 | 2021-03-21 13:40:05.705343 |
14 | 2021-03-20 13:40:05.705343 |
df3['weekday']=df3['Day'].dt.weekday_name
dictionary={
'Monday':1,
'Tuesday':2,
'Wednesday':3,
'Thursday':4,
'Friday':5,
'Saturday':6,
'Sunday':7
}
df3['weekday_ordinal']=df3['weekday'].map(dictionary)
df3
Day | weekday | weekday_ordinal | |
---|---|---|---|
0 | 2021-04-03 13:40:05.705343 | Saturday | 6 |
1 | 2021-04-02 13:40:05.705343 | Friday | 5 |
2 | 2021-04-01 13:40:05.705343 | Thursday | 4 |
3 | 2021-03-31 13:40:05.705343 | Wednesday | 3 |
4 | 2021-03-30 13:40:05.705343 | Tuesday | 2 |
5 | 2021-03-29 13:40:05.705343 | Monday | 1 |
6 | 2021-03-28 13:40:05.705343 | Sunday | 7 |
7 | 2021-03-27 13:40:05.705343 | Saturday | 6 |
8 | 2021-03-26 13:40:05.705343 | Friday | 5 |
9 | 2021-03-25 13:40:05.705343 | Thursday | 4 |
10 | 2021-03-24 13:40:05.705343 | Wednesday | 3 |
11 | 2021-03-23 13:40:05.705343 | Tuesday | 2 |
12 | 2021-03-22 13:40:05.705343 | Monday | 1 |
13 | 2021-03-21 13:40:05.705343 | Sunday | 7 |
14 | 2021-03-20 13:40:05.705343 | Saturday | 6 |
3. Count or Frequency encoding
train_set=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None,index_col=None)
train_set
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
columns=[1,3,5,6,7,8,9,13]
train_set=train_set[columns]
train_set.head()
1 | 3 | 5 | 6 | 7 | 8 | 9 | 13 | |
---|---|---|---|---|---|---|---|---|
0 | State-gov | Bachelors | Never-married | Adm-clerical | Not-in-family | White | Male | United-States |
1 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | Male | United-States |
2 | Private | HS-grad | Divorced | Handlers-cleaners | Not-in-family | White | Male | United-States |
3 | Private | 11th | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | United-States |
4 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Wife | Black | Female | Cuba |
#train_set[1].value_counts()
train_set.columns=['Employment','Degree','Status','Designation','Family_job','Race','Sex','Country']
train_set
Employment | Degree | Status | Designation | Family_job | Race | Sex | Country | |
---|---|---|---|---|---|---|---|---|
0 | State-gov | Bachelors | Never-married | Adm-clerical | Not-in-family | White | Male | United-States |
1 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | Male | United-States |
2 | Private | HS-grad | Divorced | Handlers-cleaners | Not-in-family | White | Male | United-States |
3 | Private | 11th | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | United-States |
4 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Wife | Black | Female | Cuba |
... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | Private | Assoc-acdm | Married-civ-spouse | Tech-support | Wife | White | Female | United-States |
32557 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | United-States |
32558 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | Female | United-States |
32559 | Private | HS-grad | Never-married | Adm-clerical | Own-child | White | Male | United-States |
32560 | Self-emp-inc | HS-grad | Married-civ-spouse | Exec-managerial | Wife | White | Female | United-States |
32561 rows × 8 columns
train_set.nunique()
Employment 9
Degree 16
Status 7
Designation 15
Family_job 6
Race 5
Sex 2
Country 42
dtype: int64
train_set['Country'].value_counts().to_dict()
{' United-States': 29170,
' Mexico': 643,
' ?': 583,
' Philippines': 198,
' Germany': 137,
' Canada': 121,
' Puerto-Rico': 114,
' El-Salvador': 106,
' India': 100,
' Cuba': 95,
' England': 90,
' Jamaica': 81,
' South': 80,
' China': 75,
' Italy': 73,
' Dominican-Republic': 70,
' Vietnam': 67,
' Guatemala': 64,
' Japan': 62,
' Poland': 60,
' Columbia': 59,
' Taiwan': 51,
' Haiti': 44,
' Iran': 43,
' Portugal': 37,
' Nicaragua': 34,
' Peru': 31,
' France': 29,
' Greece': 29,
' Ecuador': 28,
' Ireland': 24,
' Hong': 20,
' Cambodia': 19,
' Trinadad&Tobago': 19,
' Thailand': 18,
' Laos': 18,
' Yugoslavia': 16,
' Outlying-US(Guam-USVI-etc)': 14,
' Honduras': 13,
' Hungary': 13,
' Scotland': 12,
' Holand-Netherlands': 1}
for columns in train_set:
dic=train_set[columns].value_counts().to_dict()
train_set[columns+"_encoded"]=train_set[columns].map(dic)
train_set
Employment | Degree | Status | Designation | Family_job | Race | Sex | Country | Employment_encoded | Degree_encoded | Status_encoded | Designation_encoded | Family_job_encoded | Race_encoded | Sex_encoded | Country_encoded | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | State-gov | Bachelors | Never-married | Adm-clerical | Not-in-family | White | Male | United-States | 1298 | 5355 | 10683 | 3770 | 8305 | 27816 | 21790 | 29170 |
1 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | Male | United-States | 2541 | 5355 | 14976 | 4066 | 13193 | 27816 | 21790 | 29170 |
2 | Private | HS-grad | Divorced | Handlers-cleaners | Not-in-family | White | Male | United-States | 22696 | 10501 | 4443 | 1370 | 8305 | 27816 | 21790 | 29170 |
3 | Private | 11th | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | United-States | 22696 | 1175 | 14976 | 1370 | 13193 | 3124 | 21790 | 29170 |
4 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Wife | Black | Female | Cuba | 22696 | 5355 | 14976 | 4140 | 1568 | 3124 | 10771 | 95 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | Private | Assoc-acdm | Married-civ-spouse | Tech-support | Wife | White | Female | United-States | 22696 | 1067 | 14976 | 928 | 1568 | 27816 | 10771 | 29170 |
32557 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | United-States | 22696 | 10501 | 14976 | 2002 | 13193 | 27816 | 21790 | 29170 |
32558 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | Female | United-States | 22696 | 10501 | 993 | 3770 | 3446 | 27816 | 10771 | 29170 |
32559 | Private | HS-grad | Never-married | Adm-clerical | Own-child | White | Male | United-States | 22696 | 10501 | 10683 | 3770 | 5068 | 27816 | 21790 | 29170 |
32560 | Self-emp-inc | HS-grad | Married-civ-spouse | Exec-managerial | Wife | White | Female | United-States | 1116 | 10501 | 14976 | 4066 | 1568 | 27816 | 10771 | 29170 |
32561 rows × 16 columns
Advantages
- Easy to use
- Not increasing feature space
Disadvantages
- It will provide the same weight if frequency of two categories are same, model will not be able to distinguish between them after encoding.
4. Target Guided Ordinal Encoding
- Ordering the labels according to the target variable
- Replace the labels by the joint probability of being 1 or 0
df4=pd.read_csv('Datasets/Titanic/train.csv',usecols=['Cabin','Survived'])
df4
Survived | Cabin | |
---|---|---|
0 | 0 | NaN |
1 | 1 | C85 |
2 | 1 | NaN |
3 | 1 | C123 |
4 | 0 | NaN |
... | ... | ... |
886 | 0 | NaN |
887 | 1 | B42 |
888 | 0 | NaN |
889 | 1 | C148 |
890 | 0 | NaN |
891 rows × 2 columns
df4.Cabin.fillna('Missing',inplace=True)
df4.head()
Survived | Cabin | |
---|---|---|
0 | 0 | Missing |
1 | 1 | C85 |
2 | 1 | Missing |
3 | 1 | C123 |
4 | 0 | Missing |
#Just trying to get cabing group ignoring seat number of the corresponding cabin
df4['Cabin']=df4.Cabin.str[:1]
df4
Survived | Cabin | |
---|---|---|
0 | 0 | M |
1 | 1 | C |
2 | 1 | M |
3 | 1 | C |
4 | 0 | M |
... | ... | ... |
886 | 0 | M |
887 | 1 | B |
888 | 0 | M |
889 | 1 | C |
890 | 0 | M |
891 rows × 2 columns
df4.Cabin.unique()
array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)
#percentage of people survived in each cabin group
df4.groupby(['Cabin']).Survived.mean().sort_values()
Cabin
T 0.000000
M 0.299854
A 0.466667
G 0.500000
C 0.593220
F 0.615385
B 0.744681
E 0.750000
D 0.757576
Name: Survived, dtype: float64
df4.groupby(['Cabin']).Survived.mean().sort_values().index
Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')
ordinal_labels=df4.groupby(['Cabin']).Survived.mean().sort_values().index
#Dictionary Comprehension
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2
{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}
df4['Cabin_Ordinal']=df4.Cabin.map(ordinal_labels2)
df4
Survived | Cabin | Cabin_Ordinal | |
---|---|---|---|
0 | 0 | M | 1 |
1 | 1 | C | 4 |
2 | 1 | M | 1 |
3 | 1 | C | 4 |
4 | 0 | M | 1 |
... | ... | ... | ... |
886 | 0 | M | 1 |
887 | 1 | B | 6 |
888 | 0 | M | 1 |
889 | 1 | C | 4 |
890 | 0 | M | 1 |
891 rows × 3 columns
5. Mean Encoding
- Replacing the category by the mean value corresponding to target variable
df5=df4.copy()
mean_ordinal=df5.groupby(['Cabin']).Survived.mean().sort_values().to_dict()
mean_ordinal
{'T': 0.0,
'M': 0.29985443959243085,
'A': 0.4666666666666667,
'G': 0.5,
'C': 0.5932203389830508,
'F': 0.6153846153846154,
'B': 0.7446808510638298,
'E': 0.75,
'D': 0.7575757575757576}
df5['Cabin_mean_ordinal']=df5.Cabin.map(mean_ordinal)
df5
Survived | Cabin | Cabin_Ordinal | Cabin_mean_ordinal | |
---|---|---|---|---|
0 | 0 | M | 1 | 0.299854 |
1 | 1 | C | 4 | 0.593220 |
2 | 1 | M | 1 | 0.299854 |
3 | 1 | C | 4 | 0.593220 |
4 | 0 | M | 1 | 0.299854 |
... | ... | ... | ... | ... |
886 | 0 | M | 1 | 0.299854 |
887 | 1 | B | 6 | 0.744681 |
888 | 0 | M | 1 | 0.299854 |
889 | 1 | C | 4 | 0.593220 |
890 | 0 | M | 1 | 0.299854 |
891 rows × 4 columns
Advantages
- It captures information within the label therefore rendering more predictive features
- It creates a monotonic relationship between variable and target
Disadvantages
- Sometimes it leads to overfitting
6. Probability Ratio Encoding
- replacing category by odds w.r.t target variable
- Odds = p/(1-p) where p is probability of target variable in a category
df6=pd.read_csv('Datasets/Titanic/train.csv',usecols=['Cabin','Survived'])
df6.head()
Survived | Cabin | |
---|---|---|
0 | 0 | NaN |
1 | 1 | C85 |
2 | 1 | NaN |
3 | 1 | C123 |
4 | 0 | NaN |
# Replacing NAN with new category - Missing
df6.Cabin.fillna('Missing',inplace=True)
df6.head()
Survived | Cabin | |
---|---|---|
0 | 0 | Missing |
1 | 1 | C85 |
2 | 1 | Missing |
3 | 1 | C123 |
4 | 0 | Missing |
df6.Cabin=df6.Cabin.str[:1]
df6
Survived | Cabin | |
---|---|---|
0 | 0 | M |
1 | 1 | C |
2 | 1 | M |
3 | 1 | C |
4 | 0 | M |
... | ... | ... |
886 | 0 | M |
887 | 1 | B |
888 | 0 | M |
889 | 1 | C |
890 | 0 | M |
891 rows × 2 columns
df6.Cabin.unique()
array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)
# Probabily of surviving a person in the cabin
prob_df=pd.DataFrame(df6.groupby('Cabin')['Survived'].mean())
prob_df
Survived | |
---|---|
Cabin | |
A | 0.466667 |
B | 0.744681 |
C | 0.593220 |
D | 0.757576 |
E | 0.750000 |
F | 0.615385 |
G | 0.500000 |
M | 0.299854 |
T | 0.000000 |
# Probability of person died in the cabin
prob_df['Died']=1-prob_df.Survived
prob_df
Survived | Died | |
---|---|---|
Cabin | ||
A | 0.466667 | 0.533333 |
B | 0.744681 | 0.255319 |
C | 0.593220 | 0.406780 |
D | 0.757576 | 0.242424 |
E | 0.750000 | 0.250000 |
F | 0.615385 | 0.384615 |
G | 0.500000 | 0.500000 |
M | 0.299854 | 0.700146 |
T | 0.000000 | 1.000000 |
# Now probability ratio = prob(Survived) / prob(Died)
prob_df['Probability_ratio']=prob_df['Survived']/prob_df['Died']
prob_df
Survived | Died | Probability_ratio | |
---|---|---|---|
Cabin | |||
A | 0.466667 | 0.533333 | 0.875000 |
B | 0.744681 | 0.255319 | 2.916667 |
C | 0.593220 | 0.406780 | 1.458333 |
D | 0.757576 | 0.242424 | 3.125000 |
E | 0.750000 | 0.250000 | 3.000000 |
F | 0.615385 | 0.384615 | 1.600000 |
G | 0.500000 | 0.500000 | 1.000000 |
M | 0.299854 | 0.700146 | 0.428274 |
T | 0.000000 | 1.000000 | 0.000000 |
# We can use this probability_ratio to encode Cabin feature
df6['Cabin_prob_encoded']=df6.Cabin.map(prob_df['Probability_ratio'].to_dict())
df6
Survived | Cabin | Cabin_prob_encoded | |
---|---|---|---|
0 | 0 | M | 0.428274 |
1 | 1 | C | 1.458333 |
2 | 1 | M | 0.428274 |
3 | 1 | C | 1.458333 |
4 | 0 | M | 0.428274 |
... | ... | ... | ... |
886 | 0 | M | 0.428274 |
887 | 1 | B | 2.916667 |
888 | 0 | M | 0.428274 |
889 | 1 | C | 1.458333 |
890 | 0 | M | 0.428274 |
891 rows × 3 columns