Class Weights for Handling Imbalanced Datasets

In scikit-learn, a lot of classifiers come with a built-in method of handling imbalanced classes. If we have highly imbalanced classes and have not addressed it during preprocessing, we have the option of using the class_weight parameter to weight the classes to make certain we have a balanced mix of each class. Specifically, the balanced argument will automatically weigh classes inversely proportional to their frequency.

import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from pylab import rcParams
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:

%matplotlib inline
rcParams['figure.figsize'] = 10, 6
warnings.filterwarnings('ignore')
sns.set(style="darkgrid")

In [3]:

def generate_model_report(y_actual, y_predicted):
    print("Accuracy = " , accuracy_score(y_actual, y_predicted))
    print("Precision = " ,precision_score(y_actual, y_predicted))
    print("Recall = " ,recall_score(y_actual, y_predicted))
    print("F1 Score = " ,f1_score(y_actual, y_predicted))
    pass

In [4]:

def generate_auc_roc_curve(clf, X_test):
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(Y_test,  y_pred_proba)
    auc = roc_auc_score(Y_test, y_pred_proba)
    plt.plot(fpr,tpr,label="AUC ROC Curve with Area Under the curve ="+str(auc))
    plt.legend(loc=4)
    plt.show()
    pass

In [5]:

df = pd.read_csv('creditcard.csv')

In [6]:

df.head()

Out[6]:

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

5 rows × 31 columns

In [7]:

target = 'Class'

In [8]:

X = df.loc[:, df.columns!=target]

In [9]:

Y = df.loc[:, df.columns==target]

In [10]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [11]:

ax = sns.countplot(x=target, data=df)
print(df[target].value_counts())

0    284315
1       492
Name: Class, dtype: int64

In [12]:

100* (492/float(df.shape[0]))

Out[12]:

0.1727485630620034

In [13]:

100* (284315/float(df.shape[0]))

Out[13]:

99.82725143693798

In [14]:

Y_train[target].value_counts()

Out[14]:

0    190477
1       343
Name: Class, dtype: int64

In [15]:

clf = LogisticRegression().fit(X_train, Y_train)

In [16]:

Y_Test_Pred = clf.predict(X_test)

In [17]:

pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'), 
            pd.Series(Y_test[target], name = 'Actual'))

Out[17]:

Actual	0	1
Predicted
0	30752	63
1	34	0

In [18]:

generate_model_report(Y_test, Y_Test_Pred)

Accuracy =  0.9987657867577431
Precision =  0.6513761467889908
Recall =  0.47651006711409394
F1 Score =  0.5503875968992248

In [19]:

generate_auc_roc_curve(clf, X_test)

class_weight='balanced'

In [20]:

# https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html

In [21]:

unique_classes = list(df[target].unique())
unique_classes

Out[21]:

[0, 1]

In [22]:

out_dict = {}
for classes in unique_classes:
    out_dict[classes] = df.shape[0]/((df.loc[df[target] == classes].shape[0])
                                     *len(unique_classes))

In [23]:

out_dict

Out[23]:

{0: 0.5008652375006595, 1: 289.4380081300813}

In [24]:

print (X_train.shape, Y_train.shape)

(190820, 30) (190820, 1)

In [25]:

clf = LogisticRegression(class_weight='balanced').fit(X_train, Y_train)

In [26]:

from sklearn.utils import class_weight

In [27]:

class_weight.compute_class_weight('balanced', np.unique(Y_train), Y_train[target])

Out[27]:

array([  0.50090037, 278.16326531])

In [28]:

Y_Test_Pred = clf.predict(X_test)

In [29]:

pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'), 
            pd.Series(Y_test[target], name = 'Actual'))

Out[29]:

Actual	0	1
Predicted
0	29954	62
1	832	1

In [30]:

generate_model_report(Y_test, Y_Test_Pred)

Accuracy =  0.9754966112334684
Precision =  0.05605935696619951
Recall =  0.912751677852349
F1 Score =  0.10563106796116506

In [31]:

generate_auc_roc_curve(clf, X_test)

In [32]:

weights = np.linspace(0.05, 0.95, 20)
gsc = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid={
        'class_weight': [{0: x, 1: 1.0-x} for x in weights]
    },
    scoring='f1',
    cv=5
)

grid_result = gsc.fit(X_train, Y_train)
print("Best parameters : %s" % grid_result.best_params_)

Best parameters : {'class_weight': {0: 0.09736842105263158, 1: 0.9026315789473685}}

In [33]:

data_out = pd.DataFrame({'score': grid_result.cv_results_['mean_test_score'],
                       'weight': weights })
data_out.plot(x='weight')

Out[33]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a1a570ac8>

In [34]:

data_out

Out[34]:

	score	weight
0	0.703298	0.050000
1	0.770741	0.097368
2	0.763401	0.144737
3	0.753784	0.192105
4	0.764725	0.239474
5	0.759793	0.286842
6	0.750428	0.334211
7	0.708962	0.381579
8	0.730055	0.428947
9	0.674311	0.476316
10	0.681596	0.523684
11	0.601097	0.571053
12	0.599889	0.618421
13	0.538431	0.665789
14	0.474159	0.713158
15	0.464639	0.760526
16	0.372763	0.807895
17	0.350569	0.855263
18	0.063391	0.902632
19	0.047618	0.950000

In [35]:

clf = LogisticRegression(**grid_result.best_params_).fit(X_train, Y_train)

In [36]:

Y_Test_Pred = clf.predict(X_test)

In [37]:

pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'), 
            pd.Series(Y_test[target], name = 'Actual'))

Out[37]:

Actual	0	1
Predicted
0	30722	63
1	64	0

In [38]:

generate_model_report(Y_test, Y_Test_Pred)

Accuracy =  0.9991807377616053
Precision =  0.7168674698795181
Recall =  0.7986577181208053
F1 Score =  0.7555555555555555

In [39]:

generate_auc_roc_curve(clf, X_test)

SMOTE

In [40]:

unique, count = np.unique(Y_train, return_counts=True)
Y_train_dict_value_count = { k:v for (k,v) in zip(unique, count)}
Y_train_dict_value_count

Out[40]:

{0: 190477, 1: 343}

In [41]:

sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(X_train, Y_train)

In [42]:

unique, count = np.unique(y_train_res, return_counts=True)
y_train_smote_value_count = { k:v for (k,v) in zip(unique, count)}
y_train_smote_value_count

Out[42]:

{0: 190477, 1: 190477}

In [43]:

clf = LogisticRegression().fit(x_train_res, y_train_res)

In [44]:

Y_Test_Pred = clf.predict(X_test)

In [45]:

pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'), 
            pd.Series(Y_test[target], name = 'Actual'))

Out[45]:

Actual	0	1
Predicted
0	30184	63
1	602	0

In [46]:

generate_model_report(Y_test, Y_Test_Pred)

Accuracy =  0.9835189973081384
Precision =  0.08083832335329341
Recall =  0.9060402684563759
F1 Score =  0.148433205057724

In [47]:

generate_auc_roc_curve(clf, X_test)

In [48]:

weights = np.linspace(0.005, 0.25, 10)
weights

Out[48]:

array([0.005     , 0.03222222, 0.05944444, 0.08666667, 0.11388889,
       0.14111111, 0.16833333, 0.19555556, 0.22277778, 0.25      ])

In [49]:

pipe = make_pipeline(
    SMOTE(),
    LogisticRegression()
)

weights = np.linspace(0.005, 0.25, 10)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__ratio': weights
    },
    scoring='f1',
    cv=3
)
grid_result = gsc.fit(X_train, Y_train)

print("Best parameters : %s" % grid_result.best_params_)
weight_f1_score_df = pd.DataFrame({ 'score': grid_result.cv_results_['mean_test_score'],
                                   'weight': weights })
weight_f1_score_df.plot(x='weight')

Best parameters : {'smote__ratio': 0.005}

Out[49]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a1a8499b0>

In [50]:

pipe = make_pipeline(
    SMOTE(ratio=0.005),
    LogisticRegression()
)

pipe.fit(X_train, Y_train)

Y_Test_Pred = pipe.predict(X_test)

In [51]:

pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'), 
            pd.Series(Y_test[target], name = 'Actual'))

Out[51]:

Actual	0	1
Predicted
0	30730	63
1	56	0

In [52]:

generate_model_report(Y_test, Y_Test_Pred)

Accuracy =  0.9991275389149563
Precision =  0.7161290322580646
Recall =  0.7449664429530202
F1 Score =  0.7302631578947368

In [53]:

generate_auc_roc_curve(clf, X_test)

UNDERSAMPLING

In [54]:

minority_class_len = len(df[df[target] == 1])
print(minority_class_len)

In [55]:

majority_class_indices = df[df[target] == 0].index
print(majority_class_indices)

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            284797, 284798, 284799, 284800, 284801, 284802, 284803, 284804,
            284805, 284806],
           dtype='int64', length=284315)

In [56]:

random_majority_indices = np.random.choice(majority_class_indices,
                                           minority_class_len, 
                                           replace=False)
print(len(random_majority_indices))

In [57]:

minority_class_indices = df[df[target] == 1].index
print(minority_class_indices)

Int64Index([   541,    623,   4920,   6108,   6329,   6331,   6334,   6336,
              6338,   6427,
            ...
            274382, 274475, 275992, 276071, 276864, 279863, 280143, 280149,
            281144, 281674],
           dtype='int64', length=492)

In [58]:

under_sample_indices = np.concatenate([minority_class_indices,random_majority_indices])

In [59]:

under_sample = df.loc[under_sample_indices]

In [60]:

sns.countplot(x=target, data=under_sample)

Out[60]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a1b4597f0>

In [61]:

X = under_sample.loc[:, df.columns!=target]
Y = under_sample.loc[:, df.columns==target]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
clf = LogisticRegression().fit(X_train, Y_train)
Y_Test_Pred = clf.predict(X_test)

In [62]:

generate_model_report(Y_test, Y_Test_Pred)

Accuracy =  0.916923076923077
Precision =  0.9419354838709677
Recall =  0.8902439024390244
F1 Score =  0.9153605015673981

In [63]:

generate_auc_roc_curve(clf, X_test)

AI Data Guru

Search This Blog

Class Weights for Handling Imbalanced Datasets

class_weight='balanced'

SMOTE

UNDERSAMPLING

Comments

Post a Comment

Popular posts from this blog

learning advanced SQL deeply

Darts: Time Series Analysis made easy