In scikit-learn, a lot of classifiers come with a built-in method of handling imbalanced classes. If we have highly imbalanced classes and have not addressed it during preprocessing, we have the option of using the class_weight parameter to weight the classes to make certain we have a balanced mix of each class. Specifically, the balanced argument will automatically weigh classes inversely proportional to their frequency.
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from pylab import rcParams
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 10, 6
warnings.filterwarnings('ignore')
sns.set(style="darkgrid")
In [3]:
def generate_model_report(y_actual, y_predicted):
print("Accuracy = " , accuracy_score(y_actual, y_predicted))
print("Precision = " ,precision_score(y_actual, y_predicted))
print("Recall = " ,recall_score(y_actual, y_predicted))
print("F1 Score = " ,f1_score(y_actual, y_predicted))
pass
In [4]:
def generate_auc_roc_curve(clf, X_test):
y_pred_proba = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(Y_test, y_pred_proba)
auc = roc_auc_score(Y_test, y_pred_proba)
plt.plot(fpr,tpr,label="AUC ROC Curve with Area Under the curve ="+str(auc))
plt.legend(loc=4)
plt.show()
pass
In [5]:
df = pd.read_csv('creditcard.csv')
In [6]:
df.head()
Out[6]:
In [7]:
target = 'Class'
In [8]:
X = df.loc[:, df.columns!=target]
In [9]:
Y = df.loc[:, df.columns==target]
In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,
random_state=42)
In [11]:
ax = sns.countplot(x=target, data=df)
print(df[target].value_counts())
In [12]:
100* (492/float(df.shape[0]))
Out[12]:
In [13]:
100* (284315/float(df.shape[0]))
Out[13]:
In [14]:
Y_train[target].value_counts()
Out[14]:
In [15]:
clf = LogisticRegression().fit(X_train, Y_train)
In [16]:
Y_Test_Pred = clf.predict(X_test)
In [17]:
pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'),
pd.Series(Y_test[target], name = 'Actual'))
Out[17]:
In [18]:
generate_model_report(Y_test, Y_Test_Pred)
In [19]:
generate_auc_roc_curve(clf, X_test)
class_weight='balanced'
In [20]:
# https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html
In [21]:
unique_classes = list(df[target].unique())
unique_classes
Out[21]:
In [22]:
out_dict = {}
for classes in unique_classes:
out_dict[classes] = df.shape[0]/((df.loc[df[target] == classes].shape[0])
*len(unique_classes))
In [23]:
out_dict
Out[23]:
In [24]:
print (X_train.shape, Y_train.shape)
In [25]:
clf = LogisticRegression(class_weight='balanced').fit(X_train, Y_train)
In [26]:
from sklearn.utils import class_weight
In [27]:
class_weight.compute_class_weight('balanced', np.unique(Y_train), Y_train[target])
Out[27]:
In [28]:
Y_Test_Pred = clf.predict(X_test)
In [29]:
pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'),
pd.Series(Y_test[target], name = 'Actual'))
Out[29]:
In [30]:
generate_model_report(Y_test, Y_Test_Pred)
In [31]:
generate_auc_roc_curve(clf, X_test)
In [32]:
weights = np.linspace(0.05, 0.95, 20)
gsc = GridSearchCV(
estimator=LogisticRegression(),
param_grid={
'class_weight': [{0: x, 1: 1.0-x} for x in weights]
},
scoring='f1',
cv=5
)
grid_result = gsc.fit(X_train, Y_train)
print("Best parameters : %s" % grid_result.best_params_)
In [33]:
data_out = pd.DataFrame({'score': grid_result.cv_results_['mean_test_score'],
'weight': weights })
data_out.plot(x='weight')
Out[33]:
In [34]:
data_out
Out[34]:
In [35]:
clf = LogisticRegression(**grid_result.best_params_).fit(X_train, Y_train)
In [36]:
Y_Test_Pred = clf.predict(X_test)
In [37]:
pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'),
pd.Series(Y_test[target], name = 'Actual'))
Out[37]:
In [38]:
generate_model_report(Y_test, Y_Test_Pred)
In [39]:
generate_auc_roc_curve(clf, X_test)
SMOTE
In [40]:
unique, count = np.unique(Y_train, return_counts=True)
Y_train_dict_value_count = { k:v for (k,v) in zip(unique, count)}
Y_train_dict_value_count
Out[40]:
In [41]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(X_train, Y_train)
In [42]:
unique, count = np.unique(y_train_res, return_counts=True)
y_train_smote_value_count = { k:v for (k,v) in zip(unique, count)}
y_train_smote_value_count
Out[42]:
In [43]:
clf = LogisticRegression().fit(x_train_res, y_train_res)
In [44]:
Y_Test_Pred = clf.predict(X_test)
In [45]:
pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'),
pd.Series(Y_test[target], name = 'Actual'))
Out[45]:
In [46]:
generate_model_report(Y_test, Y_Test_Pred)
In [47]:
generate_auc_roc_curve(clf, X_test)
In [48]:
weights = np.linspace(0.005, 0.25, 10)
weights
Out[48]:
In [49]:
pipe = make_pipeline(
SMOTE(),
LogisticRegression()
)
weights = np.linspace(0.005, 0.25, 10)
gsc = GridSearchCV(
estimator=pipe,
param_grid={
'smote__ratio': weights
},
scoring='f1',
cv=3
)
grid_result = gsc.fit(X_train, Y_train)
print("Best parameters : %s" % grid_result.best_params_)
weight_f1_score_df = pd.DataFrame({ 'score': grid_result.cv_results_['mean_test_score'],
'weight': weights })
weight_f1_score_df.plot(x='weight')
Out[49]:
In [50]:
pipe = make_pipeline(
SMOTE(ratio=0.005),
LogisticRegression()
)
pipe.fit(X_train, Y_train)
Y_Test_Pred = pipe.predict(X_test)
In [51]:
pd.crosstab(pd.Series(Y_Test_Pred, name = 'Predicted'),
pd.Series(Y_test[target], name = 'Actual'))
Out[51]:
In [52]:
generate_model_report(Y_test, Y_Test_Pred)
In [53]:
generate_auc_roc_curve(clf, X_test)
UNDERSAMPLING
In [54]:
minority_class_len = len(df[df[target] == 1])
print(minority_class_len)
In [55]:
majority_class_indices = df[df[target] == 0].index
print(majority_class_indices)
In [56]:
random_majority_indices = np.random.choice(majority_class_indices,
minority_class_len,
replace=False)
print(len(random_majority_indices))
In [57]:
minority_class_indices = df[df[target] == 1].index
print(minority_class_indices)
In [58]:
under_sample_indices = np.concatenate([minority_class_indices,random_majority_indices])
In [59]:
under_sample = df.loc[under_sample_indices]
In [60]:
sns.countplot(x=target, data=under_sample)
Out[60]:
In [61]:
X = under_sample.loc[:, df.columns!=target]
Y = under_sample.loc[:, df.columns==target]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
clf = LogisticRegression().fit(X_train, Y_train)
Y_Test_Pred = clf.predict(X_test)
In [62]:
generate_model_report(Y_test, Y_Test_Pred)
In [63]:
generate_auc_roc_curve(clf, X_test)
Comments
Post a Comment