Import Numpy As NP Import Pandas As PD
Import Numpy As NP Import Pandas As PD
import pandas as pd
In [2]:
#Only execute this cell if the directory in which your dataset is different
from the directory that you are running the
#Jupyter Notebook
#import os
#os.chdir('C:\\Shripad\\Personal\\DataScience\\DSBA\\Curricumulum\\4 Data
Mining\\3 Random Forest')
In [2]:
from sklearn.ensemble import RandomForestClassifier
In [3]:
bank_df = pd.read_csv("Banking Dataset.csv")
In [4]:
bank_df.head(10)
Out[4]:
Cust_I Targ Ag Gende Occupati No_OF_CR_TX AGE_BK SC Holding_Peri
Balance
D et e r on NS T R od
160378.
0 C1 0 30 M SAL 2 26-30 826 9
60
84370.5 SELF-
1 C10 1 41 M 14 41-45 843 9
9 EMP
60849.2
2 C100 0 49 F PROF 49 46-50 328 26
6
10558.8
3 C1000 0 49 M SAL 23 46-50 619 19
1
C1000 97100.4
4 0 43 M SENP 3 41-45 397 8
0 8
C1000 160378.
5 0 30 M SAL 2 26-30 781 11
1 60
C1000 26275.5
6 0 43 M PROF 23 41-45 354 12
2 5
3 7
C1000
8 0 45 M 1881.37 PROF 3 41-45 339 13
4
C1000
9 0 37 M 3274.37 PROF 33 36-40 535 9
5
In [5]:
bank_df.shape
Out[5]:
(20000, 10)
In [6]:
bank_df.info() # many columns are of type object i.e. strings. These need to
be converted to ordinal type
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Cust_ID 20000 non-null object
1 Target 20000 non-null int64
2 Age 20000 non-null int64
3 Gender 20000 non-null object
4 Balance 20000 non-null float64
5 Occupation 20000 non-null object
6 No_OF_CR_TXNS 20000 non-null int64
7 AGE_BKT 20000 non-null object
8 SCR 20000 non-null int64
9 Holding_Period 20000 non-null int64
dtypes: float64(1), int64(5), object(4)
memory usage: 1.5+ MB
In [33]:
## For RandomForestClassifier, none of the data type need to be Object, but
everything should be integers
In [7]:
# Decision tree in Python can take only numerical / categorical colums. It
cannot take string / object types.
# The following code loops through each column and checks if the column type
is object then converts those columns
# into categorical with each distinct value becoming a category or code.
X = bank_df.drop(["Target","Cust_ID"] , axis=1)
y = bank_df.pop("Target")
In [10]:
# splitting data into training and test set for independent attributes
# X_train = independent variable for Train, X_test = independent variable for
Test,
# train_labels = dependent varliable for Train, test_labels = dependent
variable for Test
from sklearn.model_selection import train_test_split
param_grid = {
'max_depth': [7, 10],
'max_features': [4, 6],
'min_samples_leaf': [50, 100],
'min_samples_split': [150, 300],
'n_estimators': [301, 501]
}
In [27]:
rfcl = RandomForestClassifier()
In [28]:
# cv = cross validation, value of 3 i.e. Number of combination is 3.
# Random forest model will be created with first as 7, 4, 50, 150 and 301 and
split data into 3 (fold) parts
grid_search = GridSearchCV(estimator = rfcl, param_grid = param_grid, cv = 3)
In [29]:
grid_search.fit(X_train, train_labels)
Out[29]:
GridSearchCV(cv=3, estimator=RandomForestClassifier(),
param_grid={'max_depth': [7, 10], 'max_features': [4, 6],
'min_samples_leaf': [50, 100],
'min_samples_split': [150, 300],
'n_estimators': [301, 501]})
In [30]:
grid_search.best_params_
Out[30]:
{'max_depth': 7,
'max_features': 6,
'min_samples_leaf': 50,
'min_samples_split': 150,
'n_estimators': 501}
In [ ]:
best_grid = grid_search.best_estimator_
In [ ]:
ytrain_predict = best_grid.predict_proba(X_train)
ytest_predict = best_grid.predict_proba(X_test)
In [ ]:
ytrain_predict = best_grid.predict(X_train)
ytest_predict = best_grid.predict(X_test)
In [29]:
from sklearn.metrics import confusion_matrix,classification_report
In [30]:
confusion_matrix(train_labels,ytrain_predict)
Out[30]:
array([[12754, 28],
[ 1152, 66]], dtype=int64)
In [31]:
confusion_matrix(test_labels,ytest_predict)
Out[31]:
array([[5475, 10],
[ 490, 25]], dtype=int64)
In [32]:
print(classification_report(train_labels,ytrain_predict))
precision recall f1-score support
In [33]:
print(classification_report(test_labels,ytest_predict))
precision recall f1-score support
In [34]:
import matplotlib.pyplot as plt
In [35]:
# AUC and ROC for the training data
# predict probabilities
probs = best_grid.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.844
In [36]:
# AUC and ROC for the test data
# predict probabilities
probs = best_grid.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.777