C:\Users\Soham Das\Data-analysis-projects\Finance Data Analysis/Bank_Personal_Loan_Modelling.xlsx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# for better interactive visualization
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot #to see graphs in offline mode
#to ignore the filter warnings
import warnings
warnings.filterwarnings('ignore')
#as data is in the form of excel, use read_excel command
#we need to give the sheet index
df=pd.read_excel(r'C:\Users\Soham Das\Data-analysis-projects\Finance Data Analysis/Bank_Personal_Loan_Modelling.xlsx',1)
df.head()
ID | Age | Experience | Income | ZIP Code | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
df.shape
(5000, 14)
df.isnull().sum()
#good there are no non values
ID 0 Age 0 Experience 0 Income 0 ZIP Code 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal Loan 0 Securities Account 0 CD Account 0 Online 0 CreditCard 0 dtype: int64
# ID and Zipcode might be removed as they may not be useful for our analysis
df.drop(['ID','ZIP Code'],axis=1,inplace=True)
df.columns
Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard'], dtype='object')
fig=px.box(df,y=['Age', 'Experience', 'Income', 'Family', 'Education'])
fig.show()
#Five point summary suggest that Experience has negative value(This should be fixed).
# we can see the Min, Max, mean and std deviation for all key attributes of the dataset
# Income has too much noise and slightly skewed right, Age and exp are equally distributed.
#checking if there is skewness in data or not
df.skew()
#CCAvg
#Education
#Mortgage
#Personal Loan
#Securities Account
#CD Account
Age -0.029341 Experience -0.026325 Income 0.841339 Family 0.155221 CCAvg 1.598457 Education 0.227093 Mortgage 2.104002 Personal Loan 2.743607 Securities Account 2.588268 CD Account 3.691714 Online -0.394785 CreditCard 0.904589 dtype: float64
df.dtypes
Age int64 Experience int64 Income int64 Family int64 CCAvg float64 Education int64 Mortgage int64 Personal Loan int64 Securities Account int64 CD Account int64 Online int64 CreditCard int64 dtype: object
#now visualising Skewness by distribution
df.hist(figsize=(20,20))
#1.Age & Experience are to an extent equally distributed
#2.Income & Credit card spending are skewed to the left
#3.We have more Undergraduates than Graduate and Advanced & Professional
#4.50-60% of customers have enabled online banking and went digital
array([[<AxesSubplot:title={'center':'Age'}>, <AxesSubplot:title={'center':'Experience'}>, <AxesSubplot:title={'center':'Income'}>], [<AxesSubplot:title={'center':'Family'}>, <AxesSubplot:title={'center':'CCAvg'}>, <AxesSubplot:title={'center':'Education'}>], [<AxesSubplot:title={'center':'Mortgage'}>, <AxesSubplot:title={'center':'Personal Loan'}>, <AxesSubplot:title={'center':'Securities Account'}>], [<AxesSubplot:title={'center':'CD Account'}>, <AxesSubplot:title={'center':'Online'}>, <AxesSubplot:title={'center':'CreditCard'}>]], dtype=object)
sns.distplot(df['Experience'])
#contains some negative value
<AxesSubplot:xlabel='Experience', ylabel='Density'>
#we should find the mean
df['Experience'].mean()
20.1046
Negative_exp=df[df['Experience']<0]
Negative_exp.head()
#fitlered the negative experience
Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
89 | 25 | -1 | 113 | 4 | 2.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
226 | 24 | -1 | 39 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
315 | 24 | -2 | 51 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
451 | 28 | -2 | 48 | 2 | 1.75 | 3 | 89 | 0 | 0 | 0 | 1 | 0 |
524 | 24 | -1 | 75 | 4 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
sns.distplot(Negative_exp['Age'])
#age between 20-30 are having negative experience
#we need to deal with our data
<AxesSubplot:xlabel='Age', ylabel='Density'>
Negative_exp['Experience'].mean()
-1.4423076923076923
Negative_exp.size
#total negative entries
624
print('There are {} records which has negative values for experience, approx {} %'.format(Negative_exp.size , ((Negative_exp.size/df.size)*100)))
There are 624 records which has negative values for experience, approx 1.04 %
data=df.copy()
data.head()
Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | 1 | 49 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 45 | 19 | 34 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 39 | 15 | 11 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 35 | 9 | 100 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 35 | 8 | 45 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
#using numpy to change the negative values to mean value derived from data with the same age group
data['Experience']=np.where(data['Experience']<0,data['Experience'].mean(),data['Experience'])
data[data['Experience']<0]
#we can see there are no negative values anymore
Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard |
---|
df.corr()
Age | Experience | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Age | 1.000000 | 0.994215 | -0.055269 | -0.046418 | -0.052030 | 0.041334 | -0.012539 | -0.007726 | -0.000436 | 0.008043 | 0.013702 | 0.007681 |
Experience | 0.994215 | 1.000000 | -0.046574 | -0.052563 | -0.050089 | 0.013152 | -0.010582 | -0.007413 | -0.001232 | 0.010353 | 0.013898 | 0.008967 |
Income | -0.055269 | -0.046574 | 1.000000 | -0.157501 | 0.645993 | -0.187524 | 0.206806 | 0.502462 | -0.002616 | 0.169738 | 0.014206 | -0.002385 |
Family | -0.046418 | -0.052563 | -0.157501 | 1.000000 | -0.109285 | 0.064929 | -0.020445 | 0.061367 | 0.019994 | 0.014110 | 0.010354 | 0.011588 |
CCAvg | -0.052030 | -0.050089 | 0.645993 | -0.109285 | 1.000000 | -0.136138 | 0.109909 | 0.366891 | 0.015087 | 0.136537 | -0.003620 | -0.006686 |
Education | 0.041334 | 0.013152 | -0.187524 | 0.064929 | -0.136138 | 1.000000 | -0.033327 | 0.136722 | -0.010812 | 0.013934 | -0.015004 | -0.011014 |
Mortgage | -0.012539 | -0.010582 | 0.206806 | -0.020445 | 0.109909 | -0.033327 | 1.000000 | 0.142095 | -0.005411 | 0.089311 | -0.005995 | -0.007231 |
Personal Loan | -0.007726 | -0.007413 | 0.502462 | 0.061367 | 0.366891 | 0.136722 | 0.142095 | 1.000000 | 0.021954 | 0.316355 | 0.006278 | 0.002802 |
Securities Account | -0.000436 | -0.001232 | -0.002616 | 0.019994 | 0.015087 | -0.010812 | -0.005411 | 0.021954 | 1.000000 | 0.317034 | 0.012627 | -0.015028 |
CD Account | 0.008043 | 0.010353 | 0.169738 | 0.014110 | 0.136537 | 0.013934 | 0.089311 | 0.316355 | 0.317034 | 1.000000 | 0.175880 | 0.278644 |
Online | 0.013702 | 0.013898 | 0.014206 | 0.010354 | -0.003620 | -0.015004 | -0.005995 | 0.006278 | 0.012627 | 0.175880 | 1.000000 | 0.004210 |
CreditCard | 0.007681 | 0.008967 | -0.002385 | 0.011588 | -0.006686 | -0.011014 | -0.007231 | 0.002802 | -0.015028 | 0.278644 | 0.004210 | 1.000000 |
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True)
# We could see that Age & Experience are very strongly correlated,
# Hence it is fine for us to go with Age and drop Experience to avoid multi-colinearity issue.
<AxesSubplot:>
data=data.drop(['Experience'],axis=1)
data.head()
#so we dont have experience column in our data
Age | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | 49 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
1 | 45 | 34 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
2 | 39 | 11 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 35 | 100 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 35 | 45 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
data['Education'].unique()
array([1, 2, 3], dtype=int64)
def mark(x):
if x==1:
return 'Undergrad'
elif x==2:
return 'Graduate'
else:
return 'Advanced/Professional'
data['Edu_mark']=data['Education'].apply(mark)
data.head()
#'Edu_mark column is added to the dataframe'
Age | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | Edu_mark | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | 49 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Undergrad |
1 | 45 | 34 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Undergrad |
2 | 39 | 11 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Undergrad |
3 | 35 | 100 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Graduate |
4 | 35 | 45 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | Graduate |
EDU_dis=data.groupby('Edu_mark')['Age'].count()
EDU_dis
Edu_mark Advanced/Professional 1501 Graduate 1403 Undergrad 2096 Name: Age, dtype: int64
fig=px.pie(data,values=EDU_dis, names=EDU_dis.index,title='Education status of customers')
fig.show()
#We could see that We have more Undergraduates 41.92%
#graduates(28.06%)
#Advanced Professional(30.02%)
data.columns
Index(['Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Edu_mark'], dtype='object')
def Security_CD(row):
if (row['Securities Account']==1) & (row['CD Account']==1):
return 'Holds Securites & Deposit'
elif (row['Securities Account']==0) & (row['CD Account']==0):
return 'Does not Holds Securites or Deposit'
elif (row['Securities Account']==1) & (row['CD Account']==0):
return ' Holds only Securites '
elif (row['Securities Account']==0) & (row['CD Account']==1):
return ' Holds only Deposit'
data['Account_holder_category']=data.apply(Security_CD,axis=1)
data.head()
#Account_holder_category column is added to the dataframe
Age | Income | Family | CCAvg | Education | Mortgage | Personal Loan | Securities Account | CD Account | Online | CreditCard | Edu_mark | Account_holder_category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | 49 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Undergrad | Holds only Securites |
1 | 45 | 34 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | Undergrad | Holds only Securites |
2 | 39 | 11 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Undergrad | Does not Holds Securites or Deposit |
3 | 35 | 100 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Graduate | Does not Holds Securites or Deposit |
4 | 35 | 45 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | Graduate | Does not Holds Securites or Deposit |
values=data['Account_holder_category'].value_counts()
values.index
Index(['Does not Holds Securites or Deposit', ' Holds only Securites ', ' Holds only Deposit', 'Holds Securites & Deposit'], dtype='object')
fig=px.pie(data,values=values, names=values.index, title='Account Holder distribution')
fig.show()
#We could see that alomst 87% of customers do not hold any securities or deposit
#3% hold both securities as well as deposit.
#It will be good if we encourage those 87% to open any of these account as it will improve the assests of the bank
data.columns
Index(['Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Edu_mark', 'Account_holder_category'], dtype='object')
#facet_col helps to split the box plot
px.box(data,x='Education',y='Income',facet_col='Personal Loan')
#From the above plot we could say that Income of customers who availed personal loan are alomst same irrescpective of their Education
plt.figure(figsize=(12,8))
sns.distplot(data[data['Personal Loan']==0]['Income'])
sns.distplot(data[data['Personal Loan']==1]['Income'])
plt.legend()
#we need to tackle the label error
#its very clumsy also
No handles with labels found to put in legend.
<matplotlib.legend.Legend at 0x1c1d0322c40>
plt.figure(figsize=(12,8))
sns.distplot(data[data['Personal Loan']==0]['Income'],hist=False,label='Income with no personal loan')
sns.distplot(data[data['Personal Loan']==1]['Income'],hist=False,label='Income with personal loan')
plt.legend()
#Customers Who have availed personal loan seem to have higher income than those who do not have personal loan
<matplotlib.legend.Legend at 0x1c1d05d1070>
def plot(col1,col2,label1,label2,title):
plt.figure(figsize=(12,8))
sns.distplot(data[data[col2]==0][col1],hist=False,label=label1)
sns.distplot(data[data[col2]==1][col1],hist=False,label=label2)
plt.legend()
plt.title(title)
plot('Income','Personal Loan','Income with no personal loan','Income with personal loan','Income Distribution')
#trying the credit card avg - vs - personal loan
plot('CCAvg','Personal Loan','Credit card avg with no personal loan','Credit card avg with personal loan','Credit card avg Distribution')
#Distribution in case of no personal loan have a skewness
plot('Mortgage','Personal Loan','Mortgage of customers with no personal loan','Mortgage of customers with personal loan','Mortgage of customers Distribution')
#People with high mortgage value, i.e more than 400K have availed personal Loan
data.columns
Index(['Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Edu_mark', 'Account_holder_category'], dtype='object')
#Atrributes on which I'll analyse
col_names=['Securities Account','Online','Account_holder_category','CreditCard']
for i in col_names:
plt.figure(figsize=(10,5))
sns.countplot(x=i,hue='Personal Loan',data=data)
#customers who hold deposit account & customers who do not hold either a securities account or deposit account have aviled personal loan
#we will scatter plots for hypothesis testing
sns.scatterplot(data['Age'],data['Personal Loan'],hue=data['Family'])
<AxesSubplot:xlabel='Age', ylabel='Personal Loan'>
#because we need to perform some scientic python
import scipy.stats as stats
#we will define two hypothesis
Ho='Age does not have impact on availing personal loan'#null hypothesis
Ha='Age does have impact on availing personal loan'#altenate hypothesis
#only one can be true
Age_no=np.array(data[data['Personal Loan']==0]['Age'])
Age_yes=np.array(data[data['Personal Loan']==1]['Age'])
#our data is ready to perform 2 sample t-test
#we will get an overview whether there's a difference between the mean of both the samples or not
t,p_value=stats.ttest_ind(Age_no,Age_yes,axis=0)
if p_value<0.05:#reject H0 and accept Ha
print(Ha,' as the p_value is less than 0.05 with a value of {}'.format(p_value))
else:#accept H0
print(Ho,' as the p_value is greater than 0.05 with a value of {}'.format(p_value))
Age does not have impact on availing personal loan as the p_value is greater than 0.05 with a value of 0.584959263705325
#automating above Hypothesis
def Hypothesis(col1,col2,HO,Ha):
arr1=np.array(data[data[col1]==0][col2])
arr2=np.array(data[data[col1]==1][col2])
t,p_value=stats.ttest_ind(arr1,arr2,axis=0)
if p_value<0.05:
print('{}, as the p_value is less than 0.05 with a value of {}'.format(Ha,p_value))
else:
print('{} as the p_value is greater than 0.05 with a value of {}'.format(HO,p_value))
Hypothesis('Personal Loan','Age',HO='Age does not have impact on availing personal loan',Ha='Age does have impact on availing personal loan')
Age does not have impact on availing personal loan as the p_value is greater than 0.05 with a value of 0.584959263705325
Hypothesis(col1='Personal Loan',col2='Income',HO='Income does not have impact on availing personal loan',Ha='Income does have impact on availing personal loan')
#Income have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :0.0
Income does have impact on availing personal loan, as the p_value is less than 0.05 with a value of 0.0
Hypothesis('Personal Loan','Family',HO='AgFamily does not have impact on availing personal loan',Ha='Family does have impact on availing personal loan')
#Family have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :1.4099040685673807e-05
Family does have impact on availing personal loan, as the p_value is less than 0.05 with a value of 1.4099040685673807e-05