import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# for better interactive visualization
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot #to see graphs in offline mode

#to ignore the filter warnings
import warnings
warnings.filterwarnings('ignore')


#as data is in the form of excel, use read_excel command
#we need to give the sheet index
df=pd.read_excel(r'C:\Users\Soham Das\Data-analysis-projects\Finance Data Analysis/Bank_Personal_Loan_Modelling.xlsx',1)
df.head()


df.shape

(5000, 14)


df.isnull().sum()
#good there are no non values

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64


# ID and Zipcode might be removed as they may not be useful for our analysis

df.drop(['ID','ZIP Code'],axis=1,inplace=True)


df.columns

Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')


fig=px.box(df,y=['Age', 'Experience', 'Income', 'Family', 'Education'])
fig.show()
#Five point summary suggest that Experience has negative value(This should be fixed).
# we can see the Min, Max, mean and std deviation for all key attributes of the dataset
# Income has too much noise and slightly skewed right, Age and exp are equally distributed.


#checking if there is skewness in data or not
df.skew()

#CCAvg                 
#Education             
#Mortgage              
#Personal Loan         
#Securities Account    
#CD Account

Age                  -0.029341
Experience           -0.026325
Income                0.841339
Family                0.155221
CCAvg                 1.598457
Education             0.227093
Mortgage              2.104002
Personal Loan         2.743607
Securities Account    2.588268
CD Account            3.691714
Online               -0.394785
CreditCard            0.904589
dtype: float64


df.dtypes

Age                     int64
Experience              int64
Income                  int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object


#now visualising Skewness by distribution
df.hist(figsize=(20,20))

#1.Age & Experience are to an extent equally distributed
#2.Income & Credit card spending are skewed to the left
#3.We have more Undergraduates than Graduate and Advanced & Professional
#4.50-60% of customers have enabled online banking and went digital

array([[<AxesSubplot:title={'center':'Age'}>,
        <AxesSubplot:title={'center':'Experience'}>,
        <AxesSubplot:title={'center':'Income'}>],
       [<AxesSubplot:title={'center':'Family'}>,
        <AxesSubplot:title={'center':'CCAvg'}>,
        <AxesSubplot:title={'center':'Education'}>],
       [<AxesSubplot:title={'center':'Mortgage'}>,
        <AxesSubplot:title={'center':'Personal Loan'}>,
        <AxesSubplot:title={'center':'Securities Account'}>],
       [<AxesSubplot:title={'center':'CD Account'}>,
        <AxesSubplot:title={'center':'Online'}>,
        <AxesSubplot:title={'center':'CreditCard'}>]], dtype=object)


sns.distplot(df['Experience'])
#contains some negative value

<AxesSubplot:xlabel='Experience', ylabel='Density'>


#we should find the mean
df['Experience'].mean()

20.1046


Negative_exp=df[df['Experience']<0]
Negative_exp.head()
#fitlered the negative experience


sns.distplot(Negative_exp['Age'])
#age between 20-30 are having negative experience
#we need to deal with our data

<AxesSubplot:xlabel='Age', ylabel='Density'>


Negative_exp['Experience'].mean()

-1.4423076923076923


Negative_exp.size
#total negative entries

624


print('There are {} records which has negative values for experience, approx {} %'.format(Negative_exp.size , ((Negative_exp.size/df.size)*100)))

There are 624 records which has negative values for experience, approx 1.04 %


data=df.copy()


data.head()


#using numpy to change the negative values to mean value derived from data with the same age group
data['Experience']=np.where(data['Experience']<0,data['Experience'].mean(),data['Experience'])


data[data['Experience']<0]
#we can see there are no negative values anymore


df.corr()


plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True)
# We could see that Age & Experience are very strongly correlated,
# Hence it is fine for us to go with Age and drop Experience to avoid multi-colinearity issue.

<AxesSubplot:>


data=data.drop(['Experience'],axis=1)


data.head()
#so we dont have experience column in our data


data['Education'].unique()

array([1, 2, 3], dtype=int64)


def mark(x):
    if x==1:
        return 'Undergrad'
    elif x==2:
        return 'Graduate'
    else:
        return 'Advanced/Professional'


data['Edu_mark']=data['Education'].apply(mark)


data.head()
#'Edu_mark column is added to the dataframe'


EDU_dis=data.groupby('Edu_mark')['Age'].count()


EDU_dis

Edu_mark
Advanced/Professional    1501
Graduate                 1403
Undergrad                2096
Name: Age, dtype: int64


fig=px.pie(data,values=EDU_dis, names=EDU_dis.index,title='Education status of customers')
fig.show()
#We could see that We have more Undergraduates 41.92%
#graduates(28.06%)
#Advanced Professional(30.02%)


data.columns

Index(['Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage',
       'Personal Loan', 'Securities Account', 'CD Account', 'Online',
       'CreditCard', 'Edu_mark'],
      dtype='object')


def Security_CD(row):
    if (row['Securities Account']==1) & (row['CD Account']==1):
        return 'Holds Securites & Deposit'
    elif (row['Securities Account']==0) & (row['CD Account']==0):
        return 'Does not Holds Securites or Deposit'
    elif (row['Securities Account']==1) & (row['CD Account']==0):
        return ' Holds only Securites '
    elif (row['Securities Account']==0) & (row['CD Account']==1):
        return ' Holds only Deposit'


data['Account_holder_category']=data.apply(Security_CD,axis=1)


data.head()
#Account_holder_category column is added to the dataframe


values=data['Account_holder_category'].value_counts()
values.index

Index(['Does not Holds Securites or Deposit', ' Holds only Securites ',
       ' Holds only Deposit', 'Holds Securites & Deposit'],
      dtype='object')


fig=px.pie(data,values=values, names=values.index, title='Account Holder distribution')
fig.show()

#We could see that alomst 87% of customers do not hold any securities or deposit
#3% hold both securities as well as deposit.
#It will be good if we encourage those 87% to open any of these account as it will improve the assests of the bank


data.columns

Index(['Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage',
       'Personal Loan', 'Securities Account', 'CD Account', 'Online',
       'CreditCard', 'Edu_mark', 'Account_holder_category'],
      dtype='object')


#facet_col helps to split the box plot
px.box(data,x='Education',y='Income',facet_col='Personal Loan')
#From the above plot we could say that Income of customers who availed personal loan are alomst same irrescpective of their Education


plt.figure(figsize=(12,8))
sns.distplot(data[data['Personal Loan']==0]['Income'])
sns.distplot(data[data['Personal Loan']==1]['Income'])
plt.legend()
#we need to tackle the label error
#its very clumsy also

No handles with labels found to put in legend.

<matplotlib.legend.Legend at 0x1c1d0322c40>


plt.figure(figsize=(12,8))
sns.distplot(data[data['Personal Loan']==0]['Income'],hist=False,label='Income with no personal loan')
sns.distplot(data[data['Personal Loan']==1]['Income'],hist=False,label='Income with personal loan')
plt.legend()
#Customers Who have availed personal loan seem to have higher income than those who do not have personal loan

<matplotlib.legend.Legend at 0x1c1d05d1070>


def plot(col1,col2,label1,label2,title):
    plt.figure(figsize=(12,8))
    sns.distplot(data[data[col2]==0][col1],hist=False,label=label1)
    sns.distplot(data[data[col2]==1][col1],hist=False,label=label2)
    plt.legend()
    plt.title(title)


plot('Income','Personal Loan','Income with no personal loan','Income with personal loan','Income Distribution')


#trying the credit card avg - vs - personal loan
plot('CCAvg','Personal Loan','Credit card avg with no personal loan','Credit card avg with personal loan','Credit card avg Distribution')
#Distribution in case of no personal loan have a skewness


plot('Mortgage','Personal Loan','Mortgage of customers with no personal loan','Mortgage of customers  with personal loan','Mortgage of customers  Distribution')
#People with high mortgage value, i.e more than 400K have availed personal Loan


data.columns

Index(['Age', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage',
       'Personal Loan', 'Securities Account', 'CD Account', 'Online',
       'CreditCard', 'Edu_mark', 'Account_holder_category'],
      dtype='object')


#Atrributes on which I'll analyse
col_names=['Securities Account','Online','Account_holder_category','CreditCard']


for i in col_names:
    plt.figure(figsize=(10,5))
    sns.countplot(x=i,hue='Personal Loan',data=data)

#customers who hold deposit account & customers who do not hold either a securities account or deposit account have aviled personal loan


#we will scatter plots for hypothesis testing
sns.scatterplot(data['Age'],data['Personal Loan'],hue=data['Family'])

<AxesSubplot:xlabel='Age', ylabel='Personal Loan'>


#because we need to perform some scientic python
import scipy.stats as stats


#we will define two hypothesis
Ho='Age does not have impact on availing personal loan'#null hypothesis
Ha='Age does  have impact on availing personal loan'#altenate hypothesis
#only one can be true


Age_no=np.array(data[data['Personal Loan']==0]['Age'])
Age_yes=np.array(data[data['Personal Loan']==1]['Age'])


#our data is ready to perform 2 sample t-test
#we will get an overview whether there's a difference between the mean of both the samples or not

t,p_value=stats.ttest_ind(Age_no,Age_yes,axis=0)
if p_value<0.05:#reject H0 and accept Ha
    print(Ha,' as the p_value is less than 0.05 with a value of {}'.format(p_value))
else:#accept H0
    print(Ho,' as the p_value is greater than 0.05 with a value of {}'.format(p_value))

Age does not have impact on availing personal loan  as the p_value is greater than 0.05 with a value of 0.584959263705325


#automating above Hypothesis
def Hypothesis(col1,col2,HO,Ha):
    arr1=np.array(data[data[col1]==0][col2])
    arr2=np.array(data[data[col1]==1][col2])
    t,p_value=stats.ttest_ind(arr1,arr2,axis=0)
    if p_value<0.05:
        print('{}, as the p_value is less than 0.05 with a value of {}'.format(Ha,p_value))
    else:
        print('{} as the p_value is greater than 0.05 with a value of {}'.format(HO,p_value))


Hypothesis('Personal Loan','Age',HO='Age does not have impact on availing personal loan',Ha='Age does  have impact on availing personal loan')

Age does not have impact on availing personal loan as the p_value is greater than 0.05 with a value of 0.584959263705325


Hypothesis(col1='Personal Loan',col2='Income',HO='Income does not have impact on availing personal loan',Ha='Income does  have impact on availing personal loan')
#Income have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :0.0

Income does  have impact on availing personal loan, as the p_value is less than 0.05 with a value of 0.0


Hypothesis('Personal Loan','Family',HO='AgFamily does not have impact on availing personal loan',Ha='Family does  have impact on availing personal loan')
#Family have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :1.4099040685673807e-05

Family does  have impact on availing personal loan, as the p_value is less than 0.05 with a value of 1.4099040685673807e-05

	Age	Experience	Income	Family	CCAvg	Education	Mortgage	Personal Loan	Securities Account	CD Account	Online	CreditCard
Age	1.000000	0.994215	-0.055269	-0.046418	-0.052030	0.041334	-0.012539	-0.007726	-0.000436	0.008043	0.013702	0.007681
Experience	0.994215	1.000000	-0.046574	-0.052563	-0.050089	0.013152	-0.010582	-0.007413	-0.001232	0.010353	0.013898	0.008967
Income	-0.055269	-0.046574	1.000000	-0.157501	0.645993	-0.187524	0.206806	0.502462	-0.002616	0.169738	0.014206	-0.002385
Family	-0.046418	-0.052563	-0.157501	1.000000	-0.109285	0.064929	-0.020445	0.061367	0.019994	0.014110	0.010354	0.011588
CCAvg	-0.052030	-0.050089	0.645993	-0.109285	1.000000	-0.136138	0.109909	0.366891	0.015087	0.136537	-0.003620	-0.006686
Education	0.041334	0.013152	-0.187524	0.064929	-0.136138	1.000000	-0.033327	0.136722	-0.010812	0.013934	-0.015004	-0.011014
Mortgage	-0.012539	-0.010582	0.206806	-0.020445	0.109909	-0.033327	1.000000	0.142095	-0.005411	0.089311	-0.005995	-0.007231
Personal Loan	-0.007726	-0.007413	0.502462	0.061367	0.366891	0.136722	0.142095	1.000000	0.021954	0.316355	0.006278	0.002802
Securities Account	-0.000436	-0.001232	-0.002616	0.019994	0.015087	-0.010812	-0.005411	0.021954	1.000000	0.317034	0.012627	-0.015028
CD Account	0.008043	0.010353	0.169738	0.014110	0.136537	0.013934	0.089311	0.316355	0.317034	1.000000	0.175880	0.278644
Online	0.013702	0.013898	0.014206	0.010354	-0.003620	-0.015004	-0.005995	0.006278	0.012627	0.175880	1.000000	0.004210
CreditCard	0.007681	0.008967	-0.002385	0.011588	-0.006686	-0.011014	-0.007231	0.002802	-0.015028	0.278644	0.004210	1.000000

Soham Das¶

FINANCE DATA ANALYSIS¶

Importing all necessary libraries¶

1)Performing Exploratory Data Analysis¶

2)Visualising distribution of Data¶

3)Co-relation of Data¶

4)Analysing education status of Customers¶

5)Exploring the account holder distribution¶

6)Analysing customer on the basis of their education status, income and Personal Loan Status¶

7)automating above observations¶

8)Analysing categories of customer on the basis of different atrributes¶

9)Performing Hypothesis Testing¶

9.1)How Age of a person is going to be a factor in availing loan?¶

9.2)Does Income of a person have an impact on availing loan?¶

9.3)Does the family size makes them to avail loan?¶

	ID	Age	Experience	Income	ZIP Code	Family	CCAvg	Education	Securities Account	CreditCard
0	1	25	1	49	91107	4	1.6	1	1	0
1	2	45	19	34	90089	3	1.5	1	1	0
2	3	39	15	11	94720	1	1.0	1	0	0
3	4	35	9	100	94112	1	2.7	2	0	0
4	5	35	8	45	91330	4	1.0	2	0	1

	Age	Experience	Income	Family	CCAvg	Education	Mortgage	Online	CreditCard
89	25	-1	113	4	2.30	3	0	0	1
226	24	-1	39	2	1.70	2	0	0	0
315	24	-2	51	3	0.30	3	0	1	0
451	28	-2	48	2	1.75	3	89	1	0
524	24	-1	75	4	0.20	1	0	1	0

	Age	Experience	Income	Family	CCAvg	Education	Securities Account	CreditCard
0	25	1	49	4	1.6	1	1	0
1	45	19	34	3	1.5	1	1	0
2	39	15	11	1	1.0	1	0	0
3	35	9	100	1	2.7	2	0	0
4	35	8	45	4	1.0	2	0	1

	Age	Income	Family	CCAvg	Education	Securities Account	CreditCard
0	25	49	4	1.6	1	1	0
1	45	34	3	1.5	1	1	0
2	39	11	1	1.0	1	0	0
3	35	100	1	2.7	2	0	0
4	35	45	4	1.0	2	0	1

	Age	Income	Family	CCAvg	Education	Securities Account	CreditCard	Edu_mark
0	25	49	4	1.6	1	1	0	Undergrad
1	45	34	3	1.5	1	1	0	Undergrad
2	39	11	1	1.0	1	0	0	Undergrad
3	35	100	1	2.7	2	0	0	Graduate
4	35	45	4	1.0	2	0	1	Graduate

	Age	Income	Family	CCAvg	Education	Securities Account	CreditCard	Edu_mark	Account_holder_category
0	25	49	4	1.6	1	1	0	Undergrad	Holds only Securites
1	45	34	3	1.5	1	1	0	Undergrad	Holds only Securites
2	39	11	1	1.0	1	0	0	Undergrad	Does not Holds Securites or Deposit
3	35	100	1	2.7	2	0	0	Graduate	Does not Holds Securites or Deposit
4	35	45	4	1.0	2	0	1	Graduate	Does not Holds Securites or Deposit

	Age	Experience	Income	Family	CCAvg	Education	Securities Account	CreditCard
0	25	1	49	4	1.6	1	1	0
1	45	19	34	3	1.5	1	1	0
2	39	15	11	1	1.0	1	0	0
3	35	9	100	1	2.7	2	0	0
4	35	8	45	4	1.0	2	0	1

	Age	Income	Family	CCAvg	Education	Securities Account	CreditCard
0	25	49	4	1.6	1	1	0
1	45	34	3	1.5	1	1	0
2	39	11	1	1.0	1	0	0
3	35	100	1	2.7	2	0	0
4	35	45	4	1.0	2	0	1

	Age	Experience	Income	Family	CCAvg	Education	Securities Account	CreditCard
0	25	1	49	4	1.6	1	1	0
1	45	19	34	3	1.5	1	1	0
2	39	15	11	1	1.0	1	0	0
3	35	9	100	1	2.7	2	0	0
4	35	8	45	4	1.0	2	0	1

	Age	Income	Family	CCAvg	Education	Securities Account	CreditCard
0	25	49	4	1.6	1	1	0
1	45	34	3	1.5	1	1	0
2	39	11	1	1.0	1	0	0
3	35	100	1	2.7	2	0	0
4	35	45	4	1.0	2	0	1