Machine Learning Project

Photo by Markus Winkler on Unsplash
#### reading the data set ####data = pd.read_csv('case_study_data.csv')
data.head()
A snapshot of the data set
#### checking null values in any of the columns ####print(mi.bar(data.iloc[:,0:11], figsize = (15,5)))
Number of Null values in each column
print(mi.bar(data.iloc[:,11:], figsize = (15,5)))
Number of Null values in each column
### removing the column ###mod_data = data.drop(["asset_class_cd"], axis = 1)
mod_data.head(10)
mi.matrix(mod_data)
mod_data = mod_data[~pd.isna(mod_data['inquiry_purpose_code'])]
mod_data = mod_data[~pd.isna(mod_data['asset_code'])]

Exploratory Data Analysis

mod_data['zipcode'] = list(("").join(list(i)[-5:]) for i in list(mod_data['address']))
## if the domain name is not one of gmail, yahoo or hotmail, it's marked as others ##mod_data['domain'] = list(re.search(r"(.*)@(.*)\.", i).group(2) for i in list(mod_data['email']))
mod_data['domain'] = list(i if i in ["gmail", "yahoo", "hotmail"] else "other" for i in list(mod_data['domain']))
### converting the dob column in required column ###mod_data['date_of_birth'] = list(pd.to_datetime(i, format = "%d/%m/%y") for i in list(mod_data['date_of_birth']))
current_date = date(2022, 1, 25)
mod_data['age'] = list(((current_date - datetime.date(i)).days)/365 for i in list(mod_data['date_of_birth']))### in some case, years like 1956 are being changed to 2056 ###
### taking care of that issue ###
mod_data['age'] = list(i if i >= 0 else i+100 for i in list(mod_data['age']))
## description of the data frame ##mod_data.describe()
sns.pairplot(mod_data)
Scatter plot for the whole data frame
uni = mod_data.drop_duplicates(['user_id'], keep = "first")len(list(uni.index)) == len(list(mod_data.index))
mod_data.corr()
mod_data.groupby(['approved'])['approved'].count()
sns.histplot(x = "age", hue = "approved", data = mod_data)
ax = sns.factorplot(x='education_level', y='age', hue='approved', data=mod_data, kind='bar')
ax.set_xticklabels(rotation=65, horizontalalignment='right')
ax = sns.factorplot(x='hours_per_week', y='age', hue='approved', data=mod_data, kind='point')
ax.set_xticklabels(label = None, rotation=65, horizontalalignment='right')
ax.set(xticklabels=[])
ax = sns.factorplot(x='marital_status', y='age', hue='approved', data=mod_data, kind='point')
ax.set_xticklabels(label = None, rotation=65, horizontalalignment='right')
ax = sns.factorplot(x='capital_gain', y='capital_loss', hue='approved', data=mod_data, kind='point')
ax.set_xticklabels(label = None, rotation=65, horizontalalignment='right')
ax.set(xticklabels=[])
sns.boxplot(x = 'age', data = mod_data)
sns.boxplot(x = 'capital_gain', data = mod_data)
sns.boxplot(x = 'capital_loss', data = mod_data)
sns.boxplot(x = 'hours_per_week', data = mod_data)
sns.histplot(x = 'domain', data = mod_data)
sns.histplot(x = 'education_num', data = mod_data, hue = "approved")
ax = sns.histplot(x = 'education_level', data = mod_data, hue = "approved")
ax.set_xticklabels(mod_data['education_level'].unique(),rotation = 90)
ax = sns.factorplot(y='capital_gain', x='education_level', hue='approved', data=mod_data, kind='point')
ax.set_xticklabels(label = None, rotation=65, horizontalalignment='right')
mod_original = mod_data.copy(deep = True)
dupl_mod_data = mod_data.copy(deep=True)
mod_data['age'] = (mod_data['age']-mod_data['age'].mean())/mod_data['age'].std()mod_data['capital_gain'] = (mod_data['capital_gain']-mod_data['capital_gain'].mean())/mod_data['capital_gain'].std()mod_data['capital_loss'] = (mod_data['capital_loss']-mod_data['capital_loss'].mean())/mod_data['capital_loss'].std()mod_data['hours_per_week'] = (mod_data['hours_per_week']-mod_data['hours_per_week'].mean())/mod_data['hours_per_week'].std()### changing Gender to 1 and 0 ###mod_data['gender'] = list(1 if i == "Male" else 0 for i in list(mod_data['gender']))
# creating dummy variables for all the string variables so that they can be used in evaluating SHAP values ##workclass = pd.get_dummies(mod_data['workclass'], drop_first = True)
education_level = pd.get_dummies(mod_data['education_level'], drop_first = True)
marital_status = pd.get_dummies(mod_data['marital_status'], drop_first = True)occupation = pd.get_dummies(mod_data['occupation'], drop_first = True)relationship = pd.get_dummies(mod_data['relationship'], drop_first = True)inquiry_purpose_code = pd.get_dummies(mod_data['inquiry_purpose_code'], drop_first = True)
institute_type = pd.get_dummies(mod_data['institute_type'], drop_first = True)
account_type = pd.get_dummies(mod_data['account_type'], drop_first = True)asset_code = pd.get_dummies(mod_data['asset_code'], drop_first = True)portfolio_type = pd.get_dummies(mod_data['portfolio_type'], drop_first = True)domain = pd.get_dummies(mod_data['domain'], drop_first = True)mod_data.drop(['workclass', 'education_level', 'marital_status', 'occupation', 'relationship','inquiry_purpose_code', 'institute_type', 'account_type', 'asset_code', 'portfolio_type', 'domain'], axis = 1, inplace = True)mod_data = pd.concat([mod_data, workclass, education_level, occupation,relationship, inquiry_purpose_code, institute_type,
account_type, asset_code, portfolio_type, domain], axis = 1)
part_data = mod_dataX = mod_data.loc[:, mod_data.columns != 'approved']
X = X.loc[:, X.columns != 'user_id']
X = X.loc[:, X.columns != 'email']
X = X.loc[:, X.columns != 'zipcode']
X = X.loc[:, X.columns != 'date_of_birth']
X = X.loc[:, X.columns != 'address']
Y = mod_data['approved']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)masker = shap.maskers.Independent(data = X_test)model = LogisticRegression(random_state=1, max_iter=1000).fit(X_train, y_train.values.ravel())explainer = shap.LinearExplainer(model, masker=masker)
shap_values = explainer(X_test)
shap.plots.beeswarm(shap_values)
SHAP values for different independent variables in our model
formula = 'approved ~ age + education_num + capital_gain + C(inquiry_purpose_code) + C(relationship)  + C(occupation) + C(institute_type) + C(marital_status)'
fit = smf.mnlogit(formula=formula, data=dupl_mod_data).fit()
print(fit.summary())
print("AIC =", str(fit.aic))
AIC = 25571.174263297573
## calculating vif for quantitative variables ##y, X = dmatrices('approved ~ age + education_num + capital_gain + capital_loss', data=dupl_mod_data, return_type='dataframe')#calculate VIF for each explanatory variablevif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['variable'] = X.columns
vif

Business Problem

## dividing the dataz_columns = ['approved','age', 'education_num','capital_gain', 'relationship', 'occupation', 'institute_type', 'inquiry_purpose_code', 'marital_status']part_data = dupl_mod_data[z_columns]X = part_data.loc[:,part_data.columns != 'approved']
Y = part_data['approved']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)XY_train = pd.concat([X_train, y_train], axis=1, join='inner')formula = 'approved ~ age + education_num + capital_gain + C(inquiry_purpose_code) + C(relationship) + C(occupation) + C(institute_type) + C(marital_status)'fit = smf.mnlogit(formula=formula, data=XY_train).fit()
fit.summary()
### using the validation set to find the optimized threshold value ####true_negative = []
true_positive = []
threshold = []
expected = list(y_val)
predict = list(fit.predict(X_val)[1])
for i in list(np.linspace(0, 1, 100)):
predicted = list(1 if j > i else 0 for j in predict)


cm = confusion_matrix(expected, predicted)
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]
#print(TP, FN, FP, TN)

# Specificity or true negative rate
TNR = TN/(TN+FP)
TPR = TP/(TP+FN)
#print(TPR, TNR)

true_negative.append(TNR)
true_positive.append(TPR)

threshold.append(i)
expected = list(y_test)
predict = list(fit.predict(X_test)[1])
predicted = list(1 if j > 0.2 else 0 for j in predict)

cm = confusion_matrix(expected, predicted)
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]
print(TP, FN, FP, TN)

# Specificity or true negative rate
TNR = TN/(TN+FP)
TPR = TP/(TP+FN)
#print(TPR, TNR)

print(TNR)
print(TPR)

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store