Brain stroke prediction using the Logistic Regression classification Algorithm
Brain stroke prediction using the Logistic Regression classification Algorithm
#Import the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
Load the dataset
df= pd.read_csv("//content/Brain_stroke full_data.csv")
df.head()
df.shape
(4981, 11)
Data Cleaning
#Check data for missing values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4981 entries, 0 to 4980 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 4981 non-null object 1 age 4981 non-null float64 2 hypertension 4981 non-null int64 3 heart_disease 4981 non-null int64 4 ever_married 4981 non-null object 5 work_type 4981 non-null object 6 Residence_type 4981 non-null object 7 avg_glucose_level 4981 non-null float64 8 bmi 4981 non-null float64 9 smoking_status 4981 non-null object 10 stroke 4981 non-null int64 dtypes: float64(3), int64(3), object(5) memory usage: 428.2+ KB
df.isnull().sum()
gender 0 age 0 hypertension 0 heart_disease 0 ever_married 0 work_type 0 Residence_type 0 avg_glucose_level 0 bmi 0 smoking_status 0 stroke 0 dtype: int64
gender object age int64 hypertension int64 heart_disease int64 ever_married object work_type object Residence_type object avg_glucose_level float64 bmi float64 smoking_status object stroke int64 dtype: object
#Describe the data
df.describe()
Visualize the data
Categorical features:
Numerical features:
Defining the feature matrix x and response vector y
#define x and y
feature_cols = ['age','hypertension','heart_disease','avg_glucose_level','bmi']
x= df[feature_cols]
y=df.stroke
x,y
( age hypertension heart_disease avg_glucose_level bmi
0 67 0 1 228.69 36.6
1 80 0 1 105.92 32.5
2 49 0 0 171.23 34.4
3 79 1 0 174.12 24.0
4 81 0 0 186.21 29.0
... ... ... ... ... ...
4976 41 0 0 70.15 29.8
4977 40 0 0 191.15 31.1
4978 45 1 0 95.02 31.8
4979 40 0 0 83.94 30.0
4980 80 1 0 83.75 29.1
[4981 rows x 5 columns], 0 1
1 1
2 1
3 1
4 1
..
4976 0
4977 0
4978 0
4979 0
4980 0
Name: stroke, Length: 4981, dtype: int64)#Split x and y into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
#Import the logistic regression from scikitlearn module
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#instatiate the model
logreg=LogisticRegression(solver='lbfgs')
#fit the model with data
logreg.fit(x_train, y_train)
Predict the target variable
Model performance and Accuracy measurements
#Classification accuracy
print("Logistic Regression Model Accuracy:", accuracy_score(y_test, predictions))
Logistic Regression Model Accuracy: 0.9531772575250836
#Null accuracy: accuracy that could be achieved by always predicting the most frequent class
#examine the class distribution of the testing set(using a pandas series method)
y_test.value_counts()
0 1425 1 70 Name: stroke, dtype: int64
#calculate the percentage of ones
y_test.mean()
0.046822742474916385
0.9531772575250836
#Comments: If we compare the null accuracy of 95% with the model accuracy of 95.5% above, our model seem to be very good.
#calculate the null accuracy(for binary classification problem coded as 0/1)
max(y_test.mean(), 1-y_test.mean())
0.9531772575250836
Compare the Actual and Predicted response values
#print the first 30 true and predicted responses
print ('Actual:', y_test.values[0:30])
print ('pred:', predictions[0:30])
Actual: [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] pred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# Classiication accuracy appears to be the easisest to understand but it does not tell the underlying distribution of response values and it does not tell what types of erros the classifier model is making.
Confusion matrix
from sklearn import metrics
#First argument is true values while the second argument is the prediected.
print (metrics.confusion_matrix(y_test, predictions))
[[1425 0] [ 70 0]]
#Using the confusion matrix to compute the classification metrics
#CLASSIFICATION ACCURACY
#How often is the classifier correct?
print ((TP + TN) / (float(TP +TN + FP + FN)))
print (metrics.accuracy_score(y_test, predictions))
0.9531772575250836 0.9531772575250836
#Classification Error: Overall, how often is the classifier incorrect?
#Misclassification rate
print ((FP + FN) / (float(TP +TN + FP + FN)))
print (1-metrics.accuracy_score(y_test, predictions))
0.046822742474916385 0.04682274247491636
#Sensitivity
#The sensitivity describes how sensitive the classifier is predicting positive values.
#This is also known as True positive Rate or "Recall"
print (TP/ (float(TP + FN)))
print (metrics.recall_score(y_test, predictions))
0.0 0.0
#Specificity: When the actual value is negative, how often is the prediction correct?
print (TN/ (float(TN + FP)))
1.0
#False positive Rate: When the actual value is negative, how often is the prediction incorrect?
print (FP/ (float(TN + FP)))
0.0
#Precision: How precise is the classifier when predicting positive instances?
print (metrics.precision_score(y_test, predictions))
0.0
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
#F1 score
print (metrics.f1_score(y_test, predictions))
0.0
#Combine the score metrics
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
precision recall f1-score support
0 0.95 1.00 0.98 1425
1 0.00 0.00 0.00 70
accuracy 0.95 1495
macro avg 0.48 0.50 0.49 1495
weighted avg 0.91 0.95 0.93 1495
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
#print the first 10 predicted responses
logreg.predict(x_test)[0:10]
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
#print the first 10 predicted probalities of the data
logreg.predict_proba(x_test)[0:10, :]
array([[0.99474427, 0.00525573],
[0.96855397, 0.03144603],
[0.83339639, 0.16660361],
[0.99624725, 0.00375275],
[0.95978916, 0.04021084],
[0.94748635, 0.05251365],
[0.9931908 , 0.0068092 ],
[0.99556548, 0.00443452],
[0.97388737, 0.02611263],
[0.9709777 , 0.0290223 ]])logreg.predict_proba(x_test)[0:10, 1]
predictions_prob = logreg.predict_proba(x_test)[:, 1]
#Plot the histogram of the predicted probabilities
%matplotlib inline
plt.rcParams['font.size'] = 14
#histogram
plt.hist(predictions_prob, bins=8)
plt.xlim(0,1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('predicted probability of stroke')
plt.ylabel('Frequency')
Area Under the Curve(AUC)
(array([[0.99474427, 0.00525573],
[0.96855397, 0.03144603],
[0.83339639, 0.16660361],
[0.99624725, 0.00375275],
[0.95978916, 0.04021084],
[0.94748635, 0.05251365],
[0.9931908 , 0.0068092 ],
[0.99556548, 0.00443452],
[0.97388737, 0.02611263],
[0.9709777 , 0.0290223 ]]), 1495)y_prob_positive = y_prob[:,1]
y_prob_positive[:10]
array([0.00525573, 0.03144603, 0.16660361, 0.00375275, 0.04021084,
0.05251365, 0.0068092 , 0.00443452, 0.02611263, 0.0290223 ])#Calculate fpr, tpr, thresholds
fpr, tpr, thresholds= roc_curve(predictions, y_prob_positive)
#check the false positive rates
fpr
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless warnings.warn(
array([0.00000000e+00, 6.68896321e-04, 1.00000000e+00])
#AUC is useful as a single number summary of classifier performance. It is useful when there is a high class imbalance.
from sklearn.metrics import roc_auc_score
print (metrics.roc_auc_score(y_test, y_prob_positive))
0.8552380952380952
Saving and loading the trained model
#pickle
#Saving the existing model
import pickle
pickle.dump(logreg, open("lr_logistic_Regression_model_stroke_pkl.sav","wb"))
#Load the saved model
loaded_pickle_model = pickle.load(open("lr_logistic_Regression_model_stroke_pkl.sav","rb"))
#Make some predictions
loaded_pickle_model.predict(x_test)
array([0, 0, 0, ..., 0, 0, 0])