Google Account
olumide peter
oluminepeter222@gmail.com
Code Text
Notebook
Code Text

Brain stroke prediction using the Logistic Regression classification Algorithm

Code Text

#Import the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
Code Text

Load the dataset

Code Text

df= pd.read_csv("//content/Brain_stroke full_data.csv")
Code Text

df.head()
Code Text


df.shape
(4981, 11)
Code Text

Data Cleaning

Code Text

#Check data for missing values
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB
Code Text

df.isnull().sum()
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64
Code Text

gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object
Code Text

#Describe the data
df.describe()
Code Text

Visualize the data

Code Text

Categorical features:

  1. Gender
  2. Age
  3. Marital status(Ever married)
  4. Residence type
  5. smoking status
  6. Work type

Numerical features:

  1. Hypertension
  2. heart_disease
  3. avg_glucose_level
  4. bmi
Code Text

Defining the feature matrix x and response vector y

Code Text

#define x and y
feature_cols = ['age','hypertension','heart_disease','avg_glucose_level','bmi']
x= df[feature_cols]
y=df.stroke
x,y



(      age  hypertension  heart_disease  avg_glucose_level   bmi
 0      67             0              1             228.69  36.6
 1      80             0              1             105.92  32.5
 2      49             0              0             171.23  34.4
 3      79             1              0             174.12  24.0
 4      81             0              0             186.21  29.0
 ...   ...           ...            ...                ...   ...
 4976   41             0              0              70.15  29.8
 4977   40             0              0             191.15  31.1
 4978   45             1              0              95.02  31.8
 4979   40             0              0              83.94  30.0
 4980   80             1              0              83.75  29.1
 
 [4981 rows x 5 columns], 0       1
 1       1
 2       1
 3       1
 4       1
        ..
 4976    0
 4977    0
 4978    0
 4979    0
 4980    0
 Name: stroke, Length: 4981, dtype: int64)
Code Text

#Split x and y into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
Code Text

#Import the logistic regression from scikitlearn module
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#instatiate the model
logreg=LogisticRegression(solver='lbfgs')
#fit the model with data
logreg.fit(x_train, y_train)
Code Text

Predict the target variable

Code Text

Code Text

Model performance and Accuracy measurements

Code Text

#Classification accuracy
print("Logistic Regression Model Accuracy:", accuracy_score(y_test, predictions))
Logistic Regression Model Accuracy: 0.9531772575250836
Code Text

#Null accuracy: accuracy that could be achieved by always predicting the most frequent class

#examine the class distribution of the testing set(using a pandas series method)
y_test.value_counts()
0    1425
1      70
Name: stroke, dtype: int64
Code Text

#calculate the percentage of ones
y_test.mean()
0.046822742474916385
Code Text

0.9531772575250836
Code Text

#Comments: If we compare the null accuracy of 95% with the model accuracy of 95.5% above, our model seem to be very good.
Code Text

#calculate the null accuracy(for binary classification problem coded as 0/1)
max(y_test.mean(), 1-y_test.mean())
0.9531772575250836
Code Text

Compare the Actual and Predicted response values

Code Text

#print the first 30 true and predicted responses
print ('Actual:', y_test.values[0:30])
print ('pred:', predictions[0:30])

Actual: [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
pred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Code Text

# Classiication accuracy appears to be the easisest to understand but it does not tell the underlying distribution of response values and it does not tell what types of erros the classifier model is making.
Code Text

Confusion matrix


from sklearn import metrics
#First argument is true values while the second argument is the prediected.
print (metrics.confusion_matrix(y_test, predictions))
[[1425    0]
 [  70    0]]

Code Text

Code Text

#Using the confusion matrix to compute the classification metrics
#CLASSIFICATION ACCURACY
#How often is the classifier correct?
print ((TP + TN) / (float(TP +TN + FP + FN)))
print (metrics.accuracy_score(y_test, predictions))

0.9531772575250836
0.9531772575250836
Code Text

#Classification Error: Overall, how often is the classifier incorrect?
#Misclassification rate
print ((FP + FN) / (float(TP +TN + FP + FN)))
print (1-metrics.accuracy_score(y_test, predictions))
0.046822742474916385
0.04682274247491636
Code Text

#Sensitivity
#The sensitivity describes how sensitive the classifier is predicting positive values.
#This is also known as True positive Rate or "Recall"

print (TP/ (float(TP + FN)))
print (metrics.recall_score(y_test, predictions))
0.0
0.0
Code Text

#Specificity: When the actual value is negative, how often is the prediction correct?
print (TN/ (float(TN + FP)))
1.0

#False positive Rate: When the actual value is negative, how often is the prediction incorrect?
print (FP/ (float(TN + FP)))
0.0

#Precision: How precise is the classifier when predicting positive instances?

print (metrics.precision_score(y_test, predictions))
0.0
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

#F1 score
print (metrics.f1_score(y_test, predictions))

0.0

#Combine the score metrics
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1425
           1       0.00      0.00      0.00        70

    accuracy                           0.95      1495
   macro avg       0.48      0.50      0.49      1495
weighted avg       0.91      0.95      0.93      1495

/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Code Text

#print the first 10 predicted responses
logreg.predict(x_test)[0:10]
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#print the first 10 predicted probalities of the data
logreg.predict_proba(x_test)[0:10, :]
array([[0.99474427, 0.00525573],
       [0.96855397, 0.03144603],
       [0.83339639, 0.16660361],
       [0.99624725, 0.00375275],
       [0.95978916, 0.04021084],
       [0.94748635, 0.05251365],
       [0.9931908 , 0.0068092 ],
       [0.99556548, 0.00443452],
       [0.97388737, 0.02611263],
       [0.9709777 , 0.0290223 ]])
Code Text

logreg.predict_proba(x_test)[0:10, 1]
predictions_prob = logreg.predict_proba(x_test)[:, 1]

#Plot the histogram of the predicted probabilities
%matplotlib inline
plt.rcParams['font.size'] = 14

#histogram
plt.hist(predictions_prob, bins=8)
plt.xlim(0,1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('predicted probability of stroke')
plt.ylabel('Frequency')
Code Text

Area Under the Curve(AUC)

Code Text

(array([[0.99474427, 0.00525573],
        [0.96855397, 0.03144603],
        [0.83339639, 0.16660361],
        [0.99624725, 0.00375275],
        [0.95978916, 0.04021084],
        [0.94748635, 0.05251365],
        [0.9931908 , 0.0068092 ],
        [0.99556548, 0.00443452],
        [0.97388737, 0.02611263],
        [0.9709777 , 0.0290223 ]]), 1495)

y_prob_positive = y_prob[:,1]
y_prob_positive[:10]
array([0.00525573, 0.03144603, 0.16660361, 0.00375275, 0.04021084,
       0.05251365, 0.0068092 , 0.00443452, 0.02611263, 0.0290223 ])
Code Text

#Calculate fpr, tpr, thresholds
fpr, tpr, thresholds= roc_curve(predictions, y_prob_positive)

#check the false positive rates
fpr
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_ranking.py:1029: UndefinedMetricWarning: No positive samples in y_true, true positive value should be meaningless
  warnings.warn(
array([0.00000000e+00, 6.68896321e-04, 1.00000000e+00])
Code Text

#AUC is useful as a single number summary of classifier performance. It is useful when there is a high class imbalance.
from sklearn.metrics import roc_auc_score
    
print (metrics.roc_auc_score(y_test, y_prob_positive))
0.8552380952380952
Code Text

Saving and loading the trained model

Code Text

#pickle
#Saving the existing model
import pickle

pickle.dump(logreg, open("lr_logistic_Regression_model_stroke_pkl.sav","wb"))
Code Text

#Load the saved model
loaded_pickle_model = pickle.load(open("lr_logistic_Regression_model_stroke_pkl.sav","rb"))
Code Text

#Make some predictions
loaded_pickle_model.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0])
Code Text