Title: Classifying and predicting wine quality using the Support Vector Machine.

In [ ]:
#Import the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

Load the Dataset

In [ ]:
df = pd.read_csv("/content/winequality-red.csv")
df.head()
Out[ ]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [ ]:
#check the data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
In [ ]:
df.nunique()
Out[ ]:
fixed acidity            96
volatile acidity        143
citric acid              80
residual sugar           91
chlorides               153
free sulfur dioxide      60
total sulfur dioxide    144
density                 436
pH                       89
sulphates                96
alcohol                  65
quality                   6
dtype: int64

Distribution of classes

In [ ]:
good_q= df[df['residual sugar']==2.0][0:200]
bad_q= df[df['residual sugar']==9.0][0:200]


    
# plotting the bubble chart
axes = good_q.plot(kind='scatter', x="pH", y="total sulfur dioxide", color='red', label='good quality')
bad_q.plot(kind='scatter', x="pH", y="total sulfur dioxide", color='blue', label='bad quality', ax=axes)
  


# showing the plot
/usr/local/lib/python3.9/dist-packages/pandas/plotting/_matplotlib/core.py:1114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  scatter = ax.scatter(
Out[ ]:
<Axes: xlabel='pH', ylabel='total sulfur dioxide'>

Identify unwanted rows

In [ ]:
df.dtypes

#There are no unwanted columns
Out[ ]:
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

Split data into Train/Test

In [ ]:
x= df.drop("quality", axis=1)
y=df['quality']
In [ ]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x, y, test_size =0.2, random_state=1)
In [ ]:
x_train.shape
Out[ ]:
(1279, 11)
In [ ]:
y_train.shape
Out[ ]:
(1279,)
In [ ]:
x_test.shape
Out[ ]:
(320, 11)
In [ ]:
y_test.shape
Out[ ]:
(320,)

Modeling (SVM using scikitlearn)

In [ ]:
from sklearn import svm

classifier=svm.SVC(kernel ='linear', gamma='auto', C=1.5)
#fit the model
classifier.fit(x_train, y_train)
y_predict=classifier.predict(x_test)
In [ ]:
#print the first 30 true and predicted responses
print ('actual values:', y_test.values[:40])
print ('predicted values:', y_predict[:40])
actual values: [5 6 6 6 6 6 6 5 5 5 6 6 6 6 6 5 6 5 5 5 6 6 5 6 6 6 6 6 6 7 6 6 5 6 5 6 5
 7 6 5]
predicted values: [5 5 6 6 6 6 6 5 6 5 6 5 5 6 6 5 6 5 5 5 5 6 5 6 5 6 6 6 5 5 5 6 5 6 5 6 5
 5 6 5]
In [ ]:
dfms= pd.DataFrame(data={"actual values": y_test, "predicted values":y_predict})

dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfmsdfms= pd.DataFrame(data={"actual values": y_test, "predicted values":y_predict})

dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfms
Out[ ]:
actual values predicted values differences
75 5 5 0
1283 6 5 -1
408 6 6 0
1281 6 6 0
1118 6 6 0
... ... ... ...
890 5 6 1
146 5 5 0
1551 5 5 0
1209 7 6 -1
1220 6 6 0

320 rows × 3 columns

Evaluation(Results)

In [ ]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        13
           5       0.62      0.78      0.69       140
           6       0.56      0.60      0.58       134
           7       0.00      0.00      0.00        30
           8       0.00      0.00      0.00         2

    accuracy                           0.59       320
   macro avg       0.20      0.23      0.21       320
weighted avg       0.51      0.59      0.55       320

/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Saving and Loading the model

In [ ]:
#save the existing model to file
import pickle

filename='WINE_QUALITY_MODEL.pkl'
pickle.dump(classifier, open(filename,"wb"))
In [ ]:
#Load a saved model
loaded_model = pickle.load(open("/content/WINE_QUALITY_MODEL.pkl" ,"rb"))
In [ ]:
#Make predictions

input_data = pd.DataFrame([5.6,1.30,0.006,12.5,21.8,6,5.9,0.9,1.5,16.8,5.9])

#changing the input_data to numpy array
input_data_as_numpy_array =np.asarray(input_data)

#reshape the array as we are predicting for one instance
input_data_reshaped= input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)

if (prediction[0] >= 5):
    print('The wine is of a good quality')
else:
    print('The wine is of a bad quality')
The wine is of a good quality
/usr/local/lib/python3.9/dist-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but SVC was fitted with feature names
  warnings.warn(