Title: Classifying and predicting wine quality using the Support Vector Machine.
#Import the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
Load the Dataset
df = pd.read_csv("/content/winequality-red.csv")
df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
#check the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
df.nunique()
fixed acidity 96 volatile acidity 143 citric acid 80 residual sugar 91 chlorides 153 free sulfur dioxide 60 total sulfur dioxide 144 density 436 pH 89 sulphates 96 alcohol 65 quality 6 dtype: int64
Distribution of classes
good_q= df[df['residual sugar']==2.0][0:200]
bad_q= df[df['residual sugar']==9.0][0:200]
# plotting the bubble chart
axes = good_q.plot(kind='scatter', x="pH", y="total sulfur dioxide", color='red', label='good quality')
bad_q.plot(kind='scatter', x="pH", y="total sulfur dioxide", color='blue', label='bad quality', ax=axes)
# showing the plot
/usr/local/lib/python3.9/dist-packages/pandas/plotting/_matplotlib/core.py:1114: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored scatter = ax.scatter(
<Axes: xlabel='pH', ylabel='total sulfur dioxide'>
Identify unwanted rows
df.dtypes
#There are no unwanted columns
fixed acidity float64 volatile acidity float64 citric acid float64 residual sugar float64 chlorides float64 free sulfur dioxide float64 total sulfur dioxide float64 density float64 pH float64 sulphates float64 alcohol float64 quality int64 dtype: object
Split data into Train/Test
x= df.drop("quality", axis=1)
y=df['quality']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x, y, test_size =0.2, random_state=1)
x_train.shape
(1279, 11)
y_train.shape
(1279,)
x_test.shape
(320, 11)
y_test.shape
(320,)
Modeling (SVM using scikitlearn)
from sklearn import svm
classifier=svm.SVC(kernel ='linear', gamma='auto', C=1.5)
#fit the model
classifier.fit(x_train, y_train)
y_predict=classifier.predict(x_test)
#print the first 30 true and predicted responses
print ('actual values:', y_test.values[:40])
print ('predicted values:', y_predict[:40])
actual values: [5 6 6 6 6 6 6 5 5 5 6 6 6 6 6 5 6 5 5 5 6 6 5 6 6 6 6 6 6 7 6 6 5 6 5 6 5 7 6 5] predicted values: [5 5 6 6 6 6 6 5 6 5 6 5 5 6 6 5 6 5 5 5 5 6 5 6 5 6 6 6 5 5 5 6 5 6 5 6 5 5 6 5]
dfms= pd.DataFrame(data={"actual values": y_test, "predicted values":y_predict})
dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfmsdfms= pd.DataFrame(data={"actual values": y_test, "predicted values":y_predict})
dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfms
| actual values | predicted values | differences | |
|---|---|---|---|
| 75 | 5 | 5 | 0 |
| 1283 | 6 | 5 | -1 |
| 408 | 6 | 6 | 0 |
| 1281 | 6 | 6 | 0 |
| 1118 | 6 | 6 | 0 |
| ... | ... | ... | ... |
| 890 | 5 | 6 | 1 |
| 146 | 5 | 5 | 0 |
| 1551 | 5 | 5 | 0 |
| 1209 | 7 | 6 | -1 |
| 1220 | 6 | 6 | 0 |
320 rows × 3 columns
Evaluation(Results)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))
precision recall f1-score support
3 0.00 0.00 0.00 1
4 0.00 0.00 0.00 13
5 0.62 0.78 0.69 140
6 0.56 0.60 0.58 134
7 0.00 0.00 0.00 30
8 0.00 0.00 0.00 2
accuracy 0.59 320
macro avg 0.20 0.23 0.21 320
weighted avg 0.51 0.59 0.55 320
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Saving and Loading the model
#save the existing model to file
import pickle
filename='WINE_QUALITY_MODEL.pkl'
pickle.dump(classifier, open(filename,"wb"))
#Load a saved model
loaded_model = pickle.load(open("/content/WINE_QUALITY_MODEL.pkl" ,"rb"))
#Make predictions
input_data = pd.DataFrame([5.6,1.30,0.006,12.5,21.8,6,5.9,0.9,1.5,16.8,5.9])
#changing the input_data to numpy array
input_data_as_numpy_array =np.asarray(input_data)
#reshape the array as we are predicting for one instance
input_data_reshaped= input_data_as_numpy_array.reshape(1,-1)
prediction = loaded_model.predict(input_data_reshaped)
if (prediction[0] >= 5):
print('The wine is of a good quality')
else:
print('The wine is of a bad quality')
The wine is of a good quality
/usr/local/lib/python3.9/dist-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but SVC was fitted with feature names warnings.warn(