Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
car_sales= pd.read_csv("car-sales-extended-missing-data.csv")
car_sales.head()
| Make | Colour | Odometer (KM) | Doors | Price | |
|---|---|---|---|---|---|
| 0 | Honda | White | 35431.0 | 4.0 | 15323.0 |
| 1 | BMW | Blue | 192714.0 | 5.0 | 19943.0 |
| 2 | Honda | White | 84714.0 | 4.0 | 28343.0 |
| 3 | Toyota | White | 154365.0 | 4.0 | 13434.0 |
| 4 | Nissan | Blue | 181577.0 | 3.0 | 14043.0 |
car_sales.tail()
| Make | Colour | Odometer (KM) | Doors | Price | |
|---|---|---|---|---|---|
| 995 | Toyota | Black | 35820.0 | 4.0 | 32042.0 |
| 996 | NaN | White | 155144.0 | 3.0 | 5716.0 |
| 997 | Nissan | Blue | 66604.0 | 4.0 | 31570.0 |
| 998 | Honda | White | 215883.0 | 4.0 | 4001.0 |
| 999 | Toyota | Blue | 248360.0 | 4.0 | 12732.0 |
#Check the type of features we have in the datasets
print (car_sales['Colour'].unique())
print(car_sales['Make'].unique())
print(car_sales['Doors'].unique())
['White' 'Blue' 'Red' 'Green' nan 'Black'] ['Honda' 'BMW' 'Toyota' 'Nissan' nan] [ 4. 5. 3. nan]
len(car_sales)
1000
#Check for missing values
car_sales.isna().sum()
Make 49 Colour 50 Odometer (KM) 50 Doors 50 Price 50 dtype: int64
# Check our dataframe again
car_sales=car_sales.dropna(axis=0)
#Let's deal with null values in price using sklearn
len(car_sales)
773
#Check the data types
car_sales.dtypes
Make object Colour object Odometer (KM) float64 Doors float64 Price float64 dtype: object
#Convert the datatype of the doors to int and then to object, since its a categorical variable
car_sales['Doors']= car_sales['Doors'].astype('int')
#from int to object
car_sales['Doors']= car_sales['Doors'].astype('object')
print(car_sales.dtypes)
Make object Colour object Odometer (KM) float64 Doors object Price float64 dtype: object
#Check the conversion
car_sales.head()
| Make | Colour | Odometer (KM) | Doors | Price | |
|---|---|---|---|---|---|
| 0 | Honda | White | 35431.0 | 4 | 15323.0 |
| 1 | BMW | Blue | 192714.0 | 5 | 19943.0 |
| 2 | Honda | White | 84714.0 | 4 | 28343.0 |
| 3 | Toyota | White | 154365.0 | 4 | 13434.0 |
| 4 | Nissan | Blue | 181577.0 | 3 | 14043.0 |
#Verify if there are still missing values by checking their unique features
print (car_sales['Colour'].unique())
print(car_sales['Make'].unique())
print(car_sales['Doors'].unique())
['White' 'Blue' 'Red' 'Green' 'Black'] ['Honda' 'BMW' 'Toyota' 'Nissan'] [4 5 3]
#Separate data into x and y
x= car_sales.drop("Price", axis=1)
y= car_sales["Price"]
x.head()
| Make | Colour | Odometer (KM) | Doors | |
|---|---|---|---|---|
| 0 | Honda | White | 35431.0 | 4 |
| 1 | BMW | Blue | 192714.0 | 5 |
| 2 | Honda | White | 84714.0 | 4 |
| 3 | Toyota | White | 154365.0 | 4 |
| 4 | Nissan | Blue | 181577.0 | 3 |
#Convert the Odometer column to list;this will be needed in deployment.
Odometer_list= list(car_sales["Odometer (KM)"])
odor_list = [int(item) for item in Odometer_list]
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features=["Make", "Colour", "Doors"]
one_hot= OneHotEncoder()
transformer= ColumnTransformer([("one_hot", one_hot, categorical_features)],
remainder= "passthrough")
transformeed_x = transformer.fit_transform(x)
transformeed_x
array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 3.54310e+04],
[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
1.00000e+00, 1.92714e+05],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 8.47140e+04],
...,
[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
0.00000e+00, 6.66040e+04],
[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 2.15883e+05],
[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
0.00000e+00, 2.48360e+05]])
#Put it back into a dataframe
x=transformeed_x
#Convert from sparse matrix to dense matrix
pd.DataFrame(transformeed_x)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 35431.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 192714.0 |
| 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 84714.0 |
| 3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 154365.0 |
| 4 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 181577.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 768 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 163322.0 |
| 769 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 35820.0 |
| 770 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 66604.0 |
| 771 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 215883.0 |
| 772 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 248360.0 |
773 rows × 13 columns
Alternatively,
dummies= pd.get_dummies(car_sales[["Make","Odometer (KM)","Colour","Doors"]])
dummies
C:\Users\Fresh\ANACONDA 2022\lib\site-packages\pandas\core\algorithms.py:798: FutureWarning: In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior) uniques = Index(uniques)
| Odometer (KM) | Make_BMW | Make_Honda | Make_Nissan | Make_Toyota | Colour_Black | Colour_Blue | Colour_Green | Colour_Red | Colour_White | Doors_3 | Doors_4 | Doors_5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 35431.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 1 | 192714.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 84714.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3 | 154365.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 4 | 181577.0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | 163322.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 995 | 35820.0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 997 | 66604.0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 998 | 215883.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 999 | 248360.0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
773 rows × 13 columns
Split data into training and testing
#split into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
5.1 Linear Regression model
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(x_train,y_train)
model1.score(x_test, y_test)
0.3367958266145302
5.3 RandomForestRegressor
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test2 = train_test_split(x,y, test_size = 0.2)
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
y= car_sales["Price"]
model2=RandomForestRegressor()
model2.fit(x_train, y_train);
model2.score(x_test,y_test2)
0.2658508636419823
6.1 Metric Functions
6.1.1 R^2
#Scoring metrics
from sklearn.metrics import r2_score
#Fill an array with y_test mean
y_test_mean=np.full(len(y_test),y_test.mean())
y_test.mean()
17696.348387096776
r2_score(y_test,y_test_mean) #let's test the mean of the target
0.0
r2_score(y_test,y_test)
1.0
Comments: This informs that the score function above indicating the coefficient of determination tells us how closely our model is close to a perfect for a perfect prediction
y_preds=model1.predict(x_test)
df= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})
df
| actual values | predicted values | |
|---|---|---|
| 775 | 26308.0 | 18257.168304 |
| 742 | 18557.0 | 21079.003364 |
| 776 | 28830.0 | 20180.793953 |
| 934 | 12216.0 | 9266.162201 |
| 971 | 24891.0 | 23449.645993 |
| ... | ... | ... |
| 839 | 10349.0 | 16802.529529 |
| 970 | 16416.0 | 13645.571180 |
| 329 | 17860.0 | 10113.406725 |
| 179 | 23287.0 | 25493.458207 |
| 22 | 22489.0 | 14768.127189 |
155 rows × 2 columns
y_test2_mean=np.full(len(y_test2),y_test2.mean())
y_test2.mean()
17229.929032258064
r2_score(y_test2,y_test2)
1.0
dfr= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})
dfr
| actual values | predicted values | |
|---|---|---|
| 775 | 26308.0 | 18257.168304 |
| 742 | 18557.0 | 21079.003364 |
| 776 | 28830.0 | 20180.793953 |
| 934 | 12216.0 | 9266.162201 |
| 971 | 24891.0 | 23449.645993 |
| ... | ... | ... |
| 839 | 10349.0 | 16802.529529 |
| 970 | 16416.0 | 13645.571180 |
| 329 | 17860.0 | 10113.406725 |
| 179 | 23287.0 | 25493.458207 |
| 22 | 22489.0 | 14768.127189 |
155 rows × 2 columns
4.1.2 Mean absolute error (MAE)
# mean absolute error
from sklearn.metrics import mean_absolute_error
y_preds=model1.predict(x_test)
mae=mean_absolute_error(y_test, y_preds)
mae
7494.50658157528
df= pd.DataFrame(data={"actual values": y_test, "predicted values":y_preds})
df
#Let's display the differences
df["differences"]=df["predicted values"] - df["actual values"]
df
| actual values | predicted values | differences | |
|---|---|---|---|
| 552 | 7982.0 | 18257.168304 | 10275.168304 |
| 906 | 52458.0 | 21079.003364 | -31378.996636 |
| 483 | 12152.0 | 20180.793953 | 8028.793953 |
| 920 | 23439.0 | 9266.162201 | -14172.837799 |
| 516 | 8105.0 | 23449.645993 | 15344.645993 |
| ... | ... | ... | ... |
| 521 | 13707.0 | 16802.529529 | 3095.529529 |
| 853 | 7350.0 | 13645.571180 | 6295.571180 |
| 761 | 14432.0 | 10113.406725 | -4318.593275 |
| 601 | 11162.0 | 25493.458207 | 14331.458207 |
| 593 | 27530.0 | 14768.127189 | -12761.872811 |
155 rows × 3 columns
y_preds=model2.predict(x_test)
mae=mean_absolute_error(y_test2, y_preds)
mae
6071.929677419354
df2= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})
df2
| actual values | predicted values | |
|---|---|---|
| 775 | 26308.0 | 17165.70 |
| 742 | 18557.0 | 14897.61 |
| 776 | 28830.0 | 24081.48 |
| 934 | 12216.0 | 6471.71 |
| 971 | 24891.0 | 19346.92 |
| ... | ... | ... |
| 839 | 10349.0 | 9431.75 |
| 970 | 16416.0 | 12309.93 |
| 329 | 17860.0 | 11553.13 |
| 179 | 23287.0 | 18385.02 |
| 22 | 22489.0 | 20253.07 |
155 rows × 2 columns
#Let's compute the differences
df2["differences"]=df["predicted values"] - df["actual values"]
df2.head(40)
| actual values | predicted values | differences | |
|---|---|---|---|
| 775 | 26308.0 | 17165.70 | -8297.267116 |
| 742 | 18557.0 | 14897.61 | NaN |
| 776 | 28830.0 | 24081.48 | NaN |
| 934 | 12216.0 | 6471.71 | NaN |
| 971 | 24891.0 | 19346.92 | NaN |
| 162 | 22616.0 | 9706.77 | -5926.609770 |
| 795 | 20503.0 | 19810.98 | 11279.540697 |
| 745 | 20845.0 | 18344.57 | NaN |
| 192 | 13106.0 | 14503.97 | NaN |
| 180 | 12398.0 | 18457.56 | 4397.260733 |
| 92 | 6481.0 | 19394.26 | NaN |
| 472 | 10196.0 | 10313.29 | NaN |
| 820 | 5145.0 | 8816.48 | NaN |
| 35 | 8738.0 | 14568.46 | NaN |
| 571 | 21236.0 | 14670.81 | NaN |
| 909 | 26940.0 | 14865.85 | NaN |
| 46 | 25196.0 | 11045.38 | NaN |
| 923 | 13546.0 | 8480.18 | NaN |
| 880 | 4606.0 | 13966.15 | NaN |
| 79 | 18310.0 | 10713.81 | NaN |
| 321 | 17035.0 | 9539.76 | NaN |
| 506 | 21326.0 | 20563.77 | NaN |
| 373 | 4753.0 | 17438.27 | NaN |
| 783 | 15967.0 | 22999.59 | 2309.393244 |
| 16 | 34465.0 | 19247.70 | NaN |
| 78 | 6502.0 | 8051.10 | NaN |
| 33 | 9780.0 | 12257.64 | NaN |
| 636 | 28276.0 | 20952.74 | NaN |
| 295 | 10872.0 | 10696.81 | NaN |
| 788 | 30323.0 | 24677.38 | NaN |
| 531 | 14442.0 | 9201.76 | NaN |
| 723 | 12901.0 | 11229.04 | NaN |
| 612 | 16774.0 | 15356.29 | NaN |
| 623 | 8870.0 | 12709.26 | NaN |
| 921 | 8321.0 | 13585.46 | NaN |
| 711 | 20219.0 | 26576.41 | NaN |
| 695 | 16285.0 | 17782.24 | NaN |
| 979 | 17940.0 | 22303.10 | NaN |
| 654 | 16867.0 | 10557.90 | NaN |
| 5 | 23883.0 | 19490.10 | -7969.179811 |
Comments:From the dataframe (df2) above, viewing the differences, we can see that 90% of our predictions are right with the evaluated model.
4.1.3 Mean Squared error
from sklearn.metrics import mean_squared_error
y_preds=model1.predict(x_test)
mse= mean_squared_error(y_test, y_preds)
mse
92401727.98063374
dfm= pd.DataFrame(data={"actual values": y_test, "predicted values":y_preds})
dfm
#Let's compute the differences
dfm["differences"]=dfm["predicted values"] - dfm["actual values"]
dfm
| actual values | predicted values | differences | |
|---|---|---|---|
| 552 | 7982.0 | 18257.168304 | 10275.168304 |
| 906 | 52458.0 | 21079.003364 | -31378.996636 |
| 483 | 12152.0 | 20180.793953 | 8028.793953 |
| 920 | 23439.0 | 9266.162201 | -14172.837799 |
| 516 | 8105.0 | 23449.645993 | 15344.645993 |
| ... | ... | ... | ... |
| 521 | 13707.0 | 16802.529529 | 3095.529529 |
| 853 | 7350.0 | 13645.571180 | 6295.571180 |
| 761 | 14432.0 | 10113.406725 | -4318.593275 |
| 601 | 11162.0 | 25493.458207 | 14331.458207 |
| 593 | 27530.0 | 14768.127189 | -12761.872811 |
155 rows × 3 columns
y_preds=model2.predict(x_test)
mse= mean_squared_error(y_test2, y_preds)
mse
54592436.95330194
dfms= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})
dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfms
| actual values | predicted values | differences | |
|---|---|---|---|
| 775 | 26308.0 | 17165.70 | -9142.30 |
| 742 | 18557.0 | 14897.61 | -3659.39 |
| 776 | 28830.0 | 24081.48 | -4748.52 |
| 934 | 12216.0 | 6471.71 | -5744.29 |
| 971 | 24891.0 | 19346.92 | -5544.08 |
| ... | ... | ... | ... |
| 839 | 10349.0 | 9431.75 | -917.25 |
| 970 | 16416.0 | 12309.93 | -4106.07 |
| 329 | 17860.0 | 11553.13 | -6306.87 |
| 179 | 23287.0 | 18385.02 | -4901.98 |
| 22 | 22489.0 | 20253.07 | -2235.93 |
155 rows × 3 columns
4.2 Scoring parameter
Using some metric function
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
model2=RandomForestRegressor()
np.random.seed(42)
cv_acc= cross_val_score(model2, x, y, cv=5, scoring=None)
np.mean(cv_acc)
0.2402165472512876
i.r2
#i r2
np.random.seed(42)
cv_acc= cross_val_score(model2, x, y, cv=5, scoring="r2")
r2=np.mean(cv_acc)
r2
0.2402165472512876
ii. neg_mean_absolute_error
#i neg_mean_absolute_error
np.random.seed(42)
cv_msa= cross_val_score(model2, x, y, cv=5, scoring="neg_mean_absolute_error")
neg_mean_absolute_error=np.mean(cv_msa)
neg_mean_absolute_error
-5842.919065437787
iii. neg_mean_squared_error
#iv: mean_squared_error
np.random.seed(42)
cv_mse= cross_val_score(model2, x, y, cv=5, scoring="neg_mean_squared_error")
neg_mean_squared_error=np.mean(cv_mse)
neg_mean_squared_error
-54045290.54544711
Evaluate
print(f"R^2: {r2*100:.2f}%")
print(f"MAE: {neg_mean_absolute_error:.2f}")
print(f"MSE: {neg_mean_squared_error:.2f}")
R^2: 24.02% MAE: -5842.92 MSE: -54045290.55
7. Saving and Loading the model
model2=RandomForestRegressor()
model2.fit(x_test, y_preds)
import pickle
#save an existing model to file
filename='CAR_SALES_MODEL.pkl'
pickle.dump(model2, open(filename,"wb"))
#Load a saved model
loaded_model = pickle.load(open("C:/Users/Fresh/3D Objects/Deployment/models/CAR_SALES_MODEL.pkl", "rb"))