import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn


car_sales= pd.read_csv("car-sales-extended-missing-data.csv")
car_sales.head()


car_sales.tail()


#Check the type of features we have in the datasets
print (car_sales['Colour'].unique())
print(car_sales['Make'].unique())
print(car_sales['Doors'].unique())

['White' 'Blue' 'Red' 'Green' nan 'Black']
['Honda' 'BMW' 'Toyota' 'Nissan' nan]
[ 4.  5.  3. nan]


len(car_sales)

1000


#Check for missing values
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64


# Check our dataframe again
car_sales=car_sales.dropna(axis=0)


#Let's deal with null values in price using sklearn

len(car_sales)

773


#Check the data types
car_sales.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object


#Convert the datatype of the doors to int and then to object, since its a categorical variable
car_sales['Doors']= car_sales['Doors'].astype('int')


#from int to object 
car_sales['Doors']= car_sales['Doors'].astype('object')
print(car_sales.dtypes)

Make              object
Colour            object
Odometer (KM)    float64
Doors             object
Price            float64
dtype: object


#Check the conversion
car_sales.head()


#Verify if there are still missing values by checking their unique features
print (car_sales['Colour'].unique())
print(car_sales['Make'].unique())
print(car_sales['Doors'].unique())

['White' 'Blue' 'Red' 'Green' 'Black']
['Honda' 'BMW' 'Toyota' 'Nissan']
[4 5 3]


#Separate data into x and y


x= car_sales.drop("Price", axis=1)
y= car_sales["Price"]


x.head()


#Convert the Odometer column to list;this will be needed in deployment.

Odometer_list= list(car_sales["Odometer (KM)"])

odor_list = [int(item) for item in Odometer_list]


from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make", "Colour", "Doors"]
one_hot= OneHotEncoder()
transformer= ColumnTransformer([("one_hot", one_hot, categorical_features)],
                                remainder= "passthrough")

transformeed_x = transformer.fit_transform(x)
transformeed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])


#Put it back into a dataframe


x=transformeed_x


#Convert from sparse matrix to dense matrix 
pd.DataFrame(transformeed_x)


dummies= pd.get_dummies(car_sales[["Make","Odometer (KM)","Colour","Doors"]])
dummies

C:\Users\Fresh\ANACONDA 2022\lib\site-packages\pandas\core\algorithms.py:798: FutureWarning: In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)
  uniques = Index(uniques)


#split into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)


from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression


model1 = LinearRegression()
model1.fit(x_train,y_train)
model1.score(x_test, y_test)

0.3367958266145302


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test2 = train_test_split(x,y, test_size = 0.2)


from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

y= car_sales["Price"]
model2=RandomForestRegressor()
model2.fit(x_train, y_train);
model2.score(x_test,y_test2)

0.2658508636419823


#Scoring metrics
from sklearn.metrics import r2_score

#Fill an array with y_test mean
y_test_mean=np.full(len(y_test),y_test.mean())
y_test.mean()

17696.348387096776


r2_score(y_test,y_test_mean) #let's test the mean of the target

0.0


r2_score(y_test,y_test)

1.0


y_preds=model1.predict(x_test)
df= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

df


y_test2_mean=np.full(len(y_test2),y_test2.mean())
y_test2.mean()

17229.929032258064


r2_score(y_test2,y_test2)

1.0


dfr= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

dfr


# mean absolute error
from sklearn.metrics import mean_absolute_error

y_preds=model1.predict(x_test)
mae=mean_absolute_error(y_test, y_preds)
mae

7494.50658157528


df= pd.DataFrame(data={"actual values": y_test, "predicted values":y_preds})

df
#Let's display the differences
df["differences"]=df["predicted values"] - df["actual values"]
df


y_preds=model2.predict(x_test)
mae=mean_absolute_error(y_test2, y_preds)
mae

6071.929677419354


df2= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

df2


#Let's compute the differences
df2["differences"]=df["predicted values"] - df["actual values"]
df2.head(40)


from sklearn.metrics import mean_squared_error
y_preds=model1.predict(x_test)
mse= mean_squared_error(y_test, y_preds)
mse

92401727.98063374


dfm= pd.DataFrame(data={"actual values": y_test, "predicted values":y_preds})

dfm
#Let's compute the differences
dfm["differences"]=dfm["predicted values"] - dfm["actual values"]
dfm


y_preds=model2.predict(x_test)
mse= mean_squared_error(y_test2, y_preds)
mse

54592436.95330194


dfms= pd.DataFrame(data={"actual values": y_test2, "predicted values":y_preds})

dfms
#Let's compute the differences
dfms["differences"]=dfms["predicted values"] - dfms["actual values"]
dfms


from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
model2=RandomForestRegressor()
np.random.seed(42)
cv_acc= cross_val_score(model2, x, y, cv=5, scoring=None)
np.mean(cv_acc)

0.2402165472512876


#i r2
np.random.seed(42)
cv_acc= cross_val_score(model2, x, y, cv=5, scoring="r2")
r2=np.mean(cv_acc)

r2

0.2402165472512876


#i neg_mean_absolute_error
np.random.seed(42)
cv_msa= cross_val_score(model2, x, y, cv=5, scoring="neg_mean_absolute_error")
neg_mean_absolute_error=np.mean(cv_msa)
neg_mean_absolute_error

-5842.919065437787


#iv: mean_squared_error
np.random.seed(42)
cv_mse= cross_val_score(model2, x, y, cv=5, scoring="neg_mean_squared_error")
neg_mean_squared_error=np.mean(cv_mse)
neg_mean_squared_error

-54045290.54544711


    print(f"R^2: {r2*100:.2f}%")
    print(f"MAE: {neg_mean_absolute_error:.2f}")
    print(f"MSE: {neg_mean_squared_error:.2f}")

R^2: 24.02%
MAE: -5842.92
MSE: -54045290.55


model2=RandomForestRegressor()
model2.fit(x_test, y_preds)
import pickle



#save an existing model to file
filename='CAR_SALES_MODEL.pkl'
pickle.dump(model2, open(filename,"wb"))


#Load a saved model
loaded_model = pickle.load(open("C:/Users/Fresh/3D Objects/Deployment/models/CAR_SALES_MODEL.pkl", "rb"))

	Odometer (KM)	Make_BMW	Make_Honda	Make_Nissan	Make_Toyota	Colour_Black	Colour_Blue	Colour_Green	Colour_Red	Colour_White	Doors_3	Doors_4	Doors_5
0	35431.0	0	1	0	0	0	0	0	0	1	0	1	0
1	192714.0	1	0	0	0	0	1	0	0	0	0	0	1
2	84714.0	0	1	0	0	0	0	0	0	1	0	1	0
3	154365.0	0	0	0	1	0	0	0	0	1	0	1	0
4	181577.0	0	0	1	0	0	1	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
994	163322.0	1	0	0	0	0	1	0	0	0	1	0	0
995	35820.0	0	0	0	1	1	0	0	0	0	0	1	0
997	66604.0	0	0	1	0	0	1	0	0	0	0	1	0
998	215883.0	0	1	0	0	0	0	0	0	1	0	1	0
999	248360.0	0	0	0	1	0	1	0	0	0	0	1	0

	actual values	predicted values
775	26308.0	18257.168304
742	18557.0	21079.003364
776	28830.0	20180.793953
934	12216.0	9266.162201
971	24891.0	23449.645993
...	...	...
839	10349.0	16802.529529
970	16416.0	13645.571180
329	17860.0	10113.406725
179	23287.0	25493.458207
22	22489.0	14768.127189

	actual values	predicted values
775	26308.0	18257.168304
742	18557.0	21079.003364
776	28830.0	20180.793953
934	12216.0	9266.162201
971	24891.0	23449.645993
...	...	...
839	10349.0	16802.529529
970	16416.0	13645.571180
329	17860.0	10113.406725
179	23287.0	25493.458207
22	22489.0	14768.127189

	actual values	predicted values	differences
552	7982.0	18257.168304	10275.168304
906	52458.0	21079.003364	-31378.996636
483	12152.0	20180.793953	8028.793953
920	23439.0	9266.162201	-14172.837799
516	8105.0	23449.645993	15344.645993
...	...	...	...
521	13707.0	16802.529529	3095.529529
853	7350.0	13645.571180	6295.571180
761	14432.0	10113.406725	-4318.593275
601	11162.0	25493.458207	14331.458207
593	27530.0	14768.127189	-12761.872811

	actual values	predicted values
775	26308.0	17165.70
742	18557.0	14897.61
776	28830.0	24081.48
934	12216.0	6471.71
971	24891.0	19346.92
...	...	...
839	10349.0	9431.75
970	16416.0	12309.93
329	17860.0	11553.13
179	23287.0	18385.02
22	22489.0	20253.07

Machine Learning model selection to Predict Car price using the Linear Regression model with the R-squared, MAE, MSE evaluators.¶

Python Libraries: Scikit-learn, Numpy, Pandas¶

2. Load dataset¶

3. Data Cleaning¶

4. Feature Engineering¶

Let's convert the strings (categorical features) to numerical data type¶

Feature Engineering¶

5. Picking a machine learning model for a regresion problem¶

5.2 Let's fit the model¶

6. Regression model evaluation¶

Evaluating the Linear Regression Model using the R^2¶

Testing the Linear Regression Model¶

Evaluating the RandomForestRegression model using the R^2¶

Testing the RandomForestRegression Model¶

Evaluating the Linear Regression model using the MAE¶

Evaluating the RandomForesRegression model using the MAE¶

Evaluating the RandomForestRegression model¶

For the Linear Regression model¶

For the RandomForestRegression model¶

	Make	Colour	Odometer (KM)	Doors	Price
0	Honda	White	35431.0	4.0	15323.0
1	BMW	Blue	192714.0	5.0	19943.0
2	Honda	White	84714.0	4.0	28343.0
3	Toyota	White	154365.0	4.0	13434.0
4	Nissan	Blue	181577.0	3.0	14043.0

	Make	Colour	Odometer (KM)	Doors	Price
995	Toyota	Black	35820.0	4.0	32042.0
996	NaN	White	155144.0	3.0	5716.0
997	Nissan	Blue	66604.0	4.0	31570.0
998	Honda	White	215883.0	4.0	4001.0
999	Toyota	Blue	248360.0	4.0	12732.0

	actual values	predicted values	differences
775	26308.0	17165.70	-8297.267116
742	18557.0	14897.61	NaN
776	28830.0	24081.48	NaN
934	12216.0	6471.71	NaN
971	24891.0	19346.92	NaN
162	22616.0	9706.77	-5926.609770
795	20503.0	19810.98	11279.540697
745	20845.0	18344.57	NaN
192	13106.0	14503.97	NaN
180	12398.0	18457.56	4397.260733
92	6481.0	19394.26	NaN
472	10196.0	10313.29	NaN
820	5145.0	8816.48	NaN
35	8738.0	14568.46	NaN
571	21236.0	14670.81	NaN
909	26940.0	14865.85	NaN
46	25196.0	11045.38	NaN
923	13546.0	8480.18	NaN
880	4606.0	13966.15	NaN
79	18310.0	10713.81	NaN
321	17035.0	9539.76	NaN
506	21326.0	20563.77	NaN
373	4753.0	17438.27	NaN
783	15967.0	22999.59	2309.393244
16	34465.0	19247.70	NaN
78	6502.0	8051.10	NaN
33	9780.0	12257.64	NaN
636	28276.0	20952.74	NaN
295	10872.0	10696.81	NaN
788	30323.0	24677.38	NaN
531	14442.0	9201.76	NaN
723	12901.0	11229.04	NaN
612	16774.0	15356.29	NaN
623	8870.0	12709.26	NaN
921	8321.0	13585.46	NaN
711	20219.0	26576.41	NaN
695	16285.0	17782.24	NaN
979	17940.0	22303.10	NaN
654	16867.0	10557.90	NaN
5	23883.0	19490.10	-7969.179811

	actual values	predicted values	differences
775	26308.0	17165.70	-9142.30
742	18557.0	14897.61	-3659.39
776	28830.0	24081.48	-4748.52
934	12216.0	6471.71	-5744.29
971	24891.0	19346.92	-5544.08
...	...	...	...
839	10349.0	9431.75	-917.25
970	16416.0	12309.93	-4106.07
329	17860.0	11553.13	-6306.87
179	23287.0	18385.02	-4901.98
22	22489.0	20253.07	-2235.93