Introduction


Employed data, scripts and a brief description can be found at the original repository. Further references can be found at the page of Prof. Ian Watson.

library(FNN)
library(here)
library(magrittr)
library(tidyverse)

source(here::here("code/calc_KNN_error.R"))

theme_set(theme_bw())




Data Overview


Data has the following attributes:

  • Make: Make of the car;
  • Model: Model of the car;
  • Year: Manufacturing Date;
  • Engine.Fuel.Type: Kind of fuel the engine runs on;
  • Engine.HP: Engine HorsePower;
  • Engine.Cylinders: Number of cylinders in the engine;
  • Transmission.Type: Type of car transmission;
  • Driven_Wheels: Wheels added;
  • Number.of.Doors: Number of doors;
  • Vehicle.Size: Vehycle size;
  • Vehicle.Style: Vehycle style;
  • highway.MPG: Miles per gallon on road;
  • city.mpg: Miles per gallon on city;
  • Popularity: Car popularity;
  • MSRP: Manufacturer’s Suggested Retail Price and target variable.


Loading Data

read_csv(here::here("evidences/msrp.csv"),
         progress = FALSE, 
         col_types =
           cols(
            Make = col_character(),
            Model = col_character(),
            Year = col_integer(),
            `Engine Fuel Type` = col_character(),
            `Engine HP` = col_integer(),
            `Engine Cylinders` = col_integer(),
            `Transmission Type` = col_character(),
            Driven_Wheels = col_character(),
            `Number of Doors` = col_integer(),
            `Market Category` = col_character(),
            `Vehicle Size` = col_character(),
            `Vehicle Style` = col_character(),
            `highway MPG` = col_integer(),
            `city mpg` = col_integer(),
            Popularity = col_integer(),
            MSRP = col_integer()
            )) %>% 
  drop_na() -> car_data 

Dummify Categorical Variables

car_data %>%
  mutate(
    Make = as.numeric(factor(Make)),
    Model = as.numeric(factor(Model)),
    `Engine Fuel Type` = as.numeric(factor(`Engine Fuel Type`)),
    `Transmission Type` = as.numeric(factor(`Transmission Type`)),
    Driven_Wheels = as.numeric(factor(Driven_Wheels)),
    `Market Category` = as.numeric(factor(`Market Category`)),
    `Vehicle Size` = as.numeric(factor(`Vehicle Size`)),
    `Vehicle Style` = as.numeric(factor(`Vehicle Style`)))-> car_data 

car_data %>% 
    glimpse()
## Observations: 11,812
## Variables: 16
## $ Make                <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ Model               <dbl> 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ Year                <int> 2011, 2011, 2011, 2011, 2011, 2012, 2012, 20…
## $ `Engine Fuel Type`  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,…
## $ `Engine HP`         <int> 335, 300, 300, 230, 230, 230, 300, 300, 230,…
## $ `Engine Cylinders`  <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ `Transmission Type` <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ Driven_Wheels       <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ `Number of Doors`   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ `Market Category`   <dbl> 38, 67, 64, 67, 63, 67, 67, 64, 63, 63, 64, …
## $ `Vehicle Size`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `Vehicle Style`     <dbl> 9, 7, 9, 9, 7, 9, 7, 9, 7, 7, 9, 9, 7, 7, 9,…
## $ `highway MPG`       <int> 26, 28, 28, 28, 28, 28, 26, 28, 28, 27, 28, …
## $ `city mpg`          <int> 19, 19, 20, 18, 18, 18, 17, 20, 18, 18, 20, …
## $ Popularity          <int> 3916, 3916, 3916, 3916, 3916, 3916, 3916, 39…
## $ MSRP                <int> 46135, 40650, 36350, 29450, 34500, 31200, 44…


Checking for missing values

row.has.na <- apply(car_data, 
                    1,
                    function(x){any(is.na(x))})
noquote(paste('Number of rows with misssing values: ',
            sum(row.has.na)))
## [1] Number of rows with misssing values:  0


Applying scale to predictor variables

num.vars <- sapply(car_data,
                   is.numeric, 
                   simplify=F)

num.vars$MSRP = FALSE
num.vars <- unlist(num.vars)

car_data[num.vars] <- lapply(car_data[num.vars],
                             scale)
car_data %>% 
  sample_n(10)
## # A tibble: 10 x 16
##    Make[,1] Model[,1] Year[,1] `Engine Fuel Ty… `Engine HP`[,1]
##       <dbl>     <dbl>    <dbl>            <dbl>           <dbl>
##  1    0.556    -0.959   0.347           -0.0820          3.40  
##  2    1.54      1.29    0.874           -0.755          -0.453 
##  3    1.54      1.31    0.742           -4.79           -0.0870
##  4    1.47      1.47    0.611            0.591          -0.627 
##  5    1.54      0.827  -0.0476           0.591           0.0137
##  6    0.906     0.354  -0.838            0.591          -0.591 
##  7   -0.984     1.35    0.874            0.591           0.353 
##  8    0.416    -0.760   0.742            0.591          -0.600 
##  9   -0.634    -0.445   0.742            0.591           0.691 
## 10   -0.984     1.05    0.874           -2.10            0.325 
## # … with 11 more variables: `Engine Cylinders`[,1] <dbl>, `Transmission
## #   Type`[,1] <dbl>, Driven_Wheels[,1] <dbl>, `Number of Doors`[,1] <dbl>,
## #   `Market Category`[,1] <dbl>, `Vehicle Size`[,1] <dbl>, `Vehicle
## #   Style`[,1] <dbl>, `highway MPG`[,1] <dbl>, `city mpg`[,1] <dbl>,
## #   Popularity[,1] <dbl>, MSRP <int>


Validation


Split data into training/testing sets

set.seed(101)

## Adding surrogate key to dataframe
car_data$id <- 1:nrow(car_data)

car_data %>% 
  dplyr::sample_frac(.8) -> train

dplyr::anti_join(car_data, 
                 train, 
                 by = 'id') -> test

Dissociate predictors from target variable/surrogate key

train %>% 
  select(-MSRP,-id) -> train.predictors

train %>% 
  select(MSRP, id) -> train.response

test %>% 
  select(-MSRP,-id) -> test.predictors

test %>% 
  select(MSRP, id) -> test.response


Apply K Nearest Neighbor


Calculate accumulated error

results <- data.frame(matrix(ncol = 0, nrow = 10))
results$k <- seq(1,10,1)

accum_err <- c()

for(num in results$k) {

  calc_KNN_error(num,
         train.predictors,
         test.predictors,
         train$id,
         train.response,
         test.response) -> err
  
  accum_err <-c(accum_err, err)
}

results$accum_err <- accum_err

results
##     k    accum_err
## 1   1 4.043619e+11
## 2   2 1.370266e+12
## 3   3 1.051143e+12
## 4   4 1.664537e+12
## 5   5 2.180589e+12
## 6   6 2.517307e+12
## 7   7 2.853242e+12
## 8   8 3.129612e+12
## 9   9 3.263991e+12
## 10 10 3.361216e+12
results %>%
  ggplot(aes(k,accum_err)) +
  geom_point(size = 3,
             alpha = .6) + 
  geom_line() +
  scale_x_continuous(breaks=seq(1,10,1)) +
  labs(y="Accumulated Error", x= "K Value") +
  ggtitle("Accumulated Error by K value")


Results


  • A smaller K seems to render less accumulated error.
  • At K = 2, we have an unusual spike in terms of accumulated error, which one could impute on a probable overfit.
  • K = 1 renders the smallest amount of Accumulated Error.