Introduction

Employed data, scripts and a brief description can be found at the original repository. Further references can be found at the page of Prof. Ian Watson.

library(FNN)
library(here)
library(magrittr)
library(tidyverse)

source(here::here("code/calc_KNN_error.R"))

theme_set(theme_bw())

Data Overview

Data has the following attributes:

Make: Make of the car;
Model: Model of the car;
Year: Manufacturing Date;
Engine.Fuel.Type: Kind of fuel the engine runs on;
Engine.HP: Engine HorsePower;
Engine.Cylinders: Number of cylinders in the engine;
Transmission.Type: Type of car transmission;
Driven_Wheels: Wheels added;
Number.of.Doors: Number of doors;
Vehicle.Size: Vehycle size;
Vehicle.Style: Vehycle style;
highway.MPG: Miles per gallon on road;
city.mpg: Miles per gallon on city;
Popularity: Car popularity;
MSRP: Manufacturer’s Suggested Retail Price and target variable.

Loading Data

read_csv(here::here("evidences/msrp.csv"),
         progress = FALSE, 
         col_types =
           cols(
            Make = col_character(),
            Model = col_character(),
            Year = col_integer(),
            `Engine Fuel Type` = col_character(),
            `Engine HP` = col_integer(),
            `Engine Cylinders` = col_integer(),
            `Transmission Type` = col_character(),
            Driven_Wheels = col_character(),
            `Number of Doors` = col_integer(),
            `Market Category` = col_character(),
            `Vehicle Size` = col_character(),
            `Vehicle Style` = col_character(),
            `highway MPG` = col_integer(),
            `city mpg` = col_integer(),
            Popularity = col_integer(),
            MSRP = col_integer()
            )) %>% 
  drop_na() -> car_data

Dummify Categorical Variables

car_data %>%
  mutate(
    Make = as.numeric(factor(Make)),
    Model = as.numeric(factor(Model)),
    `Engine Fuel Type` = as.numeric(factor(`Engine Fuel Type`)),
    `Transmission Type` = as.numeric(factor(`Transmission Type`)),
    Driven_Wheels = as.numeric(factor(Driven_Wheels)),
    `Market Category` = as.numeric(factor(`Market Category`)),
    `Vehicle Size` = as.numeric(factor(`Vehicle Size`)),
    `Vehicle Style` = as.numeric(factor(`Vehicle Style`)))-> car_data 

car_data %>% 
    glimpse()

## Observations: 11,812
## Variables: 16
## $ Make                <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ Model               <dbl> 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ Year                <int> 2011, 2011, 2011, 2011, 2011, 2012, 2012, 20…
## $ `Engine Fuel Type`  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,…
## $ `Engine HP`         <int> 335, 300, 300, 230, 230, 230, 300, 300, 230,…
## $ `Engine Cylinders`  <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ `Transmission Type` <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ Driven_Wheels       <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ `Number of Doors`   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ `Market Category`   <dbl> 38, 67, 64, 67, 63, 67, 67, 64, 63, 63, 64, …
## $ `Vehicle Size`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `Vehicle Style`     <dbl> 9, 7, 9, 9, 7, 9, 7, 9, 7, 7, 9, 9, 7, 7, 9,…
## $ `highway MPG`       <int> 26, 28, 28, 28, 28, 28, 26, 28, 28, 27, 28, …
## $ `city mpg`          <int> 19, 19, 20, 18, 18, 18, 17, 20, 18, 18, 20, …
## $ Popularity          <int> 3916, 3916, 3916, 3916, 3916, 3916, 3916, 39…
## $ MSRP                <int> 46135, 40650, 36350, 29450, 34500, 31200, 44…

Checking for missing values

row.has.na <- apply(car_data, 
                    1,
                    function(x){any(is.na(x))})
noquote(paste('Number of rows with misssing values: ',
            sum(row.has.na)))

## [1] Number of rows with misssing values:  0

Applying scale to predictor variables

num.vars <- sapply(car_data,
                   is.numeric, 
                   simplify=F)

num.vars$MSRP = FALSE
num.vars <- unlist(num.vars)

car_data[num.vars] <- lapply(car_data[num.vars],
                             scale)
car_data %>% 
  sample_n(10)

## # A tibble: 10 x 16
##    Make[,1] Model[,1] Year[,1] `Engine Fuel Ty… `Engine HP`[,1]
##       <dbl>     <dbl>    <dbl>            <dbl>           <dbl>
##  1    0.556    -0.959   0.347           -0.0820          3.40  
##  2    1.54      1.29    0.874           -0.755          -0.453 
##  3    1.54      1.31    0.742           -4.79           -0.0870
##  4    1.47      1.47    0.611            0.591          -0.627 
##  5    1.54      0.827  -0.0476           0.591           0.0137
##  6    0.906     0.354  -0.838            0.591          -0.591 
##  7   -0.984     1.35    0.874            0.591           0.353 
##  8    0.416    -0.760   0.742            0.591          -0.600 
##  9   -0.634    -0.445   0.742            0.591           0.691 
## 10   -0.984     1.05    0.874           -2.10            0.325 
## # … with 11 more variables: `Engine Cylinders`[,1] <dbl>, `Transmission
## #   Type`[,1] <dbl>, Driven_Wheels[,1] <dbl>, `Number of Doors`[,1] <dbl>,
## #   `Market Category`[,1] <dbl>, `Vehicle Size`[,1] <dbl>, `Vehicle
## #   Style`[,1] <dbl>, `highway MPG`[,1] <dbl>, `city mpg`[,1] <dbl>,
## #   Popularity[,1] <dbl>, MSRP <int>

Validation

Split data into training/testing sets

set.seed(101)

## Adding surrogate key to dataframe
car_data$id <- 1:nrow(car_data)

car_data %>% 
  dplyr::sample_frac(.8) -> train

dplyr::anti_join(car_data, 
                 train, 
                 by = 'id') -> test

Dissociate predictors from target variable/surrogate key

train %>% 
  select(-MSRP,-id) -> train.predictors

train %>% 
  select(MSRP, id) -> train.response

test %>% 
  select(-MSRP,-id) -> test.predictors

test %>% 
  select(MSRP, id) -> test.response

Apply K Nearest Neighbor

Calculate accumulated error

results <- data.frame(matrix(ncol = 0, nrow = 10))
results$k <- seq(1,10,1)

accum_err <- c()

for(num in results$k) {

  calc_KNN_error(num,
         train.predictors,
         test.predictors,
         train$id,
         train.response,
         test.response) -> err
  
  accum_err <-c(accum_err, err)
}

results$accum_err <- accum_err

results

##     k    accum_err
## 1   1 4.043619e+11
## 2   2 1.370266e+12
## 3   3 1.051143e+12
## 4   4 1.664537e+12
## 5   5 2.180589e+12
## 6   6 2.517307e+12
## 7   7 2.853242e+12
## 8   8 3.129612e+12
## 9   9 3.263991e+12
## 10 10 3.361216e+12

results %>%
  ggplot(aes(k,accum_err)) +
  geom_point(size = 3,
             alpha = .6) + 
  geom_line() +
  scale_x_continuous(breaks=seq(1,10,1)) +
  labs(y="Accumulated Error", x= "K Value") +
  ggtitle("Accumulated Error by K value")

Results

A smaller K seems to render less accumulated error.
At K = 2, we have an unusual spike in terms of accumulated error, which one could impute on a probable overfit.
K = 1 renders the smallest amount of Accumulated Error.

Case Based Reasoning System for MSRP estimation

Introduction

Data Overview

Loading Data

Dummify Categorical Variables

Checking for missing values

Applying scale to predictor variables

Validation

Split data into training/testing sets

Dissociate predictors from target variable/surrogate key

Apply K Nearest Neighbor

Calculate accumulated error

Results

Benardi Nunes

Case Based Reasoning System for MSRP estimation

Introduction

Data Overview

Loading Data

Dummify Categorical Variables

Checking for missing values

Applying scale to predictor variables

Validation

Split data into training/testing sets

Dissociate predictors from target variable/surrogate key

Apply K Nearest Neighbor

Calculate accumulated error

Results

Benardi Nunes

Classification of candidates in Brazilian elections

Analysis with Regularization on Brazilian elections

Analysis on Brazilian elections

C.E.A.P analysis (suppliers and weekend expenses)

C.E.A.P Analysis

Multivariate logistic regression on speed dating data

Multivariate linear regression on speed dating data

Analysis on MovieLens dataset with bootstrap

Analysis on Github commits (2016-2017)

Case Based Reasoning System for MSRP estimation