Create Missing Values in mtcars data

# For this example we will pretend we are missing mpg data for Merc 280, Dodge Challenger and Ferrari Dino in mtcars

mtcars$mpg <- ifelse(rownames(mtcars) == 'Merc 280'|
                     rownames(mtcars) == 'Dodge Challenger'|
                     rownames(mtcars) == 'Ferrari Dino', 
                     -99, mtcars$mpg)



Change Missing Value Code to NA

mtcars[mtcars==-99] <- NA



Identify Missing Values in Data Frame

# list total number of missing values by variable
colSums(is.na(mtcars))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    3    0    0    0    0    0    0    0    0    0    0
# list names of cars with missing mpg
rownames(mtcars)[is.na(mtcars$mpg)]
## [1] "Merc 280"         "Dodge Challenger" "Ferrari Dino"



Calculate Mean MPG

mean(mtcars$mpg)   # missing values mess up even simple calculations
## [1] NA
mean(mtcars$mpg, na.rm = TRUE)   # we can get around this by telling R to ignore missing values 
## [1] 20.29



Mean Imputation

# Mean Imputation
mtcars.imputed <- mtcars
mtcars.imputed$mpg <- ifelse(is.na(mtcars$mpg), mean(mtcars$mpg, na.rm = TRUE), mtcars$mpg)

colSums(is.na(mtcars.imputed)) # no more missing data 
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
summary(mtcars.imputed$mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.4    15.7    20.3    20.3    22.8    33.9



Mean Imputation over every Column

# Create mean imputation function
mean.imputation <- function(df,...) {
apply(df, 2, function(x) {x <- ifelse(is.na(x), mean(x, na.rm = TRUE), x)})
    }

mtcars.imputed <- mean.imputation(mtcars) 

colSums(is.na(mtcars.imputed)) # no more missing data 
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0



Percentile Imputation

#  Imputation
mtcars.imputed <- mtcars
mtcars.imputed$mpg[is.na(mtcars.imputed$mpg)] <- quantile(mtcars.imputed$mpg, .95, na.rm = TRUE) # impute missing with 95th percentile

colSums(is.na(mtcars.imputed)) # no more missing data 
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
summary(mtcars.imputed$mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.4    15.7    21.0    21.4    26.3    33.9