I have a dataset similar to iris
, and need to write a function that deals with outliers in the following way: for each species setosa
, versicolor
, and virginica
, within each variable iris$Sepal.Length
, iris$Sepal.Width
, iris$Petal.Length
, and Petal.Width
, replace values that fall outside 1.5*IQR with the value of the IQR +/- 1.5*IQR (depending on if it falls above or below the IQR). I have been using the following code to achieve this, but it is very repetitive, time consuming, and error-prone. Also, doing it this way changes the values in the original objects. It would be nice to incorporate arguments into a function that would not only achieve this, but tell me which values were changed and save all the output into a new data frame instead of changing the values in the original dataset.
data(iris)
#create separate objects containing the data for each species
setosa <-
iris%>%
filter(Species == "setosa")
versicolor <-
iris%>%
filter(Species == "versicolor")
virginica <-
iris%>%
filter(Species == "virginica")
#for each variable within each species, do the following:
#create an object (qnt) that contains the 25th and 75th percentile
#create an object (H) containing the value of 1.5 times the interquartile range(IQR)
#replace any number less than the 25th percentile minus H with the value of the
#25th percentile minus H
#replace any number greater than the 75th percentile plus H with the value of the
#75th percentile plus H
qnt <- quantile(setosa$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Sepal.Length, na.rm = T)
setosa$Sepal.Length[setosa$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Length[setosa$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Sepal.Width, na.rm = T)
setosa$Sepal.Width[setosa$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Width[setosa$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Petal.Length, na.rm = T)
setosa$Petal.Length[setosa$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
setosa$Petal.Length[setosa$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(setosa$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(setosa$Petal.Width, na.rm = T)
setosa$Sepal.Width[setosa$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
setosa$Sepal.Width[setosa$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
#now do versicolor
qnt <- quantile(versicolor$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Sepal.Length, na.rm = T)
versicolor$Sepal.Length[versicolor$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Length[versicolor$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Sepal.Width, na.rm = T)
versicolor$Sepal.Width[versicolor$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Width[versicolor$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Petal.Length, na.rm = T)
versicolor$Petal.Length[versicolor$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
versicolor$Petal.Length[versicolor$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(versicolor$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(versicolor$Petal.Width, na.rm = T)
versicolor$Sepal.Width[versicolor$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
versicolor$Sepal.Width[versicolor$Petal.Width > (qnt[2] + H)] <- qnt[2]+H
#now do virginica
qnt <- quantile(virginica$Sepal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Sepal.Length, na.rm = T)
virginica$Sepal.Length[virginica$Sepal.Length < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Length[virginica$Sepal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Sepal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Sepal.Width, na.rm = T)
virginica$Sepal.Width[virginica$Sepal.Width < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Width[virginica$Sepal.Width > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Petal.Length, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Petal.Length, na.rm = T)
virginica$Petal.Length[virginica$Petal.Length < (qnt[1] - H)] <- qnt[1]-H
virginica$Petal.Length[virginica$Petal.Length > (qnt[2] + H)] <- qnt[2]+H
qnt <- quantile(virginica$Petal.Width, probs = c(.25, .75), na.rm = T)
H <- 1.5*IQR(virginica$Petal.Width, na.rm = T)
virginica$Sepal.Width[virginica$Petal.Width < (qnt[1] - H)] <- qnt[1]-H
virginica$Sepal.Width[virginica$Petal.Width > (qnt[2] + H)] <- qnt[2]+H