A flexible alternative to some of the tricks used to convert certain values into NA after a dataset is already loaded in the workspace. Uses type.convert and na.strings to allow conversion of multiple values into NA.

makemeNA(mydf, NAStrings, fixed = TRUE)

Arguments

mydf

The input data.frame.

NAStrings

The values or a vector of values that should be treated as NA. Alternatively, this can be a regular expression.

fixed

Logical. Defaults to TRUE. Set to FALSE if being used with regular expressions.

Value

A data.frame.

References

http://stackoverflow.com/a/14898521/1270695

Author

Ananda Mahto

Examples

df1 <- structure(list( KY27PHY1 = c("4", "5", "5", "4", "-", "4", "2","3", "5", "-", "4", "3", "3", "5", "5"), KY27PHY2 = c("4", "4","4", "4", "-", "5", "2", "3", "5", "-", "5", "3", "3", "5", "5"), KY27PHY3 = c("5", "4", "4", "4", "-", "5", "1", "4", "5","-", "4", "3", "3", "5", "5")), .Names = c("KY27PHY1", "KY27PHY2","KY27PHY3"), row.names = 197:211, class = "data.frame") df1
#> KY27PHY1 KY27PHY2 KY27PHY3 #> 197 4 4 5 #> 198 5 4 4 #> 199 5 4 4 #> 200 4 4 4 #> 201 - - - #> 202 4 5 5 #> 203 2 2 1 #> 204 3 3 4 #> 205 5 5 5 #> 206 - - - #> 207 4 5 4 #> 208 3 3 3 #> 209 3 3 3 #> 210 5 5 5 #> 211 5 5 5
makemeNA(df1, "-")
#> KY27PHY1 KY27PHY2 KY27PHY3 #> 197 4 4 5 #> 198 5 4 4 #> 199 5 4 4 #> 200 4 4 4 #> 201 NA NA NA #> 202 4 5 5 #> 203 2 2 1 #> 204 3 3 4 #> 205 5 5 5 #> 206 NA NA NA #> 207 4 5 4 #> 208 3 3 3 #> 209 3 3 3 #> 210 5 5 5 #> 211 5 5 5
df2 <- data.frame(A = c(1, 2, "-", "not applicable", 5), B = c("not available", 1, 2, 3, 4), C = c("-", letters[1:4])) df2
#> A B C #> 1 1 not available - #> 2 2 1 a #> 3 - 2 b #> 4 not applicable 3 c #> 5 5 4 d
makemeNA(df2, "not.*|-", fixed = FALSE)
#> A B C #> 1 1 NA <NA> #> 2 2 1 a #> 3 NA 2 b #> 4 NA 3 c #> 5 5 4 d
temp <- structure( list(age = c(64.3573, 69.9043, 65.6633, 50.3693, 57.0334, 81.4939, 56.954, 76.9298), CALCIUM = c(1.1, 8.1, 8.6, 8.1, 8.7, 1.1, 9.8, 9.1), CREATININE = c(NA, 1.1, 0.8, 1.3, 0.8, NA, 1, 0.8), GLUCOSE = structure(c(5L, 4L, 3L, 2L, 6L, 6L, 1L, 6L), .Label = c("", "418", "461", "472", "488", "NEG"), class = "factor")), .Names = c("age", "CALCIUM", "CREATININE", "GLUCOSE"), class = "data.frame", row.names = c(NA, -8L)) temp
#> age CALCIUM CREATININE GLUCOSE #> 1 64.3573 1.1 NA 488 #> 2 69.9043 8.1 1.1 472 #> 3 65.6633 8.6 0.8 461 #> 4 50.3693 8.1 1.3 418 #> 5 57.0334 8.7 0.8 NEG #> 6 81.4939 1.1 NA NEG #> 7 56.9540 9.8 1.0 #> 8 76.9298 9.1 0.8 NEG
## Change anything that is just text to NA makemeNA(temp, "[A-Za-z]", fixed = FALSE)
#> age CALCIUM CREATININE GLUCOSE #> 1 64.3573 1.1 NA 488 #> 2 69.9043 8.1 1.1 472 #> 3 65.6633 8.6 0.8 461 #> 4 50.3693 8.1 1.3 418 #> 5 57.0334 8.7 0.8 NA #> 6 81.4939 1.1 NA NA #> 7 56.9540 9.8 1.0 NA #> 8 76.9298 9.1 0.8 NA
## Change any exact matches with "NEG" to NA makemeNA(temp, "NEG")
#> age CALCIUM CREATININE GLUCOSE #> 1 64.3573 1.1 NA 488 #> 2 69.9043 8.1 1.1 472 #> 3 65.6633 8.6 0.8 461 #> 4 50.3693 8.1 1.3 418 #> 5 57.0334 8.7 0.8 NA #> 6 81.4939 1.1 NA NA #> 7 56.9540 9.8 1.0 NA #> 8 76.9298 9.1 0.8 NA
## Change any matches with 3-digit integers to NA makemeNA(temp, "^[0-9]{3}$", fixed = FALSE)
#> age CALCIUM CREATININE GLUCOSE #> 1 64.3573 1.1 NA <NA> #> 2 69.9043 8.1 1.1 <NA> #> 3 65.6633 8.6 0.8 <NA> #> 4 50.3693 8.1 1.3 <NA> #> 5 57.0334 8.7 0.8 NEG #> 6 81.4939 1.1 NA NEG #> 7 56.9540 9.8 1.0 <NA> #> 8 76.9298 9.1 0.8 NEG