R programming language resources › Forums › Data manipulation › Useful R helper functions
Tagged: helper functions
- This topic has 2 replies, 2 voices, and was last updated 11 years, 9 months ago by bryan.
- AuthorPosts
- December 2, 2012 at 2:59 am #801bryanParticipant
Post your most useful helper functions to this thread. By posting your function here, you agree to the terms below and in the original post.
Posts cannot be edited once they are submitted, please double check your code before you submit.
December 14, 2012 at 8:02 pm #819nutterbMemberPerhaps this is a bit long for a helper function, but it was something a colleague had wanted to try, and this seemed like a good reason to take a stab at it. The following function is our attempt to consolidate the merging of multiple data frames. There are some functions for this elsewhere, but we’ve not come across one that allows for using varying by arguments across the several merges. The code is posted below. Code is available here
Example codes is available here
multiMerge <- function(datasets, printMergeParam=FALSE, suffix=NULL, minSuffixLength=4, …){
#* multiMerge performs (n-1) merges where n is the number of data frames given
#* in datasets. Merges are performed cumulatively, so that merge1 combines
#* datasets[1] and datasets[2]; merge2 combines datasets[1:2] with datasets[3], etc.
#*
#* datasets: a list of data frames. These will be merged in the order listed.
#* printMergeParam: logical. If TRUE, prints a data frame summarizing the paramters
#* passed to merge()
#* suffix: a character vector the same length of datasets. If NULL (default),
#* the abbreviated data frame names are used
#* minSuffixLength: interger. minimum number of characters to use when
#* defining suffixes, but ignored if the user defines
#* suffixes. Default behavior is for the abbreviated
#* data frame name to be the suffix for all but the
#* first data frame. The first data frame receives no suffix.
#* …: additional arguments to merge.
#* These can be names lists or vectors. The names are the
#* number of the merge. For instance, by=list(‘merge1’=’PATID’)
#* specifies that the first merge is performed by matching
#* on ‘PATID’. Unnamed lists and vectors must be either
#* length 1 (which are recycled) or (length(datasets) – 1)
require(stringr)
#* Determine the names of the data frame objects
dn <- str_replace_all(deparse(substitute(datasets)), “list[(]”, “”)
dn <- str_replace_all(dn, “)”, “”)
dn <- unlist(str_split(dn, “, “))
#* Set error flag to 0 and message to NULL
#* The argLengthError is an indicator that the argsDisp object will need to be changed
#* to accurately display the user supplied arguments when too many are supplied.
error.flag <- 0
error.msg <- “”
argLengthError <- FALSE
#* Determine if any objects in ‘datasets’ are not data frames.
if (!all(sapply(datasets, is.data.frame))){
error.flag <- error.flag + 1
error.msg <- c(error.msg,
str_c(error.flag, “: The following objects in ‘datasets’ are not data frames: “,
str_c(dn[!sapply(datasets, is.data.frame)], collapse=”, “), sep=””))
}
#* Test that the arguments in … are of length 1, or of length(datasets) – 1.
#* arguments should only be recycled if they are unnamed
dotArgs <- list(…)
listArgs <- sapply(dotArgs, is.list)
dotArgs <- lapply(dotArgs, function(x) x <- if (length(x) == 1 && !is.list(x) && is.null(names(x))) rep(x, length(datasets)-1) else x)
dotLength <- sapply(dotArgs, length)
# return(dotArgs)
unNamedArgs <- sapply(lapply(dotArgs, names), is.null)
if (length(unNamedArgs) > 0){
if (any(dotLength[unNamedArgs] != (length(datasets)-1))){
error.flag <- error.flag + 1
argLengthError <- TRUE
error.msg <- c(error.msg,
str_c(error.flag, “: The following arguments in ‘…’ must be of length 1 or “, length(datasets), “: “,
str_c(names(dotArgs)[dotLength != (length(datasets)-1)], collapse=”, “), sep=””))
}
}
#* Check length of suffix argument
if (!is.null(suffix) & length(suffix) != length(datasets)){
error.flag <- error.flag + 1
error.msg <- c(error.msg,
str_c(error.flag, “: ‘suffix’ argument must be of length “, length(datasets), “.”, sep=””))
}
#* Set default suffixes.
# abbrev.names <- abbreviate(dn, minlength=minSuffixLength)
# abbrev.names <- lapply(abbrev.names, function(x) c(“”, str_c(“.”, x, sep=””)))[-1]
# names(abbrev.names) <- 1:length(abbrev.names)
#* Default merge() arguments.
argsMat <- list(by = lapply(datasets[-1], function(x) intersect(names(datasets[[1]]), names(x))),
by.x = rep(NA, length(datasets) – 1),
by.y = rep(NA, length(datasets) – 1),
all = rep(FALSE, length(datasets) – 1),
all.x = rep(NA, length(datasets) – 1),
all.y = rep(NA, length(datasets) – 1),
sort = rep(TRUE, length(datasets) – 1),
# suffixes = abbrev.names,
incomparables = lapply(1:(length(datasets)-1), function(x) NULL))
argsMat <- lapply(argsMat, function(x){ names(x) <- str_c(“merge”, 1:(length(datasets)-1), sep=””); return(x)})
#* Modify Defaults based on user input
changeDefaults <- function(x){
if (x %in% names(dotArgs)){
if (!is.null(names(dotArgs[[x]]))) argsMat[[x]][names(dotArgs[[x]])] <- dotArgs[[x]][names(dotArgs[[x]])]
else if (length(dotArgs[[x]]) == 1 & is.null(names(dotArgs[[x]]))) argsMat[[x]] <- rep(dotArgs[[x]], length(datasets) – 1)
else argsMat[[x]] <- dotArgs[[x]]
}
return(argsMat[[x]])
}
argsMat <- lapply(names(argsMat), changeDefaults)
names(argsMat) <- c(“by”, “by.x”, “by.y”, “all”, “all.x”, “all.y”, “sort”, “incomparables”)
#* When ‘all’ or ‘by’ are specified by the user, they need to override the defaults.
#* These two lines ensure that will happen.
if (“by” %in% names(dotArgs)) argsMat$by.x[names(dotArgs$by)] <- argsMat$by.y[names(dotArgs$by)] <- argsMat$by[names(dotArgs$by)]
if (“all” %in% names(dotArgs)) argsMat$all.x[names(dotArgs$all)] <- argsMat$all.y[names(dotArgs$all)] <- argsMat$all[names(dotArgs$all)]
#* Prepare the argsDisp matrix to display the merge parameters
#* The bindStrings function helps the transition of lists to the matrix
bindStrings <- function(x){
x[sapply(x, is.null)] <- “NULL”
apply(do.call(“rbind”, x), 1, str_c, collapse=”, “)
}
argsDisp <- c(list(x = c(dn[1], rep(“Cum. Merge”, length(datasets)-2)),
y = dn[-1]),
argsMat)
argsDisp[sapply(argsDisp, is.list)] <-
lapply(argsDisp[sapply(argsDisp, is.list)], bindStrings)
argsDisp <-as.data.frame(do.call(“cbind”, argsDisp), stringsAsFactors=FALSE)
argsDisp$by[sapply(argsMat$by, length) == 0] <- “”
argsDisp$by[sapply(argsMat$by.x, length) == 0] <- “”
argsDisp$by[sapply(argsMat$by.y, length) == 0] <- “”
#* Return the error message if there is an error flag.
if (error.flag > 0){
if (argLengthError){
argsDisp <- argsDisp[1:max(dotLength), ]
argsDisp[max(dotLength), !names(argsDisp) %in% names(dotLength)[dotLength != length(datasets)-1]] <- “”
}
if (printMergeParam) print(argsDisp)
stop(str_c(error.msg, collapse=”\n”))
}
if (is.null(suffix)) suffix <- abbreviate(dn, minlength=minSuffixLength)
var.names <- lapply(datasets, names)
var.list <- unlist(var.names)
if (printMergeParam) print(argsDisp)
#* Merge the data
mergedData <- datasets[[1]]
for (i in 1:(length(datasets)-1)){
addArgs <- lapply(argsMat, “[“, i)
addArgs <- addArgs[!sapply(addArgs, function(x) is.na(x) | length(unlist(x)) == 0)]
addArgs <- lapply(addArgs, function(x) x <- if (is.list(x)) unlist(x) else x)
mergedData <- do.call(merge, c(list(x=mergedData, y=datasets[i+1], suffixes=c(“”, str_c(“.”, suffix[i+1], sep=””))), addArgs))
}
return(mergedData)
}
December 23, 2012 at 2:41 am #837bryanParticipantThat’s definitely a task we run into often; anything to make that easier is helpful.
- AuthorPosts
- You must be logged in to reply to this topic.