Graph of data -ProgrammingR header image
Beginner to advanced resources for the R programming language

Useful R helper functions

R programming language resources Forums Data manipulation Useful R helper functions

This topic contains 2 replies, has 2 voices, and was last updated by  bryan 1 year, 7 months ago.

Viewing 3 posts - 1 through 3 (of 3 total)
  • Author
    Posts
  • #801

    bryan
    Member

    Post your most useful helper functions to this thread. By posting your function here, you agree to the terms below and in the original post.

    Posts cannot be edited once they are submitted, please double check your code before you submit.

    #819

    nutterb
    Member

    Perhaps this is a bit long for a helper function, but it was something a colleague had wanted to try, and this seemed like a good reason to take a stab at it.  The following function is our attempt to consolidate the merging of multiple data frames.  There are some functions for this elsewhere, but we’ve not come across one that allows for using varying by arguments across the several merges.  The code is posted below.  Code is available here

    Example codes is available here

    multiMerge <- function(datasets, printMergeParam=FALSE, suffix=NULL, minSuffixLength=4, …){

    #* multiMerge performs (n-1) merges where n is the number of data frames given

    #* in datasets.  Merges are performed cumulatively, so that merge1 combines

    #* datasets[1] and datasets[2];  merge2 combines datasets[1:2] with datasets[3], etc.

    #*

    #* datasets: a list of data frames.  These will be merged in the order listed.

    #* printMergeParam: logical.  If TRUE, prints a data frame summarizing the paramters

    #*                  passed to merge()

    #* suffix: a character vector the same length of datasets.  If NULL (default),

    #*         the abbreviated data frame names are used

    #* minSuffixLength: interger. minimum number of characters to use when

    #*                  defining suffixes, but ignored if the user defines

    #*                  suffixes.  Default behavior is for the abbreviated

    #*                  data frame name to be the suffix for all but the

    #*                  first data frame.  The first data frame receives no suffix.

    #* …: additional arguments to merge.

    #*      These can be names lists or vectors.  The names are the

    #*      number of the merge.  For instance, by=list(‘merge1′=’PATID’)

    #*      specifies that the first merge is performed by matching

    #*      on ‘PATID’.  Unnamed lists and vectors must be either

    #*      length 1 (which are recycled) or (length(datasets) – 1)

    require(stringr)

    #* Determine the names of the data frame objects

    dn <- str_replace_all(deparse(substitute(datasets)), “list[(]“, “”)

    dn <- str_replace_all(dn, “)”, “”)

    dn <- unlist(str_split(dn, “, “))

    #* Set error flag to 0 and message to NULL

    #* The argLengthError is an indicator that the argsDisp object will need to be changed

    #*     to accurately display the user supplied arguments when too many are supplied.

    error.flag <- 0

    error.msg <- “”

    argLengthError <- FALSE

    #* Determine if any objects in ‘datasets’ are not data frames.

    if (!all(sapply(datasets, is.data.frame))){

    error.flag <- error.flag + 1

    error.msg <- c(error.msg,

    str_c(error.flag, “: The following objects in ‘datasets’ are not data frames: “,

    str_c(dn[!sapply(datasets, is.data.frame)], collapse=”, “), sep=”"))

    }

    #* Test that the arguments in … are of length 1, or of length(datasets) – 1.

    #* arguments should only be recycled if they are unnamed

    dotArgs <- list(…)

    listArgs <- sapply(dotArgs, is.list)

    dotArgs <- lapply(dotArgs, function(x) x <- if (length(x) == 1 && !is.list(x)  && is.null(names(x))) rep(x, length(datasets)-1) else x)

    dotLength <- sapply(dotArgs, length)

    #   return(dotArgs)

    unNamedArgs <- sapply(lapply(dotArgs, names), is.null)

    if (length(unNamedArgs) > 0){

    if (any(dotLength[unNamedArgs] != (length(datasets)-1))){

    error.flag <- error.flag + 1

    argLengthError <- TRUE

    error.msg <- c(error.msg,

    str_c(error.flag, “: The following arguments in ‘…’ must be of length 1 or “, length(datasets), “: “,

    str_c(names(dotArgs)[dotLength != (length(datasets)-1)], collapse=”, “), sep=”"))

    }

    }

    #* Check length of suffix argument

    if (!is.null(suffix) & length(suffix) != length(datasets)){

    error.flag <- error.flag + 1

    error.msg <- c(error.msg,

    str_c(error.flag, “: ‘suffix’ argument must be of length “, length(datasets), “.”, sep=”"))

    }

    #* Set default suffixes.

    #   abbrev.names <- abbreviate(dn, minlength=minSuffixLength)

    #   abbrev.names <- lapply(abbrev.names, function(x) c(“”, str_c(“.”, x, sep=”")))[-1]

    #   names(abbrev.names) <- 1:length(abbrev.names)

    #* Default merge() arguments.

    argsMat <- list(by   = lapply(datasets[-1], function(x) intersect(names(datasets[[1]]), names(x))),

    by.x = rep(NA, length(datasets) – 1),

    by.y = rep(NA, length(datasets) – 1),

    all  = rep(FALSE, length(datasets) – 1),

    all.x = rep(NA, length(datasets) – 1),

    all.y = rep(NA, length(datasets) – 1),

    sort = rep(TRUE, length(datasets) – 1),

    #                   suffixes = abbrev.names,

    incomparables = lapply(1:(length(datasets)-1), function(x) NULL))

    argsMat <- lapply(argsMat, function(x){ names(x) <- str_c(“merge”, 1:(length(datasets)-1), sep=”"); return(x)})

    #* Modify Defaults based on user input

    changeDefaults <- function(x){

    if (x %in% names(dotArgs)){

    if (!is.null(names(dotArgs[[x]]))) argsMat[[x]][names(dotArgs[[x]])] <- dotArgs[[x]][names(dotArgs[[x]])]

    else if (length(dotArgs[[x]]) == 1 & is.null(names(dotArgs[[x]]))) argsMat[[x]] <- rep(dotArgs[[x]], length(datasets) – 1)

    else argsMat[[x]] <- dotArgs[[x]]

    }

    return(argsMat[[x]])

    }

    argsMat <- lapply(names(argsMat), changeDefaults)

    names(argsMat) <- c(“by”, “by.x”, “by.y”, “all”, “all.x”, “all.y”, “sort”, “incomparables”)

    #* When ‘all’ or ‘by’ are specified by the user, they need to override the defaults.

    #* These two lines ensure that will happen.

    if (“by” %in% names(dotArgs)) argsMat$by.x[names(dotArgs$by)] <- argsMat$by.y[names(dotArgs$by)] <- argsMat$by[names(dotArgs$by)]

    if (“all” %in% names(dotArgs)) argsMat$all.x[names(dotArgs$all)] <- argsMat$all.y[names(dotArgs$all)] <- argsMat$all[names(dotArgs$all)]

    #* Prepare the argsDisp matrix to display the merge parameters

    #* The bindStrings function helps the transition of lists to the matrix

    bindStrings <- function(x){

    x[sapply(x, is.null)] <- “NULL”

    apply(do.call(“rbind”, x), 1, str_c, collapse=”, “)

    }

    argsDisp <- c(list(x = c(dn[1], rep(“Cum. Merge”, length(datasets)-2)),

    y = dn[-1]),

    argsMat)

    argsDisp[sapply(argsDisp, is.list)] <-

    lapply(argsDisp[sapply(argsDisp, is.list)], bindStrings)

    argsDisp <-as.data.frame(do.call(“cbind”, argsDisp), stringsAsFactors=FALSE)

    argsDisp$by[sapply(argsMat$by, length) == 0] <- “”

    argsDisp$by[sapply(argsMat$by.x, length) == 0] <- “”

    argsDisp$by[sapply(argsMat$by.y, length) == 0] <- “”

    #* Return the error message if there is an error flag.

    if (error.flag > 0){

    if (argLengthError){

    argsDisp <- argsDisp[1:max(dotLength), ]

    argsDisp[max(dotLength), !names(argsDisp) %in% names(dotLength)[dotLength != length(datasets)-1]] <- “”

    }

    if (printMergeParam) print(argsDisp)

    stop(str_c(error.msg, collapse=”\n”))

    }

    if (is.null(suffix)) suffix <- abbreviate(dn, minlength=minSuffixLength)

    var.names <- lapply(datasets, names)

    var.list <- unlist(var.names)

    if (printMergeParam) print(argsDisp)

    #* Merge the data

    mergedData <- datasets[[1]]

    for (i in 1:(length(datasets)-1)){

    addArgs <- lapply(argsMat, “[", i)

    addArgs <- addArgs[!sapply(addArgs, function(x) is.na(x) | length(unlist(x)) == 0)]

    addArgs <- lapply(addArgs, function(x) x <- if (is.list(x)) unlist(x) else x)

    mergedData <- do.call(merge, c(list(x=mergedData, y=datasets[i+1], suffixes=c(“”, str_c(“.”, suffix[i+1], sep=”"))), addArgs))

    }

    return(mergedData)

    }

    #837

    bryan
    Member

    That’s definitely a task we run into often; anything to make that easier is helpful.

Viewing 3 posts - 1 through 3 (of 3 total)

You must be logged in to reply to this topic.