#20100812 Pooledstdev # A function for computation of pooled standard deviation per feature #Andrew Winterman #Thank you Heather for all your help and patience with this function. # This function computes the pooled standard deviation # for each feature of a microarray data set. # the pooled standard deviation is: # the square root of the weighted average of the within # group variances. # weighted average = each group is weighted proportional # to its sample size # The input is an LMFit/lmFit object that was made # without specifying contrasts, the name of the reference # individual, and the designation of groups. # Fit<- LMFit(object, design), where object is your MA # object and design is your design matrix. pooledstdev <- function(groups, Fit, ref) { # The Fit entry should be the output of LMFit with the # contrasts argument left unspecified. # groups must be of a list of character vectors, whose entries # are the names of the sources in that group (phenotypes, # experimental treatments, conditions etc.). # The groups should include the name of the reference individual # ref must be the name of the individual chosen as reference for # the make contrasts and design matrix steps. #### Check to make sure that it doesn't cause a problem to have a 0 when others are NA g <- matrix(NA, nrow = nrow(Fit$coef), ncol = length(groups) ) # g will hold within group variances, the i-th # column of g holds the variances of the features # in the i-th group. dgf <- matrix(NA, nrow = nrow(Fit$coef), ncol = length(groups) ) # df will hold the degrees of freedom for each group. v <- matrix(NA, nrow = nrow(Fit$coef), ncol = length(groups) ) # v will hold the within group sum of squares ( i.e. the product of # degrees of freedom and the within group variances). # # for ( i in 1:length(groups) ){ k <- pmatch( groups[[i]], colnames(Fit$coef) ) # which columns of coeficients correspond to # this group if(any( is.na(k) )) #Is anybody NA? { if(groups[[i]][is.na(k)] == ref) { # The reference will give NA, so if the NA is caused by # the reference source, this line will add a column of # zero's to the coefficient matrix. q <- cbind( Fit$coef[, k] , rep(0, nrow(Fit$coef)) ) } else { stop("an entry of the i'th vector in the list is not th name of a source in this experiment.") } } else{ q <- Fit$coef[, k] # the matrix q gets the coefficients from the columns # of the coeficients matrix for this group } g[,i] <- apply(q, 1, var, na.rm = TRUE) # The variances for the set of columns for this group dgf[,i] <- apply( !is.na(q), 1, sum) - 1 # degrees of freedom for each feature for this group v[,i] <- dgf[,i] * g[,i] # v get the within group sum of squares } dgf[dgf <= 0] <- 0 # If the degrees of freedom are zero, we can't # compute a variance, and we should get NA to # reflect this fact. If they are less than one, # we had no data to begin with, and once again we # should get NA. The reasoning behind sending # these to zero and not to NA (which turns out to # be effectively the same thing) is given here: # # We send these df's to 0 to allow the use of # na.rm = TRUE in the summation terms below. As # we continue, we will eventually attempt to # multiply by the degrees of freedom, and then # divide by their sum. If any variances were # somehow calculated for features with either # insufficient degrees of freedom or for features # with missing data - perhaps because of the use # of na.rm = TRUE above - multiplying by their # degrees of freedom, which has now been set to # zero, will eliminate them. If a feature has # missing data in all of the groups, # setting these entries to zero will send the # corresponding feature's pooled variance to # inf or NaN, allowing us an easy way to # keep track of these entries. Note that if any # feature has missing data in less than all of # the groups, a pooled variance will still be # calculated using those groups for which there # is data. totaldf <- apply(dgf, 1, sum, na.rm = TRUE) SS <- apply(v, 1, sum, na.rm = TRUE) # SS is the total sum of squares pooled <- sqrt( SS / totaldf ) pooled[is.nan(pooled)] <- NA # this vector is the pooled standard deviation for each feature! list(pergene.sd = pooled, median.sd = median(pooled, na.rm=TRUE)) } # Would be better to vectorize this?