#20100812 Pooledstdev
# A function for computation of pooled standard deviation per feature

#Andrew Winterman
#Thank you Heather for all your help and patience with this function.
 
# This function computes the pooled standard deviation 
# for each feature of a  microarray data set. 
# the pooled standard deviation is:
# the square root of the weighted average of the within 
# group variances.
# weighted average = each group is weighted proportional 
# to its sample size
# The input is an LMFit/lmFit object that was made 
# without specifying contrasts, the name of the reference 
# individual, and the designation of groups. 
# Fit<- LMFit(object, design), where object is your MA 
# object and design is your design matrix.

pooledstdev <- function(groups, Fit, ref)
{
#	The Fit entry should be the output of LMFit with the 
#	contrasts argument left unspecified.
#	groups must be of a list of character vectors, whose entries 
#	are the names of the sources in that group (phenotypes, 
#  experimental treatments, conditions etc.). 
# The groups should include the name of the reference individual
#  ref must be the name of the individual chosen as reference for 
#  the make contrasts and design matrix steps.
#### Check to make sure that it doesn't cause a problem to have a 0 when others are NA
	
	g <- matrix(NA, nrow = nrow(Fit$coef), ncol = length(groups) )
#			g will hold within group variances, the i-th 
#			column of g holds the variances of the features 
#			in the i-th group.
	dgf <- matrix(NA, nrow = nrow(Fit$coef), ncol = length(groups) )
#			df will hold the degrees of freedom for each group.
	v <- matrix(NA, nrow = nrow(Fit$coef), ncol = length(groups) )
#			v will hold the within group sum of squares ( i.e. the product of 
#			degrees of freedom and the within group variances).
#
#			
	for ( i in 1:length(groups) ){
		k <- pmatch( groups[[i]], colnames(Fit$coef) )
#				which columns of coeficients correspond to 
#				this group
		if(any( is.na(k) )) #Is anybody NA?
		{
			if(groups[[i]][is.na(k)] == ref)
			{
#			The reference will give NA, so if the NA is caused by 
#			the reference source, this line will add a column of 
#			zero's to the coefficient matrix.
			q <- cbind(  Fit$coef[, k] , rep(0, nrow(Fit$coef)) )
			}
			else
			{
				stop("an entry of the i'th vector in the list is not th name of a source in this experiment.")
			}
		}	
		else{		
		q <- Fit$coef[, k] 
#				the matrix q gets the coefficients from the columns 
#				of the coeficients matrix for this group
		}
		g[,i] <- apply(q, 1, var, na.rm = TRUE)
#				The variances for the set of columns for this group
		dgf[,i] <- apply( !is.na(q), 1, sum) - 1
#				degrees of freedom for each feature for this group
		v[,i] <- dgf[,i] * g[,i] 
#				v get the within group sum of squares
		
		}
		dgf[dgf <= 0] <- 0

#			If the degrees of freedom are zero, we can't 
#			compute a variance, and we should get NA to 
#			reflect this fact. If they are less than one, 
#			we had no data to begin with, and once again we 
#			should get NA. The reasoning behind sending 
#			these to zero and not to NA (which turns out to 
#			be effectively the same thing) is given here:
#			 
			
#			We send these df's to 0 to allow the use of 
#			na.rm = TRUE in the summation terms below. As 
#			we continue, we will eventually attempt to
#			multiply by the degrees of freedom, and then 
#			divide by their sum. If any variances were 
#			somehow calculated for features with either 
#			insufficient degrees of freedom or for features 
#			with missing data - perhaps because of the use 
#			of na.rm = TRUE above - multiplying by their 
#			degrees of freedom, which has now been set to
#			zero, will eliminate them. If a feature has 
#			missing  data in all of the groups, 
#			setting these  entries to zero will send the 
#			corresponding feature's pooled variance to 
#			inf or NaN, allowing us an easy way to 
#			keep track of these entries. Note that if any 
#			feature has missing data in less than all of 
#			the groups, a pooled variance will still be 
#			calculated using those groups for which there 
#			is data.


		totaldf <- apply(dgf, 1, sum, na.rm = TRUE)
		SS <- apply(v, 1, sum, na.rm = TRUE)
#			SS is the total sum of squares
		pooled <- sqrt( SS / totaldf )
		pooled[is.nan(pooled)] <- NA
#			this vector is the pooled standard deviation for each feature!
	list(pergene.sd = pooled, median.sd = median(pooled, na.rm=TRUE))
}
# Would be better to vectorize this?