###################################################################### # #get_scores.R is a script for prioritizing candidate #genes in genomic regions based on principal components #Copyright (C) 2007 Daniel Shriner #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. #The text of the GNU General Public License, version 2, is available #as http://www.gnu.org/copyleft or by writing to the Free Software #Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ###################################################################### expdat <- read.table(file="gene2go.hs.09May06.txt",header=FALSE,sep="\t",quote="") expgene <- unique(expdat$V2) #totgene <- length(expgene) expterm <- unique(expdat$V3) #read in the global correlation matrix dat1 <- scan(file="correl.txt") dat1 <- matrix(dat1,nrow=length(expterm),ncol=length(expterm)) dat <- read.table(file="GOannotations.txt",header=FALSE,sep="\t",quote="") #number of unique genes in sample numgene <- unique(dat$V2) #length(numgene) #number of GO terms in sample numterm <- unique(dat$V3) #length(numterm) #occurrences of GO terms in sample count <- numeric() for (i in 1:length(numterm)) {obsb <- dat[dat$V3==numterm[i],] obsb <- unique(obsb$V2) count <- append(count,length(obsb))} #number of QTL in sample numqtl <- unique(dat$V1) #length(numqtl) #extract the subset of the global correlation matrix for the relevant GO terms test <- intersect(numterm,expterm) p <- vector(mode="numeric",length=0) for (i in 1:length(test)) {p <- append(p,which(test[i]==expterm))} dat2 <- dat1[p,p] #create a copy dat3 <- dat2 #extract the upper triangular matrix without the diagonal dat3[lower.tri(dat3,diag=TRUE)] <- NA #find all cells with a correlation of 1 b <- which(dat3==1,arr.ind=TRUE) if (length(b)!=0) {#extract the column indices w <- b[,2] #eliminate the duplicates w <- unique(w) #eliminate the rows and columns from the original correlation matrix dat2 <- dat2[-w,-w]} rm(dat3) #use Velicer's minimum average partial test to determine the number of principal components #from the correlation matrix a <- eigen(dat2) a$values[a$values<0] <- 0 b <- diag(a$values,nrow=length(a$values)) loadings <- a$vectors%*%sqrt(b) fm <- vector(mode="numeric",length=ncol(dat2)) fm[1] <- (sum(dat2^2)-ncol(dat2))/(ncol(dat2)*(ncol(dat2)-1)) for (m in 1:(ncol(dat2)-1)) {c <- loadings[,1:m] partcov <- dat2-(c%*%t(c)) d <- diag(partcov) if (is.element(NaN,d) | is.element(0,d) | length(d[d<0])!=0) {fm[m+1]<-1.0000000000} else {d <- 1/(sqrt(d)) e <- diag(d,nrow=length(d)) pr <- e%*%partcov%*%e fm[m+1] <- (sum(pr^2)-ncol(dat2))/(ncol(dat2)*(ncol(dat2)-1))}} nfactors <- which.min(fm)-1 #nfactors #for each term, do a one-tailed Fisher's exact test for enrichment, using the number of #principal components as Bonferroni correction numterm <- as.vector(numterm) expcounts <- read.table(file="expcounts.txt",sep="\t",header=FALSE) unadjp <- rep(0,length(numterm)) adjp <- rep(0,length(numterm)) for (i in 1:length(numterm)) {mat <- c(count[i],length(numgene),expcounts[expcounts$V1==numterm[i],2],length(expgene)) mat <- c(mat[1],mat[3]-mat[1],mat[2]-mat[1],mat[4]-mat[3]-mat[2]+mat[1]) mat <- matrix(mat,byrow=TRUE,nrow=2) test <- fisher.test(mat,alternative="greater") unadjp[i] <- test$p.value adjp[i] <- test$p.value*nfactors} #do the analysis for adjusted p-values numterms <- cbind(numterm,adjp) sigterms <- numterms[adjp<=0.05,] sigterms <- matrix(sigterms,ncol=2) write.table(sigterms,file="adj_weights.txt",sep="\t",col.names=FALSE,row.names=FALSE) #the incidence matrix is binary, rows are genes and columns are GO terms, 1 if term is significantly enriched incidence <- matrix(0,nrow=length(numgene),ncol=length(numterm)) for (i in 1:length(numgene)) {u <- dat[dat$V2==numgene[i],3] u <- unique(u) for (j in 1:length(u)) {if (is.element(u[j],sigterms[,1])) {q <- which(u[j]==numterm) incidence[i,q] <- 1}}} #for each term, weight is the number of QTL with any genes so annotated weight <- vector(mode="numeric",length=length(numterm)) for (i in 1:length(numterm)) {z <- dat[dat$V3==numterm[i],1] z <- unique(z) weight[i] <- length(z)} #score is the product of the incidence matrix (adjusted by eigenvectors derived from the correlation matrix) #and the weights for each gene's set of significantly enriched terms dat2 <- dat1[p,p] f <- eigen(dat2) score <- incidence%*%f$vectors%*%weight score <- abs(score) #size is the number of unique genes in each QTL size <- rep(0,length(numqtl)) for (i in 1:length(numqtl)) {z <- dat[dat$V1==numqtl[i],] z <- unique(z$V2) size[i] <- length(z)} #k is a vector that repeats the sizes from dat$V1 k <- numeric() for (i in 1:length(numqtl)) {l <- rep(numqtl[i],size[i]) k <- append(k,l)} numgenes <- cbind(k,numgene,score) numgenes <- matrix(numgenes,ncol=3) numgenes <- t(numgenes) write(numgenes,file="scores.txt",ncolumns=3,sep="\t") #id is the list of top scorers (ties allowed) for each QTL, if any numgenes <- t(numgenes) zz <- file("adj_genes.txt","w") for (i in 1:length(numqtl)) { id <- numeric() g <- numgenes[numgenes[,1]==numqtl[i],] if (max(g[,3])>0) { id <- g[g[,3]==max(g[,3]),2] for (j in 1:length(id)) { cat(numqtl[i],id[j],sep="\t",file=zz) cat("\n",file=zz) } } else cat(numqtl[i],"\tNA\n",file=zz) } close(zz)