""" This script contains functions for examining the correlation matrix ../data/correl.txt. Note that this matrix contains Pearson's r for counts data. Here r expresses the degree to which any given pair of GO terms annotated the same genes. Default file names assume we're in the analysis directory. This should really used the scipy library matrix functions, but I haven't upgraded to Tiger yet. See: http://www.scipy.org/ """ # standard python import re,sys,os,string # column headings for the correlation matrix col_heads_fn = '../data/expcounts.txt' # the correlation matrix file cor_mat_fn = '../data/correl.txt' # some useful constants sep1 = '\t' sep2 = ',' sep3 = ' ' # in cvs module common import go_utils as gu import ssg_utils as su class CorMat: """Represents a GO term correlation matrix.""" def __init__(self,cors=None,col_heads=None): """ Function: Makes a new CorMat object Returns : Args : cors - output from read_cor_smarter col_heads - output from read_col_heads """ self.set_cors(cors) self.set_col_heads(col_heads) def set_col_heads(self,col_heads): """ Function: Returns : Args : col_heads - output from read_col_heads """ self._col_heads = col_heads def get_col_heads(self): """ Function: Returns : list of column/row labels Args : """ return self._col_heads def set_cors(self,cors): """ Function: Returns : Args : cors - output from read_cor_smarter """ self._cors = cors def _get_cors(self): """ Function: PRIVATE function go away Returns : Args : """ return self._cors def get_cor(self,goid1,goid2): """ Function: Get the correlation between these two goids Returns : a float Args : goid1, goid2 - GO ids Note: if goid1 and goid2 information haven't been added to this CorMat, a KeyError exception will be thrown. If this happens, all bets are off, so don't try to catch this unless you have some good error-handling code! """ d = self._get_cors() if d.has_key(goid1): return d[goid1][goid2] elif d.has_key(goid2): return d[goid2][goid1] def get_cor_lst(self,threshold=None): """ Function: Get a list of lists, where each sublist contains a pair of GO ids and their correlation Returns : a list of lists, unsorted Args : threshold - only return pairs with correlation greater than or equal to threshold [optional] Each GO id list contains [goid1,goid2,correlation] Unless run with a threshold option, this tends to be slow for large matrices. How large is large? Good question! """ d = self._get_cors() biglst = [] for goid1 in d.keys(): subd = d[goid1] for goid2 in subd.keys(): if goid1 == goid2: continue val = subd[goid2] if threshold and val >= threshold: biglst.append([goid1,goid2,val]) else: biglst.append([goid1,goid2,val]) return biglst def write_cor_pairs(cormat=None,go_defs=None,fn='cor_pairs2.xls', threshold=.2,trees=None): """ Function: Write a list of term correlations Returns : Args : cormat - a CorMat object go_defs - output from get_go_defs threshold - only report correlations >= this [optional] fn - name of the file to write trees - output from go_utils.build_go_tree If we can't find the definition for a GO term, 'NA' will be written in the corresponding field. """ fh = open(fn,'w') heads = ['r','in_lineage','cross','goid1','tree1','godef1', 'goid2','tree2','godef2'] fh.write(sep1.join(heads)+'\n') if not go_defs: go_defs = get_go_defs() goids = cormat.get_col_heads() i = 0 for goid1 in goids[1:]: go_def1 = gu.get_def(go_defs,goid1) if not go_def1: go_def1 = 'NA' tree1 = 'NA' else: tree1 = gu.get_tree(go_defs,goid1) i = i + 1 for goid2 in goids[i:]: r = cormat.get_cor(goid1,goid2) if threshold and r < threshold: continue go_def2 = gu.get_def(go_defs,goid2) if not go_def2: go_def2 = 'NA' tree2 = 'NA' else: tree2 = gu.get_tree(go_defs,goid2) if tree1 == tree2: if tree1 == 'molecular_function': cross = '1' elif tree1 == 'biological_process': cross = '2' elif tree1 == 'cellular_component': cross = '3' else: cross = '0' if tree1 == tree2: graph = trees[tree1] res = gu.in_lineage(graph,goid1,goid2) if res == None: inlineage = 'NA' elif res: inlineage = 'T' elif not res: inlineage = 'F' else: inlineage = 'NA' # could add some codes for specific pairings vals = [repr(r),inlineage,cross,goid1,tree1,go_def1, goid2,tree2,go_def2] fh.write(sep1.join(vals)+'\n') fh.close() def get_go_defs(fn='../data/gene2go.hs.09May06.gz', taxid='9606'): """ Function: Retrieve GO term definitions from the given file. Returns : Args : fn - name of file to parse. Expects gene2go file from Entrez gene. Note: returns the exact same data structure as go_utils.get_go_defs, so that means we can use go_utils methods on it, such as get_tree and get_def. Note: the default fn only contains H.s. GO terms, but we check the tax id anyway. Some stats: # mf terms - 2308 # bp terms - 2291 # cc terms - 548 """ t = {'Component':'cellular_component', 'Function':'molecular_function', 'Process':'biological_process'} d = {} for tree in t.values(): d[tree]={} fh = su.readfile(fn) while 1: line = fh.readline() if not line: fh.close() break toks=line.rstrip().split(sep1) goid = toks[2] tree = toks[-1] defn = toks[5] if not line.startswith(taxid+'\t'): continue tree = t[tree] d[tree][goid]=defn return d def read_col_heads(fn=col_heads_fn,lines=None): """ Function: Get list of GO id column headings Returns : a list of GO ids Args : fn - two column list, column 1 is GO ids Line separator in the input file is '\n' and there is an extra new line at the end. """ subby = re.compile('"') go_ids = [] if not lines: fh = open(fn,'r') lines = fh.read().split('\n') fh.close() for line in lines: toks = line.rstrip().split(sep1) if len(toks)==1: continue # gets rid of quotes go_id = subby.sub('',toks[0]) go_ids.append(go_id) return go_ids def read_cor(fn=cor_mat_fn,lines=None): """ Function: read the correlation matrix into memory Returns : a list of rows, where each row is a list of float values Args : fn - name of file with correlation matrix. This has been tested (and works) with file candi_gene/correl.txt, which is what you get when you unpack candi_gene/correl.zip v. 1.1 This method implements a brute force method of reading and representing the correlation matrix rows. Notes: V .1.1 of cor_mat_fn 'correl.txt' has 5148 lines and uses \r as the newline character Each line contains 5147 values, so this is the full matrix. Also, each line terminates with a space character. The final line is a newline character, so we have to check for it. """ if not lines: sys.stderr.write("reading: " + fn + '\n') fh = open(fn,'r') lines = fh.read().split('\r') rows = [] i = 1 for line in lines: # watch out for the final newline character at # the end of the file # doh! if len(line)<10: break toks = line.rstrip().split(sep3) try: toks = map(lambda x:string.atof(x),toks) except ValueError: sys.stderr.write("Error on line: " + repr(i)+'\n') return line rows.append(toks) i = i + 1 return rows def read_cor_smarter(fn=cor_mat_fn,lines=None,col_heads=None): """ Function: Read GO term annotation correlation matrix. Returns : a CorMat object Args : fn - name of the correlation matrix lines - the output from reading the cor mat file into memory, and splitting on '\r' This has been tested (and works) with file candi_gene/correl.txt, which is what you get when you unpack candi_gene/correl.zip v. 1.1 Note: this will fail horribly if any GO id appears in col_heads more than once! """ d = {} i = 0 j = 0 if not col_heads: sys.stderr.write("Getting column heads.\n") col_heads = read_col_heads() sys.stderr.write("Got: " + repr(len(col_heads)) + \ " column heads.\n") for col_head1 in col_heads: j = i d[col_head1]={} for col_head2 in col_heads[j:]: d[col_head1][col_head2]=None i = i + 1 if not lines: sys.stderr.write("reading: " + fn + '\n') fh = open(fn,'r') lines = fh.read().split('\r') sys.stderr.write("done reading: " + fn + "\n") i = 0 # row indices j = 0 # column indices for line in lines: if len(line)<10: break toks = line.rstrip().split(sep3) row_label = col_heads[i] j = i for col_head in col_heads[i:]: d[row_label][col_head]=string.atof(toks[j]) j = j + 1 i = i + 1 cormat = CorMat(cors=d,col_heads=col_heads) return cormat def zdummy(): """ Function: Returns : Args : """ pass