#!/usr/bin/env python #get_geneids.py is a script for many-to-one mapping of mRNA #accessions to Entrez Gene ids #Copyright (C) 2007 Daniel Shriner #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. #The text of the GNU General Public License, version 2, is available #as http://www.gnu.org/copyleft or by writing to the Free Software #Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import sys,gzip accs = [] #Load the gene2accession file try: f1 = gzip.open('gene2accession.hs.09May06.gz','rb') except IOError: print >> sys.stderr, 'Error opening gene2accession file.\n' sys.exit(1) while 1: line = f1.readline() if not line: break line = line.split('\t') line[3] = line[3].split('.') entry = [line[1],line[3][0]] accs.append(entry) f1.close() #Open the input file of accessions try: f2 = open('accessions.txt','r') except IOError: print >> sys.stderr, 'Error opening accessions file.\n' sys.exit(1) #Open an output file for printing out gene ids try: f3 = open('geneids.txt','w') except IOError: print >> sys.stderr, 'Error opening output file.\n' sys.exit(1) #For each mRNA accession, find all matching gene ids geneids = [] while 1: line = f2.readline() if not line: break line = line.split('\t') accession = line[1][:-1] j = 0 while 1: if j < len(accs): if accession == accs[j][1]: entry = [line[0],accs[j][0]] if entry not in geneids: geneids.append(entry) j = j + 1 else: break #save out the list of unique gene ids i = 0 while 1: if i < len(geneids): print >> f3, '%s\t%s' % (geneids[i][0],geneids[i][1]) i = i + 1 else: break f2.close() f3.close()