"""A module containing functions useful for working with gene2pubmed, a file that maps Entrez Gene identifiers onto article identifiers from PubMed""" import gzip def read_gene2pubmed(fn): """ Function: Read the given gene2pubmed-style file and create a a dictionary object that maps Entrez Gene ids (keys) onto lists of PubMed identifiers, one list per gene id Returns : a dictionary Args : fn - name of the file to read File should conform the convention described in the README file in the Entrez Gene section of the NCBI ftp site. e.g., taxonomy id [tab] gen id [tab] pubmed id [newline] Note: if any two organisms listed in fn have the same gene id, then all pubmed identifiers associated with either gene will appear in the same list. """ if fn.endswith('.gz'): fh = gzip.gzipfile(fn) else: fh = open(fn,'r') d = {} while 1: line = fh.readline() if not line: fh.close() break vals = line.rstrip().split('\t') gene_id = vals[1] pubmed_id = vals[2] if d.has_key(gene_id): lst = d[gene_id] if pubmed_id not in lst: # this test may not be necessary lst.append(pubmed_id) else: d[gene_id] = [pubmed_id] return d