#!/usr/bin/env python #get_accessions.py is a script and functions for grabbing #accession for mRNAs that map to a given genomic region #Copyright (C) 2007 Ann E. Loraine and Daniel Shriner #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. #The text of the GNU General Public License, version 2, is available #as http://www.gnu.org/copyleft or by writing to the Free Software #Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # first, import useful modules (like libraries) and methods import sys,urllib from xml.sax import make_parser from xml.sax.handler import ContentHandler das_url = 'http://genome.cse.ucsc.edu:80/cgi-bin/das' # because DAS delivers data in XML format, we need to # parse XML # Fortunately, this is easy to do -- we just make use of code # that others have written and adapt it to our own uses. class FeatureHandler(ContentHandler): # we create a subclass of ContentHandler, over-writing # methods that are invoked whenever we encounter # "events" - such as the start of an element # during the parse, we'll build up this dictionary # (hashtable) of ids _ids = {} def empty_ids(self): self._ids={} def startElement(self,name,attrs): # whenever we encounter an element called "FEATURE", # we take a look at its attributes # if the "feature" has a "label" attribute, we assume it's # an accession for an mRNA if name == 'FEATURE': if attrs.has_key('label'): id_val = attrs['label'] self.get_ids()[id_val]=1 def get_ids(self): return self._ids # let's define a method that builds a URL for # accessing data from the UCSC Genome Informatics Distributed # annotation server def make_das_url(v=None,start=None,end=None,seq=None,typ='knownGene'): u = das_url + '/' + v + '/features?segment='+seq+':'+start+','+end+\ ';type='+typ return u # get all mRNA accessions from the 'knownGene' annotations # table that map to the given genomic region def get_knownGene(v=None,start=None,end=None,seq=None): u = make_das_url(v=v,start=start,end=end,seq=seq,typ='knownGene') fh = urllib.urlopen(u) parser = make_parser() handler = FeatureHandler() handler.empty_ids() parser.setContentHandler(handler) parser.parse(fh) ids = handler.get_ids() feats = [] for id in ids.keys(): feats.append(id) return feats # # The script's "main" method -- this is executed when the script # is run from the command line, e.g., # # $ get_accs.py hg17 chr16 10000 200000 # # where $ is the Unix prompt on a system where python is installed. # # To run the script interactively, just launch the python # interpreter: # # $ python # # and type your commands, for example: # # >>> import get_accs # >>> seq = 'chr16' # >>> version = 'hg17' # >>> strt = '10000' # >>> end = '200000' # >>> get_accs.main(v=version,seq=seq,start=strt,end=end) # #if __name__=="__main__": # args = sys.argv # if not len(args)==5: # sys.stderr.write("Usage: get_accessions.py version sequence start end"+\ # os.linesep) # # exit with an error code # sys.exit(1) # else: try: f1 = open('ranges.txt','r') except IOError: print >> sys.stderr, 'Error opening input file.\n' sys.exit(1) try: f2 = open('accessions.txt','w') except IOError: print >> sys.stderr, 'Error opening output file.\n' sys.exit(1) while 1: line = f1.readline() if not line: break line = line.split('\t') version = 'hg17' seq = line[1] start = line[2] end = line[3] end = end[:-1] ids = get_knownGene(v=version,start=start,end=end,seq=seq) if len(ids)>0: for i in ids: print >> f2, '%s\t%s' % (line[0],i) else: print >> sys.stderr, 'Warning: no ids retrieved for marker %s.' % line[0] f1.close() f2.close()