Package Bio :: Package UniGene
[hide private]
[frames] | no frames]

Source Code for Package Bio.UniGene

  1  # Copyright 2006 by Sean Davis.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # $Id: __init__.py,v 1.10 2007/04/18 09:19:03 peterc Exp $ 
  7  # Sean Davis <sdavis2 at mail dot nih dot gov> 
  8  # National Cancer Institute 
  9  # National Institutes of Health 
 10  # Bethesda, MD, USA 
 11  # 
 12   
 13  """ 
 14  Parse Unigene flat file format files such as the Hs.data file. 
 15   
 16  Here is an overview of the flat file format that this parser deals with: 
 17     Line types/qualifiers: 
 18   
 19         ID           UniGene cluster ID 
 20         TITLE        Title for the cluster 
 21         GENE         Gene symbol 
 22         CYTOBAND     Cytological band 
 23         EXPRESS      Tissues of origin for ESTs in cluster 
 24         RESTR_EXPR   Single tissue or development stage contributes  
 25                      more than half the total EST frequency for this gene. 
 26         GNM_TERMINUS genomic confirmation of presence of a 3' terminus;  
 27                      T if a non-templated polyA tail is found among  
 28                        a cluster's sequences; else 
 29                      I if templated As are found in genomic sequence or 
 30                      S if a canonical polyA signal is found on  
 31                        the genomic sequence 
 32         GENE_ID      Entrez gene identifier associated with at least one sequence in this cluster;  
 33                      to be used instead of LocusLink.   
 34         LOCUSLINK    LocusLink identifier associated with at least one sequence in this cluster;   
 35                      deprecated in favor of GENE_ID 
 36         CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping on the arabidopsis genome. 
 37         STS          STS 
 38              NAME=        Name of STS 
 39              ACC=         GenBank/EMBL/DDBJ accession number of STS [optional field] 
 40              DSEG=        GDB Dsegment number [optional field] 
 41              UNISTS=      identifier in NCBI's UNISTS database 
 42         TXMAP        Transcript map interval 
 43              MARKER=      Marker found on at least one sequence in this cluster 
 44              RHPANEL=     Radiation Hybrid panel used to place marker 
 45         PROTSIM      Protein Similarity data for the sequence with highest-scoring protein similarity in this cluster 
 46              ORG=         Organism 
 47              PROTGI=      Sequence GI of protein 
 48              PROTID=      Sequence ID of protein 
 49              PCT=         Percent alignment 
 50              ALN=         length of aligned region (aa) 
 51         SCOUNT       Number of sequences in the cluster 
 52         SEQUENCE     Sequence 
 53              ACC=         GenBank/EMBL/DDBJ accession number of sequence 
 54              NID=         Unique nucleotide sequence identifier (gi) 
 55              PID=         Unique protein sequence identifier (used for non-ESTs) 
 56              CLONE=       Clone identifier (used for ESTs only) 
 57              END=         End (5'/3') of clone insert read (used for ESTs only)  
 58              LID=         Library ID; see Hs.lib.info for library name and tissue         
 59              MGC=         5' CDS-completeness indicator; if present,  
 60                           the clone associated with this sequence   
 61                           is believed CDS-complete. A value greater than 511 
 62                           is the gi of the CDS-complete mRNA matched by the EST, 
 63                           otherwise the value is an indicator of the reliability 
 64                           of the test indicating CDS comleteness; 
 65                           higher values indicate more reliable CDS-completeness predictions.  
 66             SEQTYPE=      Description of the nucleotide sequence. Possible values are 
 67                           mRNA, EST and HTC. 
 68             TRACE=        The Trace ID of the EST sequence, as provided by NCBI Trace Archive 
 69             PERIPHERAL=   Indicator that the sequence is a suboptimal  
 70                           representative of the gene represented by this cluster. 
 71                           Peripheral sequences are those that are in a cluster 
 72                           which represents a spliced gene without sharing a 
 73                           splice junction with any other sequence.  In many 
 74                           cases, they are unspliced transcripts originating 
 75                           from the gene. 
 76   
 77         //           End of record 
 78  """ 
 79  from Bio.ParserSupport import * 
 80  import re 
 81   
 82  # 
 83  # CONSTANTS 
 84  # 
 85  UG_INDENT=12 
 86   
87 -class UnigeneSequenceRecord:
88 """Store the information for one SEQUENCE line from a Unigene file 89 90 Initialize with the text part of the SEQUENCE line, or nothing. 91 92 Attributes and descriptions (access as LOWER CASE) 93 ACC= GenBank/EMBL/DDBJ accession number of sequence 94 NID= Unique nucleotide sequence identifier (gi) 95 PID= Unique protein sequence identifier (used for non-ESTs) 96 CLONE= Clone identifier (used for ESTs only) 97 END= End (5'/3') of clone insert read (used for ESTs only) 98 LID= Library ID; see Hs.lib.info for library name and tissue 99 MGC= 5' CDS-completeness indicator; if present, 100 the clone associated with this sequence 101 is believed CDS-complete. A value greater than 511 102 is the gi of the CDS-complete mRNA matched by the EST, 103 otherwise the value is an indicator of the reliability 104 of the test indicating CDS comleteness; 105 higher values indicate more reliable CDS-completeness predictions. 106 SEQTYPE= Description of the nucleotide sequence. Possible values are 107 mRNA, EST and HTC. 108 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive 109 PERIPHERAL= Indicator that the sequence is a suboptimal 110 representative of the gene represented by this cluster. 111 Peripheral sequences are those that are in a cluster 112 which represents a spliced gene without sharing a 113 splice junction with any other sequence. In many 114 cases, they are unspliced transcripts originating 115 from the gene. 116 """ 117
118 - def __init__(self,text=None):
119 self.acc = '' 120 self.nid = '' 121 self.lid = '' 122 self.pid = '' 123 self.clone = '' 124 self.image = '' 125 self.is_image = False 126 self.end = '' 127 self.mgc = '' 128 self.seqtype = '' 129 self.Trace = '' 130 self.peripheral = '' 131 if not text==None: 132 self.text=text 133 return self._init_from_text(text)
134
135 - def _init_from_text(self,text):
136 parts = text.split('; '); 137 for part in parts: 138 key,val = re.match('(\w+)=(\S+)',part).groups() 139 if key=='CLONE': 140 if val[:5]=='IMAGE': 141 self.is_image=True 142 self.image = val[6:] 143 setattr(self,key.lower(),val)
144
145 - def __repr__(self):
146 return self.text
147 148
149 -class UnigeneProtsimRecord:
150 """Store the information for one PROTSIM line from a Unigene file 151 152 Initialize with the text part of the PROTSIM line, or nothing. 153 154 Attributes and descriptions (access as LOWER CASE) 155 ORG= Organism 156 PROTGI= Sequence GI of protein 157 PROTID= Sequence ID of protein 158 PCT= Percent alignment 159 ALN= length of aligned region (aa) 160 """ 161
162 - def __init__(self,text=None):
163 self.org = '' 164 self.protgi = '' 165 self.protid = '' 166 self.pct = '' 167 self.aln = '' 168 if not text==None: 169 self.text=text 170 return self._init_from_text(text)
171
172 - def _init_from_text(self,text):
173 parts = text.split('; '); 174 175 for part in parts: 176 key,val = re.match('(\w+)=(\S+)',part).groups() 177 setattr(self,key.lower(),val)
178
179 - def __repr__(self):
180 return self.text
181 182
183 -class UnigeneSTSRecord:
184 """Store the information for one STS line from a Unigene file 185 186 Initialize with the text part of the STS line, or nothing. 187 188 Attributes and descriptions (access as LOWER CASE) 189 190 NAME= Name of STS 191 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 192 DSEG= GDB Dsegment number [optional field] 193 UNISTS= identifier in NCBI's UNISTS database 194 """ 195
196 - def __init__(self,text=None):
197 self.name = '' 198 self.acc = '' 199 self.dseg = '' 200 self.unists = '' 201 if not text==None: 202 self.text=text 203 return self._init_from_text(text)
204
205 - def _init_from_text(self,text):
206 parts = text.split(' '); 207 208 for part in parts: 209 key,val = re.match('(\w+)=(\S+)',part).groups() 210 setattr(self,key.lower(),val)
211
212 - def __repr__(self):
213 return self.text
214 215
216 -class UnigeneRecord:
217 """Store a Unigene record 218 219 Here is what is stored: 220 221 self.ID = '' # ID line 222 self.species = '' # Hs, Bt, etc. 223 self.title = '' # TITLE line 224 self.symbol = '' # GENE line 225 self.cytoband = '' # CYTOBAND line 226 self.express = [] # EXPRESS line, parsed on ';' 227 # Will be an array of strings 228 self.restr_expr = '' # RESTR_EXPR line 229 self.gnm_terminus = '' # GNM_TERMINUS line 230 self.gene_id = '' # GENE_ID line 231 self.chromosome = '' # CHROMOSOME 232 self.protsim = [] # PROTSIM entries, array of Protsims 233 # Type UnigeneProtsimRecord 234 self.sequence = [] # SEQUENCE entries, array of Sequence entries 235 # Type UnigeneSequenceRecord 236 self.sts = [] # STS entries, array of STS entries 237 # Type UnigeneSTSRecord 238 self.txmap = [] # TXMAP entries, array of TXMap entries 239 """ 240
241 - def __init__(self):
242 self.ID = '' # ID line 243 self.species = '' # Hs, Bt, etc. 244 self.title = '' # TITLE line 245 self.symbol = '' # GENE line 246 self.cytoband = '' # CYTOBAND line 247 self.express = [] # EXPRESS line, parsed on ';' 248 self.restr_expr = '' # RESTR_EXPR line 249 self.gnm_terminus = '' # GNM_TERMINUS line 250 self.gene_id = '' # GENE_ID line 251 self.chromosome = '' # CHROMOSOME 252 self.protsim = [] # PROTSIM entries, array of Protsims 253 self.sequence = [] # SEQUENCE entries, array of Sequence entries 254 self.sts = [] # STS entries, array of STS entries 255 self.txmap = [] # TXMAP entries, array of TXMap entries
256
257 - def __repr__(self):
258 return "<%s> %s %s\n%s" % (self.__class__.__name__, 259 self.ID, self.symbol, self.title)
260 261
262 -class _RecordConsumer(AbstractConsumer):
263
264 - def __init__(self):
265 self.unigene_record = UnigeneRecord()
266 - def ID(self,line):
267 self.unigene_record.ID = self._get_single_entry(line) 268 self.unigene_record.species = self.unigene_record.ID.split('.')[0]
269 - def TITLE(self,line):
270 self.unigene_record.title = self._get_single_entry(line)
271 - def GENE(self,line):
272 self.unigene_record.symbol = self._get_single_entry(line)
273 - def EXPRESS(self,line):
274 self.unigene_record.express = self._get_array_entry(line,split_on='; ')
275 - def RESTR_EXPR(self,line):
276 self.unigene_record.restr_expr = self._get_single_entry(line)
277 - def GENE_ID(self,line):
278 self.unigene_record.gene_id = self._get_single_entry(line)
279 - def CHROMOSOME(self,line):
280 self.unigene_record.chromosome = self._get_single_entry(line)
281 - def GENE_ID(self,line):
282 self.unigene_record.gene_id = self._get_single_entry(line)
283 - def SEQUENCE(self,line):
284 ug_seqrecord = UnigeneSequenceRecord(self._get_single_entry(line)) 285 self.unigene_record.sequence.append(ug_seqrecord)
286 - def PROTSIM(self,line):
287 ug_protsimrecord = UnigeneProtsimRecord(self._get_single_entry(line)) 288 self.unigene_record.protsim.append(ug_protsimrecord)
289 - def STS(self,line):
290 ug_stsrecord = UnigeneSTSRecord(self._get_single_entry(line)) 291 self.unigene_record.sts.append(ug_stsrecord)
292 293
294 - def _get_single_entry(self,line):
295 """Consume a single-value line 296 """ 297 return line[UG_INDENT:]
298
299 - def _get_array_entry(self,line,split_on):
300 """Consume a multi-value line by splitting on split_on 301 """ 302 return line[UG_INDENT:].split(split_on)
303 304
305 -class _Scanner:
306 """Scans a Unigene Flat File Format file 307 """ 308
309 - def feed(self, handle, consumer):
310 """feed(self, handle, consumer) 311 312 Feed events from parsing a Unigene file to a consumer. 313 handle is a file-like object, and consumer is a consumer object 314 that will receive events as the file is scanned 315 316 """ 317 consumer.start_record() 318 for line in handle: 319 tag = line.split(' ')[0] 320 line = line.rstrip() 321 if line=='//': 322 consumer.end_record() 323 break 324 try: 325 f = getattr(consumer, tag) 326 except AttributeError: 327 print 'no method called', tag 328 else: 329 if callable(f): 330 f(line)
331 332
333 -class RecordParser(AbstractParser):
334 - def __init__(self):
335 self._scanner = _Scanner() 336 self._consumer = _RecordConsumer()
337
338 - def parse(self, handle):
339 if isinstance(handle, File.UndoHandle): 340 uhandle = handle 341 else: 342 uhandle = File.UndoHandle(handle) 343 self._scanner.feed(uhandle, self._consumer) 344 return self._consumer.unigene_record
345
346 -class Iterator:
347 - def __init__(self, handle, parser=None):
348 self._uhandle = File.UndoHandle(handle)
349
350 - def next(self):
351 self._parser = RecordParser() 352 lines = [] 353 while 1: 354 line = self._uhandle.readline() 355 if not line: break 356 if line[:2] == '//': 357 break 358 lines.append(line) 359 if not lines: 360 return None 361 lines.append('//') 362 data = string.join(lines,'') 363 if self._parser is not None: 364 return self._parser.parse(File.StringHandle(data)) 365 return data
366
367 - def __iter__(self):
368 return iter(self.next, None)
369