Package Bio :: Package CDD :: Module cdd_format
[hide private]
[frames] | no frames]

Source Code for Module Bio.CDD.cdd_format

  1  # Copyright 2001 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Martel based parser to read CDD formatted files. 
  7   
  8  This is a huge regular regular expression for CDD, built using 
  9  the 'regular expressiona on steroids' capabilities of Martel. 
 10   
 11  http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml 
 12  Notes: 
 13  Just so I remember -- the new end of line syntax is: 
 14    New regexp syntax - \R 
 15       \R    means "\n|\r\n?" 
 16       [\R]  means "[\n\r]" 
 17   
 18  This helps us have endlines be consistent across platforms. 
 19   
 20  # standard library 
 21  http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml 
 22  """ 
 23  import string 
 24   
 25  # Martel 
 26  import Martel 
 27  from Martel import RecordReader 
 28  from Martel import Str 
 29  from Martel import AnyEol 
 30  from Martel import ToEol 
 31  from Martel import Group 
 32  from Martel import Alt 
 33  from Martel import Opt 
 34  from Martel import Rep 
 35  from Martel import Rep1 
 36  from Martel import Any 
 37  from Martel import AnyBut 
 38  from Martel import Assert 
 39  from Martel import AssertNot 
 40   
 41   
 42   
 43   
 44   
 45  # --- first set up some helper constants and functions 
 46  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
 47  # This code is part of the Biopython distribution and governed by its 
 48  # license.  Please see the LICENSE file that should have been included 
 49  # as part of this package. 
 50   
 51  upper_alpha = Any( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ) 
 52  white_space = Any( "\t " ) 
 53  eols = chr( 13 ) + chr( 10 ) 
 54  white_spaces = Rep( white_space ) 
 55  summary_line = Str( "CD summary" ) + ToEol() 
 56   
 57  cd_tag = Group( "cd_tag", Str( "CD:" ) ) 
 58  description_tag = Group( "description_tag", Str( "Description:" ) ) 
 59  status_tag = Group( "status_tag", Str( "CD status:" ) ) 
 60  source_tag = Group( "source_tag", Str( "Source:" ) ) 
 61  date_tag = Group( "date_tag", Str( "Created:" ) ) 
 62  reference_tag = Group( "reference_tag", Str( "References:" ) ) 
 63  taxonomy_tag =  Group( "taxonomy_tag", Str( "Taxonomy spanned:" ) ) 
 64  aligned_tag = Group( "aligned_tag", Str( "Aligned sequences:" ) ) 
 65  representative_tag = Group( "representative_tag", Str( "Representative:" ) ) 
 66  range_tag = Group( "range_tag", Str( "Aligned range:" ) ) 
 67  sequence_tag = Group( "sequence_tag", Str( "Sequence:" ) ) 
 68  has_tag = Alt( cd_tag, description_tag, status_tag, source_tag, date_tag, \ 
 69      reference_tag, taxonomy_tag, aligned_tag, representative_tag, range_tag, sequence_tag ) 
 70   
 71  cd_key_line = cd_tag + white_spaces + AnyEol() 
 72  description_key_line = description_tag + white_spaces + AnyEol() 
 73  status_key_line = status_tag + white_spaces + AnyEol() 
 74  source_key_line = source_tag + white_spaces + AnyEol() 
 75  date_key_line = date_tag + white_spaces + AnyEol() 
 76  reference_key_line = reference_tag + white_spaces + AnyEol() 
 77  taxonomy_key_line = taxonomy_tag + white_spaces + AnyEol() 
 78  aligned_key_line = aligned_tag + white_spaces + AnyEol() 
 79  representative_key_line = representative_tag + white_spaces + AnyEol() 
 80  range_key_line = range_tag + white_spaces + AnyEol() 
 81  sequence_key_line = sequence_tag + white_spaces + AnyEol() 
 82   
 83  cd_contents_line = Group( "cd_contents_line", AssertNot( has_tag ) + ToEol() ) 
 84  description_contents_line = AssertNot( has_tag ) + ToEol() 
 85  status_contents_line = AssertNot( has_tag ) + ToEol() 
 86  source_contents_line = AssertNot( has_tag ) + ToEol() 
 87  date_contents_line = AssertNot( has_tag ) + ToEol() 
 88  reference_contents_line = AssertNot( has_tag ) + ToEol() 
 89  taxonomy_contents_line = AssertNot( has_tag ) + ToEol() 
 90  aligned_contents_line = AssertNot( has_tag ) + ToEol() 
 91  representative_contents_line = AssertNot( has_tag ) + ToEol() 
 92  range_contents_line = AssertNot( has_tag ) + ToEol() 
 93  sequence_contents_line = Group( "sequence_contents_line", \ 
 94                              white_spaces + Rep1( upper_alpha ) + white_spaces + AnyEol() ) 
 95  sentinel_line = white_spaces + Str( "Definition" ) + white_spaces + AnyEol() 
 96  boiler_plate = AssertNot( sentinel_line ) + ToEol() 
 97  definition_line = Group( "definition_line", \ 
 98      Rep( AnyBut( eols + '[' ) ) + Str( '[CD]' ) + white_spaces + AnyEol() ) 
 99  pdb_id_line = AssertNot( definition_line ) + ToEol() 
100  pdb_id_multiline = Group( "pdb_id_multiline", Rep1( pdb_id_line ) ) 
101  table_entry = Group( "table_entry", \ 
102      pdb_id_multiline + definition_line ) 
103  table = Group( "table", Rep1( table_entry ) ) 
104   
105  cd_contents_multiline = Group( "cd_contents_multiline", \ 
106      Rep( cd_contents_line ) ) 
107  description_contents_multiline = Group( "description_contents_multiline", \ 
108      Rep( description_contents_line ) ) 
109  status_contents_multiline = Group( "status_contents_multiline", \ 
110      Rep( status_contents_line ) ) 
111  source_contents_multiline = Group( "source_contents_multiline", \ 
112      Rep( source_contents_line ) ) 
113  date_contents_multiline = Group( "date_contents_multiline", \ 
114      Rep( date_contents_line ) ) 
115  reference_contents_multiline = Group( "reference_contents_multiline", \ 
116      Rep( reference_contents_line ) ) 
117  taxonomy_contents_multiline = Group( "taxonomy_contents_multiline", \ 
118      Rep( taxonomy_contents_line ) ) 
119  aligned_contents_multiline = Group( "aligned_contents_multiline", \ 
120      Rep( aligned_contents_line ) ) 
121  representative_contents_multiline = Group( "representative_contents_multiline", \ 
122      Rep( representative_contents_line ) ) 
123  range_contents_multiline = Group( "range_contents_multiline", \ 
124      Rep( range_contents_line ) ) 
125  sequence_contents_multiline = Group( "sequence_contents_multiline", \ 
126      Rep( sequence_contents_line ) ) 
127   
128  cd_block = cd_key_line + cd_contents_multiline 
129  description_block = description_key_line + description_contents_multiline 
130  status_block = status_key_line + status_contents_multiline 
131  source_block = source_key_line + source_contents_multiline 
132  date_block = date_key_line + date_contents_multiline 
133  reference_block = Assert(reference_tag ) + reference_key_line + \ 
134      reference_contents_multiline 
135  taxonomy_block = taxonomy_key_line + taxonomy_contents_multiline 
136  aligned_block = aligned_key_line + aligned_contents_multiline 
137  representative_block = representative_key_line + representative_contents_multiline 
138  range_block = range_key_line + range_contents_multiline 
139  sequence_block = sequence_key_line + sequence_contents_multiline 
140  trailer_line = ToEol() 
141   
142  cdd_record = summary_line + cd_block + description_block + status_block + \ 
143      source_block + date_block + Opt( reference_block ) + taxonomy_block + \ 
144      aligned_block + representative_block + range_block + sequence_block + \ 
145      Rep( boiler_plate ) + sentinel_line + table 
146