1
2
3
4
5
6 """Martel based parser to read CDD formatted files.
7
8 This is a huge regular regular expression for CDD, built using
9 the 'regular expressiona on steroids' capabilities of Martel.
10
11 http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml
12 Notes:
13 Just so I remember -- the new end of line syntax is:
14 New regexp syntax - \R
15 \R means "\n|\r\n?"
16 [\R] means "[\n\r]"
17
18 This helps us have endlines be consistent across platforms.
19
20 # standard library
21 http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml
22 """
23 import string
24
25
26 import Martel
27 from Martel import RecordReader
28 from Martel import Str
29 from Martel import AnyEol
30 from Martel import ToEol
31 from Martel import Group
32 from Martel import Alt
33 from Martel import Opt
34 from Martel import Rep
35 from Martel import Rep1
36 from Martel import Any
37 from Martel import AnyBut
38 from Martel import Assert
39 from Martel import AssertNot
40
41
42
43
44
45
46
47
48
49
50
51 upper_alpha = Any( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" )
52 white_space = Any( "\t " )
53 eols = chr( 13 ) + chr( 10 )
54 white_spaces = Rep( white_space )
55 summary_line = Str( "CD summary" ) + ToEol()
56
57 cd_tag = Group( "cd_tag", Str( "CD:" ) )
58 description_tag = Group( "description_tag", Str( "Description:" ) )
59 status_tag = Group( "status_tag", Str( "CD status:" ) )
60 source_tag = Group( "source_tag", Str( "Source:" ) )
61 date_tag = Group( "date_tag", Str( "Created:" ) )
62 reference_tag = Group( "reference_tag", Str( "References:" ) )
63 taxonomy_tag = Group( "taxonomy_tag", Str( "Taxonomy spanned:" ) )
64 aligned_tag = Group( "aligned_tag", Str( "Aligned sequences:" ) )
65 representative_tag = Group( "representative_tag", Str( "Representative:" ) )
66 range_tag = Group( "range_tag", Str( "Aligned range:" ) )
67 sequence_tag = Group( "sequence_tag", Str( "Sequence:" ) )
68 has_tag = Alt( cd_tag, description_tag, status_tag, source_tag, date_tag, \
69 reference_tag, taxonomy_tag, aligned_tag, representative_tag, range_tag, sequence_tag )
70
71 cd_key_line = cd_tag + white_spaces + AnyEol()
72 description_key_line = description_tag + white_spaces + AnyEol()
73 status_key_line = status_tag + white_spaces + AnyEol()
74 source_key_line = source_tag + white_spaces + AnyEol()
75 date_key_line = date_tag + white_spaces + AnyEol()
76 reference_key_line = reference_tag + white_spaces + AnyEol()
77 taxonomy_key_line = taxonomy_tag + white_spaces + AnyEol()
78 aligned_key_line = aligned_tag + white_spaces + AnyEol()
79 representative_key_line = representative_tag + white_spaces + AnyEol()
80 range_key_line = range_tag + white_spaces + AnyEol()
81 sequence_key_line = sequence_tag + white_spaces + AnyEol()
82
83 cd_contents_line = Group( "cd_contents_line", AssertNot( has_tag ) + ToEol() )
84 description_contents_line = AssertNot( has_tag ) + ToEol()
85 status_contents_line = AssertNot( has_tag ) + ToEol()
86 source_contents_line = AssertNot( has_tag ) + ToEol()
87 date_contents_line = AssertNot( has_tag ) + ToEol()
88 reference_contents_line = AssertNot( has_tag ) + ToEol()
89 taxonomy_contents_line = AssertNot( has_tag ) + ToEol()
90 aligned_contents_line = AssertNot( has_tag ) + ToEol()
91 representative_contents_line = AssertNot( has_tag ) + ToEol()
92 range_contents_line = AssertNot( has_tag ) + ToEol()
93 sequence_contents_line = Group( "sequence_contents_line", \
94 white_spaces + Rep1( upper_alpha ) + white_spaces + AnyEol() )
95 sentinel_line = white_spaces + Str( "Definition" ) + white_spaces + AnyEol()
96 boiler_plate = AssertNot( sentinel_line ) + ToEol()
97 definition_line = Group( "definition_line", \
98 Rep( AnyBut( eols + '[' ) ) + Str( '[CD]' ) + white_spaces + AnyEol() )
99 pdb_id_line = AssertNot( definition_line ) + ToEol()
100 pdb_id_multiline = Group( "pdb_id_multiline", Rep1( pdb_id_line ) )
101 table_entry = Group( "table_entry", \
102 pdb_id_multiline + definition_line )
103 table = Group( "table", Rep1( table_entry ) )
104
105 cd_contents_multiline = Group( "cd_contents_multiline", \
106 Rep( cd_contents_line ) )
107 description_contents_multiline = Group( "description_contents_multiline", \
108 Rep( description_contents_line ) )
109 status_contents_multiline = Group( "status_contents_multiline", \
110 Rep( status_contents_line ) )
111 source_contents_multiline = Group( "source_contents_multiline", \
112 Rep( source_contents_line ) )
113 date_contents_multiline = Group( "date_contents_multiline", \
114 Rep( date_contents_line ) )
115 reference_contents_multiline = Group( "reference_contents_multiline", \
116 Rep( reference_contents_line ) )
117 taxonomy_contents_multiline = Group( "taxonomy_contents_multiline", \
118 Rep( taxonomy_contents_line ) )
119 aligned_contents_multiline = Group( "aligned_contents_multiline", \
120 Rep( aligned_contents_line ) )
121 representative_contents_multiline = Group( "representative_contents_multiline", \
122 Rep( representative_contents_line ) )
123 range_contents_multiline = Group( "range_contents_multiline", \
124 Rep( range_contents_line ) )
125 sequence_contents_multiline = Group( "sequence_contents_multiline", \
126 Rep( sequence_contents_line ) )
127
128 cd_block = cd_key_line + cd_contents_multiline
129 description_block = description_key_line + description_contents_multiline
130 status_block = status_key_line + status_contents_multiline
131 source_block = source_key_line + source_contents_multiline
132 date_block = date_key_line + date_contents_multiline
133 reference_block = Assert(reference_tag ) + reference_key_line + \
134 reference_contents_multiline
135 taxonomy_block = taxonomy_key_line + taxonomy_contents_multiline
136 aligned_block = aligned_key_line + aligned_contents_multiline
137 representative_block = representative_key_line + representative_contents_multiline
138 range_block = range_key_line + range_contents_multiline
139 sequence_block = sequence_key_line + sequence_contents_multiline
140 trailer_line = ToEol()
141
142 cdd_record = summary_line + cd_block + description_block + status_block + \
143 source_block + date_block + Opt( reference_block ) + taxonomy_block + \
144 aligned_block + representative_block + range_block + sequence_block + \
145 Rep( boiler_plate ) + sentinel_line + table
146