Package Bio :: Package GenBank :: Module LocationParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.LocationParser

  1  # First pass at a parser for the location fields of a feature table. 
  2  # Everything likely to change. 
  3   
  4  # Based on the DDBJ/EMBL/GenBank Feature Table Definition Version 2.2 
  5  # Dec 15 1999 available from EBI, but the documentation is not 
  6  # completely internally consistent much less agree with real-life 
  7  # examples.  Conflicts resolved to agree with real examples. 
  8   
  9  # Uses John Aycock's SPARK for parsing 
 10  from Bio.Parsers.spark import GenericScanner, GenericParser 
 11   
12 -class Token:
13 - def __init__(self, type):
14 self.type = type
15 - def __cmp__(self, other):
16 return cmp(self.type, other)
17 - def __repr__(self):
18 return "Tokens(%r)" % (self.type,)
19 20 # "38"
21 -class Integer:
22 type = "integer"
23 - def __init__(self, val):
24 self.val = val
25 - def __cmp__(self, other):
26 return cmp(self.type, other)
27 - def __str__(self):
28 return str(self.val)
29 - def __repr__(self):
30 return "Integer(%s)" % self.val
31 32 # From the BNF definition, this isn't needed. Does tht mean 33 # that bases can be refered to with negative numbers?
34 -class UnsignedInteger(Integer):
35 type = "unsigned_integer"
36 - def __repr__(self):
37 return "UnsignedInteger(%s)" % self.val
38
39 -class Symbol:
40 type = "symbol"
41 - def __init__(self, name):
42 self.name = name
43 - def __cmp__(self, other):
44 return cmp(self.type, other)
45 - def __str__(self):
46 return str(self.name)
47 - def __repr__(self):
48 return "Symbol(%s)" % repr(self.name)
49 50 # ">38" -- The BNF says ">" is for the lower bound.. seems wrong to me
51 -class LowBound:
52 - def __init__(self, base):
53 self.base = base
54 - def __repr__(self):
55 return "LowBound(%r)" % self.base
56 57 # "<38"
58 -class HighBound:
59 - def __init__(self, base):
60 self.base = base
61 - def __repr__(self):
62 return "HighBound(%r)" % self.base
63 64 # 12.34
65 -class TwoBound:
66 - def __init__(self, low, high):
67 self.low = low 68 self.high = high
69 - def __repr__(self):
70 return "TwoBound(%r, %r)" % (self.low, self.high)
71 72 # 12^34
73 -class Between:
74 - def __init__(self, low, high):
75 self.low = low 76 self.high = high
77 - def __repr__(self):
78 return "Between(%r, %r)" % (self.low, self.high)
79 80 # 12..34
81 -class Range:
82 - def __init__(self, low, high):
83 self.low = low 84 self.high = high
85 - def __repr__(self):
86 return "Range(%r, %r)" % (self.low, self.high)
87
88 -class Function:
89 - def __init__(self, name, args):
90 self.name = name 91 self.args = args
92 - def __repr__(self):
93 return "Function(%r, %r)" % (self.name, self.args)
94
95 -class AbsoluteLocation:
96 - def __init__(self, path, local_location):
97 self.path = path 98 self.local_location = local_location
99 - def __repr__(self):
100 return "AbsoluteLocation(%r, %r)" % (self.path, self.local_location)
101
102 -class Path:
103 - def __init__(self, database, accession):
104 self.database = database 105 self.accession = accession
106 - def __repr__(self):
107 return "Path(%r, %r)" % (self.database, self.accession)
108
109 -class FeatureName:
110 - def __init__(self, path, label):
111 self.path = path 112 self.label = label
113 - def __repr__(self):
114 return "FeatureName(%r, %r)" % (self.path, self.label)
115
116 -class LocationScanner(GenericScanner):
117 - def __init__(self):
119
120 - def tokenize(self, input):
121 self.rv = [] 122 GenericScanner.tokenize(self, input) 123 return self.rv
124
125 - def t_double_colon(self, input):
126 r" :: " 127 self.rv.append(Token("double_colon"))
128 - def t_double_dot(self, input):
129 r" \.\. " 130 self.rv.append(Token("double_dot"))
131 - def t_dot(self, input):
132 r" \.(?!\.) " 133 self.rv.append(Token("dot"))
134 - def t_caret(self, input):
135 r" \^ " 136 self.rv.append(Token("caret"))
137 - def t_comma(self, input):
138 r" \, " 139 self.rv.append(Token("comma"))
140 - def t_integer(self, input):
141 r" -?[0-9]+ " 142 self.rv.append(Integer(int(input)))
143 - def t_unsigned_integer(self, input):
144 r" [0-9]+ " 145 self.rv.append(UnsignedInteger(int(input)))
146 - def t_colon(self, input):
147 r" :(?!:) " 148 self.rv.append(Token("colon"))
149 - def t_open_paren(self, input):
150 r" \( " 151 self.rv.append(Token("open_paren"))
152 - def t_close_paren(self, input):
153 r" \) " 154 self.rv.append(Token("close_paren"))
155 - def t_symbol(self, input):
156 r" [A-Za-z0-9_'*-][A-Za-z0-9_'*.-]* " 157 # Needed an extra '.' 158 self.rv.append(Symbol(input))
159 - def t_less_than(self, input):
160 r" < " 161 self.rv.append(Token("less_than"))
162 - def t_greater_than(self, input):
163 r" > " 164 self.rv.append(Token("greater_than"))
165 166 # punctuation .. hmm, isn't needed for location 167 # r''' [ !#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~] ''' 168
169 -class LocationParser(GenericParser):
170 - def __init__(self, start='location'):
171 GenericParser.__init__(self, start) 172 self.begin_pos = 0
173
174 - def p_location(self, args):
175 """ 176 location ::= absolute_location 177 location ::= feature_name 178 location ::= function 179 """ 180 return args[0]
181
182 - def p_function(self, args):
183 """ 184 function ::= functional_operator open_paren location_list close_paren 185 """ 186 return Function(args[0].name, args[2])
187
188 - def p_absolute_location(self, args):
189 """ 190 absolute_location ::= local_location 191 absolute_location ::= path colon local_location 192 """ 193 if len(args) == 1: 194 return AbsoluteLocation(None, args[-1]) 195 return AbsoluteLocation(args[0], args[-1])
196
197 - def p_path(self, args):
198 """ 199 path ::= database double_colon primary_accession 200 path ::= primary_accession 201 """ 202 if len(args) == 3: 203 return Path(args[0], args[2]) 204 return Path(None, args[0])
205
206 - def p_feature_name(self, args):
207 """ 208 feature_name ::= path colon feature_label 209 feature_name ::= feature_label 210 """ 211 if len(args) == 3: 212 return FeatureName(args[0], args[2]) 213 return FeatureName(None, args[0])
214
215 - def p_feature_label(self, args):
216 """ 217 label ::= symbol 218 """ 219 return args[0].name
220
221 - def p_local_location(self, args):
222 """ 223 local_location ::= base_position 224 local_location ::= between_position 225 local_location ::= base_range 226 """ 227 return args[0]
228 - def p_location_list(self, args):
229 """ 230 location_list ::= location 231 location_list ::= location_list comma location 232 """ 233 if len(args) == 1: 234 return args 235 return args[0] + [args[2]]
236
237 - def p_functional_operator(self, args):
238 """ 239 functional_operator ::= symbol 240 """ 241 return args[0]
242
243 - def p_base_position(self, args):
244 """ 245 base_position ::= integer 246 base_position ::= low_base_bound 247 base_position ::= high_base_bound 248 base_position ::= two_base_bound 249 """ 250 return args[0]
251
252 - def p_low_base_bound(self, args):
253 """ 254 low_base_bound ::= greater_than integer 255 """ 256 return LowBound(args[1])
257
258 - def p_high_base_bound(self, args):
259 """ 260 high_base_bound ::= less_than integer 261 """ 262 return HighBound(args[1])
263
264 - def p_two_base_bound_1(self, args):
265 """ 266 two_base_bound ::= open_paren base_position dot base_position close_paren 267 """ 268 # main example doesn't have parens but others do.. (?) 269 return TwoBound(args[1], args[3])
270
271 - def p_two_base_bound_2(self, args):
272 """ 273 two_base_bound ::= base_position dot base_position 274 """ 275 # two_base_bound with no parentheses like 1.6 276 return TwoBound(args[0], args[2])
277
278 - def p_between_position(self, args):
279 """ 280 between_position ::= base_position caret base_position 281 """ 282 return Between(args[0], args[2])
283
284 - def p_base_range(self, args):
285 """ 286 base_range ::= base_position double_dot base_position 287 base_range ::= function double_dot base_position 288 base_range ::= base_position double_dot function 289 base_range ::= function double_dot function 290 """ 291 return Range(args[0], args[2])
292
293 - def p_database(self, args):
294 """ 295 database ::= symbol 296 """ 297 return args[0].name
298
299 - def p_primary_accession(self, args):
300 """ 301 primary_accession ::= symbol 302 """ 303 return args[0].name
304 305 306 _cached_scanner = LocationScanner()
307 -def scan(input):
308 """Break a location string into a set of tokens""" 309 #scanner = LocationScanner() 310 #return scanner.tokenize(input) 311 return _cached_scanner.tokenize(input)
312 313 _cached_parser = LocationParser()
314 -def parse(tokens):
315 """Go from a set of tokens to an object representation""" 316 #print "I have", tokens 317 #parser = LocationParser() 318 #return parser.parse(tokens) 319 return _cached_parser.parse(tokens)
320