1 import string
2 import copy
3 import operator
4 import urllib
5 import sgmllib
6 import Bio.File
7 import Martel
8 from mx import TextTools
9
10 """
11 The LocusLink site is:
12 http://www.ncbi.nlm.nih.gov/LocusLink/
13 Parses a Locus web page.
14 """
15
17 response = 0
18 if is_container( item ):
19 if len( item ) == 0:
20 response = 1
21 return response
22
24 response = 0
25 if type( item ) in [ type( [] ), type( {} ) ]:
26 response = 1
27 return response
28
30 if( a.find( b ) < 0 ):
31 return 0
32 else:
33 return 1
34
36 print '%s!!!!!\n' % 'PARAMS'
37 for item in params:
38
39 print 'param ' + str( item )
40 print '-----------------'
41
56
57 -def put( dict, key, val ):
63
64
97
101
103 if not isinstance( other, self.__class__ ):
104 return 0
105 if self.token == other.token:
106 return 1
107 return 0
108
110 if not isinstance( other, Token ):
111 return 1
112 if self.token != other.token:
113 return 1
114 return 0
115
117 output = 'token_%s\n' % self.token
118 return output
119
120
121 open_list = Token( 'open_list' )
122 close_list = Token( 'close_list' )
123 open_dict = Token( 'open_dict' )
124 close_dict = Token( 'close_dict' )
125
133
141
144
146
147 - def __init__( self, url, label = '', description = '' ):
151
153 output = '%s\n' % self.label
154 output = output + 'url = %s\n' % self.url
155 output = output + '%s\n' % self.description
156 return output
157
158
160
163
165 queue_keys = self.keys()
166 queue_keys.sort()
167 out = ''
168 for key in queue_keys:
169 out = out + '%s:\n' % key.upper()
170 out = out + self.print_item( self[ key ] )
171 out = out + '\n'
172
173 return out
174
176 indent = ' '
177 out = ''
178 for j in range( 0, level ):
179 indent = indent + ' '
180 if( type( item ) == type( '' ) ):
181 if( item != '' ):
182 out = out + '%s%s\n' % ( indent, item )
183 elif( type( item ) == type([])):
184 for subitem in item:
185 out = out + self.print_item( subitem, level + 1 )
186 out = out + '----------------------------------------------\n'
187 elif( type( item ) == type ( {} ) ):
188 keys = item.keys()
189 keys.sort()
190 for subitem in keys:
191 out = out + '%skey is %s\n' % ( indent, subitem )
192 out = out + self.print_item( item[ subitem ], level + 1 )
193 elif( isinstance( item, dict ) ):
194 keys = item.keys()
195 keys.sort()
196 for subitem in keys:
197 out = out + '%skey is %s\n' % ( indent, subitem )
198 out = out + self.print_item( item[ subitem ], level + 1 )
199 else:
200 out = out + '%s%s\n' % ( indent, str( item ) )
201 return out
202
203
205
207 sgmllib.SGMLParser.reset( self )
208 self.text = ''
209 self.record = Record()
210 self.open_tag_stack = []
211 self.open_tag = 'open_html'
212 self.outer_state = 'undefined'
213 self.section_state = 'undefined'
214 self.local_title = ''
215 self.structure_stack = []
216 self.category = ''
217 self.context_chain = []
218 self.outer_state_dict = { 'nomenclature' : 'nomenclature', 'overview' : 'overview', \
219 'function' : 'function', \
220 'relationships' : 'relationships', \
221 'locus' : 'locus', \
222 'map' : 'map', \
223 'refseq' : 'refseq', \
224 'genbank' : 'genbank', \
225 'external' : 'external_annotation', \
226 'additional' : 'additional_links' \
227 }
228
229
230 - def parse( self, handle ):
234
235
236
237
238 - def feed( self, handle ):
251
252 - def get_text( self ):
253 text = self.text
254 self.text = ''
255 return text
256
298
299
301 newtext = string.strip( newtext )
302 self.text = self.text + newtext
303
305 self.open_tag_stack.append( self.open_tag )
306 self.open_tag = 'open_a'
307 attr_dict = {}
308 for key, val in attrs:
309 attr_dict[ key ] = val
310 outer_state = self.outer_state
311 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'refseq', 'additional', 'external' ] ):
312 if self.section_state == 'local_contents':
313 if self.detail_state in [ 'scan_val', 'unpaired_key' ]:
314 if attr_dict.has_key( 'href' ):
315 href = attr_dict[ 'href' ]
316 self.text = ''
317 self.structure_stack.append( Url( href, '' ) )
318 elif outer_state == 'function':
319 if self.section_state == 'local_contents':
320 if self.detail_state in [ 'scan_val', 'unpaired_key', 'may_be_val' ]:
321 if attr_dict.has_key( 'href' ):
322 href = attr_dict[ 'href' ]
323 self.text = ''
324 self.structure_stack.append( Url( href, '' ) )
325
326
328 try:
329 self.open_tag = self.open_tag_stack.pop()
330 except:
331 self.open_tag = 'open_html'
332 outer_state = self.outer_state
333 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
334 if self.section_state == 'local_contents':
335 if self.detail_state in [ 'scan_val', 'unpaired_key' ]:
336 text = self.get_text()
337 url = self.structure_stack.pop()
338 if isinstance( url, Url ):
339 url.label = text
340 self.structure_stack.append( url )
341
342 elif outer_state == 'function':
343 if self.section_state == 'local_contents':
344 if self.detail_state in [ 'scan_val', 'unpaired_key',
345 'may_be_val' ]:
346 text = self.get_text()
347 url = self.structure_stack.pop()
348 if isinstance( url, Url ):
349 url.label = text
350 self.structure_stack.append( url )
351
353
354 self.open_tag_stack.append( self.open_tag )
355 self.open_tag = 'open_b'
356 outer_state = self.outer_state
357 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
358 self.text = ''
359
360
361
363 try:
364 self.open_tag = self.open_tag_stack.pop()
365 except:
366 self.open_tag = 'open_html'
367 outer_state = self.outer_state
368 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
369 if self.section_state == 'local_contents':
370 text = self.get_text()
371 cols = text.split( ':', 1 )
372 key = cols[ 0 ]
373 if( outer_state == 'refseq' ):
374 self.structure_stack.append( cols[ 1 ] )
375 self.structure_stack.append( open_dict )
376 self.detail_state = 'waiting_key'
377 elif outer_state == 'relationships':
378 self.structure_stack.append( key )
379 self.structure_stack.append( open_list )
380 self.detail_state = 'skip'
381 elif outer_state == 'additional':
382 self.structure_stack.append( open_dict )
383 self.structure_stack.append( key )
384 self.structure_stack.append( open_list )
385 self.detail_state = 'unpaired_key'
386 elif outer_state == 'function':
387 if self.detail_state != 'waiting_key':
388 self.structure_stack.append( close_list )
389 self.structure_stack.append( key )
390 self.detail_state = 'unpaired_key'
391 self.structure_stack.append( open_list )
392 self.structure_stack.append( open_list )
393 try:
394 val = cols[ 1 ]
395 if val.strip() != '':
396 self.structure_stack.append( val )
397 self.detail_state = 'unpaired_key'
398
399 except IndexError:
400 pass
401 else:
402 if self.detail_state != 'waiting_key':
403 self.structure_stack.append( close_list )
404 self.detail_state = 'scan_val'
405 self.structure_stack.append( key )
406 self.structure_stack.append( open_list )
407 self.structure_stack.append( open_list )
408 try:
409 val = cols[ 1 ]
410 if val.strip() != '':
411 self.structure_stack.append( val )
412 except IndexError:
413 pass
414
415
417
418 self.open_tag_stack.append( self.open_tag )
419 self.open_tag = 'open_th'
420 outer_state = self.outer_state
421 self.text = ''
422 if outer_state in [ 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
423 if self.section_state == 'local_contents':
424 self.detail_state = 'scan_headings'
425
426
427
429 try:
430 self.open_tag = self.open_tag_stack.pop()
431 except:
432 self.open_tag = 'open_html'
433 outer_state = self.outer_state
434 if outer_state == 'refseq':
435 if self.section_state == 'local_contents':
436 text = self.get_text()
437 cols = text.strip().split( ':', 1 )
438 if text.strip().lower().startswith( 'category' ):
439 self.structure_stack.append( open_dict )
440 self.structure_stack.append( cols[ 1 ] )
441 self.structure_stack.append( open_list )
442 self.structure_stack.append( open_dict )
443 self.detail_state = 'found_category'
444
445 elif self.detail_state in [ 'found_category', 'may_be_val' ]:
446 if text.strip() != '':
447 if self.detail_state != 'found_category':
448 self.structure_stack.append( close_list )
449 cols = text.split( ':' )
450 self.structure_stack.append( cols[ 0 ] )
451 self.structure_stack.append( open_list )
452 try:
453 val = cols[ 1 ]
454 self.structure_stack.append( open_list )
455 self.structure_stack.append( val )
456 self.detail_state = 'scan_val'
457 except IndexError:
458 self.detail_state = 'may_be_val'
459
460
461
462
463
465 self.open_tag_stack.append( self.open_tag )
466 self.open_tag = 'open_table'
467 self.text = ''
468 if self.outer_state == 'genbank':
469 if self.section_state == 'local_contents':
470 self.detail_state = 'skip'
471 elif( self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'additional', 'external' ] ):
472
473 if self.section_state == 'local_contents':
474 self.detail_state = 'waiting_key'
475
477 try:
478 self.open_tag = self.open_tag_stack.pop()
479 except:
480 self.open_tag = 'open_html'
481 if( self.section_state == 'local_title' ):
482 if self.outer_state == 'refseq':
483 self.section_state = 'local_contents'
484 elif self.outer_state == 'additional':
485 self.section_state = 'local_contents'
486 self.detail_state = 'scan_val'
487 else:
488 self.section_state = 'local_contents'
489 self.detail_state = 'waiting_key'
490 elif self.section_state == 'local_contents':
491 if( self.outer_state in [ 'nomenclature', 'relationships', 'locus', 'map', 'external' ] ):
492 self.structure_stack.append( close_list )
493 elif ( self.outer_state in [ 'genbank', 'additional' ] ):
494 if self.detail_state == 'scan_val':
495 self.structure_stack.append( close_list )
496
497 elif self.outer_state == 'refseq':
498 if self.detail_state in ['may_be_val', 'scan_val' ]:
499 self.structure_stack.append( close_list )
500 self.structure_stack.append( close_dict )
501 self.structure_stack.append( close_list )
502 self.structure_stack.append( close_dict )
503 self.detail_state = 'scan_category'
504
505
507 top = self.open_tag
508 self.open_tag_stack.append( self.open_tag )
509 if top == 'open_table_row':
510 if self.outer_state == 'refseq':
511 if self.section_state == 'local_contents':
512 if self.detail_state in [ 'scan_val', ]:
513 self.structure_stack.append( close_list )
514 self.detail_state = 'may_be_val'
515 self.open_tag_stack.pop()
516 self.open_tag = 'open_table_row'
517 self.text = ''
518 outer_state = self.outer_state
519 if( outer_state in [ 'relationships', 'locus', 'function', 'genbank', 'external'
520 ] ):
521 if self.section_state == 'local_contents':
522 if self.detail_state == 'scan_val':
523 self.structure_stack.append( open_list )
524 elif outer_state == 'map':
525 if self.section_state == 'local_contents':
526 if self.detail_state == 'scan_val':
527 self.structure_stack.append( open_list )
528
529 elif outer_state == 'additional':
530 if self.section_state == 'local_contents':
531 self.detail_state = 'scan_val'
532 self.structure_stack.append( open_list )
533
534
536 try:
537 self.open_tag = self.open_tag_stack.pop()
538 except:
539 self.open_tag = 'open_html'
540 if self.section_state == 'local_contents':
541 if( self.outer_state in [ 'overview', 'nomenclature', 'relationships',
542 'locus', 'genbank', 'external' ] ):
543 if self.detail_state == 'scan_val':
544 self.structure_stack.append( close_list )
545 elif self.detail_state == 'unpaired_key':
546 self.structure_stack.append( close_list )
547 elif self.detail_state == 'skip':
548 self.detail_state = 'scan_val'
549 elif self.detail_state == 'scan_headings':
550 self.detail_state = 'scan_val'
551 elif self.outer_state in [ 'additional', ]:
552 if self.detail_state == 'unpaired_key':
553 self.structure_stack.append( close_list )
554 self.structure_stack.append( close_dict )
555 self.structure_stack.append( close_list )
556 elif self.detail_state == 'scan_val':
557 self.structure_stack.append( close_list )
558 elif self.outer_state in [ 'function', ]:
559 if self.detail_state == 'scan_headings':
560 self.detail_state = 'scan_val'
561 elif self.detail_state == 'unpaired_key':
562 self.detail_state = 'may_be_val'
563 self.structure_stack.append( close_list )
564 elif self.detail_state == 'scan_val':
565 self.detail_state = 'may_be_val'
566 self.structure_stack.append( close_list )
567 elif self.outer_state in [ 'refseq', ]:
568 if self.section_state == 'local_contents':
569 if self.detail_state == 'scan_val':
570 self.structure_stack.append( close_list )
571 self.detail_state = 'may_be_val'
572 elif self.outer_state == 'map':
573 if self.section_state == 'local_contents':
574 if self.detail_state == 'scan_val':
575 self.structure_stack.append( close_list )
576 self.detail_state = 'may_be_val'
577
578
580 self.open_tag_stack.append( self.open_tag )
581 self.open_tag = 'open_table_data'
582 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
583 if( self.section_state == 'local_contents' ):
584 self.text = ''
585 elif self.outer_state == 'refseq':
586 if self.section_state == 'local_contents':
587 self.text = ''
588 if self.detail_state == 'may_be_val':
589 self.structure_stack.append( open_list )
590 self.detail_state = 'scan_val'
591
593 try:
594 self.open_tag = self.open_tag_stack.pop()
595 except:
596 self.open_tag = 'open_html'
597
598
599
600 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'genbank', 'additional', 'external' ]:
601 if( self.section_state == 'local_contents' ):
602 if self.detail_state == 'scan_val':
603 text = self.get_text()
604 if( text != '' ):
605 self.structure_stack.append( text )
606 elif self.outer_state == 'function':
607 if self.section_state == 'local_contents':
608 text = self.get_text()
609 if( text != '' ):
610 if self.detail_state == 'may_be_val':
611 if text.strip() != '':
612 self.structure_stack.append( open_list )
613 self.detail_state = 'scan_val'
614 if self.detail_state in [ 'unpaired_key', 'scan_val' ]:
615 self.structure_stack.append( text )
616 elif self.outer_state == 'map':
617 if self.section_state == 'local_contents':
618 text = self.get_text()
619 if( text != '' ):
620 if self.detail_state == 'may_be_val':
621 if text.strip() != '':
622 self.structure_stack.append( open_list )
623 self.detail_state = 'scan_val'
624 if self.detail_state == 'scan_val':
625 self.structure_stack.append( text )
626 elif self.outer_state == 'refseq':
627 if self.section_state == 'local_contents':
628 if self.detail_state == 'scan_val':
629 text = self.get_text()
630 if( text != '' ):
631 self.add_text_to_object( text )
632
633 - def do_br( self, attrs ):
634 if self.outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
635 if( self.section_state == 'local_contents' ):
636 if self.detail_state == 'scan_val':
637 if self.is_contained_by( 'open_table_data' ):
638 text = self.get_text()
639 if( text != '' ):
640 self.structure_stack.append( text )
641
642
643 - def add_text_to_object( self, text ):
644 stack_item = self.structure_stack.pop()
645 if isinstance( stack_item, Url ):
646 if stack_item.description == '':
647 stack_item.description = text
648 self.structure_stack.append( stack_item )
649 else:
650 self.structure_stack.append( stack_item )
651 self.structure_stack.append( text )
652
653
654
656 return tag in self.open_tag_stack
657
659 params = []
660 outer_state = self.outer_state
661 if outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'refseq', 'locus', 'map', 'genbank', 'additional', 'external' ]:
662 while len( self.structure_stack ) > 1:
663 len_stack = len( self.structure_stack )
664
665 for i in range ( 0, len_stack ):
666 item = self.structure_stack.pop()
667 if not is_open_token( item ):
668 params.append( item )
669 else: break
670 if( open_list.__eq__( item ) ):
671 container = process_list( params )
672 params.append( container )
673 else:
674 container = process_dict( params )
675 if len( container ) > 0:
676 params.append( container )
677 if ( len( self.structure_stack ) == 0 ) or is_open_token(
678 self.structure_stack[ -1 ] ):
679 for j in range( 0, len( params ) ):
680 item = params.pop()
681 self.structure_stack.append( item )
682 params = []
683
684
686 print '%s!!!!!\n' % self.outer_state.upper()
687 for stack_item in self.structure_stack:
688 print 'stack has ' + str( stack_item )
689 print '-----------------'
690
691
692
693
694 if( __name__ == '__main__' ):
695 handle = open( 'Hs13225.htm')
696 undo_handle = Bio.File.UndoHandle( handle )
697 locuslink_parser = LocusLinkParser()
698 record = locuslink_parser.parse( handle )
699 print record
700