Package Bio :: Package LocusLink :: Module web_parse
[hide private]
[frames] | no frames]

Source Code for Module Bio.LocusLink.web_parse

  1  import string 
  2  import copy 
  3  import operator 
  4  import urllib 
  5  import sgmllib 
  6  import Bio.File 
  7  import Martel 
  8  from mx import TextTools 
  9   
 10  """ 
 11  The LocusLink site is: 
 12  http://www.ncbi.nlm.nih.gov/LocusLink/ 
 13  Parses a Locus web page. 
 14  """ 
 15   
16 -def is_empty_container( item ):
17 response = 0 18 if is_container( item ): 19 if len( item ) == 0: 20 response = 1 21 return response
22
23 -def is_container( item ):
24 response = 0 25 if type( item ) in [ type( [] ), type( {} ) ]: 26 response = 1 27 return response
28
29 -def is_substring( a, b ):
30 if( a.find( b ) < 0 ): 31 return 0 32 else: 33 return 1
34 41
42 -def process_list( params ):
43 len_params = len( params ) 44 container = [] 45 while 1: 46 try: 47 element = params.pop() 48 except: 49 break 50 if is_close_token( element ): break 51 elif is_open_token( element ): 52 break 53 else: 54 container.append( element ) 55 return container
56
57 -def put( dict, key, val ):
58 if dict.has_key( key ): 59 element = dict[ key ] 60 dict[ key ] = [ element, val ] 61 else: 62 dict[ key ] = val
63 64
65 -def process_dict( params ):
66 container = {} 67 while len( params ) > 0: 68 element = params.pop() 69 if type( element ) == type( {} ): 70 for key, val in element.items(): 71 put( container, key, val ) 72 elif is_close_token( element ): break 73 elif is_open_token( element ): 74 params.append( element ) 75 else: 76 val = params.pop() 77 if type( val ) == type( [] ): 78 if len( val ) == 1: 79 val = val[ 0 ] 80 try: 81 put( container, element, val ) 82 except: 83 print 'Element' 84 print element 85 params.append( element ) 86 87 elif( not is_close_token( val ) ): 88 try: 89 put( container, element, val ) 90 except: 91 print 'Element' 92 print element 93 params.append( element ) 94 else: 95 break 96 return container
97
98 -class Token:
99 - def __init__( self, token ):
100 self.token = token
101
102 - def __eq__( self, other ):
103 if not isinstance( other, self.__class__ ): 104 return 0 105 if self.token == other.token: 106 return 1 107 return 0
108
109 - def __ne__( self, other ):
110 if not isinstance( other, Token ): 111 return 1 112 if self.token != other.token: 113 return 1 114 return 0
115
116 - def __str__( self ):
117 output = 'token_%s\n' % self.token 118 return output
119 120 121 open_list = Token( 'open_list' ) 122 close_list = Token( 'close_list' ) 123 open_dict = Token( 'open_dict' ) 124 close_dict = Token( 'close_dict' ) 125
126 -def is_open_token( target ):
127 answer = 0 128 if isinstance( target, Token ): 129 if ( open_list.__eq__( target ) ) or ( open_dict.__eq__( 130 target ) ): 131 answer = 1 132 return answer
133
134 -def is_close_token( target ):
135 answer = 0 136 if isinstance( target, Token ): 137 if ( close_list.__eq__( target ) ) or ( close_dict.__eq__( 138 target ) ): 139 answer = 1 140 return answer
141
142 -def is_token( target ):
143 return is_open_token( target ) or is_close_token( target )
144
145 -class Url:
146
147 - def __init__( self, url, label = '', description = '' ):
148 self.url = url 149 self.label = label 150 self.description = description
151
152 - def __str__( self ):
153 output = '%s\n' % self.label 154 output = output + 'url = %s\n' % self.url 155 output = output + '%s\n' % self.description 156 return output
157 158
159 -class Record(dict):
160
161 - def __init__( self ):
162 dict.__init__( self )
163
164 - def __str__( self ):
165 queue_keys = self.keys() 166 queue_keys.sort() 167 out = '' 168 for key in queue_keys: 169 out = out + '%s:\n' % key.upper() 170 out = out + self.print_item( self[ key ] ) 171 out = out + '\n' 172 173 return out
174
175 - def print_item( self, item, level = 1 ):
176 indent = ' ' 177 out = '' 178 for j in range( 0, level ): 179 indent = indent + ' ' 180 if( type( item ) == type( '' ) ): 181 if( item != '' ): 182 out = out + '%s%s\n' % ( indent, item ) 183 elif( type( item ) == type([])): 184 for subitem in item: 185 out = out + self.print_item( subitem, level + 1 ) 186 out = out + '----------------------------------------------\n' 187 elif( type( item ) == type ( {} ) ): 188 keys = item.keys() 189 keys.sort() 190 for subitem in keys: 191 out = out + '%skey is %s\n' % ( indent, subitem ) 192 out = out + self.print_item( item[ subitem ], level + 1 ) 193 elif( isinstance( item, dict ) ): 194 keys = item.keys() 195 keys.sort() 196 for subitem in keys: 197 out = out + '%skey is %s\n' % ( indent, subitem ) 198 out = out + self.print_item( item[ subitem ], level + 1 ) 199 else: 200 out = out + '%s%s\n' % ( indent, str( item ) ) 201 return out
202 203
204 -class LocusLinkParser( sgmllib.SGMLParser ):
205
206 - def reset( self ):
207 sgmllib.SGMLParser.reset( self ) 208 self.text = '' 209 self.record = Record() 210 self.open_tag_stack = [] 211 self.open_tag = 'open_html' 212 self.outer_state = 'undefined' 213 self.section_state = 'undefined' 214 self.local_title = '' 215 self.structure_stack = [] 216 self.category = '' 217 self.context_chain = [] 218 self.outer_state_dict = { 'nomenclature' : 'nomenclature', 'overview' : 'overview', \ 219 'function' : 'function', \ 220 'relationships' : 'relationships', \ 221 'locus' : 'locus', \ 222 'map' : 'map', \ 223 'refseq' : 'refseq', \ 224 'genbank' : 'genbank', \ 225 'external' : 'external_annotation', \ 226 'additional' : 'additional_links' \ 227 }
228 229
230 - def parse( self, handle ):
231 self.reset() 232 self.feed( handle ) 233 return self.record
234 235 # 236 # Assumes an empty line between records 237 #
238 - def feed( self, handle ):
239 if isinstance(handle, Bio.File.UndoHandle): 240 uhandle = handle 241 else: 242 uhandle = Bio.File.UndoHandle(handle) 243 text = '' 244 while 1: 245 line = uhandle.readline() 246 if not line: 247 break 248 text = text + ' ' + line 249 250 sgmllib.SGMLParser.feed( self, text )
251
252 - def get_text( self ):
253 text = self.text 254 self.text = '' 255 return text
256
257 - def handle_comment( self, comment ):
258 while comment.startswith( '-' ): 259 comment = comment[ 1: ] 260 comment = comment.strip() 261 comment = comment.lower() 262 263 keys = self.outer_state_dict.keys() 264 for key in keys: 265 if comment.startswith( key ): 266 if key in [ 'nomenclature', 'overview', 'function', 267 'relationships', 'map', 'locus', 'external' ]: 268 self.structure_stack.append( open_dict ) 269 elif key in [ 'genbank', 'additional' ]: 270 self.structure_stack.append( open_list ) 271 elif key in [ 'refseq' ]: 272 self.structure_stack.append( open_list ) 273 self.outer_state = key 274 self.section_state = 'local_title' 275 self.detail_state = 'undefined' 276 if( key == 'refseq' ): 277 self.detail_state = 'waiting_category' 278 else: 279 self.detail_state = 'waiting_key' 280 break 281 if comment.startswith( 'end' ): 282 if is_substring( comment.lower(), self.outer_state ): 283 if self.outer_state == 'refseq': 284 self.structure_stack.append( close_list ) 285 elif self.outer_state == 'function': 286 self.structure_stack.append( close_list ) 287 self.structure_stack.append( close_dict ) 288 self.process_structure_stack() 289 while 1: 290 try: 291 item = self.structure_stack.pop() 292 except: 293 item = 'Not Available' 294 if not is_token( item ) : break 295 key = self.outer_state 296 self.record[ self.outer_state_dict[ key ] ] = item 297 self.outer_state = 'undefined'
298 299
300 - def handle_data(self, newtext ):
301 newtext = string.strip( newtext ) 302 self.text = self.text + newtext
303
304 - def start_a( self, attrs ):
305 self.open_tag_stack.append( self.open_tag ) 306 self.open_tag = 'open_a' 307 attr_dict = {} 308 for key, val in attrs: 309 attr_dict[ key ] = val 310 outer_state = self.outer_state 311 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'refseq', 'additional', 'external' ] ): 312 if self.section_state == 'local_contents': 313 if self.detail_state in [ 'scan_val', 'unpaired_key' ]: 314 if attr_dict.has_key( 'href' ): 315 href = attr_dict[ 'href' ] 316 self.text = '' 317 self.structure_stack.append( Url( href, '' ) ) 318 elif outer_state == 'function': 319 if self.section_state == 'local_contents': 320 if self.detail_state in [ 'scan_val', 'unpaired_key', 'may_be_val' ]: 321 if attr_dict.has_key( 'href' ): 322 href = attr_dict[ 'href' ] 323 self.text = '' 324 self.structure_stack.append( Url( href, '' ) )
325 326
327 - def end_a( self ):
328 try: 329 self.open_tag = self.open_tag_stack.pop() 330 except: 331 self.open_tag = 'open_html' 332 outer_state = self.outer_state 333 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ): 334 if self.section_state == 'local_contents': 335 if self.detail_state in [ 'scan_val', 'unpaired_key' ]: 336 text = self.get_text() 337 url = self.structure_stack.pop() 338 if isinstance( url, Url ): 339 url.label = text 340 self.structure_stack.append( url ) 341 342 elif outer_state == 'function': 343 if self.section_state == 'local_contents': 344 if self.detail_state in [ 'scan_val', 'unpaired_key', 345 'may_be_val' ]: 346 text = self.get_text() 347 url = self.structure_stack.pop() 348 if isinstance( url, Url ): 349 url.label = text 350 self.structure_stack.append( url )
351
352 - def start_b( self, attrs ):
353 354 self.open_tag_stack.append( self.open_tag ) 355 self.open_tag = 'open_b' 356 outer_state = self.outer_state 357 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ): 358 self.text = ''
359 360 361
362 - def end_b( self ):
363 try: 364 self.open_tag = self.open_tag_stack.pop() 365 except: 366 self.open_tag = 'open_html' 367 outer_state = self.outer_state 368 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ): 369 if self.section_state == 'local_contents': 370 text = self.get_text() 371 cols = text.split( ':', 1 ) 372 key = cols[ 0 ] 373 if( outer_state == 'refseq' ): 374 self.structure_stack.append( cols[ 1 ] ) 375 self.structure_stack.append( open_dict ) 376 self.detail_state = 'waiting_key' 377 elif outer_state == 'relationships': 378 self.structure_stack.append( key ) 379 self.structure_stack.append( open_list ) 380 self.detail_state = 'skip' 381 elif outer_state == 'additional': 382 self.structure_stack.append( open_dict ) 383 self.structure_stack.append( key ) 384 self.structure_stack.append( open_list ) 385 self.detail_state = 'unpaired_key' 386 elif outer_state == 'function': 387 if self.detail_state != 'waiting_key': 388 self.structure_stack.append( close_list ) 389 self.structure_stack.append( key ) 390 self.detail_state = 'unpaired_key' 391 self.structure_stack.append( open_list ) 392 self.structure_stack.append( open_list ) 393 try: 394 val = cols[ 1 ] 395 if val.strip() != '': 396 self.structure_stack.append( val ) 397 self.detail_state = 'unpaired_key' 398 399 except IndexError: 400 pass 401 else: 402 if self.detail_state != 'waiting_key': 403 self.structure_stack.append( close_list ) 404 self.detail_state = 'scan_val' 405 self.structure_stack.append( key ) 406 self.structure_stack.append( open_list ) 407 self.structure_stack.append( open_list ) 408 try: 409 val = cols[ 1 ] 410 if val.strip() != '': 411 self.structure_stack.append( val ) 412 except IndexError: 413 pass
414 415
416 - def start_th( self, attrs ):
417 418 self.open_tag_stack.append( self.open_tag ) 419 self.open_tag = 'open_th' 420 outer_state = self.outer_state 421 self.text = '' 422 if outer_state in [ 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]: 423 if self.section_state == 'local_contents': 424 self.detail_state = 'scan_headings'
425 426 427
428 - def end_th( self ):
429 try: 430 self.open_tag = self.open_tag_stack.pop() 431 except: 432 self.open_tag = 'open_html' 433 outer_state = self.outer_state 434 if outer_state == 'refseq': 435 if self.section_state == 'local_contents': 436 text = self.get_text() 437 cols = text.strip().split( ':', 1 ) 438 if text.strip().lower().startswith( 'category' ): 439 self.structure_stack.append( open_dict ) 440 self.structure_stack.append( cols[ 1 ] ) 441 self.structure_stack.append( open_list ) 442 self.structure_stack.append( open_dict ) 443 self.detail_state = 'found_category' 444 445 elif self.detail_state in [ 'found_category', 'may_be_val' ]: 446 if text.strip() != '': 447 if self.detail_state != 'found_category': 448 self.structure_stack.append( close_list ) 449 cols = text.split( ':' ) 450 self.structure_stack.append( cols[ 0 ] ) 451 self.structure_stack.append( open_list ) 452 try: 453 val = cols[ 1 ] 454 self.structure_stack.append( open_list ) 455 self.structure_stack.append( val ) 456 self.detail_state = 'scan_val' 457 except IndexError: 458 self.detail_state = 'may_be_val'
459 460 461 462 463
464 - def start_table( self, attrs ):
465 self.open_tag_stack.append( self.open_tag ) 466 self.open_tag = 'open_table' 467 self.text = '' 468 if self.outer_state == 'genbank': 469 if self.section_state == 'local_contents': 470 self.detail_state = 'skip' 471 elif( self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'additional', 'external' ] ): 472 473 if self.section_state == 'local_contents': 474 self.detail_state = 'waiting_key'
475
476 - def end_table( self ):
477 try: 478 self.open_tag = self.open_tag_stack.pop() 479 except: 480 self.open_tag = 'open_html' 481 if( self.section_state == 'local_title' ): 482 if self.outer_state == 'refseq': 483 self.section_state = 'local_contents' 484 elif self.outer_state == 'additional': 485 self.section_state = 'local_contents' 486 self.detail_state = 'scan_val' 487 else: 488 self.section_state = 'local_contents' 489 self.detail_state = 'waiting_key' 490 elif self.section_state == 'local_contents': 491 if( self.outer_state in [ 'nomenclature', 'relationships', 'locus', 'map', 'external' ] ): 492 self.structure_stack.append( close_list ) 493 elif ( self.outer_state in [ 'genbank', 'additional' ] ): 494 if self.detail_state == 'scan_val': 495 self.structure_stack.append( close_list ) 496 497 elif self.outer_state == 'refseq': 498 if self.detail_state in ['may_be_val', 'scan_val' ]: 499 self.structure_stack.append( close_list ) 500 self.structure_stack.append( close_dict ) 501 self.structure_stack.append( close_list ) 502 self.structure_stack.append( close_dict ) 503 self.detail_state = 'scan_category'
504 505
506 - def start_tr( self, attrs ):
507 top = self.open_tag 508 self.open_tag_stack.append( self.open_tag ) 509 if top == 'open_table_row': 510 if self.outer_state == 'refseq': 511 if self.section_state == 'local_contents': 512 if self.detail_state in [ 'scan_val', ]: 513 self.structure_stack.append( close_list ) 514 self.detail_state = 'may_be_val' 515 self.open_tag_stack.pop() 516 self.open_tag = 'open_table_row' 517 self.text = '' 518 outer_state = self.outer_state 519 if( outer_state in [ 'relationships', 'locus', 'function', 'genbank', 'external' 520 ] ): 521 if self.section_state == 'local_contents': 522 if self.detail_state == 'scan_val': 523 self.structure_stack.append( open_list ) 524 elif outer_state == 'map': 525 if self.section_state == 'local_contents': 526 if self.detail_state == 'scan_val': 527 self.structure_stack.append( open_list ) 528 529 elif outer_state == 'additional': 530 if self.section_state == 'local_contents': 531 self.detail_state = 'scan_val' 532 self.structure_stack.append( open_list )
533 534
535 - def end_tr( self ):
536 try: 537 self.open_tag = self.open_tag_stack.pop() 538 except: 539 self.open_tag = 'open_html' 540 if self.section_state == 'local_contents': 541 if( self.outer_state in [ 'overview', 'nomenclature', 'relationships', 542 'locus', 'genbank', 'external' ] ): 543 if self.detail_state == 'scan_val': 544 self.structure_stack.append( close_list ) 545 elif self.detail_state == 'unpaired_key': 546 self.structure_stack.append( close_list ) 547 elif self.detail_state == 'skip': 548 self.detail_state = 'scan_val' 549 elif self.detail_state == 'scan_headings': 550 self.detail_state = 'scan_val' 551 elif self.outer_state in [ 'additional', ]: 552 if self.detail_state == 'unpaired_key': 553 self.structure_stack.append( close_list ) 554 self.structure_stack.append( close_dict ) 555 self.structure_stack.append( close_list ) 556 elif self.detail_state == 'scan_val': 557 self.structure_stack.append( close_list ) 558 elif self.outer_state in [ 'function', ]: 559 if self.detail_state == 'scan_headings': 560 self.detail_state = 'scan_val' 561 elif self.detail_state == 'unpaired_key': 562 self.detail_state = 'may_be_val' 563 self.structure_stack.append( close_list ) 564 elif self.detail_state == 'scan_val': 565 self.detail_state = 'may_be_val' 566 self.structure_stack.append( close_list ) 567 elif self.outer_state in [ 'refseq', ]: 568 if self.section_state == 'local_contents': 569 if self.detail_state == 'scan_val': 570 self.structure_stack.append( close_list ) 571 self.detail_state = 'may_be_val' 572 elif self.outer_state == 'map': 573 if self.section_state == 'local_contents': 574 if self.detail_state == 'scan_val': 575 self.structure_stack.append( close_list ) 576 self.detail_state = 'may_be_val'
577 578
579 - def start_td( self, attrs ):
580 self.open_tag_stack.append( self.open_tag ) 581 self.open_tag = 'open_table_data' 582 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]: 583 if( self.section_state == 'local_contents' ): 584 self.text = '' 585 elif self.outer_state == 'refseq': 586 if self.section_state == 'local_contents': 587 self.text = '' 588 if self.detail_state == 'may_be_val': 589 self.structure_stack.append( open_list ) 590 self.detail_state = 'scan_val'
591
592 - def end_td( self ):
593 try: 594 self.open_tag = self.open_tag_stack.pop() 595 except: 596 self.open_tag = 'open_html' 597 598 599 600 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'genbank', 'additional', 'external' ]: 601 if( self.section_state == 'local_contents' ): 602 if self.detail_state == 'scan_val': 603 text = self.get_text() 604 if( text != '' ): 605 self.structure_stack.append( text ) 606 elif self.outer_state == 'function': 607 if self.section_state == 'local_contents': 608 text = self.get_text() 609 if( text != '' ): 610 if self.detail_state == 'may_be_val': 611 if text.strip() != '': 612 self.structure_stack.append( open_list ) 613 self.detail_state = 'scan_val' 614 if self.detail_state in [ 'unpaired_key', 'scan_val' ]: 615 self.structure_stack.append( text ) 616 elif self.outer_state == 'map': 617 if self.section_state == 'local_contents': 618 text = self.get_text() 619 if( text != '' ): 620 if self.detail_state == 'may_be_val': 621 if text.strip() != '': 622 self.structure_stack.append( open_list ) 623 self.detail_state = 'scan_val' 624 if self.detail_state == 'scan_val': 625 self.structure_stack.append( text ) 626 elif self.outer_state == 'refseq': 627 if self.section_state == 'local_contents': 628 if self.detail_state == 'scan_val': 629 text = self.get_text() 630 if( text != '' ): 631 self.add_text_to_object( text )
632
633 - def do_br( self, attrs ):
634 if self.outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]: 635 if( self.section_state == 'local_contents' ): 636 if self.detail_state == 'scan_val': 637 if self.is_contained_by( 'open_table_data' ): 638 text = self.get_text() 639 if( text != '' ): 640 self.structure_stack.append( text )
641 642
643 - def add_text_to_object( self, text ):
644 stack_item = self.structure_stack.pop() 645 if isinstance( stack_item, Url ): 646 if stack_item.description == '': 647 stack_item.description = text 648 self.structure_stack.append( stack_item ) 649 else: 650 self.structure_stack.append( stack_item ) 651 self.structure_stack.append( text )
652 653 654
655 - def is_contained_by( self, tag ):
656 return tag in self.open_tag_stack
657
658 - def process_structure_stack( self ):
659 params = [] 660 outer_state = self.outer_state 661 if outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'refseq', 'locus', 'map', 'genbank', 'additional', 'external' ]: 662 while len( self.structure_stack ) > 1: 663 len_stack = len( self.structure_stack ) 664 # self.print_stack() 665 for i in range ( 0, len_stack ): 666 item = self.structure_stack.pop() 667 if not is_open_token( item ): 668 params.append( item ) 669 else: break 670 if( open_list.__eq__( item ) ): 671 container = process_list( params ) 672 params.append( container ) 673 else: 674 container = process_dict( params ) 675 if len( container ) > 0: 676 params.append( container ) 677 if ( len( self.structure_stack ) == 0 ) or is_open_token( 678 self.structure_stack[ -1 ] ): 679 for j in range( 0, len( params ) ): 680 item = params.pop() 681 self.structure_stack.append( item ) 682 params = []
683 684
685 - def print_stack( self ):
686 print '%s!!!!!\n' % self.outer_state.upper() 687 for stack_item in self.structure_stack: 688 print 'stack has ' + str( stack_item ) 689 print '-----------------'
690 691 692 693 694 if( __name__ == '__main__' ): 695 handle = open( 'Hs13225.htm') 696 undo_handle = Bio.File.UndoHandle( handle ) 697 locuslink_parser = LocusLinkParser() 698 record = locuslink_parser.parse( handle ) 699 print record 700