Package Bio :: Module SGMLExtractor
[hide private]
[frames] | no frames]

Source Code for Module Bio.SGMLExtractor

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code for more fancy file handles. 
  7   
  8   
  9  Classes: 
 10  SGMLExtractorHandle     File object that strips tags and returns content from specified 
 11  tags blocks. 
 12   
 13  SGMLExtractor   Object that scans for specified SGML tag pairs, removes any inner tags 
 14  and returns the raw content. 
 15  For example the object SGMLExtractor( [ 'h1' ] )on the following html file would return 
 16  'House that Jack built' 
 17  SGMLExtractor( [ 'dt' ] ) would return 'ratcatdogcowmaiden' 
 18  SGMLExtractor( [ 'dt', 'dd' ] ) would return 'rat that ate the malttcat ate  the rat' etc 
 19   
 20  <h1>House that Jack Built</h1> 
 21  <dl> 
 22    <dt><big>rat</big></dt> 
 23      <dd><big>ate the malt</big></dd> 
 24    <dt><big>cat</big></dt> 
 25      <dd><big>that ate the rat</big></dd> 
 26    <dt><big>dog</big></dt> 
 27      <dd><big>that worried the dats</big></dd> 
 28    <dt><big>cow</big></dt> 
 29      <dd><big>with crumpled horn</big></dd> 
 30    <dt><big>maiden</big></dt> 
 31      <dd><big>all forlorns</big></dd> 
 32  </dl> 
 33  """ 
 34  import os 
 35  import string 
 36  import StringIO 
 37  import sgmllib 
 38   
 39   
40 -class SGMLExtractorHandle:
41 """A Python handle that automatically strips SGML tags and returns data from 42 specified tag start and end pairs. 43 44 """
45 - def __init__(self, handle, tags_of_interest = [] ):
46 """SGMLExtractor(handle, tags_of_interest ) 47 48 handle is a file handle to SGML-formatted data. 49 tags_of_interest is a list of root names for pairs of start and end tags 50 51 """ 52 self._handle = handle 53 self._stripper = SGMLExtractor( tags_of_interest )
54
55 - def read(self, *args, **keywds):
56 data = self._handle.read( *args, **keywds) 57 return self._stripper.strip(data)
58
59 - def readline(self, *args, **keywds):
60 line = self._handle.readline( *args, **keywds) 61 return self._stripper.strip(line)
62
63 - def readlines(self, *args, **keywds):
64 lines = self._handle.readlines( *args, **keywds) 65 for i in range(len(lines)): 66 lines[i] = self._stripper.strip(str) 67 return lines
68
69 - def __getattr__(self, attr):
70 return getattr(self._handle, attr)
71 72
73 -def is_empty( items ):
74 if( len( items ) > 0 ): 75 return 0 76 else: 77 return 1
78
79 -class SGMLExtractor:
80 - class LocalParser(sgmllib.SGMLParser):
81 - def __init__(self, tags_of_interest = [] ):
82 sgmllib.SGMLParser.__init__(self) 83 self.data = '' 84 self._instack = [] 85 self._tags_of_interest = [] 86 for tag in tags_of_interest: 87 self._tags_of_interest.append( tag.lower() )
88
89 - def handle_data(self, data):
90 if( not is_empty( self._instack ) ): 91 self.data = self.data + data
92
93 - def unknown_starttag(self, tag, attrs):
94 lower_tag = tag.lower() 95 if( lower_tag in self._tags_of_interest ): 96 self._instack.append( lower_tag )
97
98 - def unknown_endtag(self, tag ):
99 if( not is_empty( self._instack ) ): 100 open_tag = self._instack.pop() 101 try: 102 if( open_tag != tag.lower() ): 103 self._instack.append( open_tag ) 104 except: 105 print tag
106 107
108 - def __init__(self, tags_of_interest = [] ):
109 self._parser = SGMLExtractor.LocalParser( tags_of_interest )
110
111 - def strip(self, str):
112 """S.strip(str) -> string 113 114 Strip the SGML tags from str. 115 116 """ 117 if not str: # empty string, don't do anything. 118 return '' 119 # I need to make sure that I don't return an empty string if 120 # the buffer is not empty. This can happen if there's a newline 121 # character embedded within a tag. Thus, I'll first check to 122 # see if the last character is a newline. If it is, and it's stripped 123 # away, I'll add it back. 124 is_newline = str[-1] in ['\n', '\r'] 125 126 self._parser.data = '' # clear the parser's data (don't reset) 127 self._parser.feed(str) 128 if self._parser.data: 129 str = self._parser.data 130 elif is_newline: 131 str = '\n' 132 return str
133