1
2
3
4
5
6 """Code for more fancy file handles.
7
8
9 Classes:
10 SGMLExtractorHandle File object that strips tags and returns content from specified
11 tags blocks.
12
13 SGMLExtractor Object that scans for specified SGML tag pairs, removes any inner tags
14 and returns the raw content.
15 For example the object SGMLExtractor( [ 'h1' ] )on the following html file would return
16 'House that Jack built'
17 SGMLExtractor( [ 'dt' ] ) would return 'ratcatdogcowmaiden'
18 SGMLExtractor( [ 'dt', 'dd' ] ) would return 'rat that ate the malttcat ate the rat' etc
19
20 <h1>House that Jack Built</h1>
21 <dl>
22 <dt><big>rat</big></dt>
23 <dd><big>ate the malt</big></dd>
24 <dt><big>cat</big></dt>
25 <dd><big>that ate the rat</big></dd>
26 <dt><big>dog</big></dt>
27 <dd><big>that worried the dats</big></dd>
28 <dt><big>cow</big></dt>
29 <dd><big>with crumpled horn</big></dd>
30 <dt><big>maiden</big></dt>
31 <dd><big>all forlorns</big></dd>
32 </dl>
33 """
34 import os
35 import string
36 import StringIO
37 import sgmllib
38
39
41 """A Python handle that automatically strips SGML tags and returns data from
42 specified tag start and end pairs.
43
44 """
45 - def __init__(self, handle, tags_of_interest = [] ):
46 """SGMLExtractor(handle, tags_of_interest )
47
48 handle is a file handle to SGML-formatted data.
49 tags_of_interest is a list of root names for pairs of start and end tags
50
51 """
52 self._handle = handle
53 self._stripper = SGMLExtractor( tags_of_interest )
54
55 - def read(self, *args, **keywds):
58
60 line = self._handle.readline( *args, **keywds)
61 return self._stripper.strip(line)
62
64 lines = self._handle.readlines( *args, **keywds)
65 for i in range(len(lines)):
66 lines[i] = self._stripper.strip(str)
67 return lines
68
70 return getattr(self._handle, attr)
71
72
74 if( len( items ) > 0 ):
75 return 0
76 else:
77 return 1
78
82 sgmllib.SGMLParser.__init__(self)
83 self.data = ''
84 self._instack = []
85 self._tags_of_interest = []
86 for tag in tags_of_interest:
87 self._tags_of_interest.append( tag.lower() )
88
92
94 lower_tag = tag.lower()
95 if( lower_tag in self._tags_of_interest ):
96 self._instack.append( lower_tag )
97
99 if( not is_empty( self._instack ) ):
100 open_tag = self._instack.pop()
101 try:
102 if( open_tag != tag.lower() ):
103 self._instack.append( open_tag )
104 except:
105 print tag
106
107
110
112 """S.strip(str) -> string
113
114 Strip the SGML tags from str.
115
116 """
117 if not str:
118 return ''
119
120
121
122
123
124 is_newline = str[-1] in ['\n', '\r']
125
126 self._parser.data = ''
127 self._parser.feed(str)
128 if self._parser.data:
129 str = self._parser.data
130 elif is_newline:
131 str = '\n'
132 return str
133