1 """Wrap a file handle to allow seeks back to the beginning
2
3 Sometimes data coming from a socket or other input file handle isn't
4 what it was supposed to be. For example, suppose you are reading from
5 a buggy server which is supposed to return an XML stream but can also
6 return an unformatted error message. (This often happens because the
7 server doesn't handle incorrect input very well.)
8
9 A ReseekFile helps solve this problem. It is a wrapper to the
10 original input stream but provides a buffer. Read requests to the
11 ReseekFile get forwarded to the input stream, appended to a buffer,
12 then returned to the caller. The buffer contains all the data read so
13 far.
14
15 The ReseekFile can be told to reseek to the start position. The next
16 read request will come from the buffer, until the buffer has been
17 read, in which case it gets the data from the input stream. This
18 newly read data is also appended to the buffer.
19
20 When buffering is no longer needed, use the 'nobuffer()' method. This
21 tells the ReseekFile that once it has read from the buffer it should
22 throw the buffer away. After nobuffer is called, the behaviour of
23 'seek' is no longer defined.
24
25 For example, suppose you have the server as above which either
26 gives an error message is of the form:
27
28 ERROR: cannot do that
29
30 or an XML data stream, starting with "<?xml".
31
32 infile = urllib2.urlopen("http://somewhere/")
33 infile = ReseekFile.ReseekFile(infile)
34 s = infile.readline()
35 if s.startswith("ERROR:"):
36 raise Exception(s[:-1])
37 infile.seek(0)
38 infile.nobuffer() # Don't buffer the data
39 ... process the XML from infile ...
40
41
42 This module also implements 'prepare_input_source(source)' modeled on
43 xml.sax.saxutils.prepare_input_source. This opens a URL and if the
44 input stream is not already seekable, wraps it in a ReseekFile.
45
46
47 NOTE:
48 Don't use bound methods for the ReseekFile. When the buffer is
49 empty, the ReseekFile reassigns the input file's read/readlines/etc.
50 method as instance variable. This gives slightly better performance
51 at the cost of not allowing an infrequently used idiom.
52
53 Use tell() to get the beginning byte location. ReseekFile will
54 attempt to get the real position from the wrapped file and use that as
55 the beginning location. If the wrapped file does not support tell(),
56 ReseekFile.tell() will return 0.
57
58 readlines does not yet support a sizehint. Want to
59 an implementation?
60
61 The latest version of this code can be found at
62 http://www.dalkescientific.com/Python/
63 """
64
65
66
67
68 from cStringIO import StringIO
69
71 """wrap a file handle to allow seeks back to the beginning
72
73 Takes a file handle in the constructor.
74
75 See the module docstring for more documentation.
76 """
78 self.file = file
79 self.buffer_file = StringIO()
80 self.at_beginning = 1
81 try:
82 self.beginning = file.tell()
83 except (IOError, AttributeError):
84 self.beginning = 0
85 self._use_buffer = 1
86
87 - def seek(self, offset, whence = 0):
88 """offset, whence = 0
89
90 Seek to a given byte position. Only supports whence == 0
91 and offset == the initial value of ReseekFile.tell() (which
92 is usually 0, but not always.)
93 """
94 if whence != 0:
95 raise TypeError("Unexpected whence value of %s; expecting 0" % \
96 (whence,))
97 if offset != self.beginning:
98 raise TypeError("Unexpected offset value of %r; expecting '%s'" % \
99 (offset, self.beginning))
100 self.buffer_file.seek(0)
101 self.at_beginning = 1
102
104 """the current position of the file
105
106 The initial position may not be 0 if the underlying input
107 file supports tell and it not at position 0.
108 """
109 if not self.at_beginning:
110 raise TypeError("ReseekFile cannot tell except at the beginning of file")
111 return self.beginning
112
114 if size < 0:
115 y = self.file.read()
116 z = self.buffer_file.read() + y
117 if self._use_buffer:
118 self.buffer_file.write(y)
119 return z
120 if size == 0:
121 return ""
122 x = self.buffer_file.read(size)
123 if len(x) < size:
124 y = self.file.read(size - len(x))
125 if self._use_buffer:
126 self.buffer_file.write(y)
127 return x + y
128 return x
129
130 - def read(self, size = -1):
131 """read up to 'size' bytes from the file
132
133 Default is -1, which means to read to end of file.
134 """
135 x = self._read(size)
136 if self.at_beginning and x:
137 self.at_beginning = 0
138 self._check_no_buffer()
139 return x
140
142 """read a line from the file"""
143
144
145 s = self.buffer_file.readline()
146 if s[-1:] == "\n":
147 return s
148
149 t = self.file.readline()
150
151
152 if self._use_buffer:
153 self.buffer_file.write(t)
154
155 self._check_no_buffer()
156
157 return s + t
158
160 """read all remaining lines from the file"""
161 s = self.read()
162 lines = []
163 i, j = 0, s.find("\n")
164 while j > -1:
165 lines.append(s[i:j+1])
166 i = j+1
167 j = s.find("\n", i)
168 if i < len(s):
169
170 lines.append(s[i:])
171 return lines
172
174
175
176
177 if self._use_buffer == 0 and self.buffer_file.tell() == \
178 len(self.buffer_file.getvalue()):
179
180 self.seek = getattr(self.file, "seek", None)
181 self.tell = getattr(self.file, "tell", None)
182 self.read = self.file.read
183 self.readline = self.file.readline
184 self.readlines = self.file.readlines
185 del self.buffer_file
186
188 """tell the ReseekFile to stop using the buffer once it's exhausted"""
189 self._use_buffer = 0
190
225
227 assert file.read(2) == "Th"
228 assert file.read(3) == "is "
229 assert file.read(4) == "is a"
230 assert file.read(0) == ""
231 assert file.read(0) == ""
232 assert file.read(6) == " test."
233 file.seek(seek0)
234 assert file.read(2) == "Th"
235 assert file.read(3) == "is "
236 assert file.read(4) == "is a"
237 assert file.read(0) == ""
238 assert file.read(0) == ""
239 assert file.read(6) == " test."
240 assert file.read(1) == "\n"
241 assert file.read(5) == "12345"
242 assert file.read() == "67890\n"
243 file.seek(seek0)
244 assert file.read() == test_s
245 file.seek(seek0)
246
247
249 s = "This is a test.\n1234567890\n"
250 file = StringIO(s)
251
252 x = file.tell()
253 test_reads(s, file, x)
254 test_reads(s, file, x)
255
256
257 rf = ReseekFile(file)
258 y = rf.tell()
259 rf.seek(y)
260 test_reads(s, rf, y)
261 assert rf.read() == s
262 assert rf.read() == ""
263
264
265 file = StringIO("X" + s)
266 file.read(1)
267 rf = ReseekFile(file)
268 y = rf.tell()
269 test_reads(s, rf, y)
270 rf.seek(y)
271 test_reads(s, rf, y)
272 assert rf.read() == s
273 assert rf.read() == ""
274
275
276
277 file = StringIO("X" + s)
278 file.read(1)
279 rf = ReseekFile(file)
280 y = rf.tell()
281 assert y == 1
282 rf.read(1000)
283 rf.seek(y)
284 rf.nobuffer()
285 assert rf.tell() == y
286 test_reads(s, rf, y)
287 rf.seek(y)
288 test_reads(s, rf, y)
289 assert rf.read() == s
290 assert rf.read() == ""
291
292
293 file = StringIO("X" + s)
294 file.read(1)
295 rf = ReseekFile(file)
296 y = rf.tell()
297 rf.read(5)
298 rf.seek(y)
299 rf.nobuffer()
300 assert rf.read() == s
301
302 file = StringIO("X" + s)
303 file.read(1)
304 rf = ReseekFile(file)
305 y = rf.tell()
306 t = rf.read(5)
307 rf.seek(y)
308 rf.nobuffer()
309 assert rf.read(5) == t
310
311 file = StringIO("X" + s)
312 file.read(1)
313 rf = ReseekFile(file)
314 y = rf.tell()
315 t = rf.read(5)
316 assert t == s[:5]
317 rf.seek(y)
318 rf.nobuffer()
319 assert rf.read(8) == s[:8]
320
321 file = StringIO("X" + s)
322 file.read(1)
323 rf = ReseekFile(file)
324 y = rf.tell()
325 t = rf.read(5)
326 assert t == s[:5]
327 rf.nobuffer()
328 assert rf.read(8) == s[5:5+8]
329
330
331 import os
332 infile = os.popen("echo HELLO_THERE")
333 infile.read(1)
334 rf = ReseekFile(infile)
335 y = rf.tell()
336 assert rf.read(1) == "E"
337 assert rf.read(2) == "LL"
338 rf.seek(y)
339 assert rf.read(4) == "ELLO"
340 rf.seek(y)
341 assert rf.read(1) == "E"
342 rf.nobuffer()
343 assert rf.read(1) == "L"
344 assert rf.read(4) == "LO_T"
345 assert rf.read(4) == "HERE"
346 try:
347 rf.seek(y)
348 raise AssertionError("Cannot seek here!")
349 except IOError:
350 pass
351 try:
352 rf.tell()
353 raise AssertionError("Cannot tell here!")
354 except IOError:
355 pass
356
357
358 s = "This is line 1.\nAnd line 2.\nAnd now, page 3!"
359 file = StringIO(s)
360 rf = ReseekFile(file)
361 rf.read(1)
362 assert rf.readline() == "his is line 1.\n"
363 rf.seek(0)
364 assert rf.readline() == "This is line 1.\n"
365 rf.read(2)
366 assert rf.readline() == "d line 2.\n"
367 rf.seek(0)
368 assert rf.readlines() == ["This is line 1.\n",
369 "And line 2.\n",
370 "And now, page 3!"]
371
372 rf.seek(0)
373 rf.read(len(s))
374 assert rf.readlines() == []
375 rf.seek(0)
376
377
378 s = "This is line 1.\nAnd line 2.\nAnd now, page 3!\n"
379 rf = ReseekFile(StringIO(s))
380 rf.read(1)
381 rf.seek(0)
382 rf.nobuffer()
383 assert rf.readlines() == ["This is line 1.\n",
384 "And line 2.\n",
385 "And now, page 3!\n"]
386
387
388 if __name__ == "__main__":
389 test()
390