]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | """Functions that read and write gzipped files.\r |
2 | \r | |
3 | The user of the file doesn't have to worry about the compression,\r | |
4 | but random access is not allowed."""\r | |
5 | \r | |
6 | # based on Andrew Kuchling's minigzip.py distributed with the zlib module\r | |
7 | \r | |
8 | import struct, sys, time, os\r | |
9 | import zlib\r | |
10 | import io\r | |
11 | import __builtin__\r | |
12 | \r | |
13 | __all__ = ["GzipFile","open"]\r | |
14 | \r | |
15 | FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16\r | |
16 | \r | |
17 | READ, WRITE = 1, 2\r | |
18 | \r | |
19 | def write32u(output, value):\r | |
20 | # The L format writes the bit pattern correctly whether signed\r | |
21 | # or unsigned.\r | |
22 | output.write(struct.pack("<L", value))\r | |
23 | \r | |
24 | def read32(input):\r | |
25 | return struct.unpack("<I", input.read(4))[0]\r | |
26 | \r | |
27 | def open(filename, mode="rb", compresslevel=9):\r | |
28 | """Shorthand for GzipFile(filename, mode, compresslevel).\r | |
29 | \r | |
30 | The filename argument is required; mode defaults to 'rb'\r | |
31 | and compresslevel defaults to 9.\r | |
32 | \r | |
33 | """\r | |
34 | return GzipFile(filename, mode, compresslevel)\r | |
35 | \r | |
36 | class GzipFile(io.BufferedIOBase):\r | |
37 | """The GzipFile class simulates most of the methods of a file object with\r | |
38 | the exception of the readinto() and truncate() methods.\r | |
39 | \r | |
40 | """\r | |
41 | \r | |
42 | myfileobj = None\r | |
43 | max_read_chunk = 10 * 1024 * 1024 # 10Mb\r | |
44 | \r | |
45 | def __init__(self, filename=None, mode=None,\r | |
46 | compresslevel=9, fileobj=None, mtime=None):\r | |
47 | """Constructor for the GzipFile class.\r | |
48 | \r | |
49 | At least one of fileobj and filename must be given a\r | |
50 | non-trivial value.\r | |
51 | \r | |
52 | The new class instance is based on fileobj, which can be a regular\r | |
53 | file, a StringIO object, or any other object which simulates a file.\r | |
54 | It defaults to None, in which case filename is opened to provide\r | |
55 | a file object.\r | |
56 | \r | |
57 | When fileobj is not None, the filename argument is only used to be\r | |
58 | included in the gzip file header, which may includes the original\r | |
59 | filename of the uncompressed file. It defaults to the filename of\r | |
60 | fileobj, if discernible; otherwise, it defaults to the empty string,\r | |
61 | and in this case the original filename is not included in the header.\r | |
62 | \r | |
63 | The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',\r | |
64 | depending on whether the file will be read or written. The default\r | |
65 | is the mode of fileobj if discernible; otherwise, the default is 'rb'.\r | |
66 | Be aware that only the 'rb', 'ab', and 'wb' values should be used\r | |
67 | for cross-platform portability.\r | |
68 | \r | |
69 | The compresslevel argument is an integer from 1 to 9 controlling the\r | |
70 | level of compression; 1 is fastest and produces the least compression,\r | |
71 | and 9 is slowest and produces the most compression. The default is 9.\r | |
72 | \r | |
73 | The mtime argument is an optional numeric timestamp to be written\r | |
74 | to the stream when compressing. All gzip compressed streams\r | |
75 | are required to contain a timestamp. If omitted or None, the\r | |
76 | current time is used. This module ignores the timestamp when\r | |
77 | decompressing; however, some programs, such as gunzip, make use\r | |
78 | of it. The format of the timestamp is the same as that of the\r | |
79 | return value of time.time() and of the st_mtime member of the\r | |
80 | object returned by os.stat().\r | |
81 | \r | |
82 | """\r | |
83 | \r | |
84 | # guarantee the file is opened in binary mode on platforms\r | |
85 | # that care about that sort of thing\r | |
86 | if mode and 'b' not in mode:\r | |
87 | mode += 'b'\r | |
88 | if fileobj is None:\r | |
89 | fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')\r | |
90 | if filename is None:\r | |
91 | if hasattr(fileobj, 'name'): filename = fileobj.name\r | |
92 | else: filename = ''\r | |
93 | if mode is None:\r | |
94 | if hasattr(fileobj, 'mode'): mode = fileobj.mode\r | |
95 | else: mode = 'rb'\r | |
96 | \r | |
97 | if mode[0:1] == 'r':\r | |
98 | self.mode = READ\r | |
99 | # Set flag indicating start of a new member\r | |
100 | self._new_member = True\r | |
101 | # Buffer data read from gzip file. extrastart is offset in\r | |
102 | # stream where buffer starts. extrasize is number of\r | |
103 | # bytes remaining in buffer from current stream position.\r | |
104 | self.extrabuf = ""\r | |
105 | self.extrasize = 0\r | |
106 | self.extrastart = 0\r | |
107 | self.name = filename\r | |
108 | # Starts small, scales exponentially\r | |
109 | self.min_readsize = 100\r | |
110 | \r | |
111 | elif mode[0:1] == 'w' or mode[0:1] == 'a':\r | |
112 | self.mode = WRITE\r | |
113 | self._init_write(filename)\r | |
114 | self.compress = zlib.compressobj(compresslevel,\r | |
115 | zlib.DEFLATED,\r | |
116 | -zlib.MAX_WBITS,\r | |
117 | zlib.DEF_MEM_LEVEL,\r | |
118 | 0)\r | |
119 | else:\r | |
120 | raise IOError, "Mode " + mode + " not supported"\r | |
121 | \r | |
122 | self.fileobj = fileobj\r | |
123 | self.offset = 0\r | |
124 | self.mtime = mtime\r | |
125 | \r | |
126 | if self.mode == WRITE:\r | |
127 | self._write_gzip_header()\r | |
128 | \r | |
129 | @property\r | |
130 | def filename(self):\r | |
131 | import warnings\r | |
132 | warnings.warn("use the name attribute", DeprecationWarning, 2)\r | |
133 | if self.mode == WRITE and self.name[-3:] != ".gz":\r | |
134 | return self.name + ".gz"\r | |
135 | return self.name\r | |
136 | \r | |
137 | def __repr__(self):\r | |
138 | s = repr(self.fileobj)\r | |
139 | return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'\r | |
140 | \r | |
141 | def _check_closed(self):\r | |
142 | """Raises a ValueError if the underlying file object has been closed.\r | |
143 | \r | |
144 | """\r | |
145 | if self.closed:\r | |
146 | raise ValueError('I/O operation on closed file.')\r | |
147 | \r | |
148 | def _init_write(self, filename):\r | |
149 | self.name = filename\r | |
150 | self.crc = zlib.crc32("") & 0xffffffffL\r | |
151 | self.size = 0\r | |
152 | self.writebuf = []\r | |
153 | self.bufsize = 0\r | |
154 | \r | |
155 | def _write_gzip_header(self):\r | |
156 | self.fileobj.write('\037\213') # magic header\r | |
157 | self.fileobj.write('\010') # compression method\r | |
158 | fname = os.path.basename(self.name)\r | |
159 | if fname.endswith(".gz"):\r | |
160 | fname = fname[:-3]\r | |
161 | flags = 0\r | |
162 | if fname:\r | |
163 | flags = FNAME\r | |
164 | self.fileobj.write(chr(flags))\r | |
165 | mtime = self.mtime\r | |
166 | if mtime is None:\r | |
167 | mtime = time.time()\r | |
168 | write32u(self.fileobj, long(mtime))\r | |
169 | self.fileobj.write('\002')\r | |
170 | self.fileobj.write('\377')\r | |
171 | if fname:\r | |
172 | self.fileobj.write(fname + '\000')\r | |
173 | \r | |
174 | def _init_read(self):\r | |
175 | self.crc = zlib.crc32("") & 0xffffffffL\r | |
176 | self.size = 0\r | |
177 | \r | |
178 | def _read_gzip_header(self):\r | |
179 | magic = self.fileobj.read(2)\r | |
180 | if magic != '\037\213':\r | |
181 | raise IOError, 'Not a gzipped file'\r | |
182 | method = ord( self.fileobj.read(1) )\r | |
183 | if method != 8:\r | |
184 | raise IOError, 'Unknown compression method'\r | |
185 | flag = ord( self.fileobj.read(1) )\r | |
186 | self.mtime = read32(self.fileobj)\r | |
187 | # extraflag = self.fileobj.read(1)\r | |
188 | # os = self.fileobj.read(1)\r | |
189 | self.fileobj.read(2)\r | |
190 | \r | |
191 | if flag & FEXTRA:\r | |
192 | # Read & discard the extra field, if present\r | |
193 | xlen = ord(self.fileobj.read(1))\r | |
194 | xlen = xlen + 256*ord(self.fileobj.read(1))\r | |
195 | self.fileobj.read(xlen)\r | |
196 | if flag & FNAME:\r | |
197 | # Read and discard a null-terminated string containing the filename\r | |
198 | while True:\r | |
199 | s = self.fileobj.read(1)\r | |
200 | if not s or s=='\000':\r | |
201 | break\r | |
202 | if flag & FCOMMENT:\r | |
203 | # Read and discard a null-terminated string containing a comment\r | |
204 | while True:\r | |
205 | s = self.fileobj.read(1)\r | |
206 | if not s or s=='\000':\r | |
207 | break\r | |
208 | if flag & FHCRC:\r | |
209 | self.fileobj.read(2) # Read & discard the 16-bit header CRC\r | |
210 | \r | |
211 | def write(self,data):\r | |
212 | self._check_closed()\r | |
213 | if self.mode != WRITE:\r | |
214 | import errno\r | |
215 | raise IOError(errno.EBADF, "write() on read-only GzipFile object")\r | |
216 | \r | |
217 | if self.fileobj is None:\r | |
218 | raise ValueError, "write() on closed GzipFile object"\r | |
219 | \r | |
220 | # Convert data type if called by io.BufferedWriter.\r | |
221 | if isinstance(data, memoryview):\r | |
222 | data = data.tobytes()\r | |
223 | \r | |
224 | if len(data) > 0:\r | |
225 | self.size = self.size + len(data)\r | |
226 | self.crc = zlib.crc32(data, self.crc) & 0xffffffffL\r | |
227 | self.fileobj.write( self.compress.compress(data) )\r | |
228 | self.offset += len(data)\r | |
229 | \r | |
230 | return len(data)\r | |
231 | \r | |
232 | def read(self, size=-1):\r | |
233 | self._check_closed()\r | |
234 | if self.mode != READ:\r | |
235 | import errno\r | |
236 | raise IOError(errno.EBADF, "read() on write-only GzipFile object")\r | |
237 | \r | |
238 | if self.extrasize <= 0 and self.fileobj is None:\r | |
239 | return ''\r | |
240 | \r | |
241 | readsize = 1024\r | |
242 | if size < 0: # get the whole thing\r | |
243 | try:\r | |
244 | while True:\r | |
245 | self._read(readsize)\r | |
246 | readsize = min(self.max_read_chunk, readsize * 2)\r | |
247 | except EOFError:\r | |
248 | size = self.extrasize\r | |
249 | else: # just get some more of it\r | |
250 | try:\r | |
251 | while size > self.extrasize:\r | |
252 | self._read(readsize)\r | |
253 | readsize = min(self.max_read_chunk, readsize * 2)\r | |
254 | except EOFError:\r | |
255 | if size > self.extrasize:\r | |
256 | size = self.extrasize\r | |
257 | \r | |
258 | offset = self.offset - self.extrastart\r | |
259 | chunk = self.extrabuf[offset: offset + size]\r | |
260 | self.extrasize = self.extrasize - size\r | |
261 | \r | |
262 | self.offset += size\r | |
263 | return chunk\r | |
264 | \r | |
265 | def _unread(self, buf):\r | |
266 | self.extrasize = len(buf) + self.extrasize\r | |
267 | self.offset -= len(buf)\r | |
268 | \r | |
269 | def _read(self, size=1024):\r | |
270 | if self.fileobj is None:\r | |
271 | raise EOFError, "Reached EOF"\r | |
272 | \r | |
273 | if self._new_member:\r | |
274 | # If the _new_member flag is set, we have to\r | |
275 | # jump to the next member, if there is one.\r | |
276 | #\r | |
277 | # First, check if we're at the end of the file;\r | |
278 | # if so, it's time to stop; no more members to read.\r | |
279 | pos = self.fileobj.tell() # Save current position\r | |
280 | self.fileobj.seek(0, 2) # Seek to end of file\r | |
281 | if pos == self.fileobj.tell():\r | |
282 | raise EOFError, "Reached EOF"\r | |
283 | else:\r | |
284 | self.fileobj.seek( pos ) # Return to original position\r | |
285 | \r | |
286 | self._init_read()\r | |
287 | self._read_gzip_header()\r | |
288 | self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)\r | |
289 | self._new_member = False\r | |
290 | \r | |
291 | # Read a chunk of data from the file\r | |
292 | buf = self.fileobj.read(size)\r | |
293 | \r | |
294 | # If the EOF has been reached, flush the decompression object\r | |
295 | # and mark this object as finished.\r | |
296 | \r | |
297 | if buf == "":\r | |
298 | uncompress = self.decompress.flush()\r | |
299 | self._read_eof()\r | |
300 | self._add_read_data( uncompress )\r | |
301 | raise EOFError, 'Reached EOF'\r | |
302 | \r | |
303 | uncompress = self.decompress.decompress(buf)\r | |
304 | self._add_read_data( uncompress )\r | |
305 | \r | |
306 | if self.decompress.unused_data != "":\r | |
307 | # Ending case: we've come to the end of a member in the file,\r | |
308 | # so seek back to the start of the unused data, finish up\r | |
309 | # this member, and read a new gzip header.\r | |
310 | # (The number of bytes to seek back is the length of the unused\r | |
311 | # data, minus 8 because _read_eof() will rewind a further 8 bytes)\r | |
312 | self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)\r | |
313 | \r | |
314 | # Check the CRC and file size, and set the flag so we read\r | |
315 | # a new member on the next call\r | |
316 | self._read_eof()\r | |
317 | self._new_member = True\r | |
318 | \r | |
319 | def _add_read_data(self, data):\r | |
320 | self.crc = zlib.crc32(data, self.crc) & 0xffffffffL\r | |
321 | offset = self.offset - self.extrastart\r | |
322 | self.extrabuf = self.extrabuf[offset:] + data\r | |
323 | self.extrasize = self.extrasize + len(data)\r | |
324 | self.extrastart = self.offset\r | |
325 | self.size = self.size + len(data)\r | |
326 | \r | |
327 | def _read_eof(self):\r | |
328 | # We've read to the end of the file, so we have to rewind in order\r | |
329 | # to reread the 8 bytes containing the CRC and the file size.\r | |
330 | # We check the that the computed CRC and size of the\r | |
331 | # uncompressed data matches the stored values. Note that the size\r | |
332 | # stored is the true file size mod 2**32.\r | |
333 | self.fileobj.seek(-8, 1)\r | |
334 | crc32 = read32(self.fileobj)\r | |
335 | isize = read32(self.fileobj) # may exceed 2GB\r | |
336 | if crc32 != self.crc:\r | |
337 | raise IOError("CRC check failed %s != %s" % (hex(crc32),\r | |
338 | hex(self.crc)))\r | |
339 | elif isize != (self.size & 0xffffffffL):\r | |
340 | raise IOError, "Incorrect length of data produced"\r | |
341 | \r | |
342 | # Gzip files can be padded with zeroes and still have archives.\r | |
343 | # Consume all zero bytes and set the file position to the first\r | |
344 | # non-zero byte. See http://www.gzip.org/#faq8\r | |
345 | c = "\x00"\r | |
346 | while c == "\x00":\r | |
347 | c = self.fileobj.read(1)\r | |
348 | if c:\r | |
349 | self.fileobj.seek(-1, 1)\r | |
350 | \r | |
351 | @property\r | |
352 | def closed(self):\r | |
353 | return self.fileobj is None\r | |
354 | \r | |
355 | def close(self):\r | |
356 | if self.fileobj is None:\r | |
357 | return\r | |
358 | if self.mode == WRITE:\r | |
359 | self.fileobj.write(self.compress.flush())\r | |
360 | write32u(self.fileobj, self.crc)\r | |
361 | # self.size may exceed 2GB, or even 4GB\r | |
362 | write32u(self.fileobj, self.size & 0xffffffffL)\r | |
363 | self.fileobj = None\r | |
364 | elif self.mode == READ:\r | |
365 | self.fileobj = None\r | |
366 | if self.myfileobj:\r | |
367 | self.myfileobj.close()\r | |
368 | self.myfileobj = None\r | |
369 | \r | |
370 | def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):\r | |
371 | self._check_closed()\r | |
372 | if self.mode == WRITE:\r | |
373 | # Ensure the compressor's buffer is flushed\r | |
374 | self.fileobj.write(self.compress.flush(zlib_mode))\r | |
375 | self.fileobj.flush()\r | |
376 | \r | |
377 | def fileno(self):\r | |
378 | """Invoke the underlying file object's fileno() method.\r | |
379 | \r | |
380 | This will raise AttributeError if the underlying file object\r | |
381 | doesn't support fileno().\r | |
382 | """\r | |
383 | return self.fileobj.fileno()\r | |
384 | \r | |
385 | def rewind(self):\r | |
386 | '''Return the uncompressed stream file position indicator to the\r | |
387 | beginning of the file'''\r | |
388 | if self.mode != READ:\r | |
389 | raise IOError("Can't rewind in write mode")\r | |
390 | self.fileobj.seek(0)\r | |
391 | self._new_member = True\r | |
392 | self.extrabuf = ""\r | |
393 | self.extrasize = 0\r | |
394 | self.extrastart = 0\r | |
395 | self.offset = 0\r | |
396 | \r | |
397 | def readable(self):\r | |
398 | return self.mode == READ\r | |
399 | \r | |
400 | def writable(self):\r | |
401 | return self.mode == WRITE\r | |
402 | \r | |
403 | def seekable(self):\r | |
404 | return True\r | |
405 | \r | |
406 | def seek(self, offset, whence=0):\r | |
407 | if whence:\r | |
408 | if whence == 1:\r | |
409 | offset = self.offset + offset\r | |
410 | else:\r | |
411 | raise ValueError('Seek from end not supported')\r | |
412 | if self.mode == WRITE:\r | |
413 | if offset < self.offset:\r | |
414 | raise IOError('Negative seek in write mode')\r | |
415 | count = offset - self.offset\r | |
416 | for i in range(count // 1024):\r | |
417 | self.write(1024 * '\0')\r | |
418 | self.write((count % 1024) * '\0')\r | |
419 | elif self.mode == READ:\r | |
420 | if offset < self.offset:\r | |
421 | # for negative seek, rewind and do positive seek\r | |
422 | self.rewind()\r | |
423 | count = offset - self.offset\r | |
424 | for i in range(count // 1024):\r | |
425 | self.read(1024)\r | |
426 | self.read(count % 1024)\r | |
427 | \r | |
428 | return self.offset\r | |
429 | \r | |
430 | def readline(self, size=-1):\r | |
431 | if size < 0:\r | |
432 | # Shortcut common case - newline found in buffer.\r | |
433 | offset = self.offset - self.extrastart\r | |
434 | i = self.extrabuf.find('\n', offset) + 1\r | |
435 | if i > 0:\r | |
436 | self.extrasize -= i - offset\r | |
437 | self.offset += i - offset\r | |
438 | return self.extrabuf[offset: i]\r | |
439 | \r | |
440 | size = sys.maxint\r | |
441 | readsize = self.min_readsize\r | |
442 | else:\r | |
443 | readsize = size\r | |
444 | bufs = []\r | |
445 | while size != 0:\r | |
446 | c = self.read(readsize)\r | |
447 | i = c.find('\n')\r | |
448 | \r | |
449 | # We set i=size to break out of the loop under two\r | |
450 | # conditions: 1) there's no newline, and the chunk is\r | |
451 | # larger than size, or 2) there is a newline, but the\r | |
452 | # resulting line would be longer than 'size'.\r | |
453 | if (size <= i) or (i == -1 and len(c) > size):\r | |
454 | i = size - 1\r | |
455 | \r | |
456 | if i >= 0 or c == '':\r | |
457 | bufs.append(c[:i + 1]) # Add portion of last chunk\r | |
458 | self._unread(c[i + 1:]) # Push back rest of chunk\r | |
459 | break\r | |
460 | \r | |
461 | # Append chunk to list, decrease 'size',\r | |
462 | bufs.append(c)\r | |
463 | size = size - len(c)\r | |
464 | readsize = min(size, readsize * 2)\r | |
465 | if readsize > self.min_readsize:\r | |
466 | self.min_readsize = min(readsize, self.min_readsize * 2, 512)\r | |
467 | return ''.join(bufs) # Return resulting line\r | |
468 | \r | |
469 | \r | |
470 | def _test():\r | |
471 | # Act like gzip; with -d, act like gunzip.\r | |
472 | # The input file is not deleted, however, nor are any other gzip\r | |
473 | # options or features supported.\r | |
474 | args = sys.argv[1:]\r | |
475 | decompress = args and args[0] == "-d"\r | |
476 | if decompress:\r | |
477 | args = args[1:]\r | |
478 | if not args:\r | |
479 | args = ["-"]\r | |
480 | for arg in args:\r | |
481 | if decompress:\r | |
482 | if arg == "-":\r | |
483 | f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)\r | |
484 | g = sys.stdout\r | |
485 | else:\r | |
486 | if arg[-3:] != ".gz":\r | |
487 | print "filename doesn't end in .gz:", repr(arg)\r | |
488 | continue\r | |
489 | f = open(arg, "rb")\r | |
490 | g = __builtin__.open(arg[:-3], "wb")\r | |
491 | else:\r | |
492 | if arg == "-":\r | |
493 | f = sys.stdin\r | |
494 | g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)\r | |
495 | else:\r | |
496 | f = __builtin__.open(arg, "rb")\r | |
497 | g = open(arg + ".gz", "wb")\r | |
498 | while True:\r | |
499 | chunk = f.read(1024)\r | |
500 | if not chunk:\r | |
501 | break\r | |
502 | g.write(chunk)\r | |
503 | if g is not sys.stdout:\r | |
504 | g.close()\r | |
505 | if f is not sys.stdin:\r | |
506 | f.close()\r | |
507 | \r | |
508 | if __name__ == '__main__':\r | |
509 | _test()\r |