This repository has been archived by the owner on Mar 31, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasterzip.pyx
248 lines (208 loc) · 7.56 KB
/
fasterzip.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import io
import zipfile
import contextlib
from libc.stdio cimport FILE
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from cpython.buffer cimport PyBUF_READ
cimport cfasterzip as cfz
cdef class ZipFile:
cdef cfz.mz_zip_archive* zip_archive
cdef FILE* zip_descriptor
def __init__(self, file):
"""
Open a ZIP file, where file can be a path to a file (a byte string) or
a file-like object.
:param file: Path to a file or a file-like object.
"""
if isinstance(file, io.IOBase):
try:
# To use `init_cfile` we need to get a FILE* handle of our
# own.
self.zip_descriptor = cfz.fdopen(file.fileno(), 'rb')
if self.zip_descriptor == NULL:
raise IOError('Unable to open file descriptor.')
except (io.UnsupportedOperation, IOError, AttributeError):
# TODO: What can we do about this without killing performance?
raise NotImplementedError(
'File-like object passed to ZipFile does not support'
' fileno()'
)
else:
status = cfz.mz_zip_reader_init_cfile(
self.zip_archive,
self.zip_descriptor,
0,
0
)
else:
status = cfz.mz_zip_reader_init_file(
self.zip_archive,
file,
0
)
if not status:
raise zipfile.BadZipFile(
cfz.mz_zip_get_error_string(
cfz.mz_zip_get_last_error(self.zip_archive)
)
)
def __cinit__(self):
"""Called on object allocation before all other methods."""
# Must, must, must make sure either this allocation was a success
# or we abort otherwise CPython will die with vague unfriendly
# segfaults.
self.zip_archive = <cfz.mz_zip_archive*> PyMem_Malloc(
sizeof(cfz.mz_zip_archive)
)
# Allocation failed (can only occur due to out-of-memory)
if not self.zip_archive:
raise MemoryError()
cfz.mz_zip_zero_struct(self.zip_archive)
def __dealloc__(self):
"""Called on object deallocation."""
cfz.mz_zip_end(self.zip_archive)
PyMem_Free(self.zip_archive)
cdef _locate_file(self, path):
"""Given a an archive entry path return its index."""
idx = cfz.mz_zip_reader_locate_file(
self.zip_archive,
path,
NULL,
0
)
if idx == -1:
# We raise KeyError here to mimic the behaviour of the stdlib
# zipfile.
raise KeyError(
'There is no item named {0!r} in the archive'.format(
path
)
)
return idx
cpdef getinfo(self, path):
"""
Return a dict with information about the archive member name.
:param path: The archive entry path
:return: dict
"""
cdef cfz.mz_zip_archive_file_stat stat
idx = self._locate_file(path)
status = cfz.mz_zip_reader_file_stat(
self.zip_archive,
idx,
&stat
)
if not status:
raise zipfile.BadZipFile(
cfz.mz_zip_get_error_string(
cfz.mz_zip_get_last_error(self.zip_archive)
)
)
return stat
def infolist(self):
"""
Yield detailed archive member information as dicts.
:rtype: Iterator[dict]
"""
cdef cfz.mz_zip_archive_file_stat stat
cdef cfz.mz_uint i
for i in range(len(self)):
status = cfz.mz_zip_reader_file_stat(
self.zip_archive,
i,
&stat
)
if not status:
raise zipfile.BadZipFile(
cfz.mz_zip_get_error_string(
cfz.mz_zip_get_last_error(self.zip_archive)
)
)
yield stat
def namelist(self):
"""
Yield archive members by name.
"""
for s in self.infolist():
yield s['m_filename']
@contextlib.contextmanager
def read(self, path):
"""
Reads the contents of `path`, returning a memory view over
it.
.. note::
This is the simplest way of extracting a file from an archive,
however it will extract the entire file at once into memory. If
not enough heap remains out-of-memory errors may occur.
.. note::
You should always use this as a context manager, as
the buffer will be freed from the heap immediately after.
:param path: The archive entry path.
:return: memory view of decompressed contents.
"""
cdef char *p
cdef size_t uncompressed_size
p = <char*>cfz.mz_zip_reader_extract_file_to_heap(
self.zip_archive,
path,
&uncompressed_size,
0
)
if not p:
raise zipfile.BadZipFile(
cfz.mz_zip_get_error_string(
cfz.mz_zip_get_last_error(self.zip_archive)
)
)
# Using a vanilla C API view here instead of the more convenient
# Cython typed memory view is slightly faster.
yield cfz.PyMemoryView_FromMemory(p, uncompressed_size, PyBUF_READ)
cfz.mz_free(p)
def read_iter(self, path, max_chunk_size=0x100000):
"""Read the contents of `path`, yielding chunks of at most
`max_chunk_size`.
This method may be used to efficiently decompress files that would
not fit in memory by decompressing only `max_chunk_size` at a time.
.. note::
The "real" memory overhead of this method at any given iteration
is `MAX_CHUNK_SIZE + size_of(current_chunk)`, since a Python
byte string is created for the chunk being returned.
:param path: The archive entry path.
:param max_chunk_size: The maximum size of the buffer (in bytes)
"""
cdef unsigned long copied_bytes = 0
cdef cfz.mz_zip_reader_extract_iter_state* state
cdef char* buff = <char*>PyMem_Malloc(max_chunk_size)
if not buff:
raise MemoryError()
try:
# We need to know the uncompressed size to know when to stop
# iterating.
stat = self.getinfo(path)
state = cfz.mz_zip_reader_extract_file_iter_new(
self.zip_archive,
path,
0
)
if not state:
raise zipfile.BadZipFile(
cfz.mz_zip_get_error_string(
cfz.mz_zip_get_last_error(self.zip_archive)
)
)
try:
while copied_bytes < stat['m_uncomp_size']:
size_read = cfz.mz_zip_reader_extract_iter_read(
state,
buff,
max_chunk_size
)
yield <bytes>buff[:size_read]
copied_bytes += size_read
finally:
cfz.mz_zip_reader_extract_iter_free(state)
finally:
PyMem_Free(buff)
def __len__(self):
"""Returns the number of archive entries in the zipfile."""
return cfz.mz_zip_reader_get_num_files(self.zip_archive)