From 637aa7df1e0990de5a03857a2c685c2df6ea6683 Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Wed, 9 Aug 2023 10:17:46 -0500 Subject: [PATCH 1/7] use less random input data for test and add more modify_index tests --- tests/conftest.py | 8 ++++++-- tests/test_zran.py | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 8d4eb0b..b9d2f61 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import os +import random import zlib import pytest @@ -31,7 +32,7 @@ def input_data(): def create_compressed_data(uncompressed_data, wbits, start=None, stop=None): - compress_obj = zlib.compressobj(wbits=wbits) + compress_obj = zlib.compressobj(wbits=wbits, level=9) compressed = compress_obj.compress(uncompressed_data) compressed += compress_obj.flush() @@ -57,7 +58,10 @@ def gz_points(): @pytest.fixture(scope='module') def data(): - out = os.urandom(2**22) + # Can't use os.random directly because there needs to be some + # repitition in order for compression to be effective + words = [os.urandom(8) for _ in range(1000)] + out = b''.join([random.choice(words) for _ in range(524288)]) return out diff --git a/tests/test_zran.py b/tests/test_zran.py index 4e3ca23..5b59f20 100644 --- a/tests/test_zran.py +++ b/tests/test_zran.py @@ -32,13 +32,13 @@ def test_create_index(compressed_gz_data): assert len(points[0].window) == 32768 -@pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties') +# @pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties') def test_create_index_fail_head(data, compressed_gz_data_no_head): with pytest.raises(zran.ZranError, match='zran: compressed data error in input file'): zran.Index.create_index(compressed_gz_data_no_head) -@pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties') +# @pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties') def test_create_index_fail_tail(data, compressed_gz_data_no_tail): with pytest.raises(zran.ZranError, match='zran: input file ended prematurely'): zran.Index.create_index(compressed_gz_data_no_tail) @@ -68,7 +68,7 @@ def test_decompress(data, compressed_file): assert data[start : start + length] == test_data -@pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties') +# @pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties') def test_decompress_fail(data, compressed_gz_data, compressed_gz_data_no_head): start = 100 length = 1000 @@ -86,8 +86,23 @@ def test_get_closest_point(): assert r2.outloc == 4 +def test_modify_index_and_beginning_decompress(data, compressed_dfl_data): + index = zran.Index.create_index(compressed_dfl_data, span=2**18) + start = 0 + stop = 100 + + compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + test_data = zran.decompress( + compressed_dfl_data[compressed_range[0] : compressed_range[1]], + new_index, + start - uncompressed_range[0], + stop - start, + ) + assert data[start:stop] == test_data + + @pytest.mark.parametrize('start_index,stop_index', ((0, 5), (4, 10), (9, -1))) -def test_modify_index_and_decompress(start_index, stop_index, data, compressed_dfl_data): +def test_modify_index_and_interior_decompress(start_index, stop_index, data, compressed_dfl_data): index = zran.Index.create_index(compressed_dfl_data, span=2**18) start = index.points[start_index].outloc + 100 stop = index.points[stop_index].outloc + 100 @@ -102,6 +117,21 @@ def test_modify_index_and_decompress(start_index, stop_index, data, compressed_d assert data[start:stop] == test_data +def test_modify_index_and_end_decompress(data, compressed_dfl_data): + index = zran.Index.create_index(compressed_dfl_data, span=2**18) + start = index.points[-1].outloc + 100 + stop = len(data) + + compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + test_data = zran.decompress( + compressed_dfl_data[compressed_range[0] : compressed_range[1]], + new_index, + start - uncompressed_range[0], + stop - start, + ) + assert data[start:stop] == test_data + + @pytest.mark.skip(reason='Integration test. Only run if testing Sentinel-1 SLC burst compatibility') @pytest.mark.parametrize('burst', offset_list) def test_safe(burst, input_data): From 67a8af24f7e6147aea916184eadf88a41c33a11f Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Wed, 9 Aug 2023 10:37:44 -0500 Subject: [PATCH 2/7] modified_index: last window is zero to save space --- src/zran/zranlib.pyx | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/zran/zranlib.pyx b/src/zran/zranlib.pyx index e38aca6..311d309 100644 --- a/src/zran/zranlib.pyx +++ b/src/zran/zranlib.pyx @@ -270,16 +270,24 @@ class Index: inloc_offset = desired_points[0].inloc - compressed_offsets[0] outloc_offset = desired_points[0].outloc - desired_points = [ - Point(x.outloc - outloc_offset, x.inloc - inloc_offset, x.bits, x.window) for x in desired_points - ] + + output_points = [] + start_point_is_last_in_origional = start_index == len(compressed_offsets) - 1 + for i, point in enumerate(desired_points): + last_point_in_new_index = i == len(desired_points) - 1 + if last_point_in_new_index and not start_point_is_last_in_origional: + window = bytearray(WINDOW_LENGTH) + else: + window = point.window + new_point = Point(point.outloc - outloc_offset, point.inloc - inloc_offset, point.bits, window) + output_points.append(new_point) modified_index = Index( self.have, compressed_range[1] - compressed_range[0], uncompressed_range[1] - uncompressed_range[0], - len(desired_points), - desired_points, + len(output_points), + output_points, ) return compressed_range, uncompressed_range, modified_index From e95f4637c861d9bed421d8c2d6b8f54e556edc98 Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Wed, 9 Aug 2023 10:46:37 -0500 Subject: [PATCH 3/7] modified_index: first window is zero to save space --- src/zran/zranlib.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zran/zranlib.pyx b/src/zran/zranlib.pyx index 311d309..e8af646 100644 --- a/src/zran/zranlib.pyx +++ b/src/zran/zranlib.pyx @@ -275,7 +275,9 @@ class Index: start_point_is_last_in_origional = start_index == len(compressed_offsets) - 1 for i, point in enumerate(desired_points): last_point_in_new_index = i == len(desired_points) - 1 - if last_point_in_new_index and not start_point_is_last_in_origional: + if i == 0: + window = bytearray(WINDOW_LENGTH) + elif last_point_in_new_index and not start_point_is_last_in_origional: window = bytearray(WINDOW_LENGTH) else: window = point.window From 2089fd22f12501e4e2de5cdb21e7663ab8023747 Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Wed, 9 Aug 2023 16:33:19 -0500 Subject: [PATCH 4/7] working on prevent out of bounds decompression --- src/zran/zranlib.pyx | 9 ++++++++- tests/test_zran.py | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/zran/zranlib.pyx b/src/zran/zranlib.pyx index e8af646..de90dde 100644 --- a/src/zran/zranlib.pyx +++ b/src/zran/zranlib.pyx @@ -146,7 +146,10 @@ def build_deflate_index(input_bytes: bytes, span: off_t = 2**20) -> WrapperDefla return index -def decompress(input_bytes: bytes, index: Index, offset: off_t, length: int) -> bytes: # noqa +def decompress(input_bytes: bytes, index: Index, offset: off_t, length: int) -> bytes: # noqa + if offset + length > index.uncompressed_size: + raise ValueError('Offset and length specified would result in reading past the file bounds') + compressed_data = cython.declare(cython.p_char, PyBytes_AsString(input_bytes)) compressed_data_length = cython.declare(off_t, PyBytes_Size(input_bytes)) infile = fmemopen(compressed_data, compressed_data_length, b"r") @@ -179,6 +182,10 @@ class Index: self.mode = mode self.have = have self.points = points + if self.points[0].outloc + 1 == self.uncompressed_size: + self.modified = True + else: + self.modified = False @staticmethod def create_index(input_bytes: bytes, span: int = 2**20): diff --git a/tests/test_zran.py b/tests/test_zran.py index 5b59f20..b2be799 100644 --- a/tests/test_zran.py +++ b/tests/test_zran.py @@ -108,6 +108,7 @@ def test_modify_index_and_interior_decompress(start_index, stop_index, data, com stop = index.points[stop_index].outloc + 100 compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + breakpoint() test_data = zran.decompress( compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, @@ -132,6 +133,26 @@ def test_modify_index_and_end_decompress(data, compressed_dfl_data): assert data[start:stop] == test_data +def test_index_and_read_late(data, compressed_dfl_data): + index = zran.Index.create_index(compressed_dfl_data, span=2**18) + with pytest.raises(ValueError, match='Offset and length specified would result in reading past the file bounds'): + zran.decompress(compressed_dfl_data, index, 0, len(data)) + + +def test_modify_index_and_read_late(data, compressed_dfl_data): + index = zran.Index.create_index(compressed_dfl_data, span=2**18) + start = index.points[5].outloc + stop = index.points[10].outloc + + compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + offset = start - uncompressed_range[0] + length = stop - start + test_data = zran.decompress( + compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, offset, length + ) + assert data[start:stop] == test_data + + @pytest.mark.skip(reason='Integration test. Only run if testing Sentinel-1 SLC burst compatibility') @pytest.mark.parametrize('burst', offset_list) def test_safe(burst, input_data): From 6f426e0d2c249fb0d73e896be993681af00ea0d7 Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Thu, 10 Aug 2023 08:20:21 -0500 Subject: [PATCH 5/7] handle out of bound decompress logic in decompress func --- src/zran/zranlib.pyx | 14 +++++++++----- tests/test_zran.py | 35 +++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/zran/zranlib.pyx b/src/zran/zranlib.pyx index de90dde..5c5ed23 100644 --- a/src/zran/zranlib.pyx +++ b/src/zran/zranlib.pyx @@ -146,7 +146,15 @@ def build_deflate_index(input_bytes: bytes, span: off_t = 2**20) -> WrapperDefla return index -def decompress(input_bytes: bytes, index: Index, offset: off_t, length: int) -> bytes: # noqa +def decompress(input_bytes: bytes, index: Index, offset: off_t, length: int) -> bytes: # noqa + first_bit_zero = index.points[0].bits == 0 + offset_before_second_point = offset < index.points[1].outloc + if not first_bit_zero and offset_before_second_point: + raise ValueError( + 'When first index bit != 0, offset must be at or after second index point' + f' ({index.points[1].outloc} for this index)' + ) + if offset + length > index.uncompressed_size: raise ValueError('Offset and length specified would result in reading past the file bounds') @@ -182,10 +190,6 @@ class Index: self.mode = mode self.have = have self.points = points - if self.points[0].outloc + 1 == self.uncompressed_size: - self.modified = True - else: - self.modified = False @staticmethod def create_index(input_bytes: bytes, span: int = 2**20): diff --git a/tests/test_zran.py b/tests/test_zran.py index b2be799..bb0283d 100644 --- a/tests/test_zran.py +++ b/tests/test_zran.py @@ -86,7 +86,7 @@ def test_get_closest_point(): assert r2.outloc == 4 -def test_modify_index_and_beginning_decompress(data, compressed_dfl_data): +def test_modify_index_and_head_decompress(data, compressed_dfl_data): index = zran.Index.create_index(compressed_dfl_data, span=2**18) start = 0 stop = 100 @@ -108,7 +108,6 @@ def test_modify_index_and_interior_decompress(start_index, stop_index, data, com stop = index.points[stop_index].outloc + 100 compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) - breakpoint() test_data = zran.decompress( compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, @@ -118,7 +117,7 @@ def test_modify_index_and_interior_decompress(start_index, stop_index, data, com assert data[start:stop] == test_data -def test_modify_index_and_end_decompress(data, compressed_dfl_data): +def test_modify_index_and_tail_decompress(data, compressed_dfl_data): index = zran.Index.create_index(compressed_dfl_data, span=2**18) start = index.points[-1].outloc + 100 stop = len(data) @@ -133,25 +132,37 @@ def test_modify_index_and_end_decompress(data, compressed_dfl_data): assert data[start:stop] == test_data -def test_index_and_read_late(data, compressed_dfl_data): +def test_index_after_end_decompress(data, compressed_dfl_data): index = zran.Index.create_index(compressed_dfl_data, span=2**18) with pytest.raises(ValueError, match='Offset and length specified would result in reading past the file bounds'): - zran.decompress(compressed_dfl_data, index, 0, len(data)) + zran.decompress(compressed_dfl_data, index, 0, len(data) + 1) -def test_modify_index_and_read_late(data, compressed_dfl_data): +def test_modified_index_before_start_decompress(data, compressed_dfl_data): index = zran.Index.create_index(compressed_dfl_data, span=2**18) start = index.points[5].outloc stop = index.points[10].outloc compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) - offset = start - uncompressed_range[0] - length = stop - start - test_data = zran.decompress( - compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, offset, length - ) - assert data[start:stop] == test_data + if new_index.points[0].bits != 0: + msg = 'When first index bit != 0, offset must be at or after second index point *' + with pytest.raises(ValueError, match=msg): + zran.decompress(compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, 0, 10) + + +def test_modified_after_end_decompress(data, compressed_dfl_data): + index = zran.Index.create_index(compressed_dfl_data, span=2**18) + start = index.points[5].outloc + stop = index.points[10].outloc + compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + with pytest.raises(ValueError, match='Offset and length specified would result in reading past the file bounds'): + zran.decompress( + compressed_dfl_data[compressed_range[0] : compressed_range[1]], + new_index, + new_index.points[1].outloc + 10, + new_index.uncompressed_size, + ) @pytest.mark.skip(reason='Integration test. Only run if testing Sentinel-1 SLC burst compatibility') @pytest.mark.parametrize('burst', offset_list) From fac449d26ff657e03ef3d89f8f8ff4ed029e4077 Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Thu, 10 Aug 2023 09:00:07 -0500 Subject: [PATCH 6/7] add default functionality to remove last stop point of modified index --- src/zran/zranlib.pyx | 19 +++++++++++++------ tests/test_zran.py | 28 +++++++++++++--------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/zran/zranlib.pyx b/src/zran/zranlib.pyx index 5c5ed23..1cdbf46 100644 --- a/src/zran/zranlib.pyx +++ b/src/zran/zranlib.pyx @@ -1,6 +1,7 @@ # vim: filetype=python import struct as py_struct import zlib +import warnings from collections import namedtuple from operator import attrgetter from typing import Iterable, List @@ -148,7 +149,11 @@ def build_deflate_index(input_bytes: bytes, span: off_t = 2**20) -> WrapperDefla def decompress(input_bytes: bytes, index: Index, offset: off_t, length: int) -> bytes: # noqa first_bit_zero = index.points[0].bits == 0 - offset_before_second_point = offset < index.points[1].outloc + if index.have > 1: + offset_before_second_point = offset < index.points[1].outloc + else: + offset_before_second_point = False + if not first_bit_zero and offset_before_second_point: raise ValueError( 'When first index bit != 0, offset must be at or after second index point' @@ -246,7 +251,7 @@ class Index: def to_c_index(self): return WrapperDeflateIndex.from_python_index(self.mode, self.uncompressed_size, self.have, self.points) - def create_modified_index(self, starts=[], stops=[]): + def create_modified_index(self, starts=[], stops=[], remove_last_stop=True): """Modifies a set of access Points so that they only contain the needed data Args: starts: uncompressed locations to provide indexes before. @@ -283,18 +288,20 @@ class Index: outloc_offset = desired_points[0].outloc output_points = [] - start_point_is_last_in_origional = start_index == len(compressed_offsets) - 1 for i, point in enumerate(desired_points): - last_point_in_new_index = i == len(desired_points) - 1 if i == 0: window = bytearray(WINDOW_LENGTH) - elif last_point_in_new_index and not start_point_is_last_in_origional: - window = bytearray(WINDOW_LENGTH) else: window = point.window new_point = Point(point.outloc - outloc_offset, point.inloc - inloc_offset, point.bits, window) output_points.append(new_point) + if stops and remove_last_stop: + if len(output_points) <= 2: + warnings.warn(UserWarning('Indexes must have at least two points, not removing last stop')) + else: + output_points = output_points[:-1] + modified_index = Index( self.have, compressed_range[1] - compressed_range[0], diff --git a/tests/test_zran.py b/tests/test_zran.py index bb0283d..3cbe499 100644 --- a/tests/test_zran.py +++ b/tests/test_zran.py @@ -91,12 +91,11 @@ def test_modify_index_and_head_decompress(data, compressed_dfl_data): start = 0 stop = 100 - compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop], False) + length = start - uncompressed_range[0] + offset = stop - start test_data = zran.decompress( - compressed_dfl_data[compressed_range[0] : compressed_range[1]], - new_index, - start - uncompressed_range[0], - stop - start, + compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, length, offset ) assert data[start:stop] == test_data @@ -108,11 +107,10 @@ def test_modify_index_and_interior_decompress(start_index, stop_index, data, com stop = index.points[stop_index].outloc + 100 compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + length = start - uncompressed_range[0] + offset = stop - start test_data = zran.decompress( - compressed_dfl_data[compressed_range[0] : compressed_range[1]], - new_index, - start - uncompressed_range[0], - stop - start, + compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, length, offset ) assert data[start:stop] == test_data @@ -122,12 +120,11 @@ def test_modify_index_and_tail_decompress(data, compressed_dfl_data): start = index.points[-1].outloc + 100 stop = len(data) - compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop]) + compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop], False) + length = start - uncompressed_range[0] + offset = stop - start test_data = zran.decompress( - compressed_dfl_data[compressed_range[0] : compressed_range[1]], - new_index, - start - uncompressed_range[0], - stop - start, + compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, length, offset ) assert data[start:stop] == test_data @@ -164,9 +161,10 @@ def test_modified_after_end_decompress(data, compressed_dfl_data): new_index.uncompressed_size, ) + @pytest.mark.skip(reason='Integration test. Only run if testing Sentinel-1 SLC burst compatibility') @pytest.mark.parametrize('burst', offset_list) -def test_safe(burst, input_data): +def test_burst_extraction(burst, input_data): swath, golden, index = input_data compressed_range, uncompressed_range, new_index = index.create_modified_index([burst.start], [burst.stop]) data_subset = swath[compressed_range[0] : compressed_range[1]] From 542ef78c55df44c59a5a2a19567f0b2a465d17d2 Mon Sep 17 00:00:00 2001 From: Forrest Williams Date: Thu, 10 Aug 2023 09:31:35 -0500 Subject: [PATCH 7/7] update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87dac76..93eaae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/) and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.5] +### Added +* Set the window info to all zeros for first point in first point.bits != 0 case. This decreased compressed index size +* New default for `create_modified_index` is to remove the last stop point, since the final point represents the end of the data +* Update testing to increase coverage of `create_modified_index` corner cases + ## [0.0.4] ### Added * New information to the README.md concerning contributions and similar projects