Merge pull request #45 from forrestfwilliams/develop

Release v0.5.0
forrestfwilliams · Aug 10, 2023 · 4b5210c · 4b5210c
2 parents ffedad5 + 1e4cf07
commit 4b5210c
Show file tree

Hide file tree

Showing 13 changed files with 126 additions and 28 deletions.
diff --git a/.github/workflows/build-and-deploy-test.yml b/.github/workflows/build-and-deploy-test.yml
@@ -18,7 +18,7 @@ jobs:
           fetch-depth: 0
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.12.1
+        uses: pypa/cibuildwheel@v2.14.1
 
       - uses: actions/upload-artifact@v3
         with:
@@ -55,7 +55,7 @@ jobs:
           name: artifact
           path: dist
 
-      - uses: pypa/gh-action-pypi-publish@v1.8.1
+      - uses: pypa/gh-action-pypi-publish@v1.8.8
         with:
           user: __token__
           password: ${{ secrets.PYPI_TEST_PAK }}

diff --git a/.github/workflows/build-and-deploy.yml b/.github/workflows/build-and-deploy.yml
@@ -18,7 +18,7 @@ jobs:
           fetch-depth: 0
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.12.1
+        uses: pypa/cibuildwheel@v2.14.1
 
       - uses: actions/upload-artifact@v3
         with:
@@ -55,7 +55,7 @@ jobs:
           name: artifact
           path: dist
 
-      - uses: pypa/gh-action-pypi-publish@v1.8.1
+      - uses: pypa/gh-action-pypi-publish@v1.8.8
         with:
           user: __token__
           password: ${{ secrets.PYPI_PAK }}

diff --git a/.github/workflows/bump-version.yml b/.github/workflows/bump-version.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   call-bump-version-workflow:
-    uses: ASFHyP3/actions/.github/workflows/reusable-bump-version.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-bump-version.yml@v0.8.1
     with:
       user: zran-bot
       email: ffwilliams2@alaska.edu

diff --git a/.github/workflows/changelog-check.yml b/.github/workflows/changelog-check.yml
@@ -13,6 +13,6 @@ on:
 
 jobs:
   call-changelog-check-workflow:
-    uses: ASFHyP3/actions/.github/workflows/reusable-changelog-check.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-changelog-check.yml@v0.8.1
     secrets:
       USER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/labeled-pr-check.yml b/.github/workflows/labeled-pr-check.yml
@@ -12,4 +12,4 @@ on:
 
 jobs:
   call-labeled-pr-check-workflow:
-    uses: ASFHyP3/actions/.github/workflows/reusable-labeled-pr-check.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-labeled-pr-check.yml@v0.8.1
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -9,6 +9,6 @@ on:
 jobs:
   call-pytest-workflow:
     # Docs: https://github.com/ASFHyP3/actions
-    uses: ASFHyP3/actions/.github/workflows/reusable-pytest.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-pytest.yml@v0.8.1
     with:
       local_package_name: zran
diff --git a/.github/workflows/release-checklist-comment.yml b/.github/workflows/release-checklist-comment.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   call-release-workflow:
-    uses: ASFHyP3/actions/.github/workflows/reusable-release-checklist-comment.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-release-checklist-comment.yml@v0.8.1
     permissions:
       pull-requests: write
     secrets:

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   call-release-workflow:
-    uses: ASFHyP3/actions/.github/workflows/reusable-release.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-release.yml@v0.8.1
     with:
       release_prefix: ZRAN
       release_branch: main      # Optional; default shown

diff --git a/.github/workflows/static-analysis.yml b/.github/workflows/static-analysis.yml
@@ -5,7 +5,7 @@ on: [pull_request]
 jobs:
   call-secrets-analysis-workflow:
     # Docs: https://github.com/ASFHyP3/actions
-    uses: ASFHyP3/actions/.github/workflows/reusable-secrets-analysis.yml@v0.7.1
+    uses: ASFHyP3/actions/.github/workflows/reusable-secrets-analysis.yml@v0.8.1
 
   check-with-black:
     runs-on: ubuntu-latest

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/)
 and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.0.5]
+### Added
+* Set the window info to all zeros for first point in first point.bits != 0 case. This decreased compressed index size
+* New default for `create_modified_index` is to remove the last stop point, since the final point represents the end of the data
+* Update testing to increase coverage of `create_modified_index` corner cases
+
 ## [0.0.4]
 ### Added
 * New information to the README.md concerning contributions and similar projects

diff --git a/src/zran/zranlib.pyx b/src/zran/zranlib.pyx
@@ -1,6 +1,7 @@
 # vim: filetype=python
 import struct as py_struct
 import zlib
+import warnings
 from collections import namedtuple
 from operator import attrgetter
 from typing import Iterable, List
@@ -147,6 +148,21 @@ def build_deflate_index(input_bytes: bytes, span: off_t = 2**20) -> WrapperDefla
 
 
 def decompress(input_bytes: bytes, index: Index, offset: off_t, length: int) -> bytes:  # noqa
+    first_bit_zero = index.points[0].bits == 0
+    if index.have > 1:
+        offset_before_second_point = offset < index.points[1].outloc
+    else:
+        offset_before_second_point = False
+
+    if not first_bit_zero and offset_before_second_point:
+        raise ValueError(
+            'When first index bit != 0, offset must be at or after second index point'
+            f' ({index.points[1].outloc} for this index)'
+        )
+
+    if offset + length > index.uncompressed_size:
+        raise ValueError('Offset and length specified would result in reading past the file bounds')
+
     compressed_data = cython.declare(cython.p_char, PyBytes_AsString(input_bytes))
     compressed_data_length = cython.declare(off_t, PyBytes_Size(input_bytes))
     infile = fmemopen(compressed_data, compressed_data_length, b"r")
@@ -235,7 +251,7 @@ class Index:
     def to_c_index(self):
         return WrapperDeflateIndex.from_python_index(self.mode, self.uncompressed_size, self.have, self.points)
 
-    def create_modified_index(self, starts=[], stops=[]):
+    def create_modified_index(self, starts=[], stops=[], remove_last_stop=True):
         """Modifies a set of access Points so that they only contain the needed data
         Args:
             starts: uncompressed locations to provide indexes before.
@@ -270,16 +286,28 @@ class Index:
 
         inloc_offset = desired_points[0].inloc - compressed_offsets[0]
         outloc_offset = desired_points[0].outloc
-        desired_points = [
-            Point(x.outloc - outloc_offset, x.inloc - inloc_offset, x.bits, x.window) for x in desired_points
-        ]
+
+        output_points = []
+        for i, point in enumerate(desired_points):
+            if i == 0:
+                window = bytearray(WINDOW_LENGTH)
+            else:
+                window = point.window
+            new_point = Point(point.outloc - outloc_offset, point.inloc - inloc_offset, point.bits, window)
+            output_points.append(new_point)
+
+        if stops and remove_last_stop:
+            if len(output_points) <= 2:
+                warnings.warn(UserWarning('Indexes must have at least two points, not removing last stop'))
+            else:
+                output_points = output_points[:-1]
 
         modified_index = Index(
             self.have,
             compressed_range[1] - compressed_range[0],
             uncompressed_range[1] - uncompressed_range[0],
-            len(desired_points),
-            desired_points,
+            len(output_points),
+            output_points,
         )
         return compressed_range, uncompressed_range, modified_index
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,4 +1,5 @@
 import os
+import random
 import zlib
 
 import pytest
@@ -31,7 +32,7 @@ def input_data():
 
 
 def create_compressed_data(uncompressed_data, wbits, start=None, stop=None):
-    compress_obj = zlib.compressobj(wbits=wbits)
+    compress_obj = zlib.compressobj(wbits=wbits, level=9)
     compressed = compress_obj.compress(uncompressed_data)
     compressed += compress_obj.flush()
 
@@ -57,7 +58,10 @@ def gz_points():
 
 @pytest.fixture(scope='module')
 def data():
-    out = os.urandom(2**22)
+    # Can't use os.random directly because there needs to be some
+    # repitition in order for compression to be effective
+    words = [os.urandom(8) for _ in range(1000)]
+    out = b''.join([random.choice(words) for _ in range(524288)])
     return out
 
 

diff --git a/tests/test_zran.py b/tests/test_zran.py
@@ -32,13 +32,13 @@ def test_create_index(compressed_gz_data):
     assert len(points[0].window) == 32768
 
 
-@pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties')
+# @pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties')
 def test_create_index_fail_head(data, compressed_gz_data_no_head):
     with pytest.raises(zran.ZranError, match='zran: compressed data error in input file'):
         zran.Index.create_index(compressed_gz_data_no_head)
 
 
-@pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties')
+# @pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties')
 def test_create_index_fail_tail(data, compressed_gz_data_no_tail):
     with pytest.raises(zran.ZranError, match='zran: input file ended prematurely'):
         zran.Index.create_index(compressed_gz_data_no_tail)
@@ -68,7 +68,7 @@ def test_decompress(data, compressed_file):
     assert data[start : start + length] == test_data
 
 
-@pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties')
+# @pytest.mark.skip(reason='Currently unstable. Will sometimes not fail if data has certain (unknown) properties')
 def test_decompress_fail(data, compressed_gz_data, compressed_gz_data_no_head):
     start = 100
     length = 1000
@@ -86,25 +86,85 @@ def test_get_closest_point():
     assert r2.outloc == 4
 
 
+def test_modify_index_and_head_decompress(data, compressed_dfl_data):
+    index = zran.Index.create_index(compressed_dfl_data, span=2**18)
+    start = 0
+    stop = 100
+
+    compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop], False)
+    length = start - uncompressed_range[0]
+    offset = stop - start
+    test_data = zran.decompress(
+        compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, length, offset
+    )
+    assert data[start:stop] == test_data
+
+
 @pytest.mark.parametrize('start_index,stop_index', ((0, 5), (4, 10), (9, -1)))
-def test_modify_index_and_decompress(start_index, stop_index, data, compressed_dfl_data):
+def test_modify_index_and_interior_decompress(start_index, stop_index, data, compressed_dfl_data):
     index = zran.Index.create_index(compressed_dfl_data, span=2**18)
     start = index.points[start_index].outloc + 100
     stop = index.points[stop_index].outloc + 100
 
     compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop])
+    length = start - uncompressed_range[0]
+    offset = stop - start
+    test_data = zran.decompress(
+        compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, length, offset
+    )
+    assert data[start:stop] == test_data
+
+
+def test_modify_index_and_tail_decompress(data, compressed_dfl_data):
+    index = zran.Index.create_index(compressed_dfl_data, span=2**18)
+    start = index.points[-1].outloc + 100
+    stop = len(data)
+
+    compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop], False)
+    length = start - uncompressed_range[0]
+    offset = stop - start
     test_data = zran.decompress(
-        compressed_dfl_data[compressed_range[0] : compressed_range[1]],
-        new_index,
-        start - uncompressed_range[0],
-        stop - start,
+        compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, length, offset
     )
     assert data[start:stop] == test_data
 
 
+def test_index_after_end_decompress(data, compressed_dfl_data):
+    index = zran.Index.create_index(compressed_dfl_data, span=2**18)
+    with pytest.raises(ValueError, match='Offset and length specified would result in reading past the file bounds'):
+        zran.decompress(compressed_dfl_data, index, 0, len(data) + 1)
+
+
+def test_modified_index_before_start_decompress(data, compressed_dfl_data):
+    index = zran.Index.create_index(compressed_dfl_data, span=2**18)
+    start = index.points[5].outloc
+    stop = index.points[10].outloc
+
+    compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop])
+    if new_index.points[0].bits != 0:
+        msg = 'When first index bit != 0, offset must be at or after second index point *'
+        with pytest.raises(ValueError, match=msg):
+            zran.decompress(compressed_dfl_data[compressed_range[0] : compressed_range[1]], new_index, 0, 10)
+
+
+def test_modified_after_end_decompress(data, compressed_dfl_data):
+    index = zran.Index.create_index(compressed_dfl_data, span=2**18)
+    start = index.points[5].outloc
+    stop = index.points[10].outloc
+
+    compressed_range, uncompressed_range, new_index = index.create_modified_index([start], [stop])
+    with pytest.raises(ValueError, match='Offset and length specified would result in reading past the file bounds'):
+        zran.decompress(
+            compressed_dfl_data[compressed_range[0] : compressed_range[1]],
+            new_index,
+            new_index.points[1].outloc + 10,
+            new_index.uncompressed_size,
+        )
+
+
 @pytest.mark.skip(reason='Integration test. Only run if testing Sentinel-1 SLC burst compatibility')
 @pytest.mark.parametrize('burst', offset_list)
-def test_safe(burst, input_data):
+def test_burst_extraction(burst, input_data):
     swath, golden, index = input_data
     compressed_range, uncompressed_range, new_index = index.create_modified_index([burst.start], [burst.stop])
     data_subset = swath[compressed_range[0] : compressed_range[1]]