Merge pull request #411 from loris-imageserver/jp2-siz-segment

Move the SIZ marker segment parsing into a separate method
loris-imageserver · Mar 7, 2018 · 72847e0 · 72847e0
2 parents 47a3f68 + c239a9f
commit 72847e0
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 25 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -8,3 +8,4 @@ show_missing = True
 exclude_lines =
     pragma: no cover
     assert False, "Should not be reachable"
+    else:  # unreachable
diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py
@@ -15,11 +15,19 @@
 import os
 import struct
 
+import attr
+
 from loris.loris_exception import LorisException
 
 logger = logging.getLogger(__name__)
 
 
+@attr.s(slots=True)
+class Dimensions(object):
+    height = attr.ib()
+    width = attr.ib()
+
+
 class JP2ExtractionError(LorisException):
     """Raised for errors when extracting data from a JP2 image."""
     pass
@@ -168,7 +176,7 @@ def _get_dimensions_from_image_header_box(self, jp2):
         # height and width.  Consume the rest of the box before returning.
         jp2.read(22 - 16)
 
-        return (height, width)
+        return Dimensions(width=width, height=height)
 
     def _parse_colour_specification_box(self, jp2):
         """
@@ -276,9 +284,54 @@ def _parse_colour_specification_box(self, jp2):
             return (['gray', 'color'], profile_bytes)
 
         # This should be unreachable; we include it for completeness.
-        else:
+        else:  # unreachable
             assert False, meth
 
+    def _parse_siz_marker_segment(self, jp2):
+        """
+        The SIZ marker segment provides information about the uncompressed
+        image, including (for our purposes) the width/height of the image.
+
+        The layout of the component is as follows:
+
+            SIZ     Marker code, 2 bytes.  Should have value 0xFF51.
+            Lsiz    Length of the marker segment, 2 bytes.
+            Rsiz    2 bytes, irrelevant to us.
+            Xsiz    4 bytes, irrelvant to us.
+            Ysiz    4 bytes, irrelvant to us.
+            XOsiz   4 bytes, irrelevant to us.
+            YOsiz   4 bytes, irrelevant to us.
+            XTsiz:  Width of one reference tile wrt the ref grid.  4 bytes.
+            YTsiz:  Height of one reference tile wrt the ref grid.  4 bytes.
+
+        We don't care about the rest of the fields, and can skip them.
+
+        See § A.5.1 for details.
+        """
+        marker_code = jp2.read(2)
+        if marker_code != b'\xFF\x51':
+            raise JP2ExtractionError(
+                "Bad marker code in the SIZ marker segment: %r" % marker_code
+            )
+
+        # Now we read through the irrelevant fields:
+        #
+        #   Lsiz     2
+        #   Rsiz     2
+        #   Xsiz     4
+        #   Ysiz     4
+        #   XOsiz    4
+        #   YOsiz    4
+        #   =       20
+        #
+        jp2.read(20)
+
+        # Now we're on the XTsiz and YTsiz components, so read those.
+        xt_siz = struct.unpack('>I', jp2.read(4))[0]
+        yt_siz = struct.unpack('>I', jp2.read(4))[0]
+
+        return Dimensions(width=xt_siz, height=yt_siz)
+
     def extract_jp2(self, jp2):
         """
         Given a file-like object that contains a JP2 image, attempt
@@ -305,14 +358,15 @@ def extract_jp2(self, jp2):
         # box in the JP2 Header box (see § I.5.3).  In particular, it gives
         # us the height and the width.
         dimensions = self._get_dimensions_from_image_header_box(jp2)
-        self.height, self.width = dimensions
+        self.height = dimensions.height
+        self.width = dimensions.width
         logger.debug("width:  %d", self.width)
         logger.debug("height: %d", self.height)
 
         # After the Image Header box, there are a number of other boxes inside
         # the JP2 Header box, which can potentially appear in any order.
         # We're only interested in a Colour Specification box, which has
-        # type 'colr', so skip forward until we find that.  ()
+        # type 'colr', so skip forward until we find that.
         #
         # Note: a JP2 Header box may contain more than one colr box; for now
         # we only use the first and ignore the rest.
@@ -327,23 +381,31 @@ def extract_jp2(self, jp2):
         self.color_profile_bytes = profile_bytes
         logger.debug('qualities: %s', self.profile.description['qualities'])
 
-        scaleFactors = []
+        # This is all the information we need from the JP2 Header box.
 
-        window = deque(jp2.read(2), 2)
-        # start of codestream
-        while ((window[0] != b'\xFF') or (window[1] != b'\x4F')): # (SOC - required, see pg 14)
-            window.append(jp2.read(1))
-        while ((window[0] != b'\xFF') or (window[1] != b'\x51')):  # (SIZ  - required, see pg 14)
-            window.append(jp2.read(1))
-        jp2.read(20) # through Lsiz (16), Rsiz (16), Xsiz (32), Ysiz (32), XOsiz (32), YOsiz (32)
-        tile_width = int(struct.unpack(">I", jp2.read(4))[0]) # XTsiz (32)
-        tile_height = int(struct.unpack(">I", jp2.read(4))[0]) # YTsiz (32)
-        logger.debug("tile width: %s", tile_width)
-        logger.debug("tile height: %s", tile_height)
-        self.tiles.append( { 'width' : tile_width } )
-        if tile_width != tile_height:
-            self.tiles[0]['height'] = tile_height
-        jp2.read(10) # XTOsiz (32), YTOsiz (32), Csiz (16)
+        # Now we want to get tile and size data from the Contiguous Codestream
+        # box, which contains the complete JPEG 2000 codestream (see § I.5.4).
+        #
+        # Specifically, we're interested in the Image and Tile Size (SIZ),
+        # which includes the width and height of the reference grid and tiles.
+        # This starts with a marker code 'SIZ = 0xFF51'.
+        #
+        # There is only one SIZ per codestream, so it suffices to find the
+        # first instance (see § A.5).
+        _read_jp2_until_match(jp2, b'\xFF\x51')
+
+        tile_dimensions = self._parse_siz_marker_segment(jp2)
+        if tile_dimensions.height == tile_dimensions.width:
+            self.tiles.append({
+                'width': tile_dimensions.width
+            })
+        else:
+            self.tiles.append({
+                'width': tile_dimensions.width,
+                'height': tile_dimensions.height
+            })
+
+        scaleFactors = []
 
         window = deque(jp2.read(2), 2)
         while ((window[0] != b'\xFF') or (window[1] != b'\x52')):  # (COD - required, see pg 14)

diff --git a/tests/jp2_extractor_t.py b/tests/jp2_extractor_t.py
@@ -9,7 +9,7 @@
 from hypothesis.strategies import binary
 import pytest
 
-from loris.jp2_extractor import JP2Extractor, JP2ExtractionError
+from loris.jp2_extractor import Dimensions, JP2Extractor, JP2ExtractionError
 
 
 @pytest.fixture
@@ -105,10 +105,22 @@ def test_file_type_box_is_ok_or_error(self, extractor, file_type_box):
             assert 'File Type box' in str(err)
 
     @pytest.mark.parametrize('header_box_bytes, expected_dimensions', [
-        (b'\x00\x00\x00\x01\x00\x00\x00\x01', (1, 1)),
-        (b'\x00\x00\x00\x11\x00\x00\x00\x00', (17, 0)),
-        (b'\x00\x00\x00\x00\x00\x00\x00\x11', (0, 17)),
-        (b'\x01\x01\x01\x01\x02\x02\x02\x02', (16843009, 33686018)),
+        (
+            b'\x00\x00\x00\x01\x00\x00\x00\x01',
+            Dimensions(height=1, width=1)
+        ),
+        (
+            b'\x00\x00\x00\x11\x00\x00\x00\x00',
+            Dimensions(height=17, width=0)
+        ),
+        (
+            b'\x00\x00\x00\x00\x00\x00\x00\x11',
+            Dimensions(height=0, width=17)
+        ),
+        (
+            b'\x01\x01\x01\x01\x02\x02\x02\x02',
+            Dimensions(height=16843009, width=33686018)
+        ),
     ])
     def test_reading_dimensions_from_headr_box(
         self, extractor, header_box_bytes, expected_dimensions
@@ -254,3 +266,35 @@ def test_parse_colour_specification_box_is_okay_or_error(
             qualities, profile_bytes = result
             assert isinstance(qualities, list)
             assert isinstance(profile_bytes, bytes)
+
+    @pytest.mark.parametrize('marker_code', [b'\xFF\x52', b'\xFE\x52', b'00'])
+    def test_bad_siz_marker_code_is_error(self, extractor, marker_code):
+        jp2 = BytesIO(marker_code)
+        with pytest.raises(JP2ExtractionError) as err:
+            extractor._parse_siz_marker_segment(jp2)
+        assert 'Bad marker code in the SIZ marker segment' in str(err.value)
+
+    @pytest.mark.parametrize('xtsiz_ytsiz, expected_dimensions', [
+        (
+            b'\x00\x00\x00\x01\x00\x00\x00\x01',
+            Dimensions(width=1, height=1)
+        ),
+        (
+            b'\x00\x00\x00\x11\x00\x00\x00\x00',
+            Dimensions(width=17, height=0)
+        ),
+        (
+            b'\x00\x00\x00\x00\x00\x00\x00\x11',
+            Dimensions(width=0, height=17)
+        ),
+        (
+            b'\x01\x01\x01\x01\x02\x02\x02\x02',
+            Dimensions(width=16843009, height=33686018)
+        ),
+    ])
+    def test_get_dimensions_from_siz_marker_segment(
+        self, extractor, xtsiz_ytsiz, expected_dimensions
+    ):
+        jp2 = BytesIO(b'\xFF\x51' + b'\x00' * 20 + xtsiz_ytsiz)
+        dimensions = extractor._parse_siz_marker_segment(jp2)
+        assert dimensions == expected_dimensions