diff --git a/.coveragerc b/.coveragerc index cbe62bef..7be34ee4 100644 --- a/.coveragerc +++ b/.coveragerc @@ -8,3 +8,4 @@ show_missing = True exclude_lines = pragma: no cover assert False, "Should not be reachable" + else: # unreachable diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py index 69477a28..a8429df9 100644 --- a/loris/jp2_extractor.py +++ b/loris/jp2_extractor.py @@ -15,11 +15,19 @@ import os import struct +import attr + from loris.loris_exception import LorisException logger = logging.getLogger(__name__) +@attr.s(slots=True) +class Dimensions(object): + height = attr.ib() + width = attr.ib() + + class JP2ExtractionError(LorisException): """Raised for errors when extracting data from a JP2 image.""" pass @@ -168,7 +176,7 @@ def _get_dimensions_from_image_header_box(self, jp2): # height and width. Consume the rest of the box before returning. jp2.read(22 - 16) - return (height, width) + return Dimensions(width=width, height=height) def _parse_colour_specification_box(self, jp2): """ @@ -276,9 +284,54 @@ def _parse_colour_specification_box(self, jp2): return (['gray', 'color'], profile_bytes) # This should be unreachable; we include it for completeness. - else: + else: # unreachable assert False, meth + def _parse_siz_marker_segment(self, jp2): + """ + The SIZ marker segment provides information about the uncompressed + image, including (for our purposes) the width/height of the image. + + The layout of the component is as follows: + + SIZ Marker code, 2 bytes. Should have value 0xFF51. + Lsiz Length of the marker segment, 2 bytes. + Rsiz 2 bytes, irrelevant to us. + Xsiz 4 bytes, irrelvant to us. + Ysiz 4 bytes, irrelvant to us. + XOsiz 4 bytes, irrelevant to us. + YOsiz 4 bytes, irrelevant to us. + XTsiz: Width of one reference tile wrt the ref grid. 4 bytes. + YTsiz: Height of one reference tile wrt the ref grid. 4 bytes. + + We don't care about the rest of the fields, and can skip them. + + See § A.5.1 for details. + """ + marker_code = jp2.read(2) + if marker_code != b'\xFF\x51': + raise JP2ExtractionError( + "Bad marker code in the SIZ marker segment: %r" % marker_code + ) + + # Now we read through the irrelevant fields: + # + # Lsiz 2 + # Rsiz 2 + # Xsiz 4 + # Ysiz 4 + # XOsiz 4 + # YOsiz 4 + # = 20 + # + jp2.read(20) + + # Now we're on the XTsiz and YTsiz components, so read those. + xt_siz = struct.unpack('>I', jp2.read(4))[0] + yt_siz = struct.unpack('>I', jp2.read(4))[0] + + return Dimensions(width=xt_siz, height=yt_siz) + def extract_jp2(self, jp2): """ Given a file-like object that contains a JP2 image, attempt @@ -305,14 +358,15 @@ def extract_jp2(self, jp2): # box in the JP2 Header box (see § I.5.3). In particular, it gives # us the height and the width. dimensions = self._get_dimensions_from_image_header_box(jp2) - self.height, self.width = dimensions + self.height = dimensions.height + self.width = dimensions.width logger.debug("width: %d", self.width) logger.debug("height: %d", self.height) # After the Image Header box, there are a number of other boxes inside # the JP2 Header box, which can potentially appear in any order. # We're only interested in a Colour Specification box, which has - # type 'colr', so skip forward until we find that. () + # type 'colr', so skip forward until we find that. # # Note: a JP2 Header box may contain more than one colr box; for now # we only use the first and ignore the rest. @@ -327,23 +381,31 @@ def extract_jp2(self, jp2): self.color_profile_bytes = profile_bytes logger.debug('qualities: %s', self.profile.description['qualities']) - scaleFactors = [] + # This is all the information we need from the JP2 Header box. - window = deque(jp2.read(2), 2) - # start of codestream - while ((window[0] != b'\xFF') or (window[1] != b'\x4F')): # (SOC - required, see pg 14) - window.append(jp2.read(1)) - while ((window[0] != b'\xFF') or (window[1] != b'\x51')): # (SIZ - required, see pg 14) - window.append(jp2.read(1)) - jp2.read(20) # through Lsiz (16), Rsiz (16), Xsiz (32), Ysiz (32), XOsiz (32), YOsiz (32) - tile_width = int(struct.unpack(">I", jp2.read(4))[0]) # XTsiz (32) - tile_height = int(struct.unpack(">I", jp2.read(4))[0]) # YTsiz (32) - logger.debug("tile width: %s", tile_width) - logger.debug("tile height: %s", tile_height) - self.tiles.append( { 'width' : tile_width } ) - if tile_width != tile_height: - self.tiles[0]['height'] = tile_height - jp2.read(10) # XTOsiz (32), YTOsiz (32), Csiz (16) + # Now we want to get tile and size data from the Contiguous Codestream + # box, which contains the complete JPEG 2000 codestream (see § I.5.4). + # + # Specifically, we're interested in the Image and Tile Size (SIZ), + # which includes the width and height of the reference grid and tiles. + # This starts with a marker code 'SIZ = 0xFF51'. + # + # There is only one SIZ per codestream, so it suffices to find the + # first instance (see § A.5). + _read_jp2_until_match(jp2, b'\xFF\x51') + + tile_dimensions = self._parse_siz_marker_segment(jp2) + if tile_dimensions.height == tile_dimensions.width: + self.tiles.append({ + 'width': tile_dimensions.width + }) + else: + self.tiles.append({ + 'width': tile_dimensions.width, + 'height': tile_dimensions.height + }) + + scaleFactors = [] window = deque(jp2.read(2), 2) while ((window[0] != b'\xFF') or (window[1] != b'\x52')): # (COD - required, see pg 14) diff --git a/tests/jp2_extractor_t.py b/tests/jp2_extractor_t.py index 123e63c7..adf0221e 100644 --- a/tests/jp2_extractor_t.py +++ b/tests/jp2_extractor_t.py @@ -9,7 +9,7 @@ from hypothesis.strategies import binary import pytest -from loris.jp2_extractor import JP2Extractor, JP2ExtractionError +from loris.jp2_extractor import Dimensions, JP2Extractor, JP2ExtractionError @pytest.fixture @@ -105,10 +105,22 @@ def test_file_type_box_is_ok_or_error(self, extractor, file_type_box): assert 'File Type box' in str(err) @pytest.mark.parametrize('header_box_bytes, expected_dimensions', [ - (b'\x00\x00\x00\x01\x00\x00\x00\x01', (1, 1)), - (b'\x00\x00\x00\x11\x00\x00\x00\x00', (17, 0)), - (b'\x00\x00\x00\x00\x00\x00\x00\x11', (0, 17)), - (b'\x01\x01\x01\x01\x02\x02\x02\x02', (16843009, 33686018)), + ( + b'\x00\x00\x00\x01\x00\x00\x00\x01', + Dimensions(height=1, width=1) + ), + ( + b'\x00\x00\x00\x11\x00\x00\x00\x00', + Dimensions(height=17, width=0) + ), + ( + b'\x00\x00\x00\x00\x00\x00\x00\x11', + Dimensions(height=0, width=17) + ), + ( + b'\x01\x01\x01\x01\x02\x02\x02\x02', + Dimensions(height=16843009, width=33686018) + ), ]) def test_reading_dimensions_from_headr_box( self, extractor, header_box_bytes, expected_dimensions @@ -254,3 +266,35 @@ def test_parse_colour_specification_box_is_okay_or_error( qualities, profile_bytes = result assert isinstance(qualities, list) assert isinstance(profile_bytes, bytes) + + @pytest.mark.parametrize('marker_code', [b'\xFF\x52', b'\xFE\x52', b'00']) + def test_bad_siz_marker_code_is_error(self, extractor, marker_code): + jp2 = BytesIO(marker_code) + with pytest.raises(JP2ExtractionError) as err: + extractor._parse_siz_marker_segment(jp2) + assert 'Bad marker code in the SIZ marker segment' in str(err.value) + + @pytest.mark.parametrize('xtsiz_ytsiz, expected_dimensions', [ + ( + b'\x00\x00\x00\x01\x00\x00\x00\x01', + Dimensions(width=1, height=1) + ), + ( + b'\x00\x00\x00\x11\x00\x00\x00\x00', + Dimensions(width=17, height=0) + ), + ( + b'\x00\x00\x00\x00\x00\x00\x00\x11', + Dimensions(width=0, height=17) + ), + ( + b'\x01\x01\x01\x01\x02\x02\x02\x02', + Dimensions(width=16843009, height=33686018) + ), + ]) + def test_get_dimensions_from_siz_marker_segment( + self, extractor, xtsiz_ytsiz, expected_dimensions + ): + jp2 = BytesIO(b'\xFF\x51' + b'\x00' * 20 + xtsiz_ytsiz) + dimensions = extractor._parse_siz_marker_segment(jp2) + assert dimensions == expected_dimensions