From fcf0a055b94e92a12e47caaeb269203a2700997c Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 7 Mar 2018 09:19:58 +0000 Subject: [PATCH 1/6] jp2_extractor.py: start refactoring the SIZ method --- loris/jp2_extractor.py | 41 ++++++++++++++++++++++++++++++++++++++++ tests/jp2_extractor_t.py | 7 +++++++ 2 files changed, 48 insertions(+) diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py index 69477a28..bb937d7b 100644 --- a/loris/jp2_extractor.py +++ b/loris/jp2_extractor.py @@ -279,6 +279,33 @@ def _parse_colour_specification_box(self, jp2): else: assert False, meth + def _parse_siz_marker_segment(self, jp2): + """ + The SIZ marker segment provides information about the uncompressed + image, including (for our purposes) the width/height of the image. + + The layout of the component is as follows: + + SIZ Marker code, 2 bytes. Should have value 0xFF51. + Lsiz Length of the marker segment, 2 bytes. + Rsiz 2 bytes, irrelevant to us. + Xsiz 4 bytes, irrelvant to us. + Ysiz 4 bytes, irrelvant to us. + XOsiz 4 bytes, irrelevant to us. + YOsiz 4 bytes, irrelevant to us. + XTsiz: Width of one reference tile wrt the ref grid. 4 bytes. + YTsiz: Height of one reference tile wrt the ref grid. 4 bytes. + + We don't care about the rest of the fields, and can skip them. + + See § A.5.1 for details. + """ + marker_code = jp2.read(2) + if marker_code != b'\xFF\x51': + raise JP2ExtractionError( + "Bad marker code in the SIZ marker segment: %r" % marker_code + ) + def extract_jp2(self, jp2): """ Given a file-like object that contains a JP2 image, attempt @@ -327,6 +354,20 @@ def extract_jp2(self, jp2): self.color_profile_bytes = profile_bytes logger.debug('qualities: %s', self.profile.description['qualities']) + # This is all the information we need from the JP2 Header box. + + # Now we want to get tile and size data from the + # Continuguous Codestream box, which contains the complete JPEG 2000 + # codestream (see § I.5.4). + # + # Specifically, we're interested in the Image and Tile Size (SIZ), + # which includes the width and height of the reference grid and tiles. + # This starts with a marker code 'SIZ = 0xFF51'. + # + # There is only one SIZ per codestream, so it suffices to find the + # first instance (see § A.5). + _read_jp2_until_match(jp2, b'\xFF\x51') + scaleFactors = [] window = deque(jp2.read(2), 2) diff --git a/tests/jp2_extractor_t.py b/tests/jp2_extractor_t.py index 123e63c7..ed81f0e2 100644 --- a/tests/jp2_extractor_t.py +++ b/tests/jp2_extractor_t.py @@ -254,3 +254,10 @@ def test_parse_colour_specification_box_is_okay_or_error( qualities, profile_bytes = result assert isinstance(qualities, list) assert isinstance(profile_bytes, bytes) + + @pytest.mark.parametrize('marker_code', [b'\xFF\x52', b'\xFE\x52', b'00']) + def test_bad_siz_marker_code_is_error(self, extractor, marker_code): + jp2 = BytesIO(marker_code) + with pytest.raises(JP2ExtractionError) as err: + extractor._parse_siz_marker_segment(jp2) + assert 'Bad marker code in the SIZ marker segment' in str(err.value) From 9a2432b4a24e48308d8eebfe3949554690d9ba15 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 7 Mar 2018 09:26:00 +0000 Subject: [PATCH 2/6] jp2_extractor.py: store 2D dimensions in a structure, not a tuple Because it turns out the JP2 standard is inconsistent in whether height or width comes first! --- loris/jp2_extractor.py | 13 +++++++++++-- tests/jp2_extractor_t.py | 22 +++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py index bb937d7b..5dadc98f 100644 --- a/loris/jp2_extractor.py +++ b/loris/jp2_extractor.py @@ -15,11 +15,19 @@ import os import struct +import attr + from loris.loris_exception import LorisException logger = logging.getLogger(__name__) +@attr.s(slots=True) +class Dimensions(object): + height = attr.ib() + width = attr.ib() + + class JP2ExtractionError(LorisException): """Raised for errors when extracting data from a JP2 image.""" pass @@ -168,7 +176,7 @@ def _get_dimensions_from_image_header_box(self, jp2): # height and width. Consume the rest of the box before returning. jp2.read(22 - 16) - return (height, width) + return Dimensions(width=width, height=height) def _parse_colour_specification_box(self, jp2): """ @@ -332,7 +340,8 @@ def extract_jp2(self, jp2): # box in the JP2 Header box (see § I.5.3). In particular, it gives # us the height and the width. dimensions = self._get_dimensions_from_image_header_box(jp2) - self.height, self.width = dimensions + self.height = dimensions.height + self.width = dimensions.width logger.debug("width: %d", self.width) logger.debug("height: %d", self.height) diff --git a/tests/jp2_extractor_t.py b/tests/jp2_extractor_t.py index ed81f0e2..b4fea562 100644 --- a/tests/jp2_extractor_t.py +++ b/tests/jp2_extractor_t.py @@ -9,7 +9,7 @@ from hypothesis.strategies import binary import pytest -from loris.jp2_extractor import JP2Extractor, JP2ExtractionError +from loris.jp2_extractor import Dimensions, JP2Extractor, JP2ExtractionError @pytest.fixture @@ -105,10 +105,22 @@ def test_file_type_box_is_ok_or_error(self, extractor, file_type_box): assert 'File Type box' in str(err) @pytest.mark.parametrize('header_box_bytes, expected_dimensions', [ - (b'\x00\x00\x00\x01\x00\x00\x00\x01', (1, 1)), - (b'\x00\x00\x00\x11\x00\x00\x00\x00', (17, 0)), - (b'\x00\x00\x00\x00\x00\x00\x00\x11', (0, 17)), - (b'\x01\x01\x01\x01\x02\x02\x02\x02', (16843009, 33686018)), + ( + b'\x00\x00\x00\x01\x00\x00\x00\x01', + Dimensions(height=1, width=1) + ), + ( + b'\x00\x00\x00\x11\x00\x00\x00\x00', + Dimensions(height=17, width=0) + ), + ( + b'\x00\x00\x00\x00\x00\x00\x00\x11', + Dimensions(height=0, width=17) + ), + ( + b'\x01\x01\x01\x01\x02\x02\x02\x02', + Dimensions(height=16843009, width=33686018) + ), ]) def test_reading_dimensions_from_headr_box( self, extractor, header_box_bytes, expected_dimensions From 129add213cb60a2a9c3982593e6346a6a45d669e Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 7 Mar 2018 09:27:59 +0000 Subject: [PATCH 3/6] jp2_extractor.py: finish breaking up the SIZ marker --- loris/jp2_extractor.py | 47 +++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py index 5dadc98f..1c919fa5 100644 --- a/loris/jp2_extractor.py +++ b/loris/jp2_extractor.py @@ -314,6 +314,24 @@ def _parse_siz_marker_segment(self, jp2): "Bad marker code in the SIZ marker segment: %r" % marker_code ) + # Now we read through the irrelevant fields: + # + # Lsiz 2 + # Rsiz 2 + # Xsiz 4 + # Ysiz 4 + # XOsiz 4 + # YOsiz 4 + # = 20 + # + jp2.read(20) + + # Now we're on the XTsiz and YTsiz components, so read those. + xt_siz = struct.unpack('>I', jp2.read(4))[0] + yt_siz = struct.unpack('>I', jp2.read(4))[0] + + return Dimensions(width=xt_siz, height=yt_siz) + def extract_jp2(self, jp2): """ Given a file-like object that contains a JP2 image, attempt @@ -348,7 +366,7 @@ def extract_jp2(self, jp2): # After the Image Header box, there are a number of other boxes inside # the JP2 Header box, which can potentially appear in any order. # We're only interested in a Colour Specification box, which has - # type 'colr', so skip forward until we find that. () + # type 'colr', so skip forward until we find that. # # Note: a JP2 Header box may contain more than one colr box; for now # we only use the first and ignore the rest. @@ -377,23 +395,18 @@ def extract_jp2(self, jp2): # first instance (see § A.5). _read_jp2_until_match(jp2, b'\xFF\x51') - scaleFactors = [] + tile_dimensions = self._parse_siz_marker_segment(jp2) + if tile_dimensions.height == tile_dimensions.width: + self.tiles.append({ + 'width': tile_dimensions.width + }) + else: + self.tiles.append({ + 'width': tile_dimensions.width, + 'height': tile_dimensions.height + }) - window = deque(jp2.read(2), 2) - # start of codestream - while ((window[0] != b'\xFF') or (window[1] != b'\x4F')): # (SOC - required, see pg 14) - window.append(jp2.read(1)) - while ((window[0] != b'\xFF') or (window[1] != b'\x51')): # (SIZ - required, see pg 14) - window.append(jp2.read(1)) - jp2.read(20) # through Lsiz (16), Rsiz (16), Xsiz (32), Ysiz (32), XOsiz (32), YOsiz (32) - tile_width = int(struct.unpack(">I", jp2.read(4))[0]) # XTsiz (32) - tile_height = int(struct.unpack(">I", jp2.read(4))[0]) # YTsiz (32) - logger.debug("tile width: %s", tile_width) - logger.debug("tile height: %s", tile_height) - self.tiles.append( { 'width' : tile_width } ) - if tile_width != tile_height: - self.tiles[0]['height'] = tile_height - jp2.read(10) # XTOsiz (32), YTOsiz (32), Csiz (16) + scaleFactors = [] window = deque(jp2.read(2), 2) while ((window[0] != b'\xFF') or (window[1] != b'\x52')): # (COD - required, see pg 14) From 9660b438538d20fa422f4051368f883375c746fa Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 7 Mar 2018 09:30:33 +0000 Subject: [PATCH 4/6] jp2_extractor_t.py: add some tests around tile size extraction --- tests/jp2_extractor_t.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/jp2_extractor_t.py b/tests/jp2_extractor_t.py index b4fea562..adf0221e 100644 --- a/tests/jp2_extractor_t.py +++ b/tests/jp2_extractor_t.py @@ -273,3 +273,28 @@ def test_bad_siz_marker_code_is_error(self, extractor, marker_code): with pytest.raises(JP2ExtractionError) as err: extractor._parse_siz_marker_segment(jp2) assert 'Bad marker code in the SIZ marker segment' in str(err.value) + + @pytest.mark.parametrize('xtsiz_ytsiz, expected_dimensions', [ + ( + b'\x00\x00\x00\x01\x00\x00\x00\x01', + Dimensions(width=1, height=1) + ), + ( + b'\x00\x00\x00\x11\x00\x00\x00\x00', + Dimensions(width=17, height=0) + ), + ( + b'\x00\x00\x00\x00\x00\x00\x00\x11', + Dimensions(width=0, height=17) + ), + ( + b'\x01\x01\x01\x01\x02\x02\x02\x02', + Dimensions(width=16843009, height=33686018) + ), + ]) + def test_get_dimensions_from_siz_marker_segment( + self, extractor, xtsiz_ytsiz, expected_dimensions + ): + jp2 = BytesIO(b'\xFF\x51' + b'\x00' * 20 + xtsiz_ytsiz) + dimensions = extractor._parse_siz_marker_segment(jp2) + assert dimensions == expected_dimensions From 0ed1b0f90fe1cf4431b9e366686522faf695ab40 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 7 Mar 2018 09:30:47 +0000 Subject: [PATCH 5/6] jp2_extractor.py: a quick coverage fix --- .coveragerc | 1 + loris/jp2_extractor.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.coveragerc b/.coveragerc index cbe62bef..7be34ee4 100644 --- a/.coveragerc +++ b/.coveragerc @@ -8,3 +8,4 @@ show_missing = True exclude_lines = pragma: no cover assert False, "Should not be reachable" + else: # unreachable diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py index 1c919fa5..c8bc65ce 100644 --- a/loris/jp2_extractor.py +++ b/loris/jp2_extractor.py @@ -284,7 +284,7 @@ def _parse_colour_specification_box(self, jp2): return (['gray', 'color'], profile_bytes) # This should be unreachable; we include it for completeness. - else: + else: # unreachable assert False, meth def _parse_siz_marker_segment(self, jp2): From c239a9fda46d8a37f20e8225f893a799129f8b58 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 7 Mar 2018 14:40:09 +0000 Subject: [PATCH 6/6] jp2_extractor.py: fix a tyop --- loris/jp2_extractor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/loris/jp2_extractor.py b/loris/jp2_extractor.py index c8bc65ce..a8429df9 100644 --- a/loris/jp2_extractor.py +++ b/loris/jp2_extractor.py @@ -383,9 +383,8 @@ def extract_jp2(self, jp2): # This is all the information we need from the JP2 Header box. - # Now we want to get tile and size data from the - # Continuguous Codestream box, which contains the complete JPEG 2000 - # codestream (see § I.5.4). + # Now we want to get tile and size data from the Contiguous Codestream + # box, which contains the complete JPEG 2000 codestream (see § I.5.4). # # Specifically, we're interested in the Image and Tile Size (SIZ), # which includes the width and height of the reference grid and tiles.