From 8a9430e634711742b2e7ba32b6e2608e9e47e3a0 Mon Sep 17 00:00:00 2001
From: Alexander R Craven <12934018+alexcraven@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:27:34 +0100
Subject: [PATCH] GE P-file reader: adaptive character encoding (#156)

* GE P-file reader: adaptive character encoding

`ge_read_pfile` and `ge_pfile` assumed utf-8 encoding in character strings within the p-file; this does not appear to be standard across systems. Suggested patch attempts a few likely encoding candidates, before falling back on a permissive ascii encoding.

* Fix lint errors in updated GE reader

* Added non-English character tests to test_ge_pfile

Corresponding test data https://github.com/user-attachments/files/17702724/GE_character_encoding_test_data.zip expected under spec2nii_test_data/ge/pFiles/PRESS/MR30.1

* Update submodule for new test data.

* Fix directorys tructure.

---------

Co-authored-by: wtclarke <william.clarke@ndcn.ox.ac.uk>
---
 spec2nii/GE/ge_pfile.py      | 36 +++++++++++-------
 spec2nii/GE/ge_read_pfile.py | 36 ++++++++++++++----
 tests/spec2nii_test_data     |  2 +-
 tests/test_ge_pfile.py       | 72 ++++++++++++++++++++++++++++++++++++
 4 files changed, 125 insertions(+), 21 deletions(-)

diff --git a/spec2nii/GE/ge_pfile.py b/spec2nii/GE/ge_pfile.py
index 098976f..4af6bd4 100644
--- a/spec2nii/GE/ge_pfile.py
+++ b/spec2nii/GE/ge_pfile.py
@@ -89,12 +89,15 @@ def _process_svs_pfile(pfile):
     :return: List of NIFTI MRS data objects
     :return: List of file name suffixes
     """
-    psd = pfile.hdr.rhi_psdname.decode('utf-8').lower()
-    proto = pfile.hdr.rhs_se_desc.decode('utf-8').lower()
+
+    assert pfile.encoding is not None  # encoding should have been set in ge_read_pfile get_mapper
+
+    psd = pfile.hdr.rhi_psdname.decode(pfile.encoding, errors='replace').lower()
+    proto = pfile.hdr.rhs_se_desc.decode(pfile.encoding, errors='replace').lower()
     if psd == 'hbcd' and "press" in proto:
         print('\nPSD was: ', psd)
         print('Proto is: ', proto)
-        psd = pfile.hdr.rhs_se_desc.decode('utf-8').lower()
+        psd = pfile.hdr.rhs_se_desc.decode(pfile.encoding, errors='replace').lower()
         print('PSD updated to: ', psd)
 
     # MM: Some 'gaba' psd strings contain full path names, so truncate to the end of the path
@@ -429,7 +432,10 @@ def _process_mrsi_pfile(pfile):
     :return: List of NIFTI MRS data objects
     :return: List of file name suffixes
     """
-    psd = pfile.hdr.rhi_psdname.decode('utf-8').lower()
+
+    assert pfile.encoding is not None  # encoding should have been set in ge_read_pfile get_mapper
+
+    psd = pfile.hdr.rhi_psdname.decode(pfile.encoding, errors='replace').lower()
 
     known_formats = ('probe-p', 'probe-sl', 'slaser_cni', 'presscsi')
     if psd not in known_formats:
@@ -573,37 +579,41 @@ def _populate_metadata(pfile, water_suppressed=True, data_dimensions=None):
     # 'Manufacturer'
     meta.set_standard_def('Manufacturer', 'GE')
     # 'ManufacturersModelName'
-    meta.set_standard_def('ManufacturersModelName', hdr.rhe_ex_sysid.decode('utf-8'))
+    meta.set_standard_def('ManufacturersModelName', hdr.rhe_ex_sysid.decode(pfile.encoding, errors='replace'))
     # 'DeviceSerialNumber'
-    meta.set_standard_def('DeviceSerialNumber', hdr.rhe_uniq_sys_id.decode('utf-8'))
+    meta.set_standard_def('DeviceSerialNumber', hdr.rhe_uniq_sys_id.decode(pfile.encoding, errors='replace'))
     # 'SoftwareVersions'
-    meta.set_standard_def('SoftwareVersions', hdr.rhe_ex_verscre.decode('utf-8'))
+    meta.set_standard_def('SoftwareVersions', hdr.rhe_ex_verscre.decode(pfile.encoding, errors='replace'))
     # 'InstitutionName'
-    meta.set_standard_def('InstitutionName', hdr.rhe_hospname.decode('utf-8'))
+    meta.set_standard_def('InstitutionName', hdr.rhe_hospname.decode(pfile.encoding, errors='replace'))
     # 'InstitutionAddress'
     # Not known
     # 'TxCoil'
     # Not Known
     # 'RxCoil'
-    meta.set_user_def(key='ReceiveCoilName', value=hdr.rhi_cname.decode('utf-8'), doc='Rx coil name.')
+    meta.set_user_def(
+        key="ReceiveCoilName",
+        value=hdr.rhi_cname.decode(pfile.encoding, errors="replace"),
+        doc="Rx coil name.",
+    )
 
     # # 5.3 Sequence information
     # 'SequenceName'
-    meta.set_standard_def('SequenceName', hdr.rhi_psdname.decode('utf-8'))
+    meta.set_standard_def('SequenceName', hdr.rhi_psdname.decode(pfile.encoding, errors='replace'))
     # 'ProtocolName'
-    meta.set_standard_def('ProtocolName', hdr.rhs_se_desc.decode('utf-8'))
+    meta.set_standard_def('ProtocolName', hdr.rhs_se_desc.decode(pfile.encoding, errors='replace'))
 
     # # 5.4 Sequence information
     # 'PatientPosition'
     # Not known
     # 'PatientName'
-    meta.set_standard_def('PatientName', hdr.rhe_patname.decode('utf-8'))
+    meta.set_standard_def('PatientName', hdr.rhe_patname.decode(pfile.encoding, errors='replace'))
     # 'PatientID'
     # Not known
     # 'PatientWeight'
     # Not known
     # 'PatientDoB'
-    meta.set_standard_def('PatientDoB', hdr.rhe_dateofbirth.decode('utf-8'))
+    meta.set_standard_def('PatientDoB', hdr.rhe_dateofbirth.decode(pfile.encoding, errors='replace'))
     # 'PatientSex'
     if hdr.rhe_patsex == 1:
         sex_str = 'M'
diff --git a/spec2nii/GE/ge_read_pfile.py b/spec2nii/GE/ge_read_pfile.py
index d48a084..b45b94a 100644
--- a/spec2nii/GE/ge_read_pfile.py
+++ b/spec2nii/GE/ge_read_pfile.py
@@ -124,6 +124,7 @@ def __init__(self, fname):
         self.hdr        = None
         self.map        = None
         self.endian     = 'little'  # def for version >= 11
+        self.encoding   = None
 
         self.read_header()
 
@@ -176,10 +177,31 @@ def get_mapper(self):
         if self.hdr is None:
             return None
 
-        psd = self.hdr.rhi_psdname.decode('utf-8').lower()
-        proto = self.hdr.rhs_se_desc.decode('utf-8').lower()
-        if psd == 'hbcd' and "press" in proto:
-            psd = self.hdr.rhs_se_desc.decode('utf-8').lower()
+        # ARC 20241105 : utf-8 codec is not standard across systems; here, we try a
+        # couple of likely candidates, falling back on permissive ascii
+
+        for encoding, errors in [
+            ("utf-8", "strict"),
+            ("ISO-8859-1", "strict"),
+            ("ascii", "replace"),
+        ]:
+            try:
+                psd = self.hdr.rhi_psdname.decode(encoding, errors).lower()
+                proto = self.hdr.rhs_se_desc.decode(encoding, errors).lower()
+
+                # the following is unused in this context, but can inform codec selection
+                _ = self.hdr.rhe_patname.decode(encoding, errors)
+
+                if psd == "hbcd" and "press" in proto:
+                    psd = self.hdr.rhs_se_desc.decode(encoding, errors).lower()
+            except UnicodeDecodeError:
+                psd = ""
+                proto = ""
+                continue
+            self.encoding = encoding
+            break
+
+        assert self.encoding is not None  # final codec must should have succeeded
 
         # MM: Some 'gaba' psd strings contain full path names, so truncate to the end of the path
         if psd.endswith('gaba'):
@@ -645,7 +667,7 @@ def get_dcos(self):
 
         dcos[0][0] = (self.hdr.rhi_trhc_R - self.hdr.rhi_tlhc_R)
         dcos[0][1] = (self.hdr.rhi_trhc_A - self.hdr.rhi_tlhc_A)
-        dcos[0][2] =  (self.hdr.rhi_trhc_S - self.hdr.rhi_tlhc_S)
+        dcos[0][2] = (self.hdr.rhi_trhc_S - self.hdr.rhi_tlhc_S)
 
         dcosLengthX = np.sqrt(dcos[0][0] * dcos[0][0]
                               + dcos[0][1] * dcos[0][1]
@@ -657,7 +679,7 @@ def get_dcos(self):
 
         dcos[1][0] = (self.hdr.rhi_brhc_R - self.hdr.rhi_trhc_R)
         dcos[1][1] = (self.hdr.rhi_brhc_A - self.hdr.rhi_trhc_A)
-        dcos[1][2] =  (self.hdr.rhi_brhc_S - self.hdr.rhi_trhc_S)
+        dcos[1][2] = (self.hdr.rhi_brhc_S - self.hdr.rhi_trhc_S)
 
         dcosLengthY = np.sqrt(dcos[1][0] * dcos[1][0]
                               + dcos[1][1] * dcos[1][1]
@@ -986,7 +1008,7 @@ def read_data(self):
         numTimePts      = self.get_num_time_points
         numSpecPts      = self.hdr.rhr_rh_frame_size
         numFreqPts      = numSpecPts
-        numComponents   =  2
+        numComponents   = 2
         dataWordSize    = self.hdr.rhr_rh_point_size
 
         numBytesInVol   = self.get_num_kspace_points * numSpecPts * numComponents * dataWordSize
diff --git a/tests/spec2nii_test_data b/tests/spec2nii_test_data
index 1594c26..088e8f1 160000
--- a/tests/spec2nii_test_data
+++ b/tests/spec2nii_test_data
@@ -1 +1 @@
-Subproject commit 1594c2625a53a877670f9dd0492c0e1b6f3471d5
+Subproject commit 088e8f1646f839ad2c75eacfdd2a8d6ae7707ade
diff --git a/tests/test_ge_pfile.py b/tests/test_ge_pfile.py
index 4489409..c0c22bd 100644
--- a/tests/test_ge_pfile.py
+++ b/tests/test_ge_pfile.py
@@ -35,6 +35,10 @@
 # HBCD / ISTHMUS datasets
 hbcd2_path = ge_path / 'pFiles' / 'hbcd' / 'P31744.7'
 
+# Test set from Bergen (MR30.1, non-English characters in header text)
+bergen_press_301 = ge_path / 'pFiles' / 'PRESS' / 'MR30.1' / 'P30101.7'
+bergen_press_301_non_english = ge_path / 'pFiles' / 'PRESS' / 'MR30.1' / 'P30104.7'
+
 
 def test_svs(tmp_path):
 
@@ -355,3 +359,71 @@ def test_hbcd_isthmus(tmp_path):
     img = NIFTI_MRS(tmp_path / 'hbcd_short_te.nii.gz')
     assert img.shape == (1, 1, 1, 2048, 32, 8)
     assert img.dim_tags == ['DIM_DYN', 'DIM_COIL', None]
+
+
+def test_svs_bergen_301(tmp_path):
+
+    subprocess.check_call(['spec2nii', 'ge',
+                           '-f', 'svs',
+                           '-o', tmp_path,
+                           '-j',
+                           str(bergen_press_301)])
+
+    img, hdr_ext = read_nifti_mrs_with_hdr(tmp_path / 'svs.nii.gz')
+    img_ref, hdr_ext_ref  = read_nifti_mrs_with_hdr(tmp_path / 'svs_ref.nii.gz')
+
+    assert img.shape == (1, 1, 1, 4096, 48, 2)
+    assert np.iscomplexobj(img.dataobj)
+    assert 1 / img.header['pixdim'][4] == 5000.0
+    assert hdr_ext['WaterSuppressed']
+
+    assert img_ref.shape == (1, 1, 1, 4096, 48, 2)
+    assert np.iscomplexobj(img_ref.dataobj)
+    assert 1 / img_ref.header['pixdim'][4] == 5000.0
+    assert not hdr_ext_ref['WaterSuppressed']
+
+    assert hdr_ext['dim_5'] == 'DIM_COIL'
+    assert hdr_ext['dim_6'] == 'DIM_DYN'
+    assert np.isclose(127.7, hdr_ext['SpectrometerFrequency'][0], atol=1E-1)
+    assert hdr_ext['ResonantNucleus'][0] == '1H'
+
+    assert np.isclose(hdr_ext['EchoTime'], 0.03)
+    assert np.isclose(hdr_ext['RepetitionTime'], 2.0)
+
+    assert hdr_ext['PatientName'] == 'fantom'
+    assert hdr_ext['SequenceName'] == 'PROBE-P'
+    assert hdr_ext['ProtocolName'] == 'PROBE-P'
+
+
+def test_svs_bergen_301_non_english(tmp_path):
+
+    subprocess.check_call(['spec2nii', 'ge',
+                           '-f', 'svs',
+                           '-o', tmp_path,
+                           '-j',
+                           str(bergen_press_301_non_english)])
+
+    img, hdr_ext = read_nifti_mrs_with_hdr(tmp_path / 'svs.nii.gz')
+    img_ref, hdr_ext_ref  = read_nifti_mrs_with_hdr(tmp_path / 'svs_ref.nii.gz')
+
+    assert img.shape == (1, 1, 1, 4096, 48, 2)
+    assert np.iscomplexobj(img.dataobj)
+    assert 1 / img.header['pixdim'][4] == 5000.0
+    assert hdr_ext['WaterSuppressed']
+
+    assert img_ref.shape == (1, 1, 1, 4096, 48, 2)
+    assert np.iscomplexobj(img_ref.dataobj)
+    assert 1 / img_ref.header['pixdim'][4] == 5000.0
+    assert not hdr_ext_ref['WaterSuppressed']
+
+    assert hdr_ext['dim_5'] == 'DIM_COIL'
+    assert hdr_ext['dim_6'] == 'DIM_DYN'
+    assert np.isclose(127.7, hdr_ext['SpectrometerFrequency'][0], atol=1E-1)
+    assert hdr_ext['ResonantNucleus'][0] == '1H'
+
+    assert np.isclose(hdr_ext['EchoTime'], 0.03)
+    assert np.isclose(hdr_ext['RepetitionTime'], 2.0)
+
+    assert hdr_ext['PatientName'] == 'fantom^prøve'
+    assert hdr_ext['SequenceName'] == 'PROBE-P'
+    assert hdr_ext['ProtocolName'] == 'PROBE-P åøæäöÅØÆÄÖ'