From b4c06d2192fe70adf30ba209a464022c5e79311c Mon Sep 17 00:00:00 2001 From: "Mr. Pennyworth" Date: Fri, 21 Jun 2024 12:45:03 -0500 Subject: [PATCH] Document the get_word_defs_map func --- pyapp/BetterDict.py | 62 ++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/pyapp/BetterDict.py b/pyapp/BetterDict.py index 3f6ff39..4652013 100644 --- a/pyapp/BetterDict.py +++ b/pyapp/BetterDict.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import glob +import io import json import os import plistlib @@ -8,6 +9,7 @@ import shutil import sys import time +import typing from base64 import b16encode from collections import defaultdict from struct import unpack @@ -28,6 +30,10 @@ WORKFLOW_DIR = alfred.get_workflow_dir() +def read_int(f: typing.BinaryIO) -> int: + return unpack("i", f.read(4))[0] + + # original source for parsing the '.dictionary' format: # https://gist.github.com/josephg/5e134adf70760ee7e49d def get_word_defs_map(dict_data_path): @@ -35,28 +41,50 @@ def get_word_defs_map(dict_data_path): and value is a list of its definitions.""" word_to_defs_map = defaultdict(list) + # dict_data_path is path to a Body.data file in an Apple dictionary with open(dict_data_path, "rb") as f: + # first 64 bytes of a Body.data file are always all-zeroes, + # skip them f.seek(0x40) - limit = 0x40 + unpack("i", f.read(4))[0] - f.seek(0x60) - while f.tell() < limit: - (sz,) = unpack("i", f.read(4)) - buf = decompress(f.read(sz)[8:]) - pos = 0 - while pos < len(buf): - (chunksize,) = unpack("i", buf[pos : pos + 4]) - pos += 4 + # The next four bytes represent an integer denoting remaining + # number of bytes in the Body.data file + limit = 0x40 + read_int(f) - defn = buf[pos : pos + chunksize] - word = ( - re.search(b'd:title="(.*?)"', defn).group(1).decode("utf-8") - ) - - word_to_defs_map[word].append(defn.decode("utf-8")) - - pos += chunksize + # TODO: for 'HeapDataCompressionType': 2 (in Info.plist), + # we need to skip to byte number 96, but not if it is 1. + f.seek(0x60) + while f.tell() < limit: + # a Body.data file can contain multiple sections with the format: + # [section_size (4 bytes (not including itself)), + # ??? (4 bytes), (no idea what these are!) + # decompressed_size (4 bytes), + # compressed_data (section_size-8 bytes)] + compressed_size = read_int(f) - 8 + _ = f.read(4) # no idea about these 4 bytes + decompressed_size = read_int(f) + decompressed = io.BytesIO(decompress(f.read(compressed_size))) + + while decompressed.tell() < decompressed_size: + # each decompressed chunk contains multiple definitions + # each definition is of the format: + # [defn_size (4 bytes (not including itself)), + # XML defn (defn_size bytes)] + defn_size = read_int(decompressed) + defn = decompressed.read(defn_size).decode("utf-8") + + # Example XML defn opening tag: + # + # The principled way to find the word being defined + # would be to parse the XML and then get 'd:title' from it, + # however, that's too slow, so we resort to regex matching. + word = re.search('d:title="(.*?)"', defn).group(1) + word_to_defs_map[word].append(defn) return word_to_defs_map