Skip to content

Commit

Permalink
Document the get_word_defs_map func
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-pennyworth committed Jun 21, 2024
1 parent eb5b2f7 commit b4c06d2
Showing 1 changed file with 45 additions and 17 deletions.
62 changes: 45 additions & 17 deletions pyapp/BetterDict.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# -*- coding: utf-8 -*-

import glob
import io
import json
import os
import plistlib
import re
import shutil
import sys
import time
import typing
from base64 import b16encode
from collections import defaultdict
from struct import unpack
Expand All @@ -28,35 +30,61 @@
WORKFLOW_DIR = alfred.get_workflow_dir()


def read_int(f: typing.BinaryIO) -> int:
return unpack("i", f.read(4))[0]


# original source for parsing the '.dictionary' format:
# https://gist.github.com/josephg/5e134adf70760ee7e49d
def get_word_defs_map(dict_data_path):
"""returns a dict where key is the word
and value is a list of its definitions."""
word_to_defs_map = defaultdict(list)

# dict_data_path is path to a Body.data file in an Apple dictionary
with open(dict_data_path, "rb") as f:
# first 64 bytes of a Body.data file are always all-zeroes,
# skip them
f.seek(0x40)
limit = 0x40 + unpack("i", f.read(4))[0]
f.seek(0x60)
while f.tell() < limit:
(sz,) = unpack("i", f.read(4))
buf = decompress(f.read(sz)[8:])

pos = 0
while pos < len(buf):
(chunksize,) = unpack("i", buf[pos : pos + 4])
pos += 4
# The next four bytes represent an integer denoting remaining
# number of bytes in the Body.data file
limit = 0x40 + read_int(f)

defn = buf[pos : pos + chunksize]
word = (
re.search(b'd:title="(.*?)"', defn).group(1).decode("utf-8")
)

word_to_defs_map[word].append(defn.decode("utf-8"))

pos += chunksize
# TODO: for 'HeapDataCompressionType': 2 (in Info.plist),
# we need to skip to byte number 96, but not if it is 1.
f.seek(0x60)

while f.tell() < limit:
# a Body.data file can contain multiple sections with the format:
# [section_size (4 bytes (not including itself)),
# ??? (4 bytes), (no idea what these are!)
# decompressed_size (4 bytes),
# compressed_data (section_size-8 bytes)]
compressed_size = read_int(f) - 8
_ = f.read(4) # no idea about these 4 bytes
decompressed_size = read_int(f)
decompressed = io.BytesIO(decompress(f.read(compressed_size)))

while decompressed.tell() < decompressed_size:
# each decompressed chunk contains multiple definitions
# each definition is of the format:
# [defn_size (4 bytes (not including itself)),
# XML defn (defn_size bytes)]
defn_size = read_int(decompressed)
defn = decompressed.read(defn_size).decode("utf-8")

# Example XML defn opening tag:
# <d:entry
# xmlns:d=".apple.com/DTDs/DictionaryService-1.0.rng"
# id="m_en_gbus0134270"
# d:title="apple"
# class="entry">
# The principled way to find the word being defined
# would be to parse the XML and then get 'd:title' from it,
# however, that's too slow, so we resort to regex matching.
word = re.search('d:title="(.*?)"', defn).group(1)
word_to_defs_map[word].append(defn)
return word_to_defs_map


Expand Down

0 comments on commit b4c06d2

Please sign in to comment.