Merge pull request #65 from m4rc1e/txt-lay

shape: Segment text strings
googlefonts · Jun 6, 2023 · 13ce4f6 · 13ce4f6
2 parents 4707a95 + dcad459
commit 13ce4f6
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 41 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ blackrenderer = {extras = ["skia"], version = "^0.6.0"}
 unicodedata2 = "^15.0.0"
 tqdm = "^4.64.1"
 youseedee = "^0.3.0"
+python-bidi = "*"
 
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]

diff --git a/src/diffenator2/segmenting.py b/src/diffenator2/segmenting.py
@@ -0,0 +1,161 @@
+# Taken from 
+# https://github.com/justvanrossum/fontgoggles/blob/master/Lib/fontgoggles/misc/segmenting.py
+# TODO: write python bindings for ribraqm instead
+import itertools
+from fontTools.unicodedata import script
+from unicodedata2 import category
+
+# Monkeypatch bidi to use unicodedata2
+import unicodedata2
+import bidi.algorithm
+bidi.algorithm.bidirectional = unicodedata2.bidirectional
+bidi.algorithm.category = unicodedata2.category
+bidi.algorithm.mirrored = unicodedata2.mirrored
+from bidi.algorithm import (  # noqa: ignore E402
+    get_empty_storage, get_base_level, get_embedding_levels,
+    explicit_embed_and_overrides, resolve_weak_types,
+    resolve_neutral_types, resolve_implicit_levels,
+    reorder_resolved_levels, PARAGRAPH_LEVELS,
+)
+from bidi.mirror import MIRRORED  # noqa: ignore E402
+from fontTools.unicodedata.OTTags import SCRIPT_EXCEPTIONS
+
+
+UNKNOWN_SCRIPT = {"Zinh", "Zyyy", "Zxxx"}
+
+
+def textSegments(txt):
+    scripts = detectScript(txt)
+    storage = getBiDiInfo(txt)
+
+    levels = [None] * len(txt)
+    for ch in storage['chars']:
+        levels[ch['index']] = ch['level']
+
+    prevLevel = storage['base_level']
+    for i, level in enumerate(levels):
+        if level is None:
+            levels[i] = prevLevel
+        else:
+            prevLevel = level
+
+    chars = list(zip(txt, scripts, levels))
+
+    runLenghts = []
+    for value, sub in itertools.groupby(
+        chars,
+        key=lambda item: (SCRIPT_EXCEPTIONS.get(item[1], item[1].lower()), item[2]),
+    ):
+        runLenghts.append(len(list(sub)))
+
+    segments = []
+    index = 0
+    for rl in runLenghts:
+        nextIndex = index + rl
+        segment = chars[index:nextIndex]
+        runChars = "".join(ch for ch, script, bidiLevel in segment)
+        _, script, bidiLevel = segment[0]
+        segments.append((runChars, script, bidiLevel, index))
+        index = nextIndex
+    return segments, storage['base_level']
+
+
+def reorderedSegments(segments, baseLevel):
+    reorderedSegments = []
+    isRTL = baseLevel % 2
+    for value, sub in itertools.groupby(segments, key=lambda item: item[2] % 2):
+        if isRTL == value:
+            reorderedSegments.extend(sub)
+        else:
+            reorderedSegments.extend(reversed(list(sub)))
+    if isRTL:
+        reorderedSegments = list(reversed(reorderedSegments))
+    assert len(reorderedSegments) == len(segments)
+    return reorderedSegments
+
+
+def detectScript(txt):
+    charScript = [script(c) for c in txt]
+
+    for i, ch in enumerate(txt):
+        scr = charScript[i]
+        cat = category(ch)
+        # Non-spacing mark (Mn) should always inherit script
+        if scr in UNKNOWN_SCRIPT or cat == "Mn":
+            if i:
+                scr = charScript[i-1]
+            else:
+                scr = None
+            if ch in MIRRORED and cat == "Pe":
+                scr = None
+        charScript[i] = scr
+
+    # Any unknowns should be mapped to the _next_ script
+    prev = None
+    for i in range(len(txt) - 1, -1, -1):
+        if charScript[i] is None:
+            charScript[i] = prev
+        else:
+            prev = charScript[i]
+
+    # There may be unknowns at the end of the string, fall back to
+    # preceding script
+    prev = "Zxxx"  # last resort
+    for i in range(len(txt)):
+        if charScript[i] is None:
+            charScript[i] = prev
+        else:
+            prev = charScript[i]
+
+    assert None not in charScript
+
+    return charScript
+
+
+# copied from bidi/algorthm.py and modified to be more useful for us.
+
+def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False):
+    """
+    Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
+    for debugging (default: False).
+
+    Set `base_dir` to 'L' or 'R' to override the calculated base_level.
+
+    Set `debug` to True to display (using sys.stderr) the steps taken with the
+    algorithm.
+
+    Returns an info dict object and the display layout.
+    """
+    storage = get_empty_storage()
+
+    if base_dir is None:
+        base_level = get_base_level(text, upper_is_rtl)
+    else:
+        base_level = PARAGRAPH_LEVELS[base_dir]
+
+    storage['base_level'] = base_level
+    storage['base_dir'] = ('L', 'R')[base_level]
+
+    get_embedding_levels(text, storage, upper_is_rtl, debug)
+    fix_bidi_type_for_unknown_chars(storage)
+    assert len(text) == len(storage["chars"])
+    for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])):
+        assert ch == chInfo["ch"]
+        chInfo["index"] = index
+
+    explicit_embed_and_overrides(storage, debug)
+    resolve_weak_types(storage, debug)
+    resolve_neutral_types(storage, debug)
+    resolve_implicit_levels(storage, debug)
+    reorder_resolved_levels(storage, debug)
+
+    return storage
+
+
+def fix_bidi_type_for_unknown_chars(storage):
+    """Set any bidi type of '' (symptom of a character not known by unicode)
+    to 'L', to prevent the other bidi code to fail (issue 313).
+    """
+    for _ch in storage['chars']:
+        if _ch['type'] == '':
+            _ch['type'] = 'L'
diff --git a/src/diffenator2/shape.py b/src/diffenator2/shape.py
@@ -10,6 +10,7 @@
 from diffenator2.template_elements import WordDiff, Glyph, GlyphDiff
 from pkg_resources import resource_filename
 import tqdm
+from diffenator2.segmenting import textSegments
 
 
 # Hashing strategies for elements of a Harfbuzz buffer
@@ -132,63 +133,68 @@ def test_words(
 
     differ = PixelDiffer(font_a, font_b)
     with open(word_file, encoding="utf8") as doc:
-        words = doc.read().split("\n")
-        print(f"testing {len(words)} words")
-        word_total = len(words)
-        for i, line in tqdm.tqdm(enumerate(words), total=word_total):
+        sentences = doc.read().split("\n")
+        print(f"testing {len(sentences)} words")
+        word_total = len(sentences)
+        for i, line in tqdm.tqdm(enumerate(sentences), total=word_total):
             items = line.split(",")
             try:
-                word, script, lang, features = items[0], items[1], items[2], items[3:]
-            # for wordlists which just contain words
+                sentence, script, lang, features = items[0], items[1], items[2], items[3:]
+            # for wordlists which just contain sentences
             except IndexError:
-                word, script, lang, features = items[0], None, None, []
+                sentence, script, lang, features = items[0], "dflt", None, []
             features = {k: True for k in features}
-            if any(c.string in word for c in skip_glyphs):
-                continue
 
             differ.set_script(script)
             differ.set_lang(lang)
             differ.set_features(features)
 
-            if not word:
-                continue
+            # split sentences into individual script segments. This mimmics the
+            # same behaviour as dtp apps, web browsers etc
+            for segment, script, _, _, in textSegments(sentence)[0]:
 
-            buf_b = differ.renderer_b.shape(word)
-            word_b = Word.from_buffer(word, buf_b)
-
-            gid_hashes = [hash_func(i, j) for i, j in zip(buf_b.glyph_infos, buf_b.glyph_positions)]
-            # I'm not entirely convinced this is a valid test; but it seems to
-            # work and speeds things up a lot...
-            if all(gid_hash in seen_gids for gid_hash in gid_hashes):
-                continue
+                if any(c.string in segment for c in skip_glyphs):
+                    continue
 
-            buf_a = differ.renderer_a.shape(word)
-            word_a = Word.from_buffer(word, buf_a)
+                if not segment:
+                    continue
 
-            # skip any words which cannot be shaped correctly
-            if any([g.codepoint == 0 for g in buf_a.glyph_infos+buf_b.glyph_infos]):
-                continue
+                buf_b = differ.renderer_b.shape(segment)
+                word_b = Word.from_buffer(segment, buf_b)
 
-            pc, diff_map = differ.diff(word)
+                gid_hashes = [hash_func(i, j) for i, j in zip(buf_b.glyph_infos, buf_b.glyph_positions)]
+                # I'm not entirely convinced this is a valid test; but it seems to
+                # work and speeds things up a lot...
+                if all(gid_hash in seen_gids for gid_hash in gid_hashes):
+                    continue
 
-            for gid_hash in gid_hashes:
-                seen_gids[gid_hash] = True
+                buf_a = differ.renderer_a.shape(segment)
+                word_a = Word.from_buffer(segment, buf_a)
 
-                if pc < threshold:
+                # skip any words which cannot be shaped correctly
+                if any([g.codepoint == 0 for g in buf_a.glyph_infos+buf_b.glyph_infos]):
                     continue
-                res.add(
-                    (
-                        pc,
-                        WordDiff(
-                            word,
-                            word_a.hb,
-                            word_b.hb,
-                            tuple(features.keys()),
-                            ot_to_html_lang.get((script, lang)),
-                            ot_to_dir.get(script, None),
-                            "%.2f" % pc,
-                        ),
+
+                pc, diff_map = differ.diff(segment)
+
+                for gid_hash in gid_hashes:
+                    seen_gids[gid_hash] = True
+
+                    if pc < threshold:
+                        continue
+                    res.add(
+                        (
+                            pc,
+                            WordDiff(
+                                sentence,
+                                word_a.hb,
+                                word_b.hb,
+                                tuple(features.keys()),
+                                ot_to_html_lang.get((script, lang)),
+                                ot_to_dir.get(script, None),
+                                "%.2f" % pc,
+                            ),
+                        )
                     )
-                )
     return [w[1] for w in sorted(res, key=lambda k: k[0], reverse=True)]