Skip to content

Commit

Permalink
Merge pull request #65 from m4rc1e/txt-lay
Browse files Browse the repository at this point in the history
shape: Segment text strings
  • Loading branch information
m4rc1e authored Jun 6, 2023
2 parents 4707a95 + dcad459 commit 13ce4f6
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 41 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ blackrenderer = {extras = ["skia"], version = "^0.6.0"}
unicodedata2 = "^15.0.0"
tqdm = "^4.64.1"
youseedee = "^0.3.0"
python-bidi = "*"

[build-system]
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
Expand Down
161 changes: 161 additions & 0 deletions src/diffenator2/segmenting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Taken from
# https://github.com/justvanrossum/fontgoggles/blob/master/Lib/fontgoggles/misc/segmenting.py
# TODO: write python bindings for ribraqm instead
import itertools
from fontTools.unicodedata import script
from unicodedata2 import category

# Monkeypatch bidi to use unicodedata2
import unicodedata2
import bidi.algorithm
bidi.algorithm.bidirectional = unicodedata2.bidirectional
bidi.algorithm.category = unicodedata2.category
bidi.algorithm.mirrored = unicodedata2.mirrored
from bidi.algorithm import ( # noqa: ignore E402
get_empty_storage, get_base_level, get_embedding_levels,
explicit_embed_and_overrides, resolve_weak_types,
resolve_neutral_types, resolve_implicit_levels,
reorder_resolved_levels, PARAGRAPH_LEVELS,
)
from bidi.mirror import MIRRORED # noqa: ignore E402
from fontTools.unicodedata.OTTags import SCRIPT_EXCEPTIONS


UNKNOWN_SCRIPT = {"Zinh", "Zyyy", "Zxxx"}


def textSegments(txt):
scripts = detectScript(txt)
storage = getBiDiInfo(txt)

levels = [None] * len(txt)
for ch in storage['chars']:
levels[ch['index']] = ch['level']

prevLevel = storage['base_level']
for i, level in enumerate(levels):
if level is None:
levels[i] = prevLevel
else:
prevLevel = level

chars = list(zip(txt, scripts, levels))

runLenghts = []
for value, sub in itertools.groupby(
chars,
key=lambda item: (SCRIPT_EXCEPTIONS.get(item[1], item[1].lower()), item[2]),
):
runLenghts.append(len(list(sub)))

segments = []
index = 0
for rl in runLenghts:
nextIndex = index + rl
segment = chars[index:nextIndex]
runChars = "".join(ch for ch, script, bidiLevel in segment)
_, script, bidiLevel = segment[0]
segments.append((runChars, script, bidiLevel, index))
index = nextIndex
return segments, storage['base_level']


def reorderedSegments(segments, baseLevel):
reorderedSegments = []
isRTL = baseLevel % 2
for value, sub in itertools.groupby(segments, key=lambda item: item[2] % 2):
if isRTL == value:
reorderedSegments.extend(sub)
else:
reorderedSegments.extend(reversed(list(sub)))
if isRTL:
reorderedSegments = list(reversed(reorderedSegments))
assert len(reorderedSegments) == len(segments)
return reorderedSegments


def detectScript(txt):
charScript = [script(c) for c in txt]

for i, ch in enumerate(txt):
scr = charScript[i]
cat = category(ch)
# Non-spacing mark (Mn) should always inherit script
if scr in UNKNOWN_SCRIPT or cat == "Mn":
if i:
scr = charScript[i-1]
else:
scr = None
if ch in MIRRORED and cat == "Pe":
scr = None
charScript[i] = scr

# Any unknowns should be mapped to the _next_ script
prev = None
for i in range(len(txt) - 1, -1, -1):
if charScript[i] is None:
charScript[i] = prev
else:
prev = charScript[i]

# There may be unknowns at the end of the string, fall back to
# preceding script
prev = "Zxxx" # last resort
for i in range(len(txt)):
if charScript[i] is None:
charScript[i] = prev
else:
prev = charScript[i]

assert None not in charScript

return charScript


# copied from bidi/algorthm.py and modified to be more useful for us.

def getBiDiInfo(text, *, upper_is_rtl=False, base_dir=None, debug=False):
"""
Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
for debugging (default: False).
Set `base_dir` to 'L' or 'R' to override the calculated base_level.
Set `debug` to True to display (using sys.stderr) the steps taken with the
algorithm.
Returns an info dict object and the display layout.
"""
storage = get_empty_storage()

if base_dir is None:
base_level = get_base_level(text, upper_is_rtl)
else:
base_level = PARAGRAPH_LEVELS[base_dir]

storage['base_level'] = base_level
storage['base_dir'] = ('L', 'R')[base_level]

get_embedding_levels(text, storage, upper_is_rtl, debug)
fix_bidi_type_for_unknown_chars(storage)
assert len(text) == len(storage["chars"])
for index, (ch, chInfo) in enumerate(zip(text, storage["chars"])):
assert ch == chInfo["ch"]
chInfo["index"] = index

explicit_embed_and_overrides(storage, debug)
resolve_weak_types(storage, debug)
resolve_neutral_types(storage, debug)
resolve_implicit_levels(storage, debug)
reorder_resolved_levels(storage, debug)

return storage


def fix_bidi_type_for_unknown_chars(storage):
"""Set any bidi type of '' (symptom of a character not known by unicode)
to 'L', to prevent the other bidi code to fail (issue 313).
"""
for _ch in storage['chars']:
if _ch['type'] == '':
_ch['type'] = 'L'
88 changes: 47 additions & 41 deletions src/diffenator2/shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from diffenator2.template_elements import WordDiff, Glyph, GlyphDiff
from pkg_resources import resource_filename
import tqdm
from diffenator2.segmenting import textSegments


# Hashing strategies for elements of a Harfbuzz buffer
Expand Down Expand Up @@ -132,63 +133,68 @@ def test_words(

differ = PixelDiffer(font_a, font_b)
with open(word_file, encoding="utf8") as doc:
words = doc.read().split("\n")
print(f"testing {len(words)} words")
word_total = len(words)
for i, line in tqdm.tqdm(enumerate(words), total=word_total):
sentences = doc.read().split("\n")
print(f"testing {len(sentences)} words")
word_total = len(sentences)
for i, line in tqdm.tqdm(enumerate(sentences), total=word_total):
items = line.split(",")
try:
word, script, lang, features = items[0], items[1], items[2], items[3:]
# for wordlists which just contain words
sentence, script, lang, features = items[0], items[1], items[2], items[3:]
# for wordlists which just contain sentences
except IndexError:
word, script, lang, features = items[0], None, None, []
sentence, script, lang, features = items[0], "dflt", None, []
features = {k: True for k in features}
if any(c.string in word for c in skip_glyphs):
continue

differ.set_script(script)
differ.set_lang(lang)
differ.set_features(features)

if not word:
continue
# split sentences into individual script segments. This mimmics the
# same behaviour as dtp apps, web browsers etc
for segment, script, _, _, in textSegments(sentence)[0]:

buf_b = differ.renderer_b.shape(word)
word_b = Word.from_buffer(word, buf_b)

gid_hashes = [hash_func(i, j) for i, j in zip(buf_b.glyph_infos, buf_b.glyph_positions)]
# I'm not entirely convinced this is a valid test; but it seems to
# work and speeds things up a lot...
if all(gid_hash in seen_gids for gid_hash in gid_hashes):
continue
if any(c.string in segment for c in skip_glyphs):
continue

buf_a = differ.renderer_a.shape(word)
word_a = Word.from_buffer(word, buf_a)
if not segment:
continue

# skip any words which cannot be shaped correctly
if any([g.codepoint == 0 for g in buf_a.glyph_infos+buf_b.glyph_infos]):
continue
buf_b = differ.renderer_b.shape(segment)
word_b = Word.from_buffer(segment, buf_b)

pc, diff_map = differ.diff(word)
gid_hashes = [hash_func(i, j) for i, j in zip(buf_b.glyph_infos, buf_b.glyph_positions)]
# I'm not entirely convinced this is a valid test; but it seems to
# work and speeds things up a lot...
if all(gid_hash in seen_gids for gid_hash in gid_hashes):
continue

for gid_hash in gid_hashes:
seen_gids[gid_hash] = True
buf_a = differ.renderer_a.shape(segment)
word_a = Word.from_buffer(segment, buf_a)

if pc < threshold:
# skip any words which cannot be shaped correctly
if any([g.codepoint == 0 for g in buf_a.glyph_infos+buf_b.glyph_infos]):
continue
res.add(
(
pc,
WordDiff(
word,
word_a.hb,
word_b.hb,
tuple(features.keys()),
ot_to_html_lang.get((script, lang)),
ot_to_dir.get(script, None),
"%.2f" % pc,
),

pc, diff_map = differ.diff(segment)

for gid_hash in gid_hashes:
seen_gids[gid_hash] = True

if pc < threshold:
continue
res.add(
(
pc,
WordDiff(
sentence,
word_a.hb,
word_b.hb,
tuple(features.keys()),
ot_to_html_lang.get((script, lang)),
ot_to_dir.get(script, None),
"%.2f" % pc,
),
)
)
)
return [w[1] for w in sorted(res, key=lambda k: k[0], reverse=True)]

0 comments on commit 13ce4f6

Please sign in to comment.