Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

font_sample_text: make more concise #130

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/diffenator2/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,9 @@ def diffenator_font_style(dfont, suffix=""):


def filtered_font_sample_text(ttFont, characters):
sample_text = font_sample_text(ttFont)
sample_text = [w for w in sample_text if characters_in_string(w, characters)]
font_characters = set(chr(c) for c in ttFont.getBestCmap())
characters = set(characters) & font_characters
sample_text = font_sample_text(tuple(sorted(characters)))
return " ".join(sample_text)


Expand Down
66 changes: 40 additions & 26 deletions src/diffenator2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,39 +132,53 @@ def gen_gif(img_a_path: str, img_b_path: str, dst: str):
img_a.save(dst, save_all=True, append_images=[img_b], loop=10000, duration=1000)


def greedy_set_cover(universe, sets):
uncovered_elements = set(universe)
best_sets = set()

prev_count = float("inf")
while uncovered_elements:
if len(uncovered_elements) == prev_count:
break
prev_count = len(uncovered_elements)
best_set = max(sets, key=lambda s: len(set(s) & uncovered_elements))
uncovered_elements -= set(best_set)
best_sets.add(best_set)
return sorted(best_sets, key=lambda k: ord(k[-1]))


@lru_cache()
def font_sample_text(ttFont: TTFont) -> str:
def font_sample_text(font_characters):
"""Collect words which exist in the Universal Declaration of Human Rights
that can be formed using the ttFont instance.
UDHR has been chosen due to the many languages it covers"""
with open(
resource_filename("diffenator2", "data/udhr_all.txt"), encoding="utf8"
) as doc:
uhdr = doc.read()

cmap = set(ttFont.getBestCmap())
words = []
seen_chars = set()

def _add_words(words, text, seen_chars):
for word in text.split():
chars = set(ord(l) for l in word)
if not chars.issubset(cmap):
continue
if chars & seen_chars == chars:
continue
seen_chars |= chars
words.append(word)

_add_words(words, uhdr, seen_chars)

if len(seen_chars) < len(cmap):
languages = LoadLanguages()
for file, proto in languages.items():
if hasattr(proto, "sample_text"):
for _, text in proto.sample_text.ListFields():
_add_words(words, text, seen_chars)
return words
words = re.split(r"[\b\W\b]+", doc.read())

# GF languages sample text
languages = LoadLanguages()
for file, proto in languages.items():
if hasattr(proto, "sample_text"):
for _, text in proto.sample_text.ListFields():
words += re.split(r"[\b\W\b]+", text)

# remove all anagram words and words that are not in font cmap
seen = set()
new_words = set()
for word in sorted(words, key=lambda k: len(k), reverse=True):
word_set = set(word)
if len(word_set) != len(word):
continue
if word_set.issubset(seen):
continue
if word_set.issubset(font_characters):
seen |= word_set
new_words.add(word)

unique_words = greedy_set_cover(font_characters, new_words)
return unique_words


def font_family_name(ttFont, suffix=""):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ def test_diffenator_threshold(fp_before, fp_after, threshold, has, missing):
@pytest.mark.parametrize(
"fp, cmd, pattern, has, missing",
[
(mavenpro_vf, "proof", ".*", ['>an tan</div>'], []),
(mavenpro_vf, "proof", ".*", ['>tan</div>'], []),
(mavenpro_vf, "proof", "[an]{1,2}", ['>an</div>'], []),
(mavenpro_vf, "diff", ".*", ['>an tan</div>'], []),
(mavenpro_vf, "diff", ".*", ['>tan</div>'], []),
(mavenpro_vf, "diff", "[an]{1,2}", ['>an</div>'], []),
]
)
Expand Down
Loading