Skip to content

Commit

Permalink
Builder subset operation: exclude glyphs by name/codepoint/file (#1020)
Browse files Browse the repository at this point in the history
* Add utility to parse codepoints

Based on logic from ufo_merge

* builder: subset: support excluding codepoints inline or from file

* add-ds-subsets: add --exclude-codepoints & --exclude-codepoints-file

* builder: subset: support excluding glyphs by name inline or from file

Leverages/Expands on the (formally called) unicodes_by_donor to handle
glyphs names also. Some type soup involved

* add-ds-subsets: add --exclude-glyphs & --exclude-glyphs-file

* Document subset operation options

* Add explicit ufomerge dependency, pinning to avoid bug

See feedback on #1020

---------

Co-authored-by: Harry Dalton <harry.dalton@daltonmaag.com>
  • Loading branch information
RickyDaMa and Hoolean authored Sep 11, 2024
1 parent 5542d82 commit cf3e553
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 5 deletions.
20 changes: 20 additions & 0 deletions Lib/gftools/scripts/add_ds_subsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,22 @@ def main(args=None):
parser.add_argument(
"--json", "-j", action="store_true", help="Use JSON structured UFOs"
)
parser.add_argument(
"--exclude-codepoints", help="Space-delimited unicodes to exclude"
)
parser.add_argument(
"--exclude-codepoints-file",
help="Newline delimited file with unicodes to exclude. "
"Allows for comments with either # or //",
)
parser.add_argument(
"--exclude-glyphs", help="Space-delimited glyph names to exclude"
)
parser.add_argument(
"--exclude-glyphs-file",
help="Newline delimited file with glyph names to exclude. "
"Allows for comments with either # or //",
)

parser.add_argument("--output", "-o", help="Output designspace file")

Expand Down Expand Up @@ -117,6 +133,10 @@ def main(args=None):
"from": {
"repo": args.repo,
"path": args.file,
"exclude_codepoints": args.exclude_codepoints,
"exclude_codepoints_file": args.exclude_codepoints_file,
"exclude_glyphs": args.exclude_glyphs,
"exclude_glyphs_file": args.exclude_glyphs_file,
}
}
]
Expand Down
79 changes: 74 additions & 5 deletions Lib/gftools/subsetmerger.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from ufomerge import merge_ufos

from gftools.util.styles import STYLE_NAMES
from gftools.utils import download_file, open_ufo
from gftools.utils import download_file, open_ufo, parse_codepoint

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -53,6 +53,10 @@
),
Optional("layoutHandling"): Str(),
Optional("force"): Str(),
Optional("exclude_glyphs"): Str(),
Optional("exclude_codepoints"): Str(),
Optional("exclude_glyphs_file"): Str(),
Optional("exclude_codepoints_file"): Str(),
}
)
)
Expand All @@ -63,7 +67,15 @@ def prepare_minimal_subsets(subsets):
# codepoints with the same "donor" font and options. This allows the
# user to specify multiple subsets from the same font, and they will
# be merged into a single merge operation.
unicodes_by_donor = defaultdict(set)
incl_excl_by_donor: dict[
tuple[str, str, str],
tuple[
# Unicodes to include
set[int],
# Glyph names to exclude
set[str],
],
] = defaultdict(lambda: (set(), set()))
for subset in subsets:
# Resolved named subsets to a set of Unicode using glyphsets data
if "name" in subset:
Expand All @@ -75,18 +87,74 @@ def prepare_minimal_subsets(subsets):
for r in subset["ranges"]:
for cp in range(r["start"], r["end"] + 1):
unicodes.append(cp)

# Parse in manual exclusions
excluded_codepoints = set()
if exclude_inline := subset.get("exclude_codepoints"):
for raw_value in exclude_inline.split():
raw_value = raw_value.strip()
if raw_value == "":
continue
excluded_codepoints.add(parse_codepoint(raw_value))
if exclude_file := subset.get("exclude_codepoints_file"):
for line in Path(exclude_file).read_text().splitlines():
line = line.strip()
if line != "" and not line.startswith(("#", "//")):
continue
# Remove in-line comments
line = line.split("#", 1)[0]
line = line.split("//", 1)[0]
line = line.rstrip()
excluded_codepoints.add(parse_codepoint(line))

# Filter unicodes by excluded_codepoints
unicodes = [
unicode for unicode in unicodes if unicode not in excluded_codepoints
]

# Load excluded glyphs by name
exclude_glyphs = set()
if exclude_inline := subset.get("exclude_glyphs"):
for glyph_name in exclude_inline.split():
glyph_name = glyph_name.strip()
if glyph_name == "":
continue
exclude_glyphs.add(glyph_name)
if exclude_file := subset.get("exclude_glyphs_file"):
for line in Path(exclude_file).read_text().splitlines():
line = line.strip()
if line != "" and not line.startswith(("#", "//")):
continue
# Remove in-line comments
line = line.split("#", 1)[0]
line = line.split("//", 1)[0]
line = line.rstrip()
exclude_glyphs.add(line)

# Update incl_excl_by_donor
key = (
yaml.dump(subset["from"]),
subset.get("layoutHandling"),
subset.get("force"),
)
unicodes_by_donor[key] |= set(unicodes)
unicodes_incl, glyph_names_excl = incl_excl_by_donor[key]
unicodes_incl |= set(unicodes)
glyph_names_excl |= exclude_glyphs

# Now rebuild the subset dictionary, but this time with the codepoints
# amalgamated into minimal sets.
newsubsets = []
for (donor, layouthandling, force), unicodes in unicodes_by_donor.items():
newsubsets.append({"from": yaml.safe_load(donor), "unicodes": list(unicodes)})
for (donor, layouthandling, force), (
unicodes_incl,
glyph_names_excl,
) in incl_excl_by_donor.items():
newsubsets.append(
{
"from": yaml.safe_load(donor),
"unicodes": list(unicodes_incl),
"exclude_glyphs": list(glyph_names_excl),
}
)
if layouthandling:
newsubsets[-1]["layoutHandling"] = layouthandling
if force:
Expand Down Expand Up @@ -171,6 +239,7 @@ def add_subset(self, target_ufo, ds, ds_source, subset) -> bool:
merge_ufos(
target_ufo,
source_ufo,
exclude_glyphs=subset["exclude_glyphs"],
codepoints=subset["unicodes"],
existing_handling=existing_handling,
layout_handling=layout_handling,
Expand Down
8 changes: 8 additions & 0 deletions Lib/gftools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,3 +685,11 @@ def has_gh_token():
if "GH_TOKEN" in os.environ:
return True
return False


def parse_codepoint(codepoint: str) -> int:
# https://github.com/googlefonts/ufomerge/blob/2257a1d3807a4eec9b515aa98e059383f7814d9a/Lib/ufomerge/cli.py#L118-L126
if codepoint.startswith(("U+", "u+", "0x", "0X")):
return int(codepoint[2:], 16)
else:
return int(codepoint)
15 changes: 15 additions & 0 deletions docs/gftools-builder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,21 @@ build process by leaving a `graph.png` file in the `sources` directory:
- *subspace*: Runs `fonttools varLib.instancer` to subspace a variable font according to the values in `axes`. `args` are added to the command line.
- *hbsubset*: Uses `hb-subset` to slim down a font binary.
- *addSubset*: Adds a subset from another font using `gftools-add-ds-subsets`
- `directory`: the intermediary folder used to store the source(s) the subset(s) is taken from
- `subsets`: a list of subset configurations to merge in
- `from` (required): can be a pre-configured Noto source ("Noto Sans", "Noto Serif", "Noto Sans Devanagari", "Noto Sans Linear B"), or:
- `repo`: the GitHub slug for the repository, e.g. `googlefonts/gftools`. You can specify a git revision by suffixing this with `@v1.0.0`, or use `@latest` for the latest *published* release
- `path`: the path within the repo that has the source file
- `name`: a named Google Fonts subset, e.g. `GF_Latin_Core`
- `ranges`: a list unicode codepoint range to include
- `start`: the start of the range (as hex or decimal)
- `end`: the end of the range (as hex or decimal)
- `layoutHandling`: "subset", "closure" or "ignore" ([further reading](https://github.com/googlefonts/ufomerge/blob/bb9a82ff3039b8aa0cba58372158bd3c0e5cb770/Lib/ufomerge/__init__.py#L512-L521))
- `force`: replace existing glyphs in your sources, instead of skipping them
- `exclude_glyphs`: whitespace-delimited glyph names to exclude from merging
- `exclude_glyphs_file`: path to a file with glyphs names to exclude from merging, one per line (comments using `#` or `//` allowed)
- `exclude_codepoints`: whitespace-delimited unicode codepoints to exclude from merging
- `exclude_codepoints_file`: path to a file with with unicode codepoints to exclude from merging, one per line (comments using `#` or `//` allowed)
- *buildVTT*: Uses `gftools-build-vtt` with the configuration file provided in `vttfile` to add VTT hinting to a font binary.
- *remap*: Uses `gftools-remap-font` to alter a font binary's `cmap` table.
- *paintcompiler*: Runs paintcompiler on a font to add a COLRv1 table.
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ dependencies = [
'ninja',
'networkx',
'ruamel.yaml',
# Used for subset merging, and preferred over the home-grown UFO merge script,
# which is deprecated.
# Pin avoids bug googlefonts/ufomerge#28.
'ufomerge>=1.8.1'
]

[project.optional-dependencies]
Expand Down

0 comments on commit cf3e553

Please sign in to comment.