Builder subset operation: exclude glyphs by name/codepoint/file (#1020)

* Add utility to parse codepoints Based on logic from ufo_merge * builder: subset: support excluding codepoints inline or from file * add-ds-subsets: add --exclude-codepoints & --exclude-codepoints-file * builder: subset: support excluding glyphs by name inline or from file Leverages/Expands on the (formally called) unicodes_by_donor to handle glyphs names also. Some type soup involved * add-ds-subsets: add --exclude-glyphs & --exclude-glyphs-file * Document subset operation options * Add explicit ufomerge dependency, pinning to avoid bug See feedback on #1020 --------- Co-authored-by: Harry Dalton <harry.dalton@daltonmaag.com>
googlefonts · Sep 11, 2024 · cf3e553 · cf3e553
1 parent 5542d82
commit cf3e553
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 5 deletions.
diff --git a/Lib/gftools/scripts/add_ds_subsets.py b/Lib/gftools/scripts/add_ds_subsets.py
@@ -90,6 +90,22 @@ def main(args=None):
     parser.add_argument(
         "--json", "-j", action="store_true", help="Use JSON structured UFOs"
     )
+    parser.add_argument(
+        "--exclude-codepoints", help="Space-delimited unicodes to exclude"
+    )
+    parser.add_argument(
+        "--exclude-codepoints-file",
+        help="Newline delimited file with unicodes to exclude. "
+        "Allows for comments with either # or //",
+    )
+    parser.add_argument(
+        "--exclude-glyphs", help="Space-delimited glyph names to exclude"
+    )
+    parser.add_argument(
+        "--exclude-glyphs-file",
+        help="Newline delimited file with glyph names to exclude. "
+        "Allows for comments with either # or //",
+    )
 
     parser.add_argument("--output", "-o", help="Output designspace file")
 
@@ -117,6 +133,10 @@ def main(args=None):
                 "from": {
                     "repo": args.repo,
                     "path": args.file,
+                    "exclude_codepoints": args.exclude_codepoints,
+                    "exclude_codepoints_file": args.exclude_codepoints_file,
+                    "exclude_glyphs": args.exclude_glyphs,
+                    "exclude_glyphs_file": args.exclude_glyphs_file,
                 }
             }
         ]

diff --git a/Lib/gftools/subsetmerger.py b/Lib/gftools/subsetmerger.py
@@ -21,7 +21,7 @@
 from ufomerge import merge_ufos
 
 from gftools.util.styles import STYLE_NAMES
-from gftools.utils import download_file, open_ufo
+from gftools.utils import download_file, open_ufo, parse_codepoint
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -53,6 +53,10 @@
             ),
             Optional("layoutHandling"): Str(),
             Optional("force"): Str(),
+            Optional("exclude_glyphs"): Str(),
+            Optional("exclude_codepoints"): Str(),
+            Optional("exclude_glyphs_file"): Str(),
+            Optional("exclude_codepoints_file"): Str(),
         }
     )
 )
@@ -63,7 +67,15 @@ def prepare_minimal_subsets(subsets):
     # codepoints with the same "donor" font and options. This allows the
     # user to specify multiple subsets from the same font, and they will
     # be merged into a single merge operation.
-    unicodes_by_donor = defaultdict(set)
+    incl_excl_by_donor: dict[
+        tuple[str, str, str],
+        tuple[
+            # Unicodes to include
+            set[int],
+            # Glyph names to exclude
+            set[str],
+        ],
+    ] = defaultdict(lambda: (set(), set()))
     for subset in subsets:
         # Resolved named subsets to a set of Unicode using glyphsets data
         if "name" in subset:
@@ -75,18 +87,74 @@ def prepare_minimal_subsets(subsets):
             for r in subset["ranges"]:
                 for cp in range(r["start"], r["end"] + 1):
                     unicodes.append(cp)
+
+        # Parse in manual exclusions
+        excluded_codepoints = set()
+        if exclude_inline := subset.get("exclude_codepoints"):
+            for raw_value in exclude_inline.split():
+                raw_value = raw_value.strip()
+                if raw_value == "":
+                    continue
+                excluded_codepoints.add(parse_codepoint(raw_value))
+        if exclude_file := subset.get("exclude_codepoints_file"):
+            for line in Path(exclude_file).read_text().splitlines():
+                line = line.strip()
+                if line != "" and not line.startswith(("#", "//")):
+                    continue
+                # Remove in-line comments
+                line = line.split("#", 1)[0]
+                line = line.split("//", 1)[0]
+                line = line.rstrip()
+                excluded_codepoints.add(parse_codepoint(line))
+
+        # Filter unicodes by excluded_codepoints
+        unicodes = [
+            unicode for unicode in unicodes if unicode not in excluded_codepoints
+        ]
+
+        # Load excluded glyphs by name
+        exclude_glyphs = set()
+        if exclude_inline := subset.get("exclude_glyphs"):
+            for glyph_name in exclude_inline.split():
+                glyph_name = glyph_name.strip()
+                if glyph_name == "":
+                    continue
+                exclude_glyphs.add(glyph_name)
+        if exclude_file := subset.get("exclude_glyphs_file"):
+            for line in Path(exclude_file).read_text().splitlines():
+                line = line.strip()
+                if line != "" and not line.startswith(("#", "//")):
+                    continue
+                # Remove in-line comments
+                line = line.split("#", 1)[0]
+                line = line.split("//", 1)[0]
+                line = line.rstrip()
+                exclude_glyphs.add(line)
+
+        # Update incl_excl_by_donor
         key = (
             yaml.dump(subset["from"]),
             subset.get("layoutHandling"),
             subset.get("force"),
         )
-        unicodes_by_donor[key] |= set(unicodes)
+        unicodes_incl, glyph_names_excl = incl_excl_by_donor[key]
+        unicodes_incl |= set(unicodes)
+        glyph_names_excl |= exclude_glyphs
 
     # Now rebuild the subset dictionary, but this time with the codepoints
     # amalgamated into minimal sets.
     newsubsets = []
-    for (donor, layouthandling, force), unicodes in unicodes_by_donor.items():
-        newsubsets.append({"from": yaml.safe_load(donor), "unicodes": list(unicodes)})
+    for (donor, layouthandling, force), (
+        unicodes_incl,
+        glyph_names_excl,
+    ) in incl_excl_by_donor.items():
+        newsubsets.append(
+            {
+                "from": yaml.safe_load(donor),
+                "unicodes": list(unicodes_incl),
+                "exclude_glyphs": list(glyph_names_excl),
+            }
+        )
         if layouthandling:
             newsubsets[-1]["layoutHandling"] = layouthandling
         if force:
@@ -171,6 +239,7 @@ def add_subset(self, target_ufo, ds, ds_source, subset) -> bool:
         merge_ufos(
             target_ufo,
             source_ufo,
+            exclude_glyphs=subset["exclude_glyphs"],
             codepoints=subset["unicodes"],
             existing_handling=existing_handling,
             layout_handling=layout_handling,

diff --git a/Lib/gftools/utils.py b/Lib/gftools/utils.py
@@ -685,3 +685,11 @@ def has_gh_token():
     if "GH_TOKEN" in os.environ:
         return True
     return False
+
+
+def parse_codepoint(codepoint: str) -> int:
+    # https://github.com/googlefonts/ufomerge/blob/2257a1d3807a4eec9b515aa98e059383f7814d9a/Lib/ufomerge/cli.py#L118-L126
+    if codepoint.startswith(("U+", "u+", "0x", "0X")):
+        return int(codepoint[2:], 16)
+    else:
+        return int(codepoint)
diff --git a/docs/gftools-builder/README.md b/docs/gftools-builder/README.md
@@ -331,6 +331,21 @@ build process by leaving a `graph.png` file in the `sources` directory:
 - *subspace*: Runs `fonttools varLib.instancer` to subspace a variable font according to the values in `axes`. `args` are added to the command line.
 - *hbsubset*: Uses `hb-subset` to slim down a font binary.
 - *addSubset*: Adds a subset from another font using `gftools-add-ds-subsets`
+    - `directory`: the intermediary folder used to store the source(s) the subset(s) is taken from
+    - `subsets`: a list of subset configurations to merge in
+        - `from` (required): can be a pre-configured Noto source ("Noto Sans", "Noto Serif", "Noto Sans Devanagari", "Noto Sans Linear B"), or:
+            - `repo`: the GitHub slug for the repository, e.g. `googlefonts/gftools`. You can specify a git revision by suffixing this with `@v1.0.0`, or use `@latest` for the latest *published* release
+            - `path`: the path within the repo that has the source file
+        - `name`: a named Google Fonts subset, e.g. `GF_Latin_Core`
+        - `ranges`: a list unicode codepoint range to include
+            - `start`: the start of the range (as hex or decimal)
+            - `end`: the end of the range (as hex or decimal)
+        - `layoutHandling`: "subset", "closure" or "ignore" ([further reading](https://github.com/googlefonts/ufomerge/blob/bb9a82ff3039b8aa0cba58372158bd3c0e5cb770/Lib/ufomerge/__init__.py#L512-L521))
+        - `force`: replace existing glyphs in your sources, instead of skipping them
+        - `exclude_glyphs`: whitespace-delimited glyph names to exclude from merging
+        - `exclude_glyphs_file`: path to a file with glyphs names to exclude from merging, one per line (comments using `#` or `//` allowed)
+        - `exclude_codepoints`: whitespace-delimited unicode codepoints to exclude from merging
+        - `exclude_codepoints_file`: path to a file with with unicode codepoints to exclude from merging, one per line (comments using `#` or `//` allowed)
 - *buildVTT*: Uses `gftools-build-vtt` with the configuration file provided in `vttfile` to add VTT hinting to a font binary.
 - *remap*: Uses `gftools-remap-font` to alter a font binary's `cmap` table.
 - *paintcompiler*: Runs paintcompiler on a font to add a COLRv1 table.

diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,10 @@ dependencies = [
   'ninja',
   'networkx',
   'ruamel.yaml',
+  # Used for subset merging, and preferred over the home-grown UFO merge script,
+  # which is deprecated.
+  # Pin avoids bug googlefonts/ufomerge#28.
+  'ufomerge>=1.8.1'
 ]
 
 [project.optional-dependencies]