From 528a37d7152b378d5e20118988876d77c318c9dd Mon Sep 17 00:00:00 2001
From: Alexander Mankuta <alex@pointless.one>
Date: Fri, 3 Nov 2023 15:14:20 +0200
Subject: [PATCH] Full font embedding

This add an option to disable font subsetting. Original fonts can be
embedded in full original form.

This feature can make documents substantially bigger. In addition to
embedded fonts being bigger PDF requires additional information in order
to properly render text. Specifically, it requires glyph widths. Some
fonts contain thousands of glyps. A thousand of glyph widths on average
would result in about 4 Kb additional size of the document.
Additionally, PDF requires another mapping to make the text intelligible
when copying. This additional size is much harder to estimate as it
greatly depend on the font coverage but usually on the order of ~1-10
Kb per font.

Intended use case is a workaround for when TTFunk breaks fonts in
subsetting. But also this might be useful for documents that are going
to be edited. For example, documents that are templates and more text
would be added later, or AcroForm feature that allows end users to fill
forms.
---
 CHANGELOG.md                             |   7 +
 lib/prawn/font.rb                        |  38 +++-
 lib/prawn/fonts/to_unicode_cmap.rb       | 141 ++++++++++++
 lib/prawn/fonts/ttf.rb                   | 262 +++++++++++++++++------
 spec/prawn/font_spec.rb                  |  93 +++++++-
 spec/prawn/fonts/to_unicode_cmap_spec.rb |  98 +++++++++
 spec/prawn_manual_spec.rb                |   4 +-
 7 files changed, 571 insertions(+), 72 deletions(-)
 create mode 100644 lib/prawn/fonts/to_unicode_cmap.rb
 create mode 100644 spec/prawn/fonts/to_unicode_cmap_spec.rb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af304b356..c88165257 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 ## Unreleased
 
+### Full font embedding
+
+Fonts can be embedded in their original form without subsetting or any other
+modification.
+
+(Alexander Mankuta, [#1322](https://github.com/prawnpdf/prawn/pull/1322))
+
 ## Fixed keyword arguments in Prawn::View
 
 (Kim Burgestrand, [1284](https://github.com/prawnpdf/prawn/pull/1284))
diff --git a/lib/prawn/font.rb b/lib/prawn/font.rb
index ab0cd0d45..63f1a327b 100644
--- a/lib/prawn/font.rb
+++ b/lib/prawn/font.rb
@@ -145,19 +145,23 @@ def width_of(string, options = {})
       end
     end
 
-    # Hash that maps font family names to their styled individual font names.
+    # Hash that maps font family names to their styled individual font
+    # definitions.
     #
     # To add support for another font family, append to this hash, e.g:
     #
     #   pdf.font_families.update(
-    #    "MyTrueTypeFamily" => { :bold        => "foo-bold.ttf",
-    #                            :italic      => "foo-italic.ttf",
-    #                            :bold_italic => "foo-bold-italic.ttf",
-    #                            :normal      => "foo.ttf" })
+    #     "MyTrueTypeFamily" => {
+    #       bold: "foo-bold.ttf",
+    #       italic: "foo-italic.ttf",
+    #       bold_italic: "foo-bold-italic.ttf",
+    #       normal: "foo.ttf"
+    #     }
+    #   )
     #
     # This will then allow you to use the fonts like so:
     #
-    #   pdf.font("MyTrueTypeFamily", :style => :bold)
+    #   pdf.font("MyTrueTypeFamily", style: :bold)
     #   pdf.text "Some bold text"
     #   pdf.font("MyTrueTypeFamily")
     #   pdf.text "Some normal text"
@@ -170,6 +174,17 @@ def width_of(string, options = {})
     # defining your own font families, you can map any or all of these
     # styles to whatever font files you'd like.
     #
+    # Font definition can be either a hash or just a string.
+    #
+    # A hash font definition can specify a number of options:
+    #
+    # - :file -- path to the font file (required)
+    # - :subset -- whether to subset the font (default false). Only
+    #   applicable to TrueType and OpenType fonts (includnig DFont and TTC).
+    #
+    # A string font definition is equivalent to hash definition with only
+    # :file being specified.
+    #
     def font_families
       @font_families ||= {}.merge!(
         'Courier' => {
@@ -339,6 +354,8 @@ def initialize(document, name, options = {}) # :nodoc:
 
       @references = {}
       @subset_name_cache = {}
+
+      @full_font_embedding = options.key?(:subset) && !options[:subset]
     end
 
     # The size of the font ascender in PDF points
@@ -401,7 +418,12 @@ def add_to_current_page(subset)
     end
 
     def identifier_for(subset) # :nodoc:
-      @subset_name_cache[subset] ||= "#{@identifier}.#{subset}".to_sym
+      @subset_name_cache[subset] ||=
+        if full_font_embedding
+          @identifier.to_sym
+        else
+          "#{@identifier}.#{subset}".to_sym
+        end
     end
 
     def inspect # :nodoc:
@@ -426,6 +448,8 @@ def eql?(other) # :nodoc:
 
     private
 
+    attr_reader :full_font_embedding
+
     # generate a font identifier that hasn't been used on the current page yet
     #
     def generate_unique_id
diff --git a/lib/prawn/fonts/to_unicode_cmap.rb b/lib/prawn/fonts/to_unicode_cmap.rb
new file mode 100644
index 000000000..a6d84da64
--- /dev/null
+++ b/lib/prawn/fonts/to_unicode_cmap.rb
@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+
+module Prawn
+  module Fonts
+    # @private
+    class ToUnicodeCMap
+      # mapping is expected to be a hash with keys being charater codes (in
+      # broad sense, as used in the showing operation strings) and values being
+      # Unicode code points
+      def initialize(mapping, code_space_size = nil)
+        @mapping = mapping
+        @code_space_size = code_space_size
+      end
+
+      def generate
+        chunks = []
+
+        # Header
+        chunks << <<~HEADER.chomp
+          /CIDInit /ProcSet findresource begin
+          12 dict begin
+          begincmap
+          /CIDSystemInfo 3 dict dup begin
+            /Registry (Adobe) def
+            /Ordering (UCS) def
+            /Supplement 0 def
+          end def
+          /CMapName /Adobe-Identity-UCS def
+          /CMapType 2 def
+        HEADER
+
+        max_glyph_index = mapping.keys.max
+        # Range
+        code_space_size = (max_glyph_index.bit_length / 8.0).ceil
+
+        used_code_space_size = @code_space_size || code_space_size
+
+        # In CMap codespaces are not sequentional, they're ranges in
+        # a multi-dimentional space. Each byte is considered separately. So we
+        # have to maximally extend the lower bytes in order to allow for
+        # continuos mapping.
+        # We only keep the highest byte because usually it's lower than
+        # maximally allowed and we don't want to cover that unused space.
+        code_space_max = max_glyph_index | ('ff' * (code_space_size - 1)).to_i(16)
+
+        chunks << '1 begincodespacerange'
+        chunks << format("<%0#{used_code_space_size * 2}X><%0#{used_code_space_size * 2}X>", 0, code_space_max)
+        chunks << 'endcodespacerange'
+
+        # Mapping
+        all_spans =
+          mapping_spans(
+            mapping.reject { |gid, cid| gid.zero? || (0xd800..0xdfff).cover?(cid) }
+          )
+
+        short_spans, long_spans = all_spans.partition { |span| span[0] == :short }
+
+        long_spans
+          .each_slice(100) do |spans|
+            chunks << "#{spans.length} beginbfrange"
+
+            spans.each do |type, span|
+              case type
+              when :fully_sorted
+                chunks << format(
+                  "<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X><%s>",
+                  span.first[0],
+                  span.last[0],
+                  span.first[1].chr(::Encoding::UTF_16BE).unpack1('H*')
+                )
+              when :index_sorted
+                chunks << format(
+                  "<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X>[%s]",
+                  span.first[0],
+                  span.last[0],
+                  span.map { |_, cid| "<#{cid.chr(::Encoding::UTF_16BE).unpack1('H*')}>" }.join('')
+                )
+              end
+            end
+
+            chunks << 'endbfrange'
+          end
+
+        short_spans
+          .map { |_type, slice| slice.flatten(1) }
+          .each_slice(100) do |mapping|
+            chunks << "#{mapping.length} beginbfchar"
+            chunks.concat(
+              mapping.map do |(gid, cid)|
+                format(
+                  "<%0#{code_space_size * 2}X><%s>",
+                  gid,
+                  cid.chr(::Encoding::UTF_16BE).unpack1('H*')
+                )
+              end
+            )
+            chunks << 'endbfchar'
+          end
+
+        # Footer
+        chunks << <<~FOOTER.chomp
+          endcmap
+          CMapName currentdict /CMap defineresource pop
+          end
+          end
+        FOOTER
+
+        chunks.join("\n")
+      end
+
+      private
+
+      attr_reader :mapping
+
+      attr_reader :cmap, :code_space_size, :code_space_max
+
+      def mapping_spans(mapping)
+        mapping
+          .sort
+          .slice_when { |a, b| (b[0] - a[0]) != 1 } # Slice at key discontinuity
+          .flat_map { |slice|
+            if slice.length == 1
+              [[:short, slice]]
+            else
+              continuous_slices, discontinuous_slices =
+                slice
+                  .slice_when { |a, b| b[1] - a[1] != 1 } # Slice at value discontinuity
+                  .partition { |subslice| subslice.length > 1 }
+
+              discontinuous_slices
+                .flatten(1) # Join together
+                .slice_when { |a, b| (b[0] - a[0]) != 1 } # Slice at key discontinuity, again
+                .map { |span| span.length > 1 ? [:index_sorted, span] : [:short, slice] } +
+                continuous_slices.map { |span| [:fully_sorted, span] }
+            end
+          }
+          .sort_by { |span| span[1][0][0] } # Sort span start key
+      end
+    end
+  end
+end
diff --git a/lib/prawn/fonts/ttf.rb b/lib/prawn/fonts/ttf.rb
index 7a28a9ca5..5e55220f9 100644
--- a/lib/prawn/fonts/ttf.rb
+++ b/lib/prawn/fonts/ttf.rb
@@ -9,6 +9,7 @@
 
 require 'ttfunk'
 require 'ttfunk/subset_collection'
+require_relative 'to_unicode_cmap'
 
 module Prawn
   module Fonts
@@ -43,11 +44,70 @@ def unicode?
         true
       end
 
+      class FullFontSubsetsCollection
+        FULL_FONT = Object.new.tap do |obj|
+          obj.singleton_class.define_method(:inspect) do
+            super().insert(-2, ' FULL_FONT')
+          end
+        end.freeze
+
+        def initialize(original)
+          @original = original
+
+          (@cmap ||= original.cmap.unicode.first) || raise(NoUnicodeCMap.new(font: name))
+
+          @code_space_size =
+            case cmap.code_map.keys.max
+            when 0..0xff then 1
+            when 0x100..0xffff then 2
+            when 0x10000..0xffffff then 3
+            else
+              4
+            end
+
+          # Codespaces are not sequentional, they're ranges in
+          # a multi-dimentional space. Each byte is considered separately. So we
+          # have to maximally extend the lower two bytes in order to allow for
+          # continuos Unicode mapping.
+          # We only keep the highest byte because Unicode only goes to 1FFFFF
+          # and fonts usually cover even less of the space. We don't want to
+          # list all those unmapped charac codes here.
+          @code_space_max = cmap.code_map.keys.max | ('ff' * (code_space_size - 1)).to_i(16)
+        end
+
+        def encode(characters)
+          [
+            [
+              FULL_FONT,
+              characters.map do |c|
+                check_bounds!(c)
+                [cmap[c]].pack('n')
+              end.join('')
+            ]
+          ]
+        end
+
+        private
+
+        attr_reader :cmap, :code_space_size, :code_space_max
+
+        def check_bounds!(num)
+          if num > code_space_max
+            raise Error, "CID (#{num}) exceedes code space size"
+          end
+        end
+      end
+
       def initialize(document, name, options = {})
         super
 
         @ttf = read_ttf_file
-        @subsets = TTFunk::SubsetCollection.new(@ttf)
+        @subsets =
+          if full_font_embedding
+            FullFontSubsetsCollection.new(@ttf)
+          else
+            TTFunk::SubsetCollection.new(@ttf)
+          end
         @italic_angle = nil
 
         @attributes = {}
@@ -199,8 +259,7 @@ def pdf_flags
 
       def normalize_encoding(text)
         text.encode(::Encoding::UTF_8)
-      rescue StandardError => e
-        puts e
+      rescue StandardError
         raise Prawn::Errors::IncompatibleStringEncoding,
           "Encoding #{text.encoding} can not be transparently converted to UTF-8. " \
           'Please ensure the encoding of the string you are attempting ' \
@@ -289,12 +348,26 @@ def register(subset)
       end
 
       def embed(reference, subset)
-        font_content = @subsets[subset].encode
+        if full_font_embedding
+          embed_full_font(reference)
+        else
+          embed_subset(reference, subset)
+        end
+      end
 
-        # FIXME: we need postscript_name and glyph widths from the font
-        # subset. Perhaps this could be done by querying the subset,
-        # rather than by parsing the font that the subset produces?
-        font = TTFunk::File.new(font_content)
+      def embed_subset(reference, subset)
+        font = TTFunk::File.new(@subsets[subset].encode)
+        unicode_mapping = @subsets[subset].to_unicode_map
+        embed_simple_font(reference, font, unicode_mapping)
+      end
+
+      def embed_simple_font(reference, font, unicode_mapping)
+        if font_type(font) == :unknown
+          raise Error, %(Simple font embedding is not uspported for font "#{font.name}.")
+        end
+
+        true_type = font_type(font) == :true_type
+        open_type = font_type(font) == :open_type
 
         # empirically, it looks like Adobe Reader will not display fonts
         # if their font name is more than 33 bytes long. Strange. But true.
@@ -302,14 +375,14 @@ def embed(reference, subset)
 
         raise NoPostscriptName.new(font: font) if basename.nil?
 
-        fontfile = @document.ref!(Length1: font_content.size)
-        fontfile.stream << font_content
-        fontfile.stream.compress!
+        fontfile = @document.ref!({})
+        fontfile.data[:Length1] = font.contents.size
+        fontfile.stream << font.contents.string
+        fontfile.stream.compress! if @document.compression_enabled?
 
         descriptor = @document.ref!(
           Type: :FontDescriptor,
           FontName: basename.to_sym,
-          FontFile2: fontfile,
           FontBBox: bbox,
           Flags: pdf_flags,
           StemV: stem_v,
@@ -320,10 +393,20 @@ def embed(reference, subset)
           XHeight: x_height
         )
 
+        first_char = font.cmap.tables.first.code_map.index { |gid| !gid.zero? }
+        last_char = font.cmap.tables.first.code_map.rindex { |gid| !gid.zero? }
         hmtx = font.horizontal_metrics
-        widths = font.cmap.tables.first.code_map.map do |gid|
-          Integer(hmtx.widths[gid] * scale_factor)
-        end[32..]
+        widths =
+          font.cmap.tables.first.code_map[first_char..last_char].map do |gid|
+            if gid.zero?
+              # These characters are not in the document so we don't ever use
+              # these values but we need to encode them so let's use as little
+              # sapce as possible.
+              0
+            else
+              Integer(hmtx.widths[gid] * scale_factor)
+            end
+          end
 
         # It would be nice to have Encoding set for the macroman subsets,
         # and only do a ToUnicode cmap for non-encoded unicode subsets.
@@ -335,65 +418,120 @@ def embed(reference, subset)
         # For now, it's simplest to just create a unicode cmap for every font.
         # It offends my inner purist, but it'll do.
 
-        map = @subsets[subset].to_unicode_map
+        to_unicode = @document.ref!({})
+        to_unicode << ToUnicodeCMap.new(unicode_mapping).generate
+        to_unicode.stream.compress! if @document.compression_enabled?
 
-        ranges = [[]]
-        map.keys.sort.reduce('') do |_s, code|
-          ranges << [] if ranges.last.length >= 100
-          unicode = map[code]
-          ranges.last << format(
-            '<%<code>02x><%<unicode>04x>',
-            code: code,
-            unicode: unicode
-          )
+        reference.data.update(
+          BaseFont: basename.to_sym,
+          FontDescriptor: descriptor,
+          FirstChar: first_char,
+          LastChar: last_char,
+          Widths: @document.ref!(widths),
+          ToUnicode: to_unicode
+        )
+
+        if true_type
+          reference.data.update(Subtype: :TrueType)
+          descriptor.data.update(FontFile2: fontfile)
+        elsif open_type
+          @document.renderer.min_version(1.6)
+          reference.data.update(Subtype: :Type1)
+          descriptor.data.update(FontFile3: fontfile)
+          fontfile.data.update(Subtype: :OpenType)
         end
+      end
 
-        range_blocks =
-          ranges.reduce(+'') do |s, list|
-            s << format(
-              "%<lenght>d beginbfchar\n%<list>s\nendbfchar\n",
-              lenght: list.length,
-              list: list.join("\n")
-            )
-          end
+      def embed_full_font(reference)
+        embed_composite_font(reference, @ttf)
+      end
 
-        to_unicode_cmap = UNICODE_CMAP_TEMPLATE % range_blocks.strip
+      def embed_composite_font(reference, font)
+        if font_type(font) == :unknown
+          raise Error, %(Composite font embedding is not uspported for font "#{font.name}.")
+        end
 
-        cmap = @document.ref!({})
-        cmap << to_unicode_cmap
-        cmap.stream.compress!
+        true_type = font_type(font) == :true_type
+        open_type = font_type(font) == :open_type
 
-        reference.data.update(
-          Subtype: :TrueType,
+        fontfile = @document.ref!({})
+        fontfile.data[:Length1] = font.contents.size if true_type
+        fontfile.data[:Subtype] = :CIDFontType0C if open_type
+        fontfile.stream << font.contents.string
+        fontfile.stream.compress! if @document.compression_enabled?
+
+        # empirically, it looks like Adobe Reader will not display fonts
+        # if their font name is more than 33 bytes long. Strange. But true.
+        basename = font.name.postscript_name[0, 33].delete("\0")
+
+        descriptor = @document.ref!(
+          Type: :FontDescriptor,
+          FontName: basename.to_sym,
+          FontBBox: bbox,
+          Flags: pdf_flags,
+          StemV: stem_v,
+          ItalicAngle: italic_angle,
+          Ascent: @ascender,
+          Descent: @descender,
+          CapHeight: cap_height,
+          XHeight: x_height
+        )
+        descriptor.data[:FontFile2] = fontfile if true_type
+        descriptor.data[:FontFile3] = fontfile if open_type
+
+        to_unicode = @document.ref!({})
+        to_unicode << ToUnicodeCMap.new(
+          font.cmap.unicode.first
+          .code_map
+          .reject { |cid, gid| gid.zero? || (0xd800..0xdfff).cover?(cid) }
+          .invert
+          .sort.to_h,
+          2 # Identity-H is a 2-byte encoding
+        ).generate
+        to_unicode.stream.compress! if @document.compression_enabled?
+
+        widths =
+          font.horizontal_metrics.widths.map { |w| (w * scale_factor).round }
+
+        child_font = @document.ref!(
+          Type: :Font,
           BaseFont: basename.to_sym,
+          CIDSystemInfo: {
+            Registry: 'Adobe',
+            Ordering: 'Identity',
+            Supplement: 0
+          },
           FontDescriptor: descriptor,
-          FirstChar: 32,
-          LastChar: 255,
-          Widths: @document.ref!(widths),
-          ToUnicode: cmap
+          W: [0, widths]
+        )
+        if true_type
+          child_font.data.update(
+            Subtype: :CIDFontType2,
+            CIDToGIDMap: :Identity
+          )
+        end
+        if open_type
+          child_font.data[:Subtype] = :CIDFontType0
+        end
+
+        reference.data.update(
+          Subtype: :Type0,
+          BaseFont: basename.to_sym,
+          Encoding: :'Identity-H',
+          DescendantFonts: [child_font],
+          ToUnicode: to_unicode
         )
       end
 
-      UNICODE_CMAP_TEMPLATE = <<-STR.strip.gsub(/^\s*/, '')
-        /CIDInit /ProcSet findresource begin
-        12 dict begin
-        begincmap
-        /CIDSystemInfo <<
-          /Registry (Adobe)
-          /Ordering (UCS)
-          /Supplement 0
-        >> def
-        /CMapName /Adobe-Identity-UCS def
-        /CMapType 2 def
-        1 begincodespacerange
-        <00><ff>
-        endcodespacerange
-        %s
-        endcmap
-        CMapName currentdict /CMap defineresource pop
-        end
+      def font_type(font)
+        if font.directory.tables.key?('glyf')
+          :true_type
+        elsif font.directory.tables.key?('CFF ')
+          :open_type
+        else
+          :unknown
         end
-      STR
+      end
 
       def read_ttf_file
         TTFunk::File.open(@name)
diff --git a/spec/prawn/font_spec.rb b/spec/prawn/font_spec.rb
index d36baf365..a890c907e 100644
--- a/spec/prawn/font_spec.rb
+++ b/spec/prawn/font_spec.rb
@@ -469,6 +469,52 @@ def page_should_not_include_font(font)
         expect(original.equal?(normalized)).to eq false
       end
     end
+
+    describe 'full font embedding' do
+      let(:font) { pdf.find_font "#{Prawn::DATADIR}/fonts/DejaVuSans.ttf", subset: false }
+      let(:ref) { pdf.ref!({}).tap { |ref| font.__send__(:embed, ref, nil) } }
+
+      it 'is a composite font' do
+        font_obj = ref.data
+
+        expect(font_obj[:Subtype]).to eq(:Type0)
+        expect(font_obj[:DescendantFonts]).to be_an(Array)
+        expect(font_obj[:DescendantFonts].length).to eq(1)
+        desc_font = font_obj[:DescendantFonts].first.data
+        expect(desc_font[:Type]).to eq(:Font)
+        expect(desc_font[:Subtype]).to eq(:CIDFontType2)
+      end
+
+      it 'has proper metrics' do
+        descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data
+        expect(descriptor[:Ascent]).to eq(759)
+        expect(descriptor[:Descent]).to eq(-240)
+        expect(descriptor[:CapHeight]).to eq(759)
+      end
+
+      it 'has proper encoding' do
+        font_obj = ref.data
+        expect(font_obj[:Encoding]).to eq(:'Identity-H')
+        desc_font = font_obj[:DescendantFonts].first.data
+        expect(desc_font[:CIDToGIDMap]).to eq(:Identity)
+      end
+
+      it 'contains glyph widths' do
+        desc_font = ref.data[:DescendantFonts].first.data
+        expect(desc_font[:W]).to be_an(Array)
+        expect(desc_font[:W].length).to eq(2)
+        expect(desc_font[:W][0]).to eq(0)
+        expect(desc_font[:W][1]).to be_an(Array)
+        expect(desc_font[:W][1].length).to eq(6108) # All glyph metrics
+      end
+
+      it 'propely embeds font data' do
+        descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data
+        expect(descriptor).to have_key(:FontFile2)
+        expect(descriptor[:FontFile2].data[:Length1]).to eq(741_536)
+        expect(descriptor[:FontFile2].stream).to_not be_empty
+      end
+    end
   end
 
   describe 'OTF fonts' do
@@ -500,6 +546,51 @@ def page_should_not_include_font(font)
         expect(original).to_not be_equal(normalized)
       end
     end
+
+    describe 'full font embedding' do
+      let(:font) { pdf.find_font "#{Prawn::DATADIR}/fonts/Bodoni-Book.otf", subset: false }
+      let(:ref) { pdf.ref!({}).tap { |ref| font.__send__(:embed, ref, nil) } }
+
+      it 'is a composite font' do
+        font_obj = ref.data
+
+        expect(font_obj[:Subtype]).to eq(:Type0)
+        expect(font_obj[:DescendantFonts]).to be_an(Array)
+        expect(font_obj[:DescendantFonts].length).to eq(1)
+        desc_font = font_obj[:DescendantFonts].first.data
+        expect(desc_font[:Type]).to eq(:Font)
+        expect(desc_font[:Subtype]).to eq(:CIDFontType0)
+      end
+
+      it 'has proper metrics' do
+        descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data
+        expect(descriptor[:Ascent]).to eq(1023)
+        expect(descriptor[:Descent]).to eq(-200)
+        expect(descriptor[:CapHeight]).to eq(3072)
+      end
+
+      it 'has proper encoding' do
+        font_obj = ref.data
+        expect(font_obj[:Encoding]).to eq(:'Identity-H')
+        desc_font = font_obj[:DescendantFonts].first.data
+        expect(desc_font).to_not have_key(:CIDToGIDMap)
+      end
+
+      it 'contains glyph widths' do
+        desc_font = ref.data[:DescendantFonts].first.data
+        expect(desc_font[:W]).to be_an(Array)
+        expect(desc_font[:W].length).to eq(2)
+        expect(desc_font[:W][0]).to eq(0)
+        expect(desc_font[:W][1]).to be_an(Array)
+        expect(desc_font[:W][1].length).to eq(353) # All glyph metrics
+      end
+
+      it 'propely embeds font data' do
+        descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data
+        expect(descriptor).to have_key(:FontFile3)
+        expect(descriptor[:FontFile3].stream).to_not be_empty
+      end
+    end
   end
 
   describe 'DFont fonts' do
@@ -566,7 +657,7 @@ def page_should_not_include_font(font)
         # This has to be the same font file as in the other family.
         normal: "#{Prawn::DATADIR}/fonts/DejaVuSans.ttf",
         bold: "#{Prawn::DATADIR}/fonts/Dustismo_Roman.ttf"
-      },
+      }
     )
 
     pdf.font 'DejaVu Sans'
diff --git a/spec/prawn/fonts/to_unicode_cmap_spec.rb b/spec/prawn/fonts/to_unicode_cmap_spec.rb
new file mode 100644
index 000000000..a68a1cf0b
--- /dev/null
+++ b/spec/prawn/fonts/to_unicode_cmap_spec.rb
@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'pathname'
+
+describe Prawn::Fonts::ToUnicodeCMap do
+  it 'generates a cmap' do
+    charmap = {
+      0x20 => 0x20,
+      0x21 => 0x21,
+      0x22 => 0x22,
+      0x30 => 0x30
+    }
+    to_unicode_cmap = described_class.new(charmap)
+
+    expect(to_unicode_cmap.generate).to eq(<<~CMAP.chomp)
+      /CIDInit /ProcSet findresource begin
+      12 dict begin
+      begincmap
+      /CIDSystemInfo 3 dict dup begin
+        /Registry (Adobe) def
+        /Ordering (UCS) def
+        /Supplement 0 def
+      end def
+      /CMapName /Adobe-Identity-UCS def
+      /CMapType 2 def
+      1 begincodespacerange
+      <00><30>
+      endcodespacerange
+      1 beginbfrange
+      <20><22><0020>
+      endbfrange
+      1 beginbfchar
+      <30><0030>
+      endbfchar
+      endcmap
+      CMapName currentdict /CMap defineresource pop
+      end
+      end
+    CMAP
+  end
+
+  it 'generates type 2 cmap' do
+    cmap = described_class.new(0x20 => 0x30).generate
+
+    expect(cmap).to match(%r{/CMapType 2\b})
+  end
+
+  it 'properly sets codespace range' do
+    cmap = described_class.new(0x20 => 0x30).generate
+
+    expect(cmap).to include("begincodespacerange\n<00><20>\n")
+  end
+
+  it 'properly sets large codespace range' do
+    cmap = described_class.new(0x2000 => 0x30).generate
+
+    expect(cmap).to include("begincodespacerange\n<0000><20FF>\n")
+  end
+
+  it 'uses codespace size override' do
+    cmap = described_class.new({ 0x20 => 0x30 }, 2).generate
+
+    expect(cmap).to include("begincodespacerange\n<0000><0020>\n")
+  end
+
+  it 'uses ranges for continuous mappings' do
+    cmap = described_class.new(0x20 => 0x30, 0x21 => 0x31, 0x22 => 0x32).generate
+
+    expect(cmap).to include("beginbfrange\n<20><22><0030>\n")
+  end
+
+  it 'uses ranges for continuous code rnages with non-continuous mappings' do
+    cmap = described_class.new(0x20 => 0x32, 0x21 => 0x31, 0x22 => 0x30).generate
+
+    expect(cmap).to include("beginbfrange\n<20><22>[<0032><0031><0030>]\n")
+  end
+
+  it 'uses individual mappings' do
+    cmap = described_class.new(0x20 => 0x30, 0x21 => 0x31, 0x22 => 0x32, 0x30 => 0x40).generate
+
+    expect(cmap).to include("beginbfchar\n<30><0040>\n")
+  end
+
+  it 'splits continuous mappings into groups of 100' do
+    mapping = (1..142).flat_map { |n| Array.new(3) { |i| [n * 10 + i, n * 10 + i] } }.to_h
+    cmap = described_class.new(mapping).generate
+
+    expect(cmap).to include("\n100 beginbfrange\n").and include("\n42 beginbfrange\n")
+  end
+
+  it 'splits individual mappings into groups of 100' do
+    mapping = (1..142).to_h { |n| [n * 2, n * 2] }
+    cmap = described_class.new(mapping).generate
+
+    expect(cmap).to include("\n100 beginbfchar\n").and include("\n42 beginbfchar\n")
+  end
+end
diff --git a/spec/prawn_manual_spec.rb b/spec/prawn_manual_spec.rb
index 91182a2d0..b92c55441 100644
--- a/spec/prawn_manual_spec.rb
+++ b/spec/prawn_manual_spec.rb
@@ -6,9 +6,9 @@
 MANUAL_HASH =
   case RUBY_ENGINE
   when 'ruby'
-    'a2a111c8b3ef808b734c506dc9184520888bc5904c2eb12ab299a634d63e9e0f9ede52e54a701f346a3b48364c9e48ebb82a6b2351becde08e2382fe349158a9'
+    '8ace5f35f945e5994647cefc2cf7bc369d131e0646d91eb8aeb94e58f72de18d8e7bf82f58fc45406110c4adad239dcbe834059580d29fec2b2a039db67db04c'
   when 'jruby'
-    'eb3742d593861f6ca35667198b796265a58ac63aecdb8415bdee2d191f83341bb466e3b3c136a8609acf9e7c589612fe8b19d97f99cfc183d78962c6e2aa3546'
+    'b77a740d3290192360c4c083018ca61ccc88d42e7c6a152c7bc394d751f5f08d85aec613549f6660713644b00518561cc0f9c947701f01e8c25632c9db81201a'
   end
 
 RSpec.describe Prawn do