feat: parse WebVTT from fmp4 segments. (#445)

videojs · Sep 23, 2024 · 432b036 · 432b036
1 parent f4b3162
commit 432b036
Show file tree

Hide file tree

Showing 8 changed files with 262 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -125,6 +125,12 @@ Set to `true` to remux audio and video into a single MP4 segment.
 
 This module reads CEA-608 captions out of FMP4 segments.
 
+#### WebVTTParser
+
+`muxjs.mp4.WebVTTParser`
+
+This module reads WebVTT text out of FMP4 segments.
+
 #### Tools
 
 `muxjs.mp4.tools`

diff --git a/lib/mp4/caption-parser.js b/lib/mp4/caption-parser.js
@@ -13,9 +13,8 @@ var discardEmulationPreventionBytes = require('../tools/caption-packet-parser').
 var CaptionStream = require('../m2ts/caption-stream').CaptionStream;
 var findBox = require('../mp4/find-box.js');
 var parseTfdt = require('../tools/parse-tfdt.js');
-var parseTrun = require('../tools/parse-trun.js');
 var parseTfhd = require('../tools/parse-tfhd.js');
-var window = require('global/window');
+var { getMdatTrafPairs, parseSamples } = require('./samples.js');
 
 /**
   * Maps an offset in the mdat to a sample based on the the size of the samples.
@@ -118,62 +117,6 @@ var findSeiNals = function(avcStream, samples, trackId) {
   return result;
 };
 
-/**
-  * Parses sample information out of Track Run Boxes and calculates
-  * the absolute presentation and decode timestamps of each sample.
-  *
-  * @param {Array<Uint8Array>} truns - The Trun Run boxes to be parsed
-  * @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt
-      @see ISO-BMFF-12/2015, Section 8.8.12
-  * @param {Object} tfhd - The parsed Track Fragment Header
-  *   @see inspect.parseTfhd
-  * @return {Object[]} the parsed samples
-  *
-  * @see ISO-BMFF-12/2015, Section 8.8.8
- **/
-var parseSamples = function(truns, baseMediaDecodeTime, tfhd) {
-  var currentDts = baseMediaDecodeTime;
-  var defaultSampleDuration = tfhd.defaultSampleDuration || 0;
-  var defaultSampleSize = tfhd.defaultSampleSize || 0;
-  var trackId = tfhd.trackId;
-  var allSamples = [];
-
-  truns.forEach(function(trun) {
-    // Note: We currently do not parse the sample table as well
-    // as the trun. It's possible some sources will require this.
-    // moov > trak > mdia > minf > stbl
-    var trackRun = parseTrun(trun);
-    var samples = trackRun.samples;
-
-    samples.forEach(function(sample) {
-      if (sample.duration === undefined) {
-        sample.duration = defaultSampleDuration;
-      }
-      if (sample.size === undefined) {
-        sample.size = defaultSampleSize;
-      }
-      sample.trackId = trackId;
-      sample.dts = currentDts;
-      if (sample.compositionTimeOffset === undefined) {
-        sample.compositionTimeOffset = 0;
-      }
-
-      if (typeof currentDts === 'bigint') {
-        sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset);
-        currentDts += window.BigInt(sample.duration);
-
-      } else {
-        sample.pts = currentDts + sample.compositionTimeOffset;
-        currentDts += sample.duration;
-      }
-    });
-
-    allSamples = allSamples.concat(samples);
-  });
-
-  return allSamples;
-};
-
 /**
   * Parses out caption nals from an FMP4 segment's video tracks.
   *
@@ -183,21 +126,8 @@ var parseSamples = function(truns, baseMediaDecodeTime, tfhd) {
   *   a list of seiNals found in that track
  **/
 var parseCaptionNals = function(segment, videoTrackId) {
-  // To get the samples
-  var trafs = findBox(segment, ['moof', 'traf']);
-  // To get SEI NAL units
-  var mdats = findBox(segment, ['mdat']);
   var captionNals = {};
-  var mdatTrafPairs = [];
-
-  // Pair up each traf with a mdat as moofs and mdats are in pairs
-  mdats.forEach(function(mdat, index) {
-    var matchingTraf = trafs[index];
-    mdatTrafPairs.push({
-      mdat: mdat,
-      traf: matchingTraf
-    });
-  });
+  var mdatTrafPairs = getMdatTrafPairs(segment);
 
   mdatTrafPairs.forEach(function(pair) {
     var mdat = pair.mdat;

diff --git a/lib/mp4/index.js b/lib/mp4/index.js
@@ -10,5 +10,6 @@ module.exports = {
   Transmuxer: require('./transmuxer').Transmuxer,
   AudioSegmentStream: require('./transmuxer').AudioSegmentStream,
   VideoSegmentStream: require('./transmuxer').VideoSegmentStream,
-  CaptionParser: require('./caption-parser')
+  CaptionParser: require('./caption-parser'),
+  WebVttParser: require('./webvtt-parser')
 };
diff --git a/lib/mp4/samples.js b/lib/mp4/samples.js
@@ -0,0 +1,87 @@
+const { parseTrun } = require("../tools/mp4-inspector");
+const { findBox } = require("./probe");
+var window = require('global/window');
+
+/**
+ * Utility function for parsing data from mdat boxes.
+ * @param {Array<Uint8Array>} segment the segment data to create mdat/traf pairs from.
+ * @returns mdat and traf boxes paired up for easier parsing.
+ */
+var getMdatTrafPairs = function(segment) {
+  var trafs = findBox(segment, ['moof', 'traf']);
+  var mdats = findBox(segment, ['mdat']);
+
+  var mdatTrafPairs = [];
+
+  // Pair up each traf with a mdat as moofs and mdats are in pairs
+  mdats.forEach(function(mdat, index) {
+    var matchingTraf = trafs[index];
+    mdatTrafPairs.push({
+      mdat: mdat,
+      traf: matchingTraf
+    });
+  });
+
+  return mdatTrafPairs;
+};
+
+/**
+  * Parses sample information out of Track Run Boxes and calculates
+  * the absolute presentation and decode timestamps of each sample.
+  *
+  * @param {Array<Uint8Array>} truns - The Trun Run boxes to be parsed
+  * @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt
+      @see ISO-BMFF-12/2015, Section 8.8.12
+  * @param {Object} tfhd - The parsed Track Fragment Header
+  *   @see inspect.parseTfhd
+  * @return {Object[]} the parsed samples
+  *
+  * @see ISO-BMFF-12/2015, Section 8.8.8
+ **/
+var parseSamples = function(truns, baseMediaDecodeTime, tfhd) {
+  var currentDts = baseMediaDecodeTime;
+  var defaultSampleDuration = tfhd.defaultSampleDuration || 0;
+  var defaultSampleSize = tfhd.defaultSampleSize || 0;
+  var trackId = tfhd.trackId;
+  var allSamples = [];
+
+  truns.forEach(function(trun) {
+    // Note: We currently do not parse the sample table as well
+    // as the trun. It's possible some sources will require this.
+    // moov > trak > mdia > minf > stbl
+    var trackRun = parseTrun(trun);
+    var samples = trackRun.samples;
+
+    samples.forEach(function(sample) {
+      if (sample.duration === undefined) {
+        sample.duration = defaultSampleDuration;
+      }
+      if (sample.size === undefined) {
+        sample.size = defaultSampleSize;
+      }
+      sample.trackId = trackId;
+      sample.dts = currentDts;
+      if (sample.compositionTimeOffset === undefined) {
+        sample.compositionTimeOffset = 0;
+      }
+
+      if (typeof currentDts === 'bigint') {
+        sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset);
+        currentDts += window.BigInt(sample.duration);
+
+      } else {
+        sample.pts = currentDts + sample.compositionTimeOffset;
+        currentDts += sample.duration;
+      }
+    });
+
+    allSamples = allSamples.concat(samples);
+  });
+
+  return allSamples;
+};
+
+module.exports = { 
+  getMdatTrafPairs,
+  parseSamples
+};
diff --git a/lib/mp4/webvtt-parser.js b/lib/mp4/webvtt-parser.js
@@ -0,0 +1,126 @@
+const { parseTfdt } = require("../tools/mp4-inspector");
+const findBox = require("./find-box");
+const { getTimescaleFromMediaHeader } = require("./probe");
+const { parseSamples, getMdatTrafPairs } = require("./samples");
+
+/**
+ * Module for parsing WebVTT text and styles from FMP4 segments.
+ * Based on the ISO/IEC 14496-30.
+ */
+const WebVttParser = function() {
+  // default timescale to 90k
+  let timescale = 90e3;
+
+  /**
+   * Parses the timescale from the init segment.
+   * @param {Array<Uint8Array>} segment The initialization segment to parse the timescale from.
+   */
+  this.init = function(segment) {
+    // We just need the timescale from the init segment.
+    const mdhd = findBox(segment, ['moov', 'trak', 'mdia', 'mdhd'])[0];
+
+    if (mdhd) {
+      timescale = getTimescaleFromMediaHeader(mdhd);
+    }
+  };
+
+  /**
+   * Parses a WebVTT FMP4 segment.
+   * @param {Array<Uint8Array>} segment The content segment to parse the WebVTT cues from.
+   * @returns The WebVTT cue text, styling, and timing info as an array of cue objects.
+   */
+  this.parseSegment = function(segment) {
+    const vttCues = [];
+    const mdatTrafPairs = getMdatTrafPairs(segment);
+    let baseMediaDecodeTime = 0;
+
+    mdatTrafPairs.forEach(function(pair) {
+      const mdatBox = pair.mdat;
+      const trafBox = pair.traf;
+      // zero or one.
+      const tfdtBox = findBox(trafBox, ['tfdt'])[0];
+      // zero or one.
+      const tfhdBox = findBox(trafBox, ['tfhd'])[0];
+      // zero or more.
+      const trunBoxes = findBox(trafBox, ['trun']);
+
+      if (tfdtBox) {
+        const tfdt = parseTfdt(tfdtBox);
+
+        baseMediaDecodeTime = tfdt.baseMediaDecodeTime;
+      }
+
+      if (trunBoxes.length && tfhdBox) {
+        const samples = parseSamples(trunBoxes, baseMediaDecodeTime, tfhdBox);
+        let mdatOffset = 0;
+
+        samples.forEach(function(sample) {
+          // decode utf8 payload
+          const UTF_8 = 'utf-8';
+          const textDecoder = new TextDecoder(UTF_8);
+          // extract sample data from the mdat box.
+          // WebVTT Sample format:
+          // Exactly one VTTEmptyCueBox box
+          // OR one or more VTTCueBox boxes.
+          const sampleData = mdatBox.slice(mdatOffset, mdatOffset + sample.size);
+          // single vtte box.
+          const vtteBox = findBox(sampleData, ['vtte'])[0];
+
+          // empty box
+          if (vtteBox) {
+            mdatOffset += sample.size;
+            return;
+          }
+
+          // TODO: Support 'vtta' boxes.
+          // VTTAdditionalTextBoxes can be interleaved between VTTCueBoxes.
+
+          const vttcBoxes = findBox(sampleData, ['vttc']);
+
+          vttcBoxes.forEach(function(vttcBox) {
+            // mandatory payload box.
+            const paylBox = findBox(vttcBox, ['payl'])[0];
+            // optional settings box
+            const sttgBox = findBox(vttcBox, ['sttg'])[0];
+            const start = sample.pts / timescale;
+            const end = (sample.pts + sample.duration) / timescale;
+            let cueText, settings;
+
+            // contains cue text.
+            if (paylBox) {
+              try {
+                cueText = textDecoder.decode(paylBox);
+              } catch(e) {
+                console.error(e);
+              }
+            }
+
+            // settings box contains styling.
+            if (sttgBox) {
+              try {
+                settings = textDecoder.decode(sttgBox);
+              } catch(e) {
+                console.error(e);
+              }
+            }
+
+            if (sample.duration && cueText) {
+              vttCues.push({
+                cueText,
+                start,
+                end,
+                settings
+              });
+            }
+          });
+
+          mdatOffset += sample.size;
+        });
+      }
+    });
+
+    return vttCues;
+  };
+};
+
+module.exports = WebVttParser;
diff --git a/test/segments/test-webvtt-init.mp4 b/test/segments/test-webvtt-init.mp4
diff --git a/test/segments/test-webvtt.m4s b/test/segments/test-webvtt.m4s
diff --git a/test/webvtt-parser.test.js b/test/webvtt-parser.test.js
@@ -0,0 +1,39 @@
+var segments = require('data-files!segments');
+var vttContentSegment = segments['test-webvtt.m4s']();
+var vttInitSegment = segments['test-webvtt-init.mp4']();
+var WebVttParser = require('../lib/mp4').WebVttParser;
+var window = require('global/window');
+var webVttParser;
+
+QUnit.module('MP4 WebVtt Segment Parser', {
+  beforeEach: function() {
+    webVttParser = new WebVttParser();
+  }
+});
+
+QUnit.test('parse webvtt init and content segments', function(assert) {
+  // Init segment sets the timescale.
+  webVttParser.init(vttInitSegment);
+  assert.ok(webVttParser, 'WebVtt parser created');
+  // we need a TextDecoder to test the WebVTT segment parser.
+  if (window.TextDecoder) {
+    const parsedWebVttCues = webVttParser.parseSegment(vttContentSegment);
+    const expectedCueValues = [
+      {
+        cueText: "2024-09-19T20:13:06Z\nen # 863388393",
+        start: 1726776786,
+        end: 1726776786.9,
+        settings: undefined
+      },
+      {
+        cueText: "2024-09-19T20:13:07Z\nen # 863388393",
+        start: 1726776787,
+        end: 1726776787.9,
+        settings: undefined
+      }
+    ];
+    assert.ok(parsedWebVttCues, 'parsed WebVtt Cues are created');
+    assert.equal(parsedWebVttCues.length, 2, '2 WebVtt Cues are created');
+    assert.deepEqual(parsedWebVttCues, expectedCueValues, 'WebVtt cues are expected values');
+  }
+});