Skip to content

Commit

Permalink
feat: parse WebVTT from fmp4 segments. (#445)
Browse files Browse the repository at this point in the history
  • Loading branch information
adrums86 authored Sep 23, 2024
1 parent f4b3162 commit 432b036
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 73 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ Set to `true` to remux audio and video into a single MP4 segment.

This module reads CEA-608 captions out of FMP4 segments.

#### WebVTTParser

`muxjs.mp4.WebVTTParser`

This module reads WebVTT text out of FMP4 segments.

#### Tools

`muxjs.mp4.tools`
Expand Down
74 changes: 2 additions & 72 deletions lib/mp4/caption-parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ var discardEmulationPreventionBytes = require('../tools/caption-packet-parser').
var CaptionStream = require('../m2ts/caption-stream').CaptionStream;
var findBox = require('../mp4/find-box.js');
var parseTfdt = require('../tools/parse-tfdt.js');
var parseTrun = require('../tools/parse-trun.js');
var parseTfhd = require('../tools/parse-tfhd.js');
var window = require('global/window');
var { getMdatTrafPairs, parseSamples } = require('./samples.js');

/**
* Maps an offset in the mdat to a sample based on the the size of the samples.
Expand Down Expand Up @@ -118,62 +117,6 @@ var findSeiNals = function(avcStream, samples, trackId) {
return result;
};

/**
* Parses sample information out of Track Run Boxes and calculates
* the absolute presentation and decode timestamps of each sample.
*
* @param {Array<Uint8Array>} truns - The Trun Run boxes to be parsed
* @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt
@see ISO-BMFF-12/2015, Section 8.8.12
* @param {Object} tfhd - The parsed Track Fragment Header
* @see inspect.parseTfhd
* @return {Object[]} the parsed samples
*
* @see ISO-BMFF-12/2015, Section 8.8.8
**/
var parseSamples = function(truns, baseMediaDecodeTime, tfhd) {
var currentDts = baseMediaDecodeTime;
var defaultSampleDuration = tfhd.defaultSampleDuration || 0;
var defaultSampleSize = tfhd.defaultSampleSize || 0;
var trackId = tfhd.trackId;
var allSamples = [];

truns.forEach(function(trun) {
// Note: We currently do not parse the sample table as well
// as the trun. It's possible some sources will require this.
// moov > trak > mdia > minf > stbl
var trackRun = parseTrun(trun);
var samples = trackRun.samples;

samples.forEach(function(sample) {
if (sample.duration === undefined) {
sample.duration = defaultSampleDuration;
}
if (sample.size === undefined) {
sample.size = defaultSampleSize;
}
sample.trackId = trackId;
sample.dts = currentDts;
if (sample.compositionTimeOffset === undefined) {
sample.compositionTimeOffset = 0;
}

if (typeof currentDts === 'bigint') {
sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset);
currentDts += window.BigInt(sample.duration);

} else {
sample.pts = currentDts + sample.compositionTimeOffset;
currentDts += sample.duration;
}
});

allSamples = allSamples.concat(samples);
});

return allSamples;
};

/**
* Parses out caption nals from an FMP4 segment's video tracks.
*
Expand All @@ -183,21 +126,8 @@ var parseSamples = function(truns, baseMediaDecodeTime, tfhd) {
* a list of seiNals found in that track
**/
var parseCaptionNals = function(segment, videoTrackId) {
// To get the samples
var trafs = findBox(segment, ['moof', 'traf']);
// To get SEI NAL units
var mdats = findBox(segment, ['mdat']);
var captionNals = {};
var mdatTrafPairs = [];

// Pair up each traf with a mdat as moofs and mdats are in pairs
mdats.forEach(function(mdat, index) {
var matchingTraf = trafs[index];
mdatTrafPairs.push({
mdat: mdat,
traf: matchingTraf
});
});
var mdatTrafPairs = getMdatTrafPairs(segment);

mdatTrafPairs.forEach(function(pair) {
var mdat = pair.mdat;
Expand Down
3 changes: 2 additions & 1 deletion lib/mp4/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ module.exports = {
Transmuxer: require('./transmuxer').Transmuxer,
AudioSegmentStream: require('./transmuxer').AudioSegmentStream,
VideoSegmentStream: require('./transmuxer').VideoSegmentStream,
CaptionParser: require('./caption-parser')
CaptionParser: require('./caption-parser'),
WebVttParser: require('./webvtt-parser')
};
87 changes: 87 additions & 0 deletions lib/mp4/samples.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
const { parseTrun } = require("../tools/mp4-inspector");
const { findBox } = require("./probe");
var window = require('global/window');

/**
* Utility function for parsing data from mdat boxes.
* @param {Array<Uint8Array>} segment the segment data to create mdat/traf pairs from.
* @returns mdat and traf boxes paired up for easier parsing.
*/
var getMdatTrafPairs = function(segment) {
var trafs = findBox(segment, ['moof', 'traf']);
var mdats = findBox(segment, ['mdat']);

var mdatTrafPairs = [];

// Pair up each traf with a mdat as moofs and mdats are in pairs
mdats.forEach(function(mdat, index) {
var matchingTraf = trafs[index];
mdatTrafPairs.push({
mdat: mdat,
traf: matchingTraf
});
});

return mdatTrafPairs;
};

/**
* Parses sample information out of Track Run Boxes and calculates
* the absolute presentation and decode timestamps of each sample.
*
* @param {Array<Uint8Array>} truns - The Trun Run boxes to be parsed
* @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt
@see ISO-BMFF-12/2015, Section 8.8.12
* @param {Object} tfhd - The parsed Track Fragment Header
* @see inspect.parseTfhd
* @return {Object[]} the parsed samples
*
* @see ISO-BMFF-12/2015, Section 8.8.8
**/
var parseSamples = function(truns, baseMediaDecodeTime, tfhd) {
var currentDts = baseMediaDecodeTime;
var defaultSampleDuration = tfhd.defaultSampleDuration || 0;
var defaultSampleSize = tfhd.defaultSampleSize || 0;
var trackId = tfhd.trackId;
var allSamples = [];

truns.forEach(function(trun) {
// Note: We currently do not parse the sample table as well
// as the trun. It's possible some sources will require this.
// moov > trak > mdia > minf > stbl
var trackRun = parseTrun(trun);
var samples = trackRun.samples;

samples.forEach(function(sample) {
if (sample.duration === undefined) {
sample.duration = defaultSampleDuration;
}
if (sample.size === undefined) {
sample.size = defaultSampleSize;
}
sample.trackId = trackId;
sample.dts = currentDts;
if (sample.compositionTimeOffset === undefined) {
sample.compositionTimeOffset = 0;
}

if (typeof currentDts === 'bigint') {
sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset);
currentDts += window.BigInt(sample.duration);

} else {
sample.pts = currentDts + sample.compositionTimeOffset;
currentDts += sample.duration;
}
});

allSamples = allSamples.concat(samples);
});

return allSamples;
};

module.exports = {
getMdatTrafPairs,
parseSamples
};
126 changes: 126 additions & 0 deletions lib/mp4/webvtt-parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
const { parseTfdt } = require("../tools/mp4-inspector");
const findBox = require("./find-box");
const { getTimescaleFromMediaHeader } = require("./probe");
const { parseSamples, getMdatTrafPairs } = require("./samples");

/**
* Module for parsing WebVTT text and styles from FMP4 segments.
* Based on the ISO/IEC 14496-30.
*/
const WebVttParser = function() {
// default timescale to 90k
let timescale = 90e3;

/**
* Parses the timescale from the init segment.
* @param {Array<Uint8Array>} segment The initialization segment to parse the timescale from.
*/
this.init = function(segment) {
// We just need the timescale from the init segment.
const mdhd = findBox(segment, ['moov', 'trak', 'mdia', 'mdhd'])[0];

if (mdhd) {
timescale = getTimescaleFromMediaHeader(mdhd);
}
};

/**
* Parses a WebVTT FMP4 segment.
* @param {Array<Uint8Array>} segment The content segment to parse the WebVTT cues from.
* @returns The WebVTT cue text, styling, and timing info as an array of cue objects.
*/
this.parseSegment = function(segment) {
const vttCues = [];
const mdatTrafPairs = getMdatTrafPairs(segment);
let baseMediaDecodeTime = 0;

mdatTrafPairs.forEach(function(pair) {
const mdatBox = pair.mdat;
const trafBox = pair.traf;
// zero or one.
const tfdtBox = findBox(trafBox, ['tfdt'])[0];
// zero or one.
const tfhdBox = findBox(trafBox, ['tfhd'])[0];
// zero or more.
const trunBoxes = findBox(trafBox, ['trun']);

if (tfdtBox) {
const tfdt = parseTfdt(tfdtBox);

baseMediaDecodeTime = tfdt.baseMediaDecodeTime;
}

if (trunBoxes.length && tfhdBox) {
const samples = parseSamples(trunBoxes, baseMediaDecodeTime, tfhdBox);
let mdatOffset = 0;

samples.forEach(function(sample) {
// decode utf8 payload
const UTF_8 = 'utf-8';
const textDecoder = new TextDecoder(UTF_8);
// extract sample data from the mdat box.
// WebVTT Sample format:
// Exactly one VTTEmptyCueBox box
// OR one or more VTTCueBox boxes.
const sampleData = mdatBox.slice(mdatOffset, mdatOffset + sample.size);
// single vtte box.
const vtteBox = findBox(sampleData, ['vtte'])[0];

// empty box
if (vtteBox) {
mdatOffset += sample.size;
return;
}

// TODO: Support 'vtta' boxes.
// VTTAdditionalTextBoxes can be interleaved between VTTCueBoxes.

const vttcBoxes = findBox(sampleData, ['vttc']);

vttcBoxes.forEach(function(vttcBox) {
// mandatory payload box.
const paylBox = findBox(vttcBox, ['payl'])[0];
// optional settings box
const sttgBox = findBox(vttcBox, ['sttg'])[0];
const start = sample.pts / timescale;
const end = (sample.pts + sample.duration) / timescale;
let cueText, settings;

// contains cue text.
if (paylBox) {
try {
cueText = textDecoder.decode(paylBox);
} catch(e) {
console.error(e);
}
}

// settings box contains styling.
if (sttgBox) {
try {
settings = textDecoder.decode(sttgBox);
} catch(e) {
console.error(e);
}
}

if (sample.duration && cueText) {
vttCues.push({
cueText,
start,
end,
settings
});
}
});

mdatOffset += sample.size;
});
}
});

return vttCues;
};
};

module.exports = WebVttParser;
Binary file added test/segments/test-webvtt-init.mp4
Binary file not shown.
Binary file added test/segments/test-webvtt.m4s
Binary file not shown.
39 changes: 39 additions & 0 deletions test/webvtt-parser.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
var segments = require('data-files!segments');
var vttContentSegment = segments['test-webvtt.m4s']();
var vttInitSegment = segments['test-webvtt-init.mp4']();
var WebVttParser = require('../lib/mp4').WebVttParser;
var window = require('global/window');
var webVttParser;

QUnit.module('MP4 WebVtt Segment Parser', {
beforeEach: function() {
webVttParser = new WebVttParser();
}
});

QUnit.test('parse webvtt init and content segments', function(assert) {
// Init segment sets the timescale.
webVttParser.init(vttInitSegment);
assert.ok(webVttParser, 'WebVtt parser created');
// we need a TextDecoder to test the WebVTT segment parser.
if (window.TextDecoder) {
const parsedWebVttCues = webVttParser.parseSegment(vttContentSegment);
const expectedCueValues = [
{
cueText: "2024-09-19T20:13:06Z\nen # 863388393",
start: 1726776786,
end: 1726776786.9,
settings: undefined
},
{
cueText: "2024-09-19T20:13:07Z\nen # 863388393",
start: 1726776787,
end: 1726776787.9,
settings: undefined
}
];
assert.ok(parsedWebVttCues, 'parsed WebVtt Cues are created');
assert.equal(parsedWebVttCues.length, 2, '2 WebVtt Cues are created');
assert.deepEqual(parsedWebVttCues, expectedCueValues, 'WebVtt cues are expected values');
}
});

0 comments on commit 432b036

Please sign in to comment.