-
Notifications
You must be signed in to change notification settings - Fork 211
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: parse WebVTT from fmp4 segments. (#445)
- Loading branch information
Showing
8 changed files
with
262 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
const { parseTrun } = require("../tools/mp4-inspector"); | ||
const { findBox } = require("./probe"); | ||
var window = require('global/window'); | ||
|
||
/** | ||
* Utility function for parsing data from mdat boxes. | ||
* @param {Array<Uint8Array>} segment the segment data to create mdat/traf pairs from. | ||
* @returns mdat and traf boxes paired up for easier parsing. | ||
*/ | ||
var getMdatTrafPairs = function(segment) { | ||
var trafs = findBox(segment, ['moof', 'traf']); | ||
var mdats = findBox(segment, ['mdat']); | ||
|
||
var mdatTrafPairs = []; | ||
|
||
// Pair up each traf with a mdat as moofs and mdats are in pairs | ||
mdats.forEach(function(mdat, index) { | ||
var matchingTraf = trafs[index]; | ||
mdatTrafPairs.push({ | ||
mdat: mdat, | ||
traf: matchingTraf | ||
}); | ||
}); | ||
|
||
return mdatTrafPairs; | ||
}; | ||
|
||
/** | ||
* Parses sample information out of Track Run Boxes and calculates | ||
* the absolute presentation and decode timestamps of each sample. | ||
* | ||
* @param {Array<Uint8Array>} truns - The Trun Run boxes to be parsed | ||
* @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt | ||
@see ISO-BMFF-12/2015, Section 8.8.12 | ||
* @param {Object} tfhd - The parsed Track Fragment Header | ||
* @see inspect.parseTfhd | ||
* @return {Object[]} the parsed samples | ||
* | ||
* @see ISO-BMFF-12/2015, Section 8.8.8 | ||
**/ | ||
var parseSamples = function(truns, baseMediaDecodeTime, tfhd) { | ||
var currentDts = baseMediaDecodeTime; | ||
var defaultSampleDuration = tfhd.defaultSampleDuration || 0; | ||
var defaultSampleSize = tfhd.defaultSampleSize || 0; | ||
var trackId = tfhd.trackId; | ||
var allSamples = []; | ||
|
||
truns.forEach(function(trun) { | ||
// Note: We currently do not parse the sample table as well | ||
// as the trun. It's possible some sources will require this. | ||
// moov > trak > mdia > minf > stbl | ||
var trackRun = parseTrun(trun); | ||
var samples = trackRun.samples; | ||
|
||
samples.forEach(function(sample) { | ||
if (sample.duration === undefined) { | ||
sample.duration = defaultSampleDuration; | ||
} | ||
if (sample.size === undefined) { | ||
sample.size = defaultSampleSize; | ||
} | ||
sample.trackId = trackId; | ||
sample.dts = currentDts; | ||
if (sample.compositionTimeOffset === undefined) { | ||
sample.compositionTimeOffset = 0; | ||
} | ||
|
||
if (typeof currentDts === 'bigint') { | ||
sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset); | ||
currentDts += window.BigInt(sample.duration); | ||
|
||
} else { | ||
sample.pts = currentDts + sample.compositionTimeOffset; | ||
currentDts += sample.duration; | ||
} | ||
}); | ||
|
||
allSamples = allSamples.concat(samples); | ||
}); | ||
|
||
return allSamples; | ||
}; | ||
|
||
module.exports = { | ||
getMdatTrafPairs, | ||
parseSamples | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
const { parseTfdt } = require("../tools/mp4-inspector"); | ||
const findBox = require("./find-box"); | ||
const { getTimescaleFromMediaHeader } = require("./probe"); | ||
const { parseSamples, getMdatTrafPairs } = require("./samples"); | ||
|
||
/** | ||
* Module for parsing WebVTT text and styles from FMP4 segments. | ||
* Based on the ISO/IEC 14496-30. | ||
*/ | ||
const WebVttParser = function() { | ||
// default timescale to 90k | ||
let timescale = 90e3; | ||
|
||
/** | ||
* Parses the timescale from the init segment. | ||
* @param {Array<Uint8Array>} segment The initialization segment to parse the timescale from. | ||
*/ | ||
this.init = function(segment) { | ||
// We just need the timescale from the init segment. | ||
const mdhd = findBox(segment, ['moov', 'trak', 'mdia', 'mdhd'])[0]; | ||
|
||
if (mdhd) { | ||
timescale = getTimescaleFromMediaHeader(mdhd); | ||
} | ||
}; | ||
|
||
/** | ||
* Parses a WebVTT FMP4 segment. | ||
* @param {Array<Uint8Array>} segment The content segment to parse the WebVTT cues from. | ||
* @returns The WebVTT cue text, styling, and timing info as an array of cue objects. | ||
*/ | ||
this.parseSegment = function(segment) { | ||
const vttCues = []; | ||
const mdatTrafPairs = getMdatTrafPairs(segment); | ||
let baseMediaDecodeTime = 0; | ||
|
||
mdatTrafPairs.forEach(function(pair) { | ||
const mdatBox = pair.mdat; | ||
const trafBox = pair.traf; | ||
// zero or one. | ||
const tfdtBox = findBox(trafBox, ['tfdt'])[0]; | ||
// zero or one. | ||
const tfhdBox = findBox(trafBox, ['tfhd'])[0]; | ||
// zero or more. | ||
const trunBoxes = findBox(trafBox, ['trun']); | ||
|
||
if (tfdtBox) { | ||
const tfdt = parseTfdt(tfdtBox); | ||
|
||
baseMediaDecodeTime = tfdt.baseMediaDecodeTime; | ||
} | ||
|
||
if (trunBoxes.length && tfhdBox) { | ||
const samples = parseSamples(trunBoxes, baseMediaDecodeTime, tfhdBox); | ||
let mdatOffset = 0; | ||
|
||
samples.forEach(function(sample) { | ||
// decode utf8 payload | ||
const UTF_8 = 'utf-8'; | ||
const textDecoder = new TextDecoder(UTF_8); | ||
// extract sample data from the mdat box. | ||
// WebVTT Sample format: | ||
// Exactly one VTTEmptyCueBox box | ||
// OR one or more VTTCueBox boxes. | ||
const sampleData = mdatBox.slice(mdatOffset, mdatOffset + sample.size); | ||
// single vtte box. | ||
const vtteBox = findBox(sampleData, ['vtte'])[0]; | ||
|
||
// empty box | ||
if (vtteBox) { | ||
mdatOffset += sample.size; | ||
return; | ||
} | ||
|
||
// TODO: Support 'vtta' boxes. | ||
// VTTAdditionalTextBoxes can be interleaved between VTTCueBoxes. | ||
|
||
const vttcBoxes = findBox(sampleData, ['vttc']); | ||
|
||
vttcBoxes.forEach(function(vttcBox) { | ||
// mandatory payload box. | ||
const paylBox = findBox(vttcBox, ['payl'])[0]; | ||
// optional settings box | ||
const sttgBox = findBox(vttcBox, ['sttg'])[0]; | ||
const start = sample.pts / timescale; | ||
const end = (sample.pts + sample.duration) / timescale; | ||
let cueText, settings; | ||
|
||
// contains cue text. | ||
if (paylBox) { | ||
try { | ||
cueText = textDecoder.decode(paylBox); | ||
} catch(e) { | ||
console.error(e); | ||
} | ||
} | ||
|
||
// settings box contains styling. | ||
if (sttgBox) { | ||
try { | ||
settings = textDecoder.decode(sttgBox); | ||
} catch(e) { | ||
console.error(e); | ||
} | ||
} | ||
|
||
if (sample.duration && cueText) { | ||
vttCues.push({ | ||
cueText, | ||
start, | ||
end, | ||
settings | ||
}); | ||
} | ||
}); | ||
|
||
mdatOffset += sample.size; | ||
}); | ||
} | ||
}); | ||
|
||
return vttCues; | ||
}; | ||
}; | ||
|
||
module.exports = WebVttParser; |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
var segments = require('data-files!segments'); | ||
var vttContentSegment = segments['test-webvtt.m4s'](); | ||
var vttInitSegment = segments['test-webvtt-init.mp4'](); | ||
var WebVttParser = require('../lib/mp4').WebVttParser; | ||
var window = require('global/window'); | ||
var webVttParser; | ||
|
||
QUnit.module('MP4 WebVtt Segment Parser', { | ||
beforeEach: function() { | ||
webVttParser = new WebVttParser(); | ||
} | ||
}); | ||
|
||
QUnit.test('parse webvtt init and content segments', function(assert) { | ||
// Init segment sets the timescale. | ||
webVttParser.init(vttInitSegment); | ||
assert.ok(webVttParser, 'WebVtt parser created'); | ||
// we need a TextDecoder to test the WebVTT segment parser. | ||
if (window.TextDecoder) { | ||
const parsedWebVttCues = webVttParser.parseSegment(vttContentSegment); | ||
const expectedCueValues = [ | ||
{ | ||
cueText: "2024-09-19T20:13:06Z\nen # 863388393", | ||
start: 1726776786, | ||
end: 1726776786.9, | ||
settings: undefined | ||
}, | ||
{ | ||
cueText: "2024-09-19T20:13:07Z\nen # 863388393", | ||
start: 1726776787, | ||
end: 1726776787.9, | ||
settings: undefined | ||
} | ||
]; | ||
assert.ok(parsedWebVttCues, 'parsed WebVtt Cues are created'); | ||
assert.equal(parsedWebVttCues.length, 2, '2 WebVtt Cues are created'); | ||
assert.deepEqual(parsedWebVttCues, expectedCueValues, 'WebVtt cues are expected values'); | ||
} | ||
}); |