From 1a713b7997138b48f7624d9b9683f67e88cb482b Mon Sep 17 00:00:00 2001 From: Borewit Date: Thu, 15 Aug 2024 15:20:33 +0200 Subject: [PATCH] Add `option.mkvUseIndex to use index to skip Matroska cluster elements --- README.md | 6 +++++- lib/matroska/MatroskaParser.ts | 28 ++++++++++++++++++++++++++-- lib/type.ts | 10 ++++++++++ test/test-file-matroska.ts | 34 ++++++++++++++++++++++++++++------ 4 files changed, 69 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8aa6cff02..c68440c2a 100644 --- a/README.md +++ b/README.md @@ -338,7 +338,11 @@ import { parseFile, selectCover } from 'music-metadata'; - `observer: (update: MetadataEvent) => void;`: Will be called after each change to `common` (generic) tag, or `format` properties. - `skipCovers`: default: `false`, if set to `true`, it will not return embedded cover-art (images). - `skipPostHeaders? boolean` default: `false`, if set to `true`, it will not search all the entire track for additional headers. Only recommenced to use in combination with streams. -- `includeChapters` default: `false`, if set to `true`, it will parse chapters (currently only MP4 files). _experimental functionality_ +- `mkvUseIndex` default: `false`, if set to `true`, in Matroska based files, use the _SeekHead_ element index to skip _segment/cluster_ elements.. + _experimental functionality_ + Can have a significant performance impact if enabled. + Possible side effect can be that certain metadata maybe skipped, depending on the index. + If there is no _SeekHead_ element present in the Matroska file, this flag has no effect. Although in most cases duration is included, in some cases it requires `music-metadata` parsing the entire file. To enforce parsing the entire file if needed you should set `duration` to `true`. diff --git a/lib/matroska/MatroskaParser.ts b/lib/matroska/MatroskaParser.ts index 7a67e7274..bf6e18f73 100644 --- a/lib/matroska/MatroskaParser.ts +++ b/lib/matroska/MatroskaParser.ts @@ -4,7 +4,7 @@ import type { ITokenizer } from 'strtok3'; import type { INativeMetadataCollector } from '../common/MetadataCollector.js'; import { BasicParser } from '../common/BasicParser.js'; import { matroskaDtd } from './MatroskaDtd.js'; -import { type IAttachments, type ISegmentInformation, type ITags, type ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js'; +import { type IAttachments, type ISeekHead, type ISegmentInformation, type ITags, type ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js'; import type { AnyTagValue, IOptions, ITrackInfo } from '../type.js'; import type { ITokenParser } from '../ParserFactory.js'; @@ -21,6 +21,14 @@ const debug = initDebug('music-metadata:parser:matroska'); */ export class MatroskaParser extends BasicParser { + private seekHead: ISeekHead | undefined; + private seekHeadOffset = 0; + /** + * Use index to skip multiple segment/cluster elements at once. + * Significant performance impact + */ + private flagUseIndexToSkipClusters = false; + /** * Initialize parser with output (metadata), input (tokenizer) & parsing options (options). * @param {INativeMetadataCollector} metadata Output @@ -29,6 +37,7 @@ export class MatroskaParser extends BasicParser { */ public init(metadata: INativeMetadataCollector, tokenizer: ITokenizer, options: IOptions): ITokenParser { super.init(metadata, tokenizer, options); + this.flagUseIndexToSkipClusters = options.mkvUseIndex ?? false; return this; } @@ -46,18 +55,33 @@ export class MatroskaParser extends BasicParser { debug(`Skip element: name=${element.name}, id=0x${element.id.toString(16)}`); return ParseAction.IgnoreElement; case 0x1f43b675: // cluster + if (this.flagUseIndexToSkipClusters && this.seekHead) { + const index = this.seekHead.seek.find(index => index.position + this.seekHeadOffset > this.tokenizer.position); + if (index) { + // Go to next index position + const ignoreSize = index.position + this.seekHeadOffset - this.tokenizer.position; + debug(`Use index to go to next position, ignoring ${ignoreSize} bytes`); + this.tokenizer.ignore(ignoreSize); + return ParseAction.SkipElement; + } + } return ParseAction.IgnoreElement; default: return ParseAction.ReadNext; } }, - elementValue: async (element, value) => { + elementValue: async (element, value, offset) => { debug(`Received: name=${element.name}, value=${value}`); switch (element.id) { case 0x4282: // docType this.metadata.setFormat('container', `EBML/${value}`); break; + case 0x114d9b74: + this.seekHead = value as unknown as ISeekHead + this.seekHeadOffset = offset; + break; + case 0x1549a966: { // Info (Segment Information) const info = value as ISegmentInformation; const timecodeScale = info.timecodeScale ? info.timecodeScale : 1000000; diff --git a/lib/type.ts b/lib/type.ts index 8682a5364..789f7391a 100644 --- a/lib/type.ts +++ b/lib/type.ts @@ -625,6 +625,16 @@ export interface IOptions { * Set observer for async callbacks to common or format. */ observer?: Observer; + + /** + * In Matroska based files, use the _SeekHead_ element index to skip _segment/cluster_ elements. + * By default, disabled + * Can have a significant performance impact if enabled. + * Possible side effect can be that certain metadata maybe skipped, depending on the index. + * If there is no _SeekHead_ element present in the Matroska file, this flag has no effect + * Ref: https://www.matroska.org/technical/diagram.html + */ + mkvUseIndex?: boolean; } export interface IApeHeader extends IOptions { diff --git a/test/test-file-matroska.ts b/test/test-file-matroska.ts index e2dda7d40..41bd4188c 100644 --- a/test/test-file-matroska.ts +++ b/test/test-file-matroska.ts @@ -40,11 +40,11 @@ describe('Matroska formats', () => { assert.strictEqual(format.numberOfChannels, 2, 'format.numberOfChannels'); }); - it('parse: "02 - Poxfil - Solid Ground (5 sec).mka"', async () => { + async function parsePoxfile(options?: mm.IOptions) { const mkaPath = path.join(matroskaSamplePath, '02 - Poxfil - Solid Ground (5 sec).mka'); - const {format, common} = await mm.parseFile(mkaPath, {duration: false}); + const {format, common} = await mm.parseFile(mkaPath, options); // format chunk information assert.strictEqual(format.container, 'EBML/matroska', 'format.container'); @@ -54,6 +54,14 @@ describe('Matroska formats', () => { assert.strictEqual(format.numberOfChannels, 2, 'format.numberOfChannels'); verifyTrackSolidGround(common); + } + + it('parse: "02 - Poxfil - Solid Ground (5 sec).mka"', () => { + return parsePoxfile(); + }); + + it('parse: "02 - Poxfil - Solid Ground (5 sec).mka" with `mkvUseIndex` flag', () => { + return parsePoxfile({mkvUseIndex: true}); }); }); @@ -116,7 +124,7 @@ describe('Matroska formats', () => { assert.strictEqual(common.encodersettings, '--bitrate 96 --vbr', 'common.encodersettings'); }); - it('should ignore trailing null characters', async () => { + it('shoud ignore trailing null characters', async () => { const webmPath = path.join(matroskaSamplePath, 'fixture-null.webm'); const {format} = await mm.parseFile(webmPath, {duration: false}); assert.strictEqual(format.container, 'EBML/webm', 'format.container'); @@ -127,11 +135,10 @@ describe('Matroska formats', () => { // https://github.com/Borewit/music-metadata/issues/384 describe('Multiple audio tracks', () => { - it('parse: "matroska-test-w1-test5-short.mkv"', async () => { - + async function parse(options?: mm.IOptions) { const mkvPath = path.join(matroskaSamplePath, 'matroska-test-w1-test5-short.mkv'); - const {format, common} = await mm.parseFile(mkvPath); + const {format, common} = await mm.parseFile(mkvPath, options); assert.deepEqual(format.container, 'EBML/matroska', 'format.container'); assert.deepEqual(format.tagTypes, [ 'matroska' ], 'format.tagTypes'); @@ -143,6 +150,14 @@ describe('Matroska formats', () => { assert.deepEqual(common.title, 'Elephant Dreams', 'common.title'); assert.deepEqual(common.album, 'Matroska Test Files - Wave 1', 'common.album'); + } + + it('parse: "matroska-test-w1-test5-short.mkv"', () => { + return parse(); + }); + + it('parse: "matroska-test-w1-test5-short.mkv `mkvUseIndex` flag', () => { + return parse({mkvUseIndex: true}); }); }); @@ -160,6 +175,13 @@ describe('Matroska formats', () => { assert.strictEqual(format.numberOfChannels, 1, 'format.numberOfChannels'); }); + it('Parse stream with `mkvUseIndex` flag', async () => { + const {format} = await mm.parseFile(mkvPath, {mkvUseIndex: true}); + assert.strictEqual(format.container, 'EBML/webm', 'format.container'); + assert.strictEqual(format.codec, 'OPUS', 'format.codec'); + assert.strictEqual(format.numberOfChannels, 1, 'format.numberOfChannels'); + }); + }); describe('Handle corrupt Matroska file', () => {