Skip to content

Commit

Permalink
Parse EBML (Matroska, webm) using async notification
Browse files Browse the repository at this point in the history
  • Loading branch information
Borewit committed Aug 15, 2024
1 parent b5b84ff commit 1fe4049
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 144 deletions.
96 changes: 62 additions & 34 deletions lib/ebml/EbmlIterator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,21 @@ export interface ILinkedElementType extends IElementType {
readonly container?: { [id: number]: ILinkedElementType; };
}

export enum ParseAction {
ReadNext = 0, // Continue reading the next elements
IgnoreElement = 2, // Ignore (do not read) this element
SkipSiblings = 3, // Skip all remaining elements at the same level
TerminateParsing = 4, // Terminate the parsing process
SkipElement = 5 // Consider the element has read, assume position is at the next element
}

/**
* @return true, to quit the parser
*/
export type ElementListener = (dtdElement: ILinkedElementType, value: ValueType) => Promise<boolean>;
export type IElementListener = {
startNext: (dtdElement: ILinkedElementType) => ParseAction,
elementValue: (dtdElement: ILinkedElementType, value: ValueType, offset: number) => Promise<void>
}

/**
* Extensible Binary Meta Language (EBML) iterator
Expand All @@ -34,7 +45,6 @@ export class EbmlIterator {

private ebmlMaxIDLength = 4;
private ebmlMaxSizeLength = 8;
private cancel = false;

/**
* @param {ITokenizer} tokenizer Input
Expand All @@ -49,17 +59,17 @@ export class EbmlIterator {
this.parserMap.set(DataType.float, e => this.readFloat(e));
}

public async iterate(dtdElement: IElementType, posDone: number, listener: ElementListener): Promise<ITree> {
this.cancel = false;
public async iterate(dtdElement: IElementType, posDone: number, listener: IElementListener): Promise<ITree> {
return this.parseContainer(linkParents(dtdElement), posDone, listener);
}

private async parseContainer(dtdElement: ILinkedElementType, posDone: number, listener: ElementListener): Promise<ITree> {
private async parseContainer(dtdElement: ILinkedElementType, posDone: number, listener: IElementListener): Promise<ITree> {
const tree: ITree = {};
while (this.tokenizer.position < posDone && !this.cancel) {
while (this.tokenizer.position < posDone) {
let element: IHeader;
const elementPosition= this.tokenizer.position;
try {
element = await this.readElement();
element = await this.readElement();
} catch (error) {
if (error instanceof EndOfStreamError) {
break;
Expand All @@ -68,35 +78,53 @@ export class EbmlIterator {
}
const child = (dtdElement.container as { [id: number]: ILinkedElementType; })[element.id];
if (child) {
if (child.ignore) {
debug(`Ignore element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container}`);
await this.tokenizer.ignore(element.len);
} else {
if (element.id === 0x1F43B675) {
// Hack to ignore remaining segment, when cluster element received
// await this.tokenizer.ignore(posDone - this.tokenizer.position);
// break;
}
debug(`Reading element: name=${getElementPath(child)}{id=0x${element.id}, container=${!!child.container}}`);
if (child.container) {
const res = await this.parseContainer(child, element.len >= 0 ? this.tokenizer.position + element.len : -1, listener);
if (child.multiple) {
if (!tree[child.name]) {
tree[child.name] = [];
const action = listener.startNext(child);
switch (action) {
case ParseAction.ReadNext: {
if (element.id === 0x1F43B675) {
// Hack to ignore remaining segment, when cluster element received
// await this.tokenizer.ignore(posDone - this.tokenizer.position);
// break;
}
debug(`Read element: name=${getElementPath(child)}{id=0x${element.id.toString(16)}, container=${!!child.container}} at position=${elementPosition}`);
if (child.container) {
const res = await this.parseContainer(child, element.len >= 0 ? this.tokenizer.position + element.len : -1, listener);
if (child.multiple) {
if (!tree[child.name]) {
tree[child.name] = [];
}
(tree[child.name] as ITree[]).push(res);
} else {
tree[child.name] = res;
}
(tree[child.name] as ITree[]).push(res);
await listener.elementValue(child, res, elementPosition);
} else {
tree[child.name] = res;
}
this.cancel = await listener(child, res);
} else {
const parser = this.parserMap.get(child.value as DataType);
if (typeof parser === 'function') {
const value = await parser(element);
tree[child.name] = value;
this.cancel = await listener(child, value);
const parser = this.parserMap.get(child.value as DataType);
if (typeof parser === 'function') {
const value = await parser(element);
tree[child.name] = value;
await listener.elementValue(child, value, elementPosition);
}
}
}
} break;

case ParseAction.SkipElement:
debug(`Go to next element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`);
break;

case ParseAction.IgnoreElement:
debug(`Ignore element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`);
await this.tokenizer.ignore(element.len);
break;

case ParseAction.SkipSiblings:
debug(`Ignore remaining container, at: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`);
await this.tokenizer.ignore(posDone - this.tokenizer.position);
break;

case ParseAction.TerminateParsing:
debug(`Terminate parsing at element: name=${getElementPath(child)}, element.id=0x${element.id}, container=${!!child.container} at position=${elementPosition}`);
return tree;
}
} else {
switch (element.id) {
Expand All @@ -105,7 +133,7 @@ export class EbmlIterator {
await this.tokenizer.ignore(element.len);
break;
default:
debug(`parseEbml: parent=${getElementPath(dtdElement)}, unknown child: id=${element.id.toString(16)}`);
debug(`parseEbml: parent=${getElementPath(dtdElement)}, unknown child: id=${element.id.toString(16)} at position=${elementPosition}`);
this.padding += element.len;
await this.tokenizer.ignore(element.len);
}
Expand Down
1 change: 0 additions & 1 deletion lib/ebml/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ export interface IElementType {
readonly value?: DataType;
readonly container?: { [id: number]: IElementType; };
readonly multiple?: boolean;
readonly ignore?: boolean;
}

export interface IEbmlDoc {
Expand Down
12 changes: 6 additions & 6 deletions lib/matroska/MatroskaDtd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@ export const matroskaDtd: IElementType = {
name: 'segment',
container: {

// Meta Seek Information
// Meta Seek Information (also known as MetaSeek)
0x114d9b74: {
name: 'seekHead',
container: {
0x4dbb: {
name: 'seek',
multiple: true,
container: {
0x53ab: {name: 'seekId', value: DataType.binary},
0x53ac: {name: 'seekPosition', value: DataType.uint}
0x53ab: {name: 'id', value: DataType.binary},
0x53ac: {name: 'position', value: DataType.uint}
}
}
}
Expand Down Expand Up @@ -69,8 +70,8 @@ export const matroskaDtd: IElementType = {
0x58d7: {name: 'silentTracks ', multiple: true},
0xa7: {name: 'position', value: DataType.uid},
0xab: {name: 'prevSize', value: DataType.uid},
0xa0: {name: 'blockGroup', ignore: true},
0xa3: {name: 'simpleBlock', ignore: true}
0xa0: {name: 'blockGroup'},
0xa3: {name: 'simpleBlock'}
}
},

Expand Down Expand Up @@ -174,7 +175,6 @@ export const matroskaDtd: IElementType = {
// Cueing Data
0x1c53bb6b: {
name: 'cues',
ignore: true,
container: {
0xbb: {
name: 'cuePoint',
Expand Down
176 changes: 92 additions & 84 deletions lib/matroska/MatroskaParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ import type { ITokenizer } from 'strtok3';
import type { INativeMetadataCollector } from '../common/MetadataCollector.js';
import { BasicParser } from '../common/BasicParser.js';
import { matroskaDtd } from './MatroskaDtd.js';
import { IAttachments, type IMatroskaDoc, IMatroskaSegment, ISegmentInformation, ITags, ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js';
import { type IAttachments, type ISegmentInformation, type ITags, type ITrackElement, type ITrackEntry, TargetType, TrackType } from './types.js';

import type { AnyTagValue, IOptions, ITrackInfo } from '../type.js';
import type { ITokenParser } from '../ParserFactory.js';
import { EbmlIterator } from '../ebml/EbmlIterator.js';
import { EbmlIterator, ParseAction } from '../ebml/EbmlIterator.js';

const debug = initDebug('music-metadata:parser:matroska');

Expand Down Expand Up @@ -38,95 +38,103 @@ export class MatroskaParser extends BasicParser {

const matroskaIterator = new EbmlIterator(this.tokenizer);
debug('Initializing DTD end MatroskaIterator');
const matroska = await matroskaIterator.iterate(matroskaDtd, containerSize, async (element, value) => {
debug(`Received: name=${element.name}, value=${value}`);
switch (element.id) {
case 0x4282: // docType
this.metadata.setFormat('container', `EBML/${value}`);
break;

case 0x1549a966: {// info
const info = value as ISegmentInformation;
const timecodeScale = info.timecodeScale ? info.timecodeScale : 1000000;
if (typeof info.duration === 'number') {
const duration = info.duration * timecodeScale / 1000000000;
await this.addTag('segment:title', info.title);
this.metadata.setFormat('duration', Number(duration));
}
await matroskaIterator.iterate(matroskaDtd, containerSize, {
startNext: (element) => {
switch (element.id) {
// case 0x1f43b675: // cluster
case 0x1c53bb6b: // Cueing Data
debug(`Skip element: name=${element.name}, id=0x${element.id.toString(16)}`);
return ParseAction.IgnoreElement;
case 0x1f43b675: // cluster
return ParseAction.IgnoreElement;
default:
return ParseAction.ReadNext;
}
break;

case 0x1654ae6b: { // tracks
const audioTracks = value as ITrackElement;
if (audioTracks?.entries) {
audioTracks.entries.forEach(entry => {
const stream: ITrackInfo = {
codecName: entry.codecID.replace('A_', '').replace('V_', ''),
codecSettings: entry.codecSettings,
flagDefault: entry.flagDefault,
flagLacing: entry.flagLacing,
flagEnabled: entry.flagEnabled,
language: entry.language,
name: entry.name,
type: entry.trackType,
audio: entry.audio,
video: entry.video
};
this.metadata.addStreamInfo(stream);
});

const audioTrack = audioTracks.entries
.filter(entry => entry.trackType === TrackType.audio)
.reduce((acc: ITrackEntry | null, cur: ITrackEntry): ITrackEntry => {
if (!acc) return cur;
if (cur.flagDefault && !acc.flagDefault) return cur;
if (cur.trackNumber < acc.trackNumber) return cur;
return acc;
}, null);

if (audioTrack) {
this.metadata.setFormat('codec', audioTrack.codecID.replace('A_', ''));
this.metadata.setFormat('sampleRate', audioTrack.audio.samplingFrequency);
this.metadata.setFormat('numberOfChannels', audioTrack.audio.channels);
},
elementValue: async (element, value) => {
debug(`Received: name=${element.name}, value=${value}`);
switch (element.id) {
case 0x4282: // docType
this.metadata.setFormat('container', `EBML/${value}`);
break;

case 0x1549a966: { // Info (Segment Information)
const info = value as ISegmentInformation;
const timecodeScale = info.timecodeScale ? info.timecodeScale : 1000000;
if (typeof info.duration === 'number') {
const duration = info.duration * timecodeScale / 1000000000;
await this.addTag('segment:title', info.title);
this.metadata.setFormat('duration', Number(duration));
}
}
}
break;

case 0x1254c367: { // 'tags'
const tags = value as unknown as ITags;
await Promise.all(tags.tag.map(async tag => {
const target = tag.target;
const targetType = target?.targetTypeValue ? TargetType[target.targetTypeValue] : (target?.targetType ? target.targetType : 'track');
await Promise.all(tag.simpleTags.map(async simpleTag => {
const value = simpleTag.string ? simpleTag.string : simpleTag.binary;
await this.addTag(`${targetType}:${simpleTag.name}`, value);
break;

case 0x1654ae6b: { // tracks
const audioTracks = value as ITrackElement;
if (audioTracks?.entries) {
audioTracks.entries.forEach(entry => {
const stream: ITrackInfo = {
codecName: entry.codecID.replace('A_', '').replace('V_', ''),
codecSettings: entry.codecSettings,
flagDefault: entry.flagDefault,
flagLacing: entry.flagLacing,
flagEnabled: entry.flagEnabled,
language: entry.language,
name: entry.name,
type: entry.trackType,
audio: entry.audio,
video: entry.video
};
this.metadata.addStreamInfo(stream);
});

const audioTrack = audioTracks.entries
.filter(entry => entry.trackType === TrackType.audio)
.reduce((acc: ITrackEntry | null, cur: ITrackEntry): ITrackEntry => {
if (!acc) return cur;
if (cur.flagDefault && !acc.flagDefault) return cur;
if (cur.trackNumber < acc.trackNumber) return cur;
return acc;
}, null);

if (audioTrack) {
this.metadata.setFormat('codec', audioTrack.codecID.replace('A_', ''));
this.metadata.setFormat('sampleRate', audioTrack.audio.samplingFrequency);
this.metadata.setFormat('numberOfChannels', audioTrack.audio.channels);
}
}
}
break;

case 0x1254c367: { // tags
const tags = value as unknown as ITags;
await Promise.all(tags.tag.map(async tag => {
const target = tag.target;
const targetType = target?.targetTypeValue ? TargetType[target.targetTypeValue] : (target?.targetType ? target.targetType : 'track');
await Promise.all(tag.simpleTags.map(async simpleTag => {
const value = simpleTag.string ? simpleTag.string : simpleTag.binary;
await this.addTag(`${targetType}:${simpleTag.name}`, value);
}));
}));
}));
}
break;

case 0x1941a469: { //attachments
const attachments = value as unknown as IAttachments;
await Promise.all(attachments.attachedFiles
.filter(file => file.mimeType.startsWith('image/'))
.map(file => this.addTag('picture', {
data: file.data,
format: file.mimeType,
description: file.description,
name: file.name
})));
}
break;

case 0x1941a469: { // attachments
const attachments = value as unknown as IAttachments;
await Promise.all(attachments.attachedFiles
.filter(file => file.mimeType.startsWith('image/'))
.map(file => this.addTag('picture', {
data: file.data,
format: file.mimeType,
description: file.description,
name: file.name
})));

}
break;
}
break;

//case 'cluster':
case 0x18538067: // segment
debug(`Cancel EBML parser after element ${element.name}`);
return true; // Quite EBML parser, we got the metadata we need
}
return false;
}) as unknown as IMatroskaDoc;
});
}

private async addTag(tagId: string, value: AnyTagValue): Promise<void> {
Expand Down
Loading

0 comments on commit 1fe4049

Please sign in to comment.