Skip to content

Commit

Permalink
Merge pull request #219 from VisLab/update-tokenizer
Browse files Browse the repository at this point in the history
First pass at the update of the hed tag parsing.  There are some temporary files that will be removed when transition is complete.
  • Loading branch information
VisLab authored Nov 10, 2024
2 parents fb23337 + a2ab776 commit 8129028
Show file tree
Hide file tree
Showing 16 changed files with 1,131 additions and 194 deletions.
2 changes: 1 addition & 1 deletion common/issues/data.js
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ export default {
invalidExtension: {
hedCode: 'TAG_EXTENSION_INVALID',
level: 'error',
message: stringTemplate`"${'tag'}" appears as an extension of "${'parentTag'}", which does not allow tag extensions.`,
message: stringTemplate`"${'tag'}" appears as an extension of "${'parentTag'}", which does not allow this tag extension.`,
},
emptyTagFound: {
hedCode: 'TAG_EMPTY',
Expand Down
146 changes: 121 additions & 25 deletions parser/parsedHedTag.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import { IssueError } from '../common/issues/issues'
import { getTagLevels } from '../utils/hedStrings'
import { getParentTag, getTagLevels, getTagName } from '../utils/hedStrings'
import ParsedHedSubstring from './parsedHedSubstring'
import { SchemaValueTag } from '../schema/entries'
import TagConverter from './tagConverter'
import { Schema } from '../schema/containers'
import { getRegExp } from './tempRegex'

import RegexClass from '../schema/regExps'

/**
* A parsed HED tag.
Expand Down Expand Up @@ -39,6 +41,30 @@ export default class ParsedHedTag extends ParsedHedSubstring {
*/
_remainder

/**
* The extension if any
*
* @type {string}
* @private
*/
_extension

/**
* The value if any
*
* @type {string}
* @private
*/
_value

/**
* The units if any
*
* @type {string}
* @private
*/
_units

/**
* Constructor.
*
Expand All @@ -48,15 +74,16 @@ export default class ParsedHedTag extends ParsedHedSubstring {
* @throws {IssueError} If tag conversion or parsing fails.
*/
constructor(tagSpec, hedSchemas, hedString) {
super(tagSpec.tag, tagSpec.bounds)

this._convertTag(hedSchemas, hedString, tagSpec)

this.formattedTag = this._formatTag()
super(tagSpec.tag, tagSpec.bounds) // Sets originalTag and originalBounds
this._convertTag(hedSchemas, hedString, tagSpec) // Sets various forms of the tag.
this._handleRemainder()
//this._checkTagAttributes() // Checks various aspects like requireChild or extensionAllowed.
//this.formattedTag = this._formatTag()
//this.formattedTag = this.canonicalTag.toLowerCase()
}

/**
* Convert this tag to long form.
* Convert this tag to its various forms
*
* @param {Schemas} hedSchemas The collection of HED schemas.
* @param {string} hedString The original HED string.
Expand All @@ -83,6 +110,38 @@ export default class ParsedHedTag extends ParsedHedSubstring {
this._schemaTag = schemaTag
this._remainder = remainder
this.canonicalTag = this._schemaTag.longExtend(remainder)
this.formattedTag = this.canonicalTag.toLowerCase()
}

/**
* Handle the remainder portion
*
* @throws {IssueError} If parsing the remainder section fails.
*/
_handleRemainder() {
if (this._remainder === '') {
return
}
// if (this.allowsExtensions) {
// this._handleExtension()
// } else if (this.takesValue) { // Its a value tag
// return
// } else {
// //IssueError.generateAndThrow('invalidTag', {tag: this.originalTag})
// }
}

/**
* Handle potential extensions
*
* @throws {IssueError} If parsing the remainder section fails.
*/
_handleExtension() {
this._extension = this._remainder
const testReg = getRegExp('nameClass')
if (!testReg.test(this._extension)) {
IssueError.generateAndThrow('invalidExtension', { tag: this.originalTag })
}
}

/**
Expand Down Expand Up @@ -121,23 +180,6 @@ export default class ParsedHedTag extends ParsedHedSubstring {
}
}

/**
* Format this HED tag by removing newlines and double quotes.
*
* @returns {string} The formatted version of this tag.
*/
_formatTag() {
this.originalTag = this.originalTag.replace('\n', ' ')
let hedTagString = this.canonicalTag.trim()
if (hedTagString.startsWith('"')) {
hedTagString = hedTagString.slice(1)
}
if (hedTagString.endsWith('"')) {
hedTagString = hedTagString.slice(0, -1)
}
return hedTagString.toLowerCase()
}

/**
* Determine whether this tag has a given attribute.
*
Expand Down Expand Up @@ -440,4 +482,58 @@ export default class ParsedHedTag extends ParsedHedSubstring {
return units
})
}

/**
* Validate a unit and strip it from the value.
*
* @param {ParsedHedTag} tag A HED tag.
* @returns {[boolean, boolean, string]} Whether a unit was found, whether it was valid, and the stripped value.
*/
validateUnits(tag) {
const originalTagUnitValue = tag.originalTagName
const tagUnitClassUnits = tag.validUnits
const validUnits = tag.schema.entries.allUnits
const unitStrings = Array.from(validUnits.keys())
unitStrings.sort((first, second) => {
return second.length - first.length
})
let actualUnit = getTagName(originalTagUnitValue, ' ')
let noUnitFound = false
if (actualUnit === originalTagUnitValue) {
actualUnit = ''
noUnitFound = true
}
let foundUnit, foundWrongCaseUnit, strippedValue
for (const unitName of unitStrings) {
const unit = validUnits.get(unitName)
const isPrefixUnit = unit.isPrefixUnit
const isUnitSymbol = unit.isUnitSymbol
for (const derivativeUnit of unit.derivativeUnits()) {
if (isPrefixUnit && originalTagUnitValue.startsWith(derivativeUnit)) {
foundUnit = true
noUnitFound = false
strippedValue = originalTagUnitValue.substring(derivativeUnit.length).trim()
}
if (actualUnit === derivativeUnit) {
foundUnit = true
strippedValue = getParentTag(originalTagUnitValue, ' ')
} else if (actualUnit.toLowerCase() === derivativeUnit.toLowerCase()) {
if (isUnitSymbol) {
foundWrongCaseUnit = true
} else {
foundUnit = true
}
strippedValue = getParentTag(originalTagUnitValue, ' ')
}
if (foundUnit) {
const unitIsValid = tagUnitClassUnits.has(unit)
return [true, unitIsValid, strippedValue]
}
}
if (foundWrongCaseUnit) {
return [true, false, strippedValue]
}
}
return [!noUnitFound, false, originalTagUnitValue]
}
}
56 changes: 39 additions & 17 deletions parser/tagConverter.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { IssueError } from '../common/issues/issues'
import { getTagSlashIndices } from '../utils/hedStrings'
import { SchemaValueTag } from '../schema/entries'

import { getRegExp } from './tempRegex'
/**
* Converter from a tag specification to a schema-based tag object.
*/
Expand Down Expand Up @@ -56,6 +56,7 @@ export default class TagConverter {
constructor(tagSpec, hedSchemas) {
this.hedSchemas = hedSchemas
this.tagMapping = hedSchemas.getSchema(tagSpec.library).entries.tags

this.tagSpec = tagSpec
this.tagString = tagSpec.tag
this.tagLevels = this.tagString.split('/')
Expand All @@ -67,6 +68,7 @@ export default class TagConverter {
* Retrieve the {@link SchemaTag} object for a tag specification.
*
* @returns {[SchemaTag, string]} The schema's corresponding tag object and the remainder of the tag string.
* @throws {IssueError} If tag conversion.
*/
convert() {
let parentTag = undefined
Expand All @@ -86,45 +88,50 @@ export default class TagConverter {
}

_validateChildTag(parentTag, tagLevelIndex) {
if (this.schemaTag instanceof SchemaValueTag) {
IssueError.generateAndThrow('internalConsistencyError', {
message: 'Child tag is a value tag which should have been handled earlier.',
})
}

const childTag = this._getSchemaTag(tagLevelIndex)
if (childTag === undefined) {
// This is an extended tag
if (tagLevelIndex === 0) {
IssueError.generateAndThrow('invalidTag', { tag: this.tagString })
}
if (parentTag !== undefined && !parentTag.hasAttributeName('extensionAllowed')) {
IssueError.generateAndThrow('invalidExtension', {
tag: this.tagLevels[tagLevelIndex],
parentTag: parentTag.longName,
parentTag: this.tagLevels.slice(0, tagLevelIndex).join('/'),
})
}
this._checkExtensions(tagLevelIndex)
return childTag
}

if (tagLevelIndex > 0 && (childTag.parent === undefined || childTag.parent !== parentTag)) {
IssueError.generateAndThrow('invalidParentNode', {
tag: this.tagLevels[tagLevelIndex],
parentTag: childTag.longName,
parentTag: this.tagLevels.slice(0, tagLevelIndex).join('/'),
})
}

return childTag
}

_getSchemaTag(tagLevelIndex) {
let tagLevel = this.tagLevels[tagLevelIndex].toLowerCase()
// TODO: These two checks should probably be removed as the tokenizer handles this.
if (tagLevelIndex === 0) {
tagLevel = tagLevel.trimLeft()
}
if (tagLevel === '' || tagLevel !== tagLevel.trim()) {
IssueError.generateAndThrow('invalidTag', { tag: this.tagString })
_checkExtensions(tagLevelIndex) {
// A non-tag has been detected --- from here on must be non-tags.
this._checkNameClass(tagLevelIndex) // This is an extension
for (let index = tagLevelIndex + 1; index < this.tagLevels.length; index++) {
const child = this._getSchemaTag(index)
if (child !== undefined) {
// A schema tag showed up after a non-schema tag
IssueError.generateAndThrow('invalidParentNode', {
tag: this.tagLevels[index],
parentTag: this.tagLevels.slice(0, index).join('/'),
})
}
this._checkNameClass(index)
}
}

_getSchemaTag(tagLevelIndex) {
const tagLevel = this.tagLevels[tagLevelIndex].toLowerCase()
return this.tagMapping.getEntry(tagLevel)
}

Expand All @@ -138,4 +145,19 @@ export default class TagConverter {
IssueError.generateAndThrow('childRequired', { tag: this.tagString })
}
}

_checkNameClass(index) {
// Check whether the tagLevel is a valid name class
// TODO: this test should be in the schema and the RegExp only created once.
const valueClasses = this.hedSchemas.getSchema(this.tagSpec.library).entries.valueClasses
const myRex = valueClasses._definitions.get('nameClass')?._charClassRegex
const my = new RegExp(myRex)
if (!my.test(this.tagLevels[index])) {
// An extension is not name class
IssueError.generateAndThrow('invalidExtension', {
tag: this.tagLevels[index],
parentTag: this.tagLevels.slice(0, index).join('/'),
})
}
}
}
25 changes: 25 additions & 0 deletions parser/tempRegex.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import regexData from '../data/json/class_regex.json'

// Function to get the RegExp
export function getRegExp(name) {
if (!regexData.class_chars[name]) {
throw new Error(`Invalid class name: ${name}`)
}

const charNames = regexData.class_chars[name]
if (charNames.length === 0) {
throw new Error(`No character definitions for class: ${name}`)
}

// Join the individual character regex patterns
const pattern = charNames
.map((charName) => {
if (!regexData.char_regex[charName]) {
throw new Error(`Invalid character name: ${charName}`)
}
return regexData.char_regex[charName]
})
.join('|')

return new RegExp(`^(?:${pattern})+$`)
}
4 changes: 2 additions & 2 deletions schema/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,9 @@ export default class SchemaParser {
for (const [name, valueAttributes] of valueAttributeDefinitions) {
const booleanAttributes = booleanAttributeDefinitions.get(name)
//valueClasses.set(name, new SchemaValueClass(name, booleanAttributes, valueAttributes))
const charClassRegex = this._getValueClassChars(name)
const charRegex = this._getValueClassChars(name)
const wordRegex = new RegExp(classRegex.class_words[name] ?? '^.+$')
valueClasses.set(name, new SchemaValueClass(name, booleanAttributes, valueAttributes, charClassRegex, wordRegex))
valueClasses.set(name, new SchemaValueClass(name, booleanAttributes, valueAttributes, charRegex, wordRegex))
}
this.valueClasses = new SchemaEntryManager(valueClasses)
}
Expand Down
21 changes: 21 additions & 0 deletions schema/regExps.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import classRegex from '../data/json/class_regex.json'

export class RegexClass {
// Static method that returns the RegExp object

static getValueClassChars(name) {
let classChars
if (Array.isArray(classRegex.class_chars[name]) && classRegex.class_chars[name].length > 0) {
classChars =
'^(?:' + classRegex.class_chars[name].map((charClass) => classRegex.char_regex[charClass]).join('|') + ')+$'
} else {
classChars = '^.+$' // Any non-empty line or string.
}
return new RegExp(classChars)
}

static testRegex(name, value) {
const regex = RegexClass.getValueClassChars(name)
return regex.test(value)
}
}
Loading

0 comments on commit 8129028

Please sign in to comment.