Skip to content

Commit

Permalink
Merge pull request #213 from VisLab/update-tokenizer
Browse files Browse the repository at this point in the history
Update tokenizer to handle (, x
  • Loading branch information
VisLab authored Nov 4, 2024
2 parents 9cc9bcc + 81e7835 commit 84fd711
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 56 deletions.
79 changes: 28 additions & 51 deletions parser/tokenizer.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { unicodeName } from 'unicode-name'

import { generateIssue } from '../common/issues/issues'
import { replaceTagNameWithPound } from '../utils/hedStrings'

const CHARACTERS = {
BLANK: ' ',
Expand Down Expand Up @@ -34,8 +33,6 @@ for (let i = 0x7f; i <= 0x9f; i++) {
invalidCharacters.add(String.fromCodePoint(i))
}

const invalidCharactersOutsideOfValues = new Set([':'])

/**
* A specification for a tokenized substring.
*/
Expand Down Expand Up @@ -221,18 +218,14 @@ export class HedStringTokenizer {
}

handleComma(i) {
if (this.state.lastDelimiter[0] === undefined && this.hedString.slice(0, i).length === 0) {
// Start of string empty
this.pushIssue('emptyTagFound', i)
return
}
const trimmed = this.hedString.slice(this.state.lastDelimiter[1] + 1, i).trim()
if (this.state.lastDelimiter[0] === CHARACTERS.COMMA && trimmed.length === 0) {
// empty token after a previous comma
this.pushIssue('emptyTagFound', this.state.lastDelimiter[1]) // Check for empty group between commas
if (
[CHARACTERS.OPENING_GROUP, CHARACTERS.COMMA, undefined].includes(this.state.lastDelimiter[0]) &&
trimmed.length === 0
) {
this.pushIssue('emptyTagFound', i) // Empty tag Ex: ",x" or "(, x" or "y, ,x"
} else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
// Unclosed curly brace
this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1])
this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1]) // Unclosed curly brace Ex: "{ x,"
}
if (
[CHARACTERS.CLOSING_GROUP, CHARACTERS.CLOSING_COLUMN].includes(this.state.lastDelimiter[0]) &&
Expand All @@ -250,15 +243,15 @@ export class HedStringTokenizer {
handleSlash(i) {
if (this.state.currentToken.trim().length === 0) {
// Slash at beginning of tag.
this.pushIssue('extraSlash', i)
this.pushIssue('extraSlash', i) // Slash at beginning of tag.
} else if (this.state.lastSlash >= 0 && this.hedString.slice(this.state.lastSlash + 1, i).trim().length === 0) {
this.pushIssue('extraSlash', i) // Slashes with only blanks between
} else if (i > 0 && this.hedString.charAt(i - 1) === CHARACTERS.BLANK) {
this.pushIssue('extraBlank', i - 1) // Blank before slash such as slash in value
} else if (i < this.hedString.length - 1 && this.hedString.charAt(i + 1) === CHARACTERS.BLANK) {
this.pushIssue('extraBlank', i + 1) //Blank after
this.pushIssue('extraBlank', i + 1) //Blank after a slash
} else if (this.hedString.slice(i).trim().length === 0) {
this.pushIssue('extraSlash', this.state.startingIndex)
this.pushIssue('extraSlash', this.state.startingIndex) // Extra slash at the end
} else {
this.state.currentToken += CHARACTERS.SLASH
this.state.lastSlash = i
Expand All @@ -267,11 +260,13 @@ export class HedStringTokenizer {

handleOpeningGroup(i) {
if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1])
this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1]) // After open curly brace Ex: "{ ("
} else if (this.state.lastDelimiter[0] === CHARACTERS.CLOSING_COLUMN) {
this.pushIssue('commaMissing', this.state.lastDelimiter[1])
this.pushIssue('commaMissing', this.state.lastDelimiter[1]) // After close curly brace Ex: "} ("
} else if (this.state.lastDelimiter[0] === CHARACTERS.CLOSING_GROUP) {
this.pushIssue('commaMissing', this.state.lastDelimiter[1] + 1) // After close group Ex: ") ("
} else if (this.state.currentToken.trim().length > 0) {
this.pushInvalidTag('commaMissing', i, this.state.currentToken.trim())
this.pushInvalidTag('commaMissing', i, this.state.currentToken.trim()) // After tag Ex: "x ("
} else {
this.state.currentGroupStack.push([])
this.state.parenthesesStack.push(new GroupSpec(i, undefined, []))
Expand All @@ -282,40 +277,35 @@ export class HedStringTokenizer {
}

handleClosingGroup(i) {
if ([CHARACTERS.OPENING_GROUP, CHARACTERS.COMMA].includes(this.state.lastDelimiter[0])) {
this.pushTag(i)
}
if (this.state.groupDepth <= 0) {
// If the group depth is <= 0, it means there's no corresponding opening group.
this.pushIssue('unopenedParenthesis', i)
this.pushIssue('unopenedParenthesis', i) // No corresponding opening group
} else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1])
this.pushIssue('unclosedCurlyBrace', this.state.lastDelimiter[1]) // After open curly brace Ex: "{ )"
} else {
// Close the group by updating its bounds and moving it to the parent group.
this.closeGroup(i)
if ([CHARACTERS.OPENING_GROUP, CHARACTERS.COMMA].includes(this.state.lastDelimiter[0])) {
// Should be a tag here
this.pushTag(i)
}
this.closeGroup(i) // Close the group by updating its bounds and moving it to the parent group.
this.state.lastDelimiter = [CHARACTERS.CLOSING_GROUP, i]
}
}

handleOpeningColumn(i) {
if (this.state.currentToken.trim().length > 0) {
// In the middle of a token -- can't have an opening brace
this.pushInvalidCharacterIssue(CHARACTERS.OPENING_COLUMN, i)
this.pushInvalidCharacterIssue(CHARACTERS.OPENING_COLUMN, i) // Middle of a token Ex: "x {"
} else if (this.state.lastDelimiter[0] === CHARACTERS.OPENING_COLUMN) {
//
this.pushIssue('nestedCurlyBrace', i)
this.pushIssue('nestedCurlyBrace', i) // After open curly brace Ex: "{x{"
} else {
this.state.lastDelimiter = [CHARACTERS.OPENING_COLUMN, i]
}
}

handleClosingColumn(i) {
if (this.state.lastDelimiter[0] !== CHARACTERS.OPENING_COLUMN) {
// Column splice not in progress
this.pushIssue('unopenedCurlyBrace', i)
this.pushIssue('unopenedCurlyBrace', i) // No matching open brace Ex: " x}"
} else if (!this.state.currentToken.trim()) {
// Column slice cannot be empty
this.pushIssue('emptyCurlyBrace', i)
this.pushIssue('emptyCurlyBrace', i) // Column slice cannot be empty Ex: "{ }"
} else {
// Close column by updating bounds and moving it to the parent group, push a column splice on the stack.
this.state.currentGroupStack[this.state.groupDepth].push(
Expand All @@ -328,10 +318,9 @@ export class HedStringTokenizer {

handleColon(i) {
if (this.state.librarySchema || this.state.currentToken.trim().includes(CHARACTERS.BLANK)) {
// If colon has not been seen, it is a library. Ignore other colons.
this.state.currentToken += CHARACTERS.COLON
this.state.currentToken += CHARACTERS.COLON // If colon has not been seen, it is a library. Ignore other colons.
} else if (/[^A-Za-z]/.test(this.state.currentToken.trim())) {
this.pushIssue('invalidTagPrefix', i)
this.pushIssue('invalidTagPrefix', i) // Prefix not alphabetic Ex: "1a:xxx"
} else {
const lib = this.state.currentToken.trimStart()
this.resetToken(i)
Expand Down Expand Up @@ -370,23 +359,11 @@ export class HedStringTokenizer {
const groupSpec = this.state.parenthesesStack.pop()
groupSpec.bounds[1] = i + 1
if (this.hedString.slice(groupSpec.bounds[0] + 1, i).trim().length === 0) {
//The group is empty
this.pushIssue('emptyTagFound', i)
this.pushIssue('emptyTagFound', i) //The group is empty
}
this.state.parenthesesStack[this.state.groupDepth - 1].children.push(groupSpec)
this.state.currentGroupStack[this.state.groupDepth - 1].push(this.state.currentGroupStack.pop())
this.state.groupDepth--
//this.resetToken(i)
}

checkValueTagForInvalidCharacters() {
const formToCheck = replaceTagNameWithPound(this.state.currentToken)
for (let i = 0; i < formToCheck.length; i++) {
const character = formToCheck.charAt(i)
if (invalidCharactersOutsideOfValues.has(character)) {
this.pushInvalidCharacterIssue(character, this.state.startingIndex + i)
}
}
}

pushIssue(issueCode, index) {
Expand Down
24 changes: 20 additions & 4 deletions tests/testData/tokenizerTests.data.js
Original file line number Diff line number Diff line change
Expand Up @@ -335,12 +335,12 @@ export const tokenizerTests = [
errors: [generateIssue('emptyTagFound', { index: '3', string: 'x,y,' })],
},
{
testname: 'double-in-comma',
testname: 'double-comma',
explanation: 'Cannot have double commas',
string: 'x,,y,',
tagSpecs: [],
groupSpec: null,
errors: [generateIssue('emptyTagFound', { index: '1', string: 'x,,y,' })],
errors: [generateIssue('emptyTagFound', { index: '2', string: 'x,,y,' })],
},
{
testname: 'leading-comma',
Expand Down Expand Up @@ -374,6 +374,14 @@ export const tokenizerTests = [
groupSpec: null,
errors: [generateIssue('commaMissing', { index: '6', string: 'x, (y)z' })],
},
{
testname: 'extra-comma-before-open-group',
explanation: 'Must have a comma before open brace',
string: '(, x, y), z',
tagSpecs: [],
groupSpec: null,
errors: [generateIssue('emptyTagFound', { index: '1', string: '(, x, y), z' })],
},
{
testname: 'missing-comma-before-open-column',
explanation: 'Must have a comma before open brace',
Expand All @@ -382,6 +390,14 @@ export const tokenizerTests = [
groupSpec: null,
errors: [generateIssue('unclosedCurlyBrace', { index: '6', string: 'x, y, {(z)' })],
},
{
testname: 'missing-comma-between groups',
explanation: 'Must have a comma before open brace',
string: '(x, y)(z, (w))',
tagSpecs: [],
groupSpec: null,
errors: [generateIssue('commaMissing', { index: '6', string: '(x, y)(z, (w))' })],
},
{
testname: 'missing-close-brace-before-parentheses',
explanation: 'Must have a closed-brace-after-open-brace',
Expand Down Expand Up @@ -428,15 +444,15 @@ export const tokenizerTests = [
string: 'x, (y), z,,',
tagSpecs: [],
groupSpec: null,
errors: [generateIssue('emptyTagFound', { index: '9', string: 'x, (y), z,,' })],
errors: [generateIssue('emptyTagFound', { index: '10', string: 'x, (y), z,,' })],
},
{
testname: 'multiple-internal-commas',
explanation: 'Multiple closing commas not allowed',
string: 'x, (y),, z',
tagSpecs: [],
groupSpec: null,
errors: [generateIssue('emptyTagFound', { index: '6', string: 'x, (y),, z' })],
errors: [generateIssue('emptyTagFound', { index: '7', string: 'x, (y),, z' })],
},
],
},
Expand Down
2 changes: 1 addition & 1 deletion tests/tokenizerTests.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { tokenizerTests } from './testData/tokenizerTests.data'
const runAll = true
let onlyRun = new Map()
if (!runAll) {
onlyRun = new Map([['invalid-comma-missing-or-extra', ['missing-comma-before-close']]])
onlyRun = new Map([['invalid-commas', ['extra-comma-before-open-group']]])
}

describe('Tokenizer validation using JSON tests', () => {
Expand Down

0 comments on commit 84fd711

Please sign in to comment.