From da014496db8215f0c619d808b3dfee88c1267917 Mon Sep 17 00:00:00 2001 From: James Prevett Date: Sun, 5 Jan 2025 10:37:32 -0600 Subject: [PATCH] Cleaned up BNF `ast_to_config` and renamed to `create_config` --- src/bnf.ts | 464 +++++++++++++++++++++++++++----------------------- src/config.ts | 9 +- src/parser.ts | 2 +- 3 files changed, 260 insertions(+), 215 deletions(-) diff --git a/src/bnf.ts b/src/bnf.ts index ef42bf8..4fbbb1b 100644 --- a/src/bnf.ts +++ b/src/bnf.ts @@ -1,27 +1,30 @@ import rawConfig from './bnf.json' with { type: 'json' }; -import * as config from './config.js'; -import type { DefinitionPart, Logger, Node, PureNodeDefinition } from './parser.js'; +import type { Config, Json, PureConfig } from './config.js'; +import { parse_json } from './config.js'; +import type { DefinitionPart, Logger, Node } from './parser.js'; import { logger, parse } from './parser.js'; -import type { Token, TokenDefinition } from './tokens.js'; +import type { Token } from './tokens.js'; import { tokenize } from './tokens.js'; -export const { literals, definitions, ignoreLiterals, rootNodes } = config.parse_json(rawConfig as config.Json); +const bnf_config = parse_json(rawConfig as Json); + +export { bnf_config as config }; /** * Shortcut for tokenize(source, bnf.literals); */ function tokenizeBnf(source: string): Token[] { - return tokenize(source, literals); + return tokenize(source, bnf_config.literals); } export { tokenizeBnf as tokenize }; export function parseSource(source: string, log?: Logger): Node[] { - return parse({ ignoreLiterals, definitions, rootNodes, log, source, literals }); + return parse({ ...bnf_config, log, source }); } function parseBnf(tokens: Token[], log?: Logger): Node[] { - return parse({ ignoreLiterals, definitions, rootNodes, log, tokens, literals: literals.map(t => t.name) }); + return parse({ ...bnf_config, log, tokens }); } export { parseBnf as parse }; @@ -32,261 +35,296 @@ const typeForGroup = { left_paren: 'required', } as const; -export function ast_to_config(ast: Node[], log: Logger = () => {}, include?: (name: string) => Node[]): config.Config { - const definitions: PureNodeDefinition[] = [], - literals: TokenDefinition[] = [], - rootNodes: string[] = [], - ignoreLiterals: string[] = []; - - let currentNode: string, - groups = 0; +export interface ASTConfigOptions { + log?: Logger; + include?: (name: string) => Node[]; +} - function processNode(node: Node, depth: number = 0) { - const _log = logger(log, { kind: node.kind, depth }); +interface ASTConfigContext extends ASTConfigOptions { + depth: number; + config: PureConfig; + currentNode?: string; + groups: number; +} - _log(3, `Processing ${node.kind} at ${node.line}:${node.column}`); +/** + * Creates a copy of a context for use with children. + * Right now this just increments the depth + */ +function child_context(context: ASTConfigContext): ASTConfigContext { + return { + ...context, + depth: context.depth + 1, + }; +} - if (node.kind == 'directive') { - const [, directive, contents] = node.text.match(/##(\w+) (.*)/i)!; +function config_process_directive(text: string, $: ASTConfigContext) { + const log = logger($.log, { kind: 'directive', depth: $.depth }); + const [, directive, contents] = text.match(/##(\w+) (.*)/i)!; + + switch (directive) { + case 'root': + $.config.rootNodes.push(...contents.split(/[ ,;]/)); + break; + case 'ignore': + $.config.ignoreLiterals.push(...contents.split(/[ ,;]/)); + break; + case 'include': + if (!$.include) { + log(0, 'Warning: Missing include()'); + break; + } + log(1, 'Including: ' + contents); + for (const node of $.include(contents)) { + config_process_node(node, child_context($)); + } + break; + // ##flags + case 'flags': { + const [, name, flags] = contents.match(/(\w+)\s+(\w+)/) || []; + const literal = $.config.literals.find(({ name: n }) => n == name); + if (!literal) { + log(0, 'Warning: ##flags references missing literal: ' + name); + break; + } - switch (directive) { - case 'root': - rootNodes.push(...contents.split(/[ ,;]/)); - break; - case 'ignore': - ignoreLiterals.push(...contents.split(/[ ,;]/)); - break; - case 'include': - if (!include) { - _log(0, 'Warning: Missing include()'); - break; - } - _log(1, 'Including: ' + contents); - for (const node of include(contents)) { - processNode(node, depth + 1); - } - break; - // ##flags - case 'flags': { - const [, name, flags] = contents.match(/(\w+)\s+(\w+)/) || []; - const literal = literals.find(({ name: n }) => n == name); - if (!literal) { - _log(0, 'Warning: ##flags references missing literal: ' + name); - break; - } + literal.pattern = new RegExp(literal.pattern.source, flags); - literal.pattern = new RegExp(literal.pattern.source, flags); + break; + } + // ##groups ... + case 'groups': { + const [, name, _names] = contents.match(/(\w+)\s+(.+)/) || []; + const groupNames = _names.split(/[\s,]+/); + const rule = $.config.definitions.find(d => d.name == name); + if (!rule) { + log(0, 'Warning: ##groups: missing rule ' + JSON.stringify(name)); + break; + } + for (let i = 0; i < groupNames.length; i++) { + const group = $.config.definitions.find(d => d.name == name + '#' + i); + if (!group) { + log(0, 'Warning: ##groups: missing group ' + i); break; } - // ##groups ... - case 'groups': { - const [, name, _names] = contents.match(/(\w+)\s+(.+)/) || []; - const groupNames = _names.split(/[\s,]+/); - const rule = definitions.find(d => d.name == name); - if (!rule) { - _log(0, 'Warning: ##groups: missing rule ' + JSON.stringify(name)); - break; - } - for (let i = 0; i < groupNames.length; i++) { - const group = definitions.find(d => d.name == name + '#' + i); - if (!group) { - _log(0, 'Warning: ##groups: missing group ' + i); - break; - } - - const new_name = groupNames[i].replaceAll('%', name); - - for (const part of definitions.flatMap(d => d.pattern)) { - if (part.kind == group.name) { - part.kind = new_name; - } - } + const new_name = groupNames[i].replaceAll('%', name); - _log(1, `Renaming group: ${group.name} -> ${new_name}`); - group.name = new_name; + for (const part of $.config.definitions.flatMap(d => d.pattern)) { + if (part.kind == group.name) { + part.kind = new_name; } - break; } - default: - _log(0, 'Warning: unsupported directive: ' + directive); - } - - return; - } - - if (node.kind != 'rule') { - // Recursively process child nodes - for (const child of node.children || []) { - processNode(child, depth + 1); - } - return; - } - - // Extract the rule name (identifier) and its expression - const name = node.children?.find(child => child.kind === 'identifier')?.text; - const expression = node.children?.find(child => child.kind === 'expression'); - - _log(2, `Found rule "${name}" at ${node.line}:${node.column}`); - if (!name || !expression) { - _log(1, 'Rule is missing name or expression'); - return; - } - currentNode = name; - groups = 0; - - const [pattern, isAlternation] = processExpression(expression, depth + 1); - - /* - Inline single-use literals - For example: - `ws = "[ \t]+";` - Gets converted to - "[ \\t]+": /[ \t]+/ (a literal) - ws: [ { kind: "[ \\t]+", required: true } ] (a definition) - This collapses it, so we have - ws: /[ \t]+/ (a literal) - */ - - const maybeLiteral = pattern[0].kind; - - const index = literals.findIndex(l => l.name == maybeLiteral); - if (index != -1 && pattern.length == 1 && pattern[0].type == 'required' && literals[index].pattern.source.slice(1) == pattern[0].kind) { - let regex; - try { - regex = new RegExp('^' + maybeLiteral); - } catch (e: any) { - throw `Invalid literal: ${name}: ${e}`; + log(1, `Renaming group: ${group.name} -> ${new_name}`); + group.name = new_name; } - literals.splice(index, 1, { - name, - pattern: regex, - }); - return; + break; } - - // Add the NodeDefinition for this rule - definitions.push({ - name, - type: isAlternation ? 'alternation' : 'sequence', - pattern: pattern.map(part => (typeof part === 'string' ? { kind: part, type: 'required' } : part)), - }); + default: + log(0, 'Warning: unsupported directive: ' + directive); } +} - function processExpression(expression: Node, depth: number = 0): [DefinitionPart[], boolean] { - let isAlternation = false; +function config_process_expression(expression: Node, $: ASTConfigContext): [DefinitionPart[], boolean] { + const _sub = child_context($); - const _log = logger(log, { kind: expression.kind, depth }); + let isAlternation = false; - const pattern: DefinitionPart[] = []; + const _log = logger($.log, { kind: expression.kind, depth: $.depth }); - for (const term of expression.children || []) { - if (term.kind == 'pipe') { - isAlternation = true; - _log(2, 'Found pipe in expression'); - continue; - } + const pattern: DefinitionPart[] = []; - if (term.kind == 'expression_continue' || term.kind == 'expression#0') { - _log(2, 'Found expression_continue'); - let next; - [next, isAlternation] = processExpression(term, depth + 1); - pattern.push(...next); - continue; - } + for (const term of expression.children || []) { + if (term.kind == 'pipe') { + isAlternation = true; + _log(2, 'Found pipe in expression'); + continue; + } - if (term.kind != 'sequence' && term.kind != 'sequence_continue' && term.kind != 'sequence#0') { - _log(2, 'Invalid expression child: ' + term.kind); - continue; - } + if (term.kind == 'expression_continue' || term.kind == 'expression#0') { + _log(2, 'Found expression_continue'); + let next; + [next, isAlternation] = config_process_expression(term, _sub); + pattern.push(...next); + continue; + } - _log(2, `Parsing sequence at ${term.line}:${term.column}`); - if (!term.children?.length) { - _log(2, 'Sequence has no children'); - continue; - } - for (const factor of term.children) { - const node = factor.children?.[0] ?? factor; + if (term.kind != 'sequence' && term.kind != 'sequence_continue' && term.kind != 'sequence#0') { + _log(2, 'Invalid expression child: ' + term.kind); + continue; + } - _log(2, `Parsing ${node.kind} "${node.text}" at ${node.line}:${node.column}`); - switch (node.kind) { - case 'string': { - const quote = node.text.charAt(0); // either ' or " + _log(2, `Parsing sequence at ${term.line}:${term.column}`); + if (!term.children?.length) { + _log(2, 'Sequence has no children'); + continue; + } + for (const factor of term.children) { + const node = factor.children?.[0] ?? factor; - // Remove the surrounding quotes - const text = node.text.slice(1, -1).replaceAll('\\' + quote, quote); + _log(2, `Parsing ${node.kind} "${node.text}" at ${node.line}:${node.column}`); + switch (node.kind) { + case 'string': { + const quote = node.text.charAt(0); // either ' or " - try { - const regex = new RegExp('^' + text); + // Remove the surrounding quotes + const text = node.text.slice(1, -1).replaceAll('\\' + quote, quote); - if (!literals.some(l => l.name == text)) { - literals.push({ name: text, pattern: regex }); - } - } catch (e: any) { - throw `Invalid literal: ${text}: ${e.message}`; - } + try { + const regex = new RegExp('^' + text); - pattern.push({ kind: text, type: 'required' }); - break; + if (!$.config.literals.some(l => l.name == text)) { + $.config.literals.push({ name: text, pattern: regex }); + } + } catch (e: any) { + throw `Invalid literal: ${text}: ${e.message}`; } - case 'identifier': { - const modifer = factor.children?.[1]?.kind; - pattern.push({ kind: node.text, type: modifer == '\\?' ? 'optional' : modifer == '\\*' ? 'repeated' : 'required' }); + + pattern.push({ kind: text, type: 'required' }); + break; + } + case 'identifier': { + const modifer = factor.children?.[1]?.kind; + pattern.push({ kind: node.text, type: modifer == '\\?' ? 'optional' : modifer == '\\*' ? 'repeated' : 'required' }); + break; + } + case 'left_bracket': + case 'left_brace': + case 'left_paren': { + const inner = factor.children?.find(({ kind }) => kind == 'expression'); + if (!inner) { + _log(1, 'Missing inner expression'); break; } - case 'left_bracket': - case 'left_brace': - case 'left_paren': { - const inner = factor.children?.find(({ kind }) => kind == 'expression'); - if (!inner) { - _log(1, 'Missing inner expression'); - break; - } - const type = typeForGroup[node.kind]; + const type = typeForGroup[node.kind]; - const [subPattern, isAlternation] = processExpression(inner, depth + 1); + const [subPattern, isAlternation] = config_process_expression(inner, _sub); - // Check if subPattern contains another rule name, if so, no need to create a new group - const existing = subPattern.length == 1 && subPattern[0].kind !== 'string' ? subPattern[0].kind : null; - if (existing) { - pattern.push({ kind: existing, type }); - break; - } + // Check if subPattern contains another rule name, if so, no need to create a new group + const existing = subPattern.length == 1 && subPattern[0].kind !== 'string' ? subPattern[0].kind : null; + if (existing) { + pattern.push({ kind: existing, type }); + break; + } - const subName = `${currentNode}#${groups++}`; + const subName = `${$.currentNode}#${$.groups++}`; - definitions.push({ - name: subName, - type: isAlternation ? 'alternation' : 'sequence', - pattern: subPattern, - }); + $.config.definitions.push({ + name: subName, + type: isAlternation ? 'alternation' : 'sequence', + pattern: subPattern, + }); - // Append the new rule name to the pattern, marked as optional or repeated - pattern.push({ kind: subName, type }); + // Append the new rule name to the pattern, marked as optional or repeated + pattern.push({ kind: subName, type }); - break; - } - default: - _log(1, `Unexpected kind "${node.kind}" of factor child`); - break; + break; } + default: + _log(1, `Unexpected kind "${node.kind}" of factor child`); + break; } } + } + + return [pattern, isAlternation]; +} + +function config_process_node(node: Node, $: ASTConfigContext) { + const _sub_context = child_context($); + + const _log = logger($.log, { kind: node.kind, depth: $.depth }); + + _log(3, `Processing ${node.kind} at ${node.line}:${node.column}`); + + switch (node.kind) { + case 'directive': + config_process_directive(node.text, _sub_context); + } + + if (node.kind != 'rule') { + // Recursively process child nodes + for (const child of node.children || []) { + config_process_node(child, _sub_context); + } + return; + } + + // Extract the rule name (identifier) and its expression + const name = node.children?.find(child => child.kind === 'identifier')?.text; + const expression = node.children?.find(child => child.kind === 'expression'); - return [pattern, isAlternation]; + _log(2, `Found rule "${name}" at ${node.line}:${node.column}`); + if (!name || !expression) { + _log(1, 'Rule is missing name or expression'); + return; } + $.currentNode = name; + $.groups = 0; + + const [pattern, isAlternation] = config_process_expression(expression, _sub_context); + + /* + Inline single-use literals + For example: + `ws = "[ \t]+";` + Gets converted to + "[ \\t]+": /[ \t]+/ (a literal) + ws: [ { kind: "[ \\t]+", required: true } ] (a definition) + This collapses it, so we have + ws: /[ \t]+/ (a literal) + */ + + const maybeLiteral = pattern[0].kind; + + const index = $.config.literals.findIndex(l => l.name == maybeLiteral); + if (index != -1 && pattern.length == 1 && pattern[0].type == 'required' && $.config.literals[index].pattern.source.slice(1) == pattern[0].kind) { + let regex; + try { + regex = new RegExp('^' + maybeLiteral); + } catch (e: any) { + throw `Invalid literal: ${name}: ${e}`; + } + $.config.literals.splice(index, 1, { + name, + pattern: regex, + }); + return; + } + + // Add the NodeDefinition for this rule + $.config.definitions.push({ + name, + type: isAlternation ? 'alternation' : 'sequence', + pattern: pattern.map(part => (typeof part === 'string' ? { kind: part, type: 'required' } : part)), + }); +} + +export function create_config(ast: Node[], options: ASTConfigOptions): Config { + const config: PureConfig = { + definitions: [], + literals: [], + rootNodes: [], + ignoreLiterals: [], + }; + // Start processing from the root node for (const node of ast) { - processNode(node); + config_process_node(node, { + ...options, + depth: 0, + config, + groups: 0, + }); } - if (!rootNodes) { + if (!config.rootNodes) { throw 'Missing root node'; } - return { definitions, literals, rootNodes, ignoreLiterals }; + return config; } diff --git a/src/config.ts b/src/config.ts index ceb69dd..7ee83fb 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,4 +1,4 @@ -import type { NodeDefinition } from './parser.js'; +import type { NodeDefinition, PureNodeDefinition } from './parser.js'; import type { TokenDefinition } from './tokens.js'; export interface TokenDefinitionJSON { @@ -21,6 +21,13 @@ export interface Config { ignoreLiterals: string[]; } +export interface PureConfig { + literals: TokenDefinition[]; + definitions: PureNodeDefinition[]; + rootNodes: string[]; + ignoreLiterals: string[]; +} + export function parse_json_literal(literal: TokenDefinitionJSON): TokenDefinition { const $ = literal.pattern.endsWith('$'); diff --git a/src/parser.ts b/src/parser.ts index a2e9f89..8464aa5 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -87,7 +87,7 @@ export interface ParseAndTokenize extends ParseOptionsShared { export interface ParseOnly extends ParseOptionsShared { tokens: Token[]; - literals: string[]; + literals: Iterable | string[]; } export type ParseOptions = ParseOnly | ParseAndTokenize;