diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/test.yml similarity index 72% rename from .github/workflows/c-cpp.yml rename to .github/workflows/test.yml index 8bfc46a..98e0f8c 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: C/C++ CI +name: Test on: push: @@ -8,11 +8,11 @@ on: jobs: build: - runs-on: ubuntu-latest - steps: - uses: actions/checkout@v3 + - name: "additional dependencies" + run: sudo apt update && sudo apt install -y jq - name: make run: make -f Makefile.gnu - name: make test diff --git a/.gitignore b/.gitignore index 7f7fdc3..b05fa72 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.o +min_lalr1/min_lalr1 src/parse_boot1.c src/parse_boot2.c diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fa6cd6..d32ebdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,13 @@ This file is used to document any relevant changes done to UniCC. ## [v1.8] -Current main branch. +Released on Oct 28, 2023 - Improved program module generator to support - and -settings -- Target `python` updated to spaces instead of tabs and further improvements -- Target `javascript` updated to more recent JS/ECMAScript6 features +- Updated Test suite +- Target `python` updated to spaces instead of tabs and further PEP-8 improvements +- Target `javascript` updated to more recent JS/ECMAScript6 standards +- Target `json` fixed and updated ## [v1.7] diff --git a/LICENSE b/LICENSE index 8e66dd9..365ea50 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2006-2020 by Phorward Software Technologies, Jan Max Meyer. +Copyright © 2006-2023 by Jan Max Meyer, Phorward Software Technologies. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile.gnu b/Makefile.gnu index 9a27b50..60a3cb7 100644 --- a/Makefile.gnu +++ b/Makefile.gnu @@ -197,7 +197,17 @@ test_js: $(TESTPREFIX)js_expr $(TESTPREFIX)js_ast @echo "--- $@ succeded ---" @rm $(TESTPREFIX)* +# JSON + +$(TESTPREFIX)json_ast: + ./unicc -wtl json examples/expr.ast.par >$@.json + jq . $@.json + +test_json: $(TESTPREFIX)js_expr $(TESTPREFIX)json_ast + @echo "--- $@ succeded ---" + @rm $(TESTPREFIX)* + # Test -test: test_c test_cpp test_py test_js +test: test_c test_cpp test_py test_js test_json @echo "=== $+ succeeded ===" diff --git a/README.md b/README.md index 761e2e8..4f8c8c6 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,28 @@ -# UniCC [![C/C++ CI](https://github.com/phorward/unicc/actions/workflows/c-cpp.yml/badge.svg)](https://github.com/phorward/unicc/actions/workflows/c-cpp.yml) [![MIT License badge](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) - -**unicc** is a universal LALR(1) parser generator with built-in scanner generator, targetting C, C++, Python, JavaScript, JSON and XML. +
+ UniCC Logo +

LALR(1) Parser Generator

+ + Badge displaying the test status + + + Badge displaying the license + +
+ The universal LALR(1) parser generator with built-in scanner generator, creating parsers in different target programming languages. +
## About -**unicc** compiles an augmented grammar definition into a program source code that parses the described grammar. Because UniCC is intended to be target-language independent, it can be configured via template definition files to emit parsers in almost any programming language. +**unicc** is a parser generator that compiles an extended grammar definition into program source code that parses the described grammar. Since UniCC is target language independent, it can be configured via template definition files to generate parsers in any programming language. -UniCC comes with out of the box support for the programming languages **C**, **C++**, **Python** and **JavaScript**. Parsers can also be generated into **JSON** and **XML**. +UniCC natively supports the programming languages **C**, **C++**, **Python** and **JavaScript**. Parse tables can also be generated in **JSON** and **XML**. -UniCC can generate both scanner-less and scanner-mode parsers. The more powerful scanner-less parsing is the default, and allows to break the barrier between the grammar and its tokens, so tokens are under full control of the context-free grammar. Scanner-less parsing requires that the provided grammar is internally rewritten according to whitespace and lexeme settings. +UniCC can generate both scannerless parsers and parsers with a separate scanner. The more powerful scannerless parsing is the default and allows the barrier between the grammar and its tokens to be broken, leaving the tokens under the full control of the context-free grammar. Scannerless parsing requires that the provided grammar is rewritten internally according to the whitespace and lexeme settings. ## Examples Below is the full definition of a simple, universal grammar example that can be compiled to any of UniCC's target languages. + This example uses the automatic abstract syntax tree construction syntax to define nodes and leafs of the resulting syntax tree. ```unicc @@ -82,21 +92,31 @@ $ cc -o expr expr.c This [C](examples/expr.c.par)-example can also be found for [C++](examples/expr.cpp.par), [Python](examples/expr.py.par) and [JavaScript](examples/expr.js.par). -More real-world examples for parsers implemented with UniCC can be found in [xpl](https://github.com/phorward/xpl), [rapidbatch](https://github.com/phorward/rapidbatch) and [ViUR logics](https://github.com/viur-framework/logics). +More real-world examples for parsers implemented with UniCC can be found in [XPL](https://github.com/phorward/xpl), [RapidBATCH](https://github.com/phorward/rapidbatch) and [Logics](https://github.com/viur-framework/logics). ## Features UniCC provides the following features and tools: - Grammars are expressed in a powerful Backus-Naur-style meta language -- Generates parsers in C, C++, Python, JavaScript, JSON and XML -- Scanner-less and scanner-mode parser construction supported -- Build-in full Unicode processing -- Grammar prototyping features, virtual productions and anonymous nonterminals +- Generates standalone (dependency-less) parsers in + - C + - C++ + - Python 2 (deprecated) + - Python 3 + - JavaScript (ES2018) +- Provides facilities to generate parse tables as + - JSON + - XML (deprecated) +- Scannerless parser supported by default +- Full Unicode processing built-in +- Grammar prototyping features + - automatic grammar revision for scannerless parsers + - virtual productions + - anonymous nonterminals - Abstract syntax tree notation features - Semantically determined symbols - Standard LALR(1) conflict resolution -- Platform-independent (console-based) ## Documentation @@ -104,7 +124,7 @@ The [UniCC User's Manual](http://downloads.phorward-software.com/unicc/unicc.pdf ## Installation -On Linux and OS X, UniCC can be build and installed like any GNU-style program, with +UniCC can be build and installed like any GNU-style program, with ```sh ./configure @@ -112,11 +132,11 @@ make make install ``` -In the past, setup packages for Windows systems where also provided, but these are not maintained anymore since unicc v1.6. You can still find them [here](https://downloads.phorward-software.com/unicc/). - -## UniCC v2 +Alternatively, the dev-toolchain can be used, by just calling on any recent Linux system. -Between 2014 and 2020, a version 2 of UniCC was under development, but abandoned for now. This version currently exists in the [branch unicc2](https://github.com/phorward/unicc/tree/unicc2) inside of this repository, and is a complete rewrite, but with the intention to provide better tools for grammar prototyping and direct AST traversal. +```sh +make -f Makefile.gnu +``` ## License diff --git a/targets/javascript.tlt b/targets/javascript.tlt index 9f1794b..2f542b5 100644 --- a/targets/javascript.tlt +++ b/targets/javascript.tlt @@ -82,24 +82,24 @@ --> - @@production-number - , + @@production-number + , - { - } + { + } , - @@symbol: [ @@action,@@index ] - , + @@symbol: [@@action, @@index] + , - { - } + { + } , - @@symbol: [ @@action,@@index ] - , + @@symbol: [@@action, @@index] + , - @@machine - , + @@machine + , - @@from, @@to - , + @@from, @@to + , - @@goto - , + @@goto + , - [ - ] + [ + ] , - @@index - , + @@index + , - [ - ] + [ + ] , - @@accept - , + @@accept + , - { "symbol": "@@symbol-name", "emit": "@@emit", "is-terminal": @@type, "is-lexem": @@lexem, "is-whitespace": @@whitespace, "is-greedy": @@greedy } + { + "symbol": "@@symbol-name", + "emit": "@@emit", + "is-terminal": @@type, + "is-lexem": @@lexem, + "is-whitespace": @@whitespace, + "is-greedy": @@greedy + } , - { "production": "@@production", "emit": "@@emit", "length": @@length, "left-hand-side": @@lhs } + { + "production": "@@production", + "emit": "@@emit", + "length": @@length, + "left-hand-side": @@lhs + } , @@ -161,7 +173,7 @@ **************************************************************************** --> -// Parser module generated by unicc from @@filename. +// This parser module was generated by unicc from @@filename. // DO NOT EDIT THIS FILE MANUALLY, IT WILL GO AWAY! @@prologue @@ -289,7 +301,7 @@ export default class @@prefixParser { static #lexer = { "select": [ - @@dfa-select +@@dfa-select ], "index": [ @@dfa-index @@ -312,7 +324,7 @@ export default class @@prefixParser { @@goto-table ], "default-production": [ - @@default-productions +@@default-productions ] }; diff --git a/targets/json.tlt b/targets/json.tlt index 2ca6568..19ea4a2 100644 --- a/targets/json.tlt +++ b/targets/json.tlt @@ -1,189 +1,155 @@ - - true - false - - - - - - - - - - - any - any - @@value-type-id - @@attribute: null, - value: null - value: { - } - - - - case @@production-number: { - ; } break; - - - ( ( this.tos - @@offset ).value ) - - ( ( this.tos - @@offset ).value.@@attribute ) - this.ret - this.ret.@@attribute - - ( this.lhs = @@sym ) - - - - case @@symbol-number: { - ; } break; - - - this.lexem( pcb ) - this.len - this.tos.value - ( this.tos.value.@@attribute ) - - ( this.sym = @@sym ) - - - - - @@production-number - , - - - - [ - ] - , - @@symbol,@@action,@@index - , - - - - [ - ] - , - @@symbol,@@action,@@index - , - - - - - - @@machine - , - - - - @@from, @@to - , - - - - @@goto - , - - - - [ - ] - , - @@index - , - - - - [ - ] - , - @@accept - , - - - - - - { "symbol": "@@symbol-name", "emit": "@@emit", "is_terminal": @@type, "is_lexem": @@lexem, "is_whitespace": @@whitespace, "is_greedy": @@greedy } - , - - - - { "production": "@@production", "emit": "@@emit", "length": @@length, "left_hand_side": @@lhs } - , - - - + true + false + + + + + + + + + + + + @@production-number + , + + + + { + } + , + "@@symbol": [@@action, @@index] + , + + + + { + } + , + "@@symbol": [@@action, @@index] + , + + + + + + @@machine + , + + + + @@from, @@to + , + + + + @@goto + , + + + + [ + ] + , + @@index + , + + + + [ + ] + , + @@accept + , + + + + + + { + "symbol": "@@symbol-name", + "emit": "@@emit", + "is-terminal": @@type, + "is-lexem": @@lexem, + "is-whitespace": @@whitespace, + "is-greedy": @@greedy + } + , + + + + { + "production": "@@production", + "emit": "@@emit", + "length": @@length, + "left-hand-side": @@lhs + } + , + + + { - "grammar": { - "symbols": [ + "grammar": { + "symbols": [ @@symbols - ], - "productions": [ + ], + "productions": [ @@productions - ], - "goal": @@goal - }, - lexer: { - "machine": [ @@dfa-select ], - "indexes": [ + ], + "goal": @@goal + }, + "lexer": { + "machine": [ +@@dfa-select + ], + "indexes": [ @@dfa-index - ], - "chars": [ + ], + "chars": [ @@dfa-char - ], - "transitions": [ + ], + "transitions": [ @@dfa-trans - ], - "accept": [ + ], + "accept": [ @@dfa-accept - ] - }, - "parser": { - "actions": [ + ] + }, + "parser": { + "actions": [ @@action-table - ], - "gotos": [ + ], + "gotos": [ @@goto-table - ], - "default_productions": [ @@default-productions ] - } + ], + "default_productions": [ +@@default-productions + ] + } } diff --git a/targets/python.tlt b/targets/python.tlt index d88e14f..3a916ae 100644 --- a/targets/python.tlt +++ b/targets/python.tlt @@ -94,7 +94,7 @@ class @@prefixNode: use of the AST node syntax. """ - def __init__(self, emit = None, match = None, children = None): + def __init__(self, emit=None, match=None, children=None): self.emit = emit self.match = match self.children = children or [] @@ -119,12 +119,12 @@ class @@prefixParseException(Exception): TODO: This might be replaced by SyntaxError builtin. """ - def __init__(self, row, col, txt = None): + def __init__(self, row, col, txt=None): if isinstance(txt, list): expecting = txt txt = ("Line %d, column %d: Parse error, expecting %s" % - (row, col, ", ".join([("%r" % symbol[0]) - for symbol in txt]))) + (row, col, ", ".join([("%r" % symbol[0]) + for symbol in txt]))) else: expecting = None @@ -246,7 +246,7 @@ class @@prefixParser: # Check action table first for (sym, pcb.act, pcb.idx) in self._act[pcb.tos.state]: if sym == pcb.sym: - return True if pcb.act else False #enforced parse error + return True if pcb.act else False # enforced parse error # Otherwise, apply default production pcb.idx = self._def_prod[pcb.tos.state] @@ -293,7 +293,7 @@ class @@prefixParser: pcb.buf += ch - #print("_get_input", pcb.buf, offset, pcb.buf[offset], ord(pcb.buf[offset])) + # print("_get_input", pcb.buf, offset, pcb.buf[offset], ord(pcb.buf[offset])) return ord(pcb.buf[offset]) @@ -332,7 +332,7 @@ class @@prefixParser: while self._dfa_chars[idx][0] > -1: if (next >= self._dfa_chars[idx][0] - and next <= self._dfa_chars[idx][1]): + and next <= self._dfa_chars[idx][1]): length += 1 state = self._dfa_trans[idx] @@ -358,7 +358,7 @@ class @@prefixParser: # TODO: Semantic Terminal Selection? - #print("_lex", pcb.sym, pcb.len) + # print("_lex", pcb.sym, pcb.len) def _get_sym(self, pcb): # Get lookahead symbol @@ -390,7 +390,7 @@ class @@prefixParser: return pcb.sym > -1 - def parse(self, s = None): + def parse(self, s=None): if s is None: try: s = raw_input(">") @@ -404,7 +404,7 @@ class @@prefixParser: pcb.stack.append(pcb.tos) while True: - #print("state = %d" % pcb.tos.state) + # print("state = %d" % pcb.tos.state) # Reduce while pcb.act & self._REDUCE: @@ -412,11 +412,11 @@ class @@prefixParser: # Set default left-hand side pcb.lhs = self._productions[pcb.idx][3] - #print("REDUCE", pcb.idx, self._productions[pcb.idx][0]) - #print("state", pcb.tos.state) + # print("REDUCE", pcb.idx, self._productions[pcb.idx][0]) + # print("state", pcb.tos.state) # Call reduce function - #print("CALL", "_reduce_action_%d" % pcb.idx) + # print("CALL", "_reduce_action_%d" % pcb.idx) reduce_fn = getattr(self, "_reduce_action_%d" % pcb.idx, None) if reduce_fn: reduce_fn(pcb) @@ -440,7 +440,7 @@ class @@prefixParser: # Handle AST nodes if self._productions[pcb.idx][1]: - #print("%s = %s" % (self._productions[pcb.idx][0], self._productions[pcb.idx][1])) + # print("%s = %s" % (self._productions[pcb.idx][0], self._productions[pcb.idx][1])) node = @@prefixNode(self._productions[pcb.idx][1], children=cnodes) else: node = None @@ -453,7 +453,7 @@ class @@prefixParser: if pcb.lhs == @@goal and len(pcb.stack) == 1: pcb.tos.node = node or cnodes self._clear_input(pcb) - pcb.act = self._SUCCESS; + pcb.act = self._SUCCESS break self._get_go(pcb) @@ -474,22 +474,22 @@ class @@prefixParser: # Get next input symbol self._get_sym(pcb) - #print("pcb.sym = %d (%s)" % (pcb.sym, self._symbols[pcb.sym][0])) - #print("pcb.len = %d" % pcb.len) + # print("pcb.sym = %d (%s)" % (pcb.sym, self._symbols[pcb.sym][0])) + # print("pcb.len = %d" % pcb.len) # Get action table entry if not self._get_act(pcb): # TODO: Error Recovery raise @@prefixParseException(pcb.line, pcb.column, - [self._symbols[sym] - for (sym, pcb.act, pcb.idx) - in self._act[pcb.tos.state]]) + [self._symbols[sym] + for (sym, pcb.act, pcb.idx) + in self._act[pcb.tos.state]]) - #print("pcb.act = %d" % pcb.act) + # print("pcb.act = %d" % pcb.act) # Shift if pcb.act & self._SHIFT: - #print("SHIFT", pcb.sym, self._symbols[pcb.sym]) + # print("SHIFT", pcb.sym, self._symbols[pcb.sym]) pcb.tos = _@@prefixToken() pcb.stack.append(pcb.tos) @@ -509,7 +509,9 @@ class @@prefixParser: @@top-value = pcb.buf[:pcb.len] if pcb.tos.symbol[1]: - pcb.tos.node = @@prefixNode(pcb.tos.symbol[1], @@top-value) + pcb.tos.node = @@prefixNode( + pcb.tos.symbol[1], @@top-value + ) if pcb.sym != @@eof and pcb.sym != @@error: self._clear_input(pcb) diff --git a/unicc.svg b/unicc.svg new file mode 100644 index 0000000..5ab8d37 --- /dev/null +++ b/unicc.svg @@ -0,0 +1,79 @@ + + + + + + + + + + + + + + UniCC + +