diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..11306c8 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,9 @@ +[run] +source = squel +branch = True + +[report] +ignore_errors = True +omit = + .venv/* + .pytest_cache/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..9f8e5ae --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,41 @@ +name: test +on: [push] + +jobs: + test: + runs-on: ubuntu-24.04 + name: Python ${{matrix.python-version}} + strategy: + matrix: + python-version: ["3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install poetry + run: sudo apt-get install python3-poetry + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry + key: cache-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} + + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Run linting + run: poetry run flake8 --max-complexity=15 --exclude=./build,.venv,.tox,.eggs,dist,docs + + - name: Run tests + run: poetry run coverage run -m pytest + + - name: Collect coverage + run: poetry run coverage report diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e51d45b --- /dev/null +++ b/.gitignore @@ -0,0 +1,106 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +poetry.lock \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..dd350ff --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 nomorepanic + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc0c7f7 --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# Squel + +SQL parsing, done right. + +WIP, but you can you use it as base. Just add the missing definitions to +the grammar. + +## Example + +```python +from squel.app import Squel + + +tree = Squel.parse('hello.sql') +print(tree.pretty()) +``` + +### CLI + +```sh +squel parse hello.sql +``` diff --git a/grammar/postgres.ebnf b/grammar/postgres.ebnf new file mode 100644 index 0000000..977944a --- /dev/null +++ b/grammar/postgres.ebnf @@ -0,0 +1,75 @@ +start: _NL? block? + +argument_value: NAME | INT +argument: _OP (argument_value (_COMMA _WS? argument_value)*) _CP + +primary: _PRIMARY _WS _KEY +unique: _UNIQUE +references: _REFS _WS NAME _WS (_ON _WS _DELETE _WS _CASCADE)? +function_name: NAME _OP _CP + +timestamp: TIME_TYPE _WS "without time zone" +char: CHAR_TYPE _WS _VARYING argument +type: BOOLEAN_TYPE | INT_TYPE | char | TEXT_TYPE | timestamp | SERIAL_TYPE | UUID_TYPE +value: INT | FALSE | TRUE + +not_null: _NOT _WS _NULL +default: _DEFAULT _WS (value|function_name) +property: not_null | default + +constraint: (primary|unique|references) _COMMA? +bound_constraint: constraint _WS? argument _COMMA? +column_name: _QUOTE? NAME _QUOTE? +column: column_name _WS type (_WS property)* (_WS constraint)? _COMMA? + +table_name: NAME (_DOT NAME)* +table_head: _CREATE _WS _TABLE _WS table_name _WS? _OP +table_body: (column _NL)+ (bound_constraint _NL)* +table_block: table_head _NL _INDENT table_body _DEDENT _CP _SCOLON + +index: _CREATE _WS _INDEX _WS NAME _WS _ON _WS NAME _WS? argument _SCOLON + +comment: _COMMENT + +line: comment | index | block +block: line _NL | table_block + +_CREATE: "CREATE" +_TABLE: "TABLE"i +_NOT: "not"i +_NULL: "null"i +_DEFAULT: "default"i +_VARYING: "varying"i +_PRIMARY: "primary"i +_KEY: "KEY"i +_UNIQUE: "UNIQUE"i +_REFS: "references"i +_ON: "on"i +_DELETE: "delete"i +_CASCADE: "cascade"i +_INDEX: "index"i +_COMMENT: /--(.*)/ + +BOOLEAN_TYPE: "boolean" +INT_TYPE: "integer" +CHAR_TYPE: "character" +TEXT_TYPE: "text" +TIME_TYPE: "timestamp" +SERIAL_TYPE: "serial" +UUID_TYPE: "uuid" + +NAME.1: /[a-zA-Z_0-9]+/ +INT.2: "0".."9"+ +TRUE: "true" +FALSE: "false" + +_WS: (" ")+ +_NL: /(\r?\n[\t ]*)+/ +_INDENT: "" +_DEDENT: "" +_OP: "(" +_CP: ")" +_COMMA: "," +_SCOLON: ";" +_DOT: "." +_QUOTE: "\"" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5d0d94 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[tool.poetry] +name = "squel" +version = "1.0.0" +description = "SQL parsing, done right" +authors = ["nomorepanic "] +readme = "README.md" +repository = "https://github.com/nomorepanic/squel" +license="MIT" +include = ["LICENSE", "README.md", "grammar/postgres.ebnf"] + +[tool.poetry.dependencies] +python = "^3.11" +click = ">=6.7" +lark-parser = ">=0.6.4" + + +[tool.poetry.group.dev.dependencies] +coverage = "^7.0" +flake8 = "^7.0" +flake8-quotes = "^3.4" +flake8-import-order = "^0.18" +pep8-naming = "^0.14" +pytest = "^8" +pytest-mock = "^3.14" +pytest-sugar = "^1.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +sqlast = "squel.Cli:Cli.main" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..dc621a8 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +python_files=tests/*/*.py diff --git a/squel/__init__.py b/squel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/squel/app.py b/squel/app.py new file mode 100644 index 0000000..e1b9bdc --- /dev/null +++ b/squel/app.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +import io + +from .parser import Parser + + +class Squel: + + @staticmethod + def parse(path): + with io.open(path, 'r') as file: + source = file.read() + return Parser().parse(source) diff --git a/squel/cli.py b/squel/cli.py new file mode 100644 index 0000000..e110621 --- /dev/null +++ b/squel/cli.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import click + +from .app import Squel + + +class Cli: + + @click.group() + def main(): + pass + + @staticmethod + @main.command() + @click.argument('path') + def parse(path): + click.echo(Squel.parse(path)) diff --git a/squel/grammar.py b/squel/grammar.py new file mode 100644 index 0000000..3704db1 --- /dev/null +++ b/squel/grammar.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +import io + + +class Grammar: + + @staticmethod + def grammar(ebnf_file): + with io.open(ebnf_file) as file: + return file.read() diff --git a/squel/indenter.py b/squel/indenter.py new file mode 100644 index 0000000..f42f026 --- /dev/null +++ b/squel/indenter.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +from lark.indenter import Indenter + + +class CustomIndenter(Indenter): + NL_type = '_NL' + OPEN_PAREN_types = [] + CLOSE_PAREN_types = [] + INDENT_type = '_INDENT' + DEDENT_type = '_DEDENT' + tab_len = 8 diff --git a/squel/parser.py b/squel/parser.py new file mode 100644 index 0000000..9150e77 --- /dev/null +++ b/squel/parser.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import os + +from lark import Lark + +from .grammar import Grammar +from .indenter import CustomIndenter +from .transformer import Transformer + + +class Parser: + def __init__(self, algo='lalr', ebnf_file=None): + self.algo = algo + self.ebnf_file = ebnf_file + + @staticmethod + def indenter(): + """ + Initialize the indenter + """ + return CustomIndenter() + + @staticmethod + def transformer(): + """ + Initialize the transformer + """ + return Transformer() + + @staticmethod + def default_ebnf(): + folder = os.path.dirname(__file__) + path = os.path.join(folder, '..', 'grammar', 'postgres.ebnf') + return os.path.realpath(path) + + def lark(self): + if self.ebnf_file is None: + self.ebnf_file = self.default_ebnf() + grammar = Grammar.grammar(self.ebnf_file) + return Lark(grammar, parser=self.algo, postlex=self.indenter()) + + def parse(self, source): + source = '{}\n'.format(source) + lark = self.lark() + tree = lark.parse(source) + return self.transformer().transform(tree) diff --git a/squel/transformer.py b/squel/transformer.py new file mode 100644 index 0000000..0a6dfd2 --- /dev/null +++ b/squel/transformer.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +from lark import Transformer as LarkTransformer + +from .tree import Tree + + +class Transformer(LarkTransformer): + + def __getattr__(self, attribute, *args): + return lambda matches: Tree(attribute, matches) diff --git a/squel/tree.py b/squel/tree.py new file mode 100644 index 0000000..4346d7b --- /dev/null +++ b/squel/tree.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +from lark.lexer import Token +from lark.tree import Tree as LarkTree + + +class Tree(LarkTree): + + @staticmethod + def walk(tree, path): + for item in tree.children: + if isinstance(item, Tree): + if item.data == path: + return item + + def node(self, path): + """ + Finds a subtree or a nested subtree, using path + """ + shards = path.split('.') + current = None + for shard in shards: + if current is None: + current = self.walk(self, shard) + else: + current = self.walk(current, shard) + return current + + def child(self, index): + if len(self.children) > index: + return self.children[index] + + def line(self): + """ + Finds the line number of a tree, by finding the first token in the tree + and returning its line + """ + for child in self.children: + if isinstance(child, Token): + return str(child.line) + return child.line() + + def __getattr__(self, attribute): + return self.node(attribute) diff --git a/squel/version.py b/squel/version.py new file mode 100644 index 0000000..892084b --- /dev/null +++ b/squel/version.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +version = '1.0.0' diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/parser.py b/tests/integration/parser.py new file mode 100644 index 0000000..f658e54 --- /dev/null +++ b/tests/integration/parser.py @@ -0,0 +1,6 @@ +from squel.parser import Parser +from squel.tree import Tree + + +def test_parse_empty_string(): + assert Parser().parse('') == Tree('start', []) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/app.py b/tests/unit/app.py new file mode 100644 index 0000000..3c52bc3 --- /dev/null +++ b/tests/unit/app.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +import io + +from squel.app import Squel +from squel.parser import Parser + + +def test_app_squel_parse(patch): + patch.object(io, 'open') + patch.init(Parser) + patch.object(Parser, 'parse') + result = Squel.parse('path') + io.open.assert_called_with('path', 'r') + Parser.parse.assert_called_with(io.open().__enter__().read()) + assert result == Parser.parse() diff --git a/tests/unit/cli.py b/tests/unit/cli.py new file mode 100644 index 0000000..fc5b00b --- /dev/null +++ b/tests/unit/cli.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +import click +from click.testing import CliRunner + +from pytest import fixture + +from squel.app import Squel +from squel.cli import Cli + + +@fixture +def runner(): + return CliRunner() + + +def test_cli_parse(patch, runner): + patch.object(click, 'echo') + patch.object(Squel, 'parse') + result = runner.invoke(Cli.parse, ['path']) + Squel.parse.assert_called_with('path') + click.echo.assert_called_with(Squel.parse()) + assert result.exit_code == 0 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 0000000..06ccbdd --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +from pytest import fixture + + +@fixture +def magic(mocker): + """ + Shorthand for mocker.MagicMock. It's magic! + """ + return mocker.MagicMock + + +@fixture +def patch_init(mocker): + """ + Makes patching a class' constructor slightly easier + """ + def patch_init(item): + mocker.patch.object(item, '__init__', return_value=None) + return patch_init + + +@fixture +def patch_many(mocker): + """ + Makes patching many attributes of the same object simpler + """ + def patch_many(item, attributes): + for attribute in attributes: + mocker.patch.object(item, attribute) + return patch_many + + +@fixture +def patch(mocker, patch_init, patch_many): + mocker.patch.init = patch_init + mocker.patch.many = patch_many + return mocker.patch + + +@fixture +def call_count(): + """ + Makes asserting a call count on the same module less repetitive. + """ + def call_count(module, methods, count=1): + for method in methods: + assert getattr(module, method).call_count == count + return call_count diff --git a/tests/unit/grammar.py b/tests/unit/grammar.py new file mode 100644 index 0000000..a1cc9b7 --- /dev/null +++ b/tests/unit/grammar.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +import io + +from squel.grammar import Grammar + + +def test_grammar_grammar(patch): + patch.object(io, 'open') + result = Grammar.grammar('grammar.ebnf') + io.open.assert_called_with('grammar.ebnf') + assert result == io.open().__enter__().read() diff --git a/tests/unit/indenter.py b/tests/unit/indenter.py new file mode 100644 index 0000000..4dd6a66 --- /dev/null +++ b/tests/unit/indenter.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +from lark.indenter import Indenter + +from squel.indenter import CustomIndenter + + +def test_indenter_customindenter(): + assert issubclass(CustomIndenter, Indenter) + assert CustomIndenter.NL_type == '_NL' + assert CustomIndenter.OPEN_PAREN_types == [] + assert CustomIndenter.CLOSE_PAREN_types == [] + assert CustomIndenter.INDENT_type == '_INDENT' + assert CustomIndenter.DEDENT_type == '_DEDENT' + assert CustomIndenter.tab_len == 8 diff --git a/tests/unit/parser.py b/tests/unit/parser.py new file mode 100644 index 0000000..b5667cf --- /dev/null +++ b/tests/unit/parser.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +import os + +from lark import Lark + +from pytest import fixture + +from squel.grammar import Grammar +from squel.indenter import CustomIndenter +from squel.parser import Parser +from squel.transformer import Transformer + + +@fixture +def parser(): + return Parser() + + +def test_parser_init(parser): + assert parser.algo == 'lalr' + assert parser.ebnf_file is None + + +def test_parser_init_algo(): + parser = Parser(algo='algo') + assert parser.algo == 'algo' + + +def test_parser_init_ebnf_file(): + parser = Parser(ebnf_file='alternative.ebnf') + assert parser.ebnf_file == 'alternative.ebnf' + + +def test_parser_indenter(patch, parser): + patch.init(CustomIndenter) + assert isinstance(parser.indenter(), CustomIndenter) + + +def test_parser_transfomer(patch, parser): + patch.init(Transformer) + assert isinstance(parser.transformer(), Transformer) + + +def test_parser_default_ebnf(patch): + patch.object(os, 'path') + result = Parser.default_ebnf() + args = (os.path.dirname(), '..', 'grammar', 'postgres.ebnf') + os.path.join.assert_called_with(*args) + os.path.realpath.assert_called_with(os.path.join()) + assert result == os.path.realpath() + + +def test_parser_lark(patch, parser): + patch.object(Grammar, 'grammar') + patch.many(Parser, ['indenter', 'default_ebnf']) + patch.init(Lark) + result = parser.lark() + Grammar.grammar.assert_called_with(Parser.default_ebnf()) + kwargs = {'parser': parser.algo, 'postlex': Parser.indenter()} + Lark.__init__.assert_called_with(Grammar.grammar(), **kwargs) + assert isinstance(result, Lark) + + +def test_parser_lark_ebnf(patch, parser): + patch.object(Grammar, 'grammar') + patch.object(Parser, 'indenter') + patch.init(Lark) + parser.ebnf_file = 'grammar.ebnf' + parser.lark() + Grammar.grammar.assert_called_with('grammar.ebnf') + + +def test_parser_parse(patch, parser): + patch.many(Parser, ['lark', 'transformer']) + result = parser.parse('source') + Parser.lark().parse.assert_called_with('source\n') + Parser.transformer().transform.assert_called_with(Parser.lark().parse()) + assert result == Parser.transformer().transform() diff --git a/tests/unit/transformer.py b/tests/unit/transformer.py new file mode 100644 index 0000000..af9d4f7 --- /dev/null +++ b/tests/unit/transformer.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +from lark import Transformer as LarkTransformer + +from pytest import mark + +from squel.transformer import Transformer +from squel.tree import Tree + + +def test_transformer(): + assert issubclass(Transformer, LarkTransformer) + + +@mark.parametrize('rule', ['start', 'line', 'block', 'column', 'property']) +def test_transformer_rules(rule): + transformer = Transformer() + result = getattr(transformer, rule)(['matches']) + assert isinstance(result, Tree) + assert result.data == rule + assert result.children == ['matches'] diff --git a/tests/unit/tree.py b/tests/unit/tree.py new file mode 100644 index 0000000..84ec310 --- /dev/null +++ b/tests/unit/tree.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +from lark.lexer import Token +from lark.tree import Tree as LarkTree + +from squel.tree import Tree + + +def test_tree(): + assert issubclass(Tree, LarkTree) + + +def test_tree_walk(): + inner_tree = Tree('inner', []) + tree = Tree('rule', [inner_tree]) + result = Tree.walk(tree, 'inner') + assert result == inner_tree + + +def test_tree_walk_token(): + """ + Ensures that encountered tokens are skipped + """ + inner_tree = Tree('inner', []) + tree = Tree('rule', [Token('test', 'test'), inner_tree]) + result = Tree.walk(tree, 'inner') + assert result == inner_tree + + +def test_tree_node(patch): + patch.object(Tree, 'walk') + tree = Tree('rule', []) + result = tree.node('inner') + Tree.walk.assert_called_with(tree, 'inner') + assert result == Tree.walk() + + +def test_tree_child(): + tree = Tree('rule', ['child']) + assert tree.child(0) == 'child' + + +def test_tree_child_overflow(): + tree = Tree('rule', ['child']) + assert tree.child(1) is None + + +def test_tree_line(): + tree = Tree('outer', [Tree('path', [Token('WORD', 'word', line=1)])]) + assert tree.line() == '1' + + +def test_tree_attributes(patch): + patch.object(Tree, 'node') + tree = Tree('master', []) + result = tree.branch + Tree.node.assert_called_with('branch') + assert result == Tree.node() diff --git a/tests/unit/version.py b/tests/unit/version.py new file mode 100644 index 0000000..61ce61b --- /dev/null +++ b/tests/unit/version.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +from squel.version import version + + +def test_version_version(): + assert version == '1.0.0'