Skip to content

Commit

Permalink
Merge pull request #3 from macbre/initial-version
Browse files Browse the repository at this point in the history
Initial version
  • Loading branch information
macbre authored Jan 11, 2018
2 parents 3bc5d0d + 8cab787 commit 177516b
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,5 @@ ENV/

# mypy
.mypy_cache/

.idea/
8 changes: 8 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
language: python
python:
- "2.7"
- "3.4"
- "3.5"
- "3.6"
install: make install
script: make coverage && make lint
25 changes: 25 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
coverage_options = --include='sql_metadata.py' --omit='test/*'

install:
pip install -e .

test:
py.test

coverage:
rm -f .coverage*
rm -rf htmlcov/*
coverage run -p -m py.test
coverage combine
coverage html -d htmlcov $(coverage_options)
coverage xml -i
coverage report $(coverage_options)

lint:
pylint sql_metadata.py

publish:
# run git tag -a v0.0.0 before running make publish
python setup.py sdist upload -r pypi

.PHONY: test
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,23 @@
# sql-metadata
Uses tokenized query returned by [`python-sqlparse`](https://github.com/andialbrecht/sqlparse) and generates query metadata

[![PyPI](https://img.shields.io/pypi/v/sql-metadata.svg)](https://pypi.python.org/pypi/sql-metadata)
[![Build Status](https://travis-ci.org/macbre/sql-metadata.svg?branch=master)](https://travis-ci.org/macbre/sql-metadata)

Uses tokenized query returned by [`python-sqlparse`](https://github.com/andialbrecht/sqlparse) and generates query metadata. Extracts column names and tables used by the query.

### Usage

```python
>>> import sql_metadata

>>> sql_metadata.get_query_tokens("SELECT * FROM foo")
[<DML 'SELECT' at 0x7F14FFDEB808>, <Wildcard '*' at 0x7F14FFDEB940>, <Keyword 'FROM' at 0x7F14FFDEBBB0>, <Name 'foo' at 0x7F14FFDEB9A8>]

>>> sql_metadata.get_query_columns("SELECT test, id FROM foo, bar")
[u'test', u'id']

>>> sql_metadata.get_query_tables("SELECT test, id FROM foo, bar")
[u'foo', u'bar']
```

> See `test/test_query.py` file for more examples of a bit more complex queries.
46 changes: 46 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from setuptools import setup

VERSION = '1.0'

# @see https://github.com/pypa/sampleproject/blob/master/setup.py
setup(
name='sqlmetadata',
version=VERSION,
author='Maciej Brencz',
author_email='maciej.brencz@gmail.com',
license='MIT',
description='Uses tokenized query returned by python-sqlparse and generates query metadata',
url='https://github.com/macbre/sql-metadata',
# https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifiers=[
# How mature is this project? Common values are
# 3 - Alpha
# 4 - Beta
# 5 - Production/Stable
'Development Status :: 5 - Production/Stable',

# Indicate who your project is intended for
'Intended Audience :: Developers',
'Intended Audience :: System Administrators',
'Topic :: Database',

# Pick your license as you wish
'License :: OSI Approved :: MIT License',

# Specify the Python versions you support here. In particular, ensure
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
py_modules=["sql_metadata"],
install_requires=[
'coverage==4.4.2',
'pylint==1.8.1',
'pytest==3.2.3',
'sqlparse==0.2.4',
]
)
106 changes: 106 additions & 0 deletions sql_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
This module provides SQL query parsing functions
"""
import re

import sqlparse

from sqlparse.sql import TokenList
from sqlparse.tokens import Name, Whitespace, Wildcard


def preprocess_query(query):
"""
Perform initial query cleanup
:type query str
:rtype str
"""
# 1. remove aliases
# FROM `dimension_wikis` `dw`
# INNER JOIN `fact_wam_scores` `fwN`
query = re.sub(r'(\s(FROM|JOIN)\s`[^`]+`)\s`[^`]+`', r'\1', query, flags=re.IGNORECASE)

return query


def get_query_tokens(query):
"""
:type query str
:rtype: list[sqlparse.sql.Token]
"""
query = preprocess_query(query)

tokens = TokenList(sqlparse.parse(query)[0].tokens).flatten()
# print([(token.value, token.ttype) for token in tokens])

return [token for token in tokens if token.ttype is not Whitespace]


def get_query_columns(query):
"""
:type query str
:rtype: list[str]
"""
columns = []
last_keyword = None
last_token = None

for token in get_query_tokens(query):
if token.is_keyword and token.value.upper() not in ['AS', 'AND', 'OR']:
# keep the name of the last keyword, e.g. SELECT, FROM, WHERE, (ORDER) BY
last_keyword = token.value.upper()
# print('keyword', last_keyword)
elif token.ttype is Name:
# analyze the name tokens, column names and where condition values
if last_keyword in ['SELECT', 'WHERE', 'BY'] and last_token not in ['AS']:
# print(last_keyword, last_token, token.value)

if token.value not in columns:
columns.append(token.value)
elif token.ttype is Wildcard:
# handle wildcard in SELECT part, but ignore count(*)
# print(last_keyword, last_token, token.value)
if last_keyword == 'SELECT' and last_token != '(':
columns.append(token.value)

last_token = token.value.upper()

return columns


def get_query_tables(query):
"""
:type query str
:rtype: list[str]
"""
tables = []
last_keyword = None
last_token = None

table_syntax_keywords = [
# SELECT queries
'FROM', 'WHERE', 'JOIN', 'INNER JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'ON',
# INSERT queries
'INTO', 'VALUES'
]

for token in get_query_tokens(query):
# print([token, token.ttype])
if token.is_keyword and token.value.upper() in table_syntax_keywords:
# keep the name of the last keyword
last_keyword = token.value.upper()
# print('keyword', last_keyword)
elif token.ttype is Name or token.is_keyword:
# print([last_keyword, last_token, token.value])
# analyze the name tokens, column names and where condition values
if last_keyword in ['FROM', 'JOIN', 'INNER JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'INTO'] \
and last_token not in ['AS'] \
and token.value not in ['AS']:
table_name = token.value.strip('`')
if table_name not in tables:
tables.append(table_name)

last_token = token.value.upper()

return tables
Empty file added test/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions test/test_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from unittest import TestCase

from sql_metadata import preprocess_query, get_query_columns, get_query_tables


class TestUtils(TestCase):

def test_preprocess_query(self):
self.assertEquals(
preprocess_query('SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC'),
'SELECT DISTINCT dw.lang FROM `dimension_wikis` INNER JOIN `fact_wam_scores` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC'
)

self.assertEquals(
preprocess_query("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"),
"SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` left join `fact_wam_scores` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"
)

def test_get_query_columns(self):
self.assertListEqual(['*'],
get_query_columns('SELECT * FROM `test_table`'))

self.assertListEqual(['foo'],
get_query_columns('SELECT foo FROM `test_table`'))

self.assertListEqual(['id', 'foo'],
get_query_columns('SELECT id, foo FROM test_table WHERE id = 3'))

self.assertListEqual(['foo', 'count', 'id'],
get_query_columns('SELECT foo, count(*) as bar FROM `test_table` WHERE id = 3'))

self.assertListEqual(['foo', 'test'],
get_query_columns('SELECT foo, test as bar FROM `test_table`'))

self.assertListEqual(['bar'],
get_query_columns('SELECT /* a comment */ bar FROM test_table'))

# assert False

def test_get_query_tables(self):
self.assertListEqual(['test_table'],
get_query_tables('SELECT * FROM `test_table`'))

self.assertListEqual(['0001_test_table'],
get_query_tables('SELECT * FROM `0001_test_table`'))

self.assertListEqual(['test_table'],
get_query_tables('SELECT foo FROM `test_table`'))

self.assertListEqual(['test_table'],
get_query_tables('SELECT foo FROM test_table WHERE id = 1'))

self.assertListEqual(['test_table', 'second_table'],
get_query_tables('SELECT foo FROM test_table, second_table WHERE id = 1'))

self.assertListEqual(['revision', 'page', 'wikicities_user'],
get_query_tables('SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N'))

self.assertListEqual(['events'],
get_query_tables("SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP"))

# complex queries
# @see https://github.com/macbre/query-digest/issues/16
self.assertListEqual(['report_wiki_recent_pageviews', 'dimension_wikis'],
get_query_tables("SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N"))

self.assertListEqual(['dimension_wikis', 'fact_wam_scores'],
get_query_tables("SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC"))

self.assertListEqual(['fact_wam_scores', 'dimension_wikis'],
get_query_tables("SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))"))

# INSERT queries
self.assertListEqual(['0070_insert_ignore_table'],
get_query_tables("INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');"))

self.assertListEqual(['0070_insert_ignore_table'],
get_query_tables("INSERT into `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');"))

# assert False

0 comments on commit 177516b

Please sign in to comment.