From 407e1067bc961b7bcddca3035ef14fea06e03c12 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 10:51:46 -0600 Subject: [PATCH 01/22] Attempting to deal with UTF-8 BOM-table stuff. --- pgsanity/pgsanity.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py index c620863..850b9a0 100755 --- a/pgsanity/pgsanity.py +++ b/pgsanity/pgsanity.py @@ -2,6 +2,8 @@ from __future__ import print_function from __future__ import absolute_import +from chardet import detect +from codecs import BOM_UTF8 import argparse import sys @@ -13,6 +15,25 @@ def get_config(argv=sys.argv[1:]): parser.add_argument('files', nargs='*', default=None) return parser.parse_args(argv) +def check_for_bom(starting_bytes): + """ Check the first few bytes of a file to determine whether input + contains a BOM-table or not. + + Returns a boolean indicating whether a BOM-table appears to be present. + """ + minlen = len(BOM_UTF8) + if len(starting_bytes) < minlen: + raise ValueError("Starting bytes of file must be at least" + " {} bytes long to check for BOM.".format(minlen)) + encoding = detect(starting_bytes)["encoding"] + is_utf8 = encoding in ["UTF-8","UTF-8-SIG"] + return is_utf8 and starting_bytes.startswith(BOM_UTF8) + # ^ The above is a tiny bit redundant given that 'UTF-8-SIG' simply means + # "UTF-8 file with a BOM-table". However, older versions of chardet don't + # support this, and will just detect 'UTF-8', leaving us to check for the + # BOM ourselves as we do above. The extra check is not harmful on + # systems that have a more recent chardet module. + def check_file(filename=None, show_filename=False): """ Check whether an input file is valid PostgreSQL. If no filename is @@ -22,8 +43,13 @@ def check_file(filename=None, show_filename=False): """ # either work with sys.stdin or open the file if filename is not None: - with open(filename, "r") as filelike: - sql_string = filelike.read() + with open(filename, "rb") as filelike: + # discard BOM if present then read remaining bytes + nose = filelike.read(len(BOM_UTF8)) + nose = '' if check_for_bom(nose) else nose + sql_string = nose + filelike.read() + sql_string = sql_string.decode("utf-8") + # ^ This is safe for both ASCII and UTF-8 files. else: with sys.stdin as filelike: sql_string = sys.stdin.read() From a4b6c984fe18bd9d2670aa25c8564efefab768a1 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 11:38:13 -0600 Subject: [PATCH 02/22] Attempting a modification which will strip BOM for piped input as well as regular files. --- pgsanity/pgsanity.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py index 850b9a0..2bdc5bd 100755 --- a/pgsanity/pgsanity.py +++ b/pgsanity/pgsanity.py @@ -44,17 +44,16 @@ def check_file(filename=None, show_filename=False): # either work with sys.stdin or open the file if filename is not None: with open(filename, "rb") as filelike: - # discard BOM if present then read remaining bytes - nose = filelike.read(len(BOM_UTF8)) - nose = '' if check_for_bom(nose) else nose - sql_string = nose + filelike.read() - sql_string = sql_string.decode("utf-8") - # ^ This is safe for both ASCII and UTF-8 files. + sql_string = filelike.read() else: with sys.stdin as filelike: sql_string = sys.stdin.read() - - success, msg = check_string(sql_string) + # check for BOM-table and discard if present + nose = sql_string[0:len(BOM_UTF8)] + bom_present = check_for_bom(nose) + sql_string = sql_string[len(nose):] if bom_present else sql_string + success, msg = check_string(sql_string.decode("utf-8")) + # ^ The above called to decode() is safe for both ASCII and UTF-8 data. # report results result = 0 From 7cca7a494f09bf7db1a9e6fe12f3a2ef21c74f25 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 11:43:59 -0600 Subject: [PATCH 03/22] Typo fix plus small experiment --- pgsanity/pgsanity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py index 2bdc5bd..2316a21 100755 --- a/pgsanity/pgsanity.py +++ b/pgsanity/pgsanity.py @@ -43,7 +43,7 @@ def check_file(filename=None, show_filename=False): """ # either work with sys.stdin or open the file if filename is not None: - with open(filename, "rb") as filelike: + with open(filename, "r") as filelike: sql_string = filelike.read() else: with sys.stdin as filelike: @@ -53,7 +53,7 @@ def check_file(filename=None, show_filename=False): bom_present = check_for_bom(nose) sql_string = sql_string[len(nose):] if bom_present else sql_string success, msg = check_string(sql_string.decode("utf-8")) - # ^ The above called to decode() is safe for both ASCII and UTF-8 data. + # ^ The above call to decode() is safe for both ASCII and UTF-8 data. # report results result = 0 From 440942087a20e7bd2d5c026649afe80154a9da9a Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 11:54:14 -0600 Subject: [PATCH 04/22] Fixed some unescaped metacharacters in README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 88d810b..038db9a 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,8 @@ errors of the SQL. - arch: sudo pacman -S postgresql-libs ###Getting PgSanity -PgSanity is available in the Python Package Index, so you can install it with either easy_install or pip. Here's [PgSanity's page on PyPI](http://pypi.python.org/pypi/pgsanity). -- sudo pip install pgsanity **or** sudo easy_install pgsanity +PgSanity is available in the Python Package Index, so you can install it with either easy\_install or pip. Here's [PgSanity's page on PyPI](http://pypi.python.org/pypi/pgsanity). +- sudo pip install pgsanity **or** sudo easy\_install pgsanity - If you don't have pip you can get it on ubuntu/debian by running: sudo apt-get install python-pip ##Usage From 69bf9e63696a1d57834616d3e7f3c34dddcb35b5 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 12:08:48 -0600 Subject: [PATCH 05/22] Experimenting with getting pgsanity to ignore psql-commands such as \timing on --- pgsanity/sqlprep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 5c7d95b..5e92ad6 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -24,7 +24,7 @@ def prepare_sql(sql): if start == "/*": in_block_comment = True - elif start == "--" and not in_block_comment: + elif start in ["--","\n\\"] and not in_block_comment: in_line_comment = True if not in_statement: start_str = "//" @@ -50,7 +50,7 @@ def split_sql(sql): """generate hunks of SQL that are between the bookends return: tuple of beginning bookend, closing bookend, and contents note: beginning & end of string are returned as None""" - bookends = ("\n", ";", "--", "/*", "*/") + bookends = ("\n", ";", "--", "/*", "*/", "\n\\") last_bookend_found = None start = 0 From 852604474b43ccf60b9f8097a24719864c1527b1 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 12:11:38 -0600 Subject: [PATCH 06/22] Tweaked change for ignoring psql-commands. --- pgsanity/sqlprep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 5e92ad6..4150a78 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -24,7 +24,7 @@ def prepare_sql(sql): if start == "/*": in_block_comment = True - elif start in ["--","\n\\"] and not in_block_comment: + elif start in ["--","\\"] and not in_block_comment: in_line_comment = True if not in_statement: start_str = "//" @@ -50,7 +50,7 @@ def split_sql(sql): """generate hunks of SQL that are between the bookends return: tuple of beginning bookend, closing bookend, and contents note: beginning & end of string are returned as None""" - bookends = ("\n", ";", "--", "/*", "*/", "\n\\") + bookends = ("\n", ";", "--", "/*", "*/", "\\") last_bookend_found = None start = 0 From 7e1b7f99285be145dfe363aeab6d905f213bcf20 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 12:17:55 -0600 Subject: [PATCH 07/22] Tweaked change for ignoring psql-commands. Attempt 2. --- pgsanity/sqlprep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 4150a78..6096d61 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -24,7 +24,7 @@ def prepare_sql(sql): if start == "/*": in_block_comment = True - elif start in ["--","\\"] and not in_block_comment: + elif start in ["--","\n\\"] and not in_block_comment: in_line_comment = True if not in_statement: start_str = "//" @@ -50,7 +50,7 @@ def split_sql(sql): """generate hunks of SQL that are between the bookends return: tuple of beginning bookend, closing bookend, and contents note: beginning & end of string are returned as None""" - bookends = ("\n", ";", "--", "/*", "*/", "\\") + bookends = ("\n\\","\n", ";", "--", "/*", "*/") last_bookend_found = None start = 0 From 07ad49de0b3bef432e8e1d83a2e69b19148fa146 Mon Sep 17 00:00:00 2001 From: koyae Date: Sun, 15 May 2016 12:22:27 -0600 Subject: [PATCH 08/22] Tweaked change for ignoring psql-commands. Attempt 3. --- pgsanity/sqlprep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 6096d61..d130471 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -17,7 +17,7 @@ def prepare_sql(sql): # decide where we are if not in_statement and not in_line_comment and not in_block_comment: # not currently in any block - if start != "--" and start != "/*" and len(contents.strip()) > 0: + if start not in ["--","\n\\"] and start != "/*" and len(contents.strip()) > 0: # not starting a comment and there is contents in_statement = True precontents = "EXEC SQL " From b42c2ec9cb79489371757a62037a9082b8d6a6f4 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 12:29:31 -0600 Subject: [PATCH 09/22] Psql-commands are now being ignored as desired. Cleanup is pending. --- pgsanity/sqlprep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index d130471..9e7d682 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -17,14 +17,14 @@ def prepare_sql(sql): # decide where we are if not in_statement and not in_line_comment and not in_block_comment: # not currently in any block - if start not in ["--","\n\\"] and start != "/*" and len(contents.strip()) > 0: + if start not in ["--","\n\\","\\"] and start != "/*" and len(contents.strip()) > 0: # not starting a comment and there is contents in_statement = True precontents = "EXEC SQL " if start == "/*": in_block_comment = True - elif start in ["--","\n\\"] and not in_block_comment: + elif start in ["--","\n\\","\\"] and not in_block_comment: in_line_comment = True if not in_statement: start_str = "//" @@ -50,7 +50,7 @@ def split_sql(sql): """generate hunks of SQL that are between the bookends return: tuple of beginning bookend, closing bookend, and contents note: beginning & end of string are returned as None""" - bookends = ("\n\\","\n", ";", "--", "/*", "*/") + bookends = ("\n\\","\n", ";", "--", "/*", "*/","\\") last_bookend_found = None start = 0 From 0a7762338282af21f8826d41ab76443a21f4a096 Mon Sep 17 00:00:00 2001 From: Koyae Date: Sun, 15 May 2016 12:33:43 -0600 Subject: [PATCH 10/22] Minor cleanup of psql-command-stripping tweak. --- pgsanity/sqlprep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 9e7d682..26a4f76 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -17,14 +17,14 @@ def prepare_sql(sql): # decide where we are if not in_statement and not in_line_comment and not in_block_comment: # not currently in any block - if start not in ["--","\n\\","\\"] and start != "/*" and len(contents.strip()) > 0: + if start not in ["--","\\"] and start != "/*" and len(contents.strip()) > 0: # not starting a comment and there is contents in_statement = True precontents = "EXEC SQL " if start == "/*": in_block_comment = True - elif start in ["--","\n\\","\\"] and not in_block_comment: + elif start in ["--","\\"] and not in_block_comment: in_line_comment = True if not in_statement: start_str = "//" @@ -50,7 +50,7 @@ def split_sql(sql): """generate hunks of SQL that are between the bookends return: tuple of beginning bookend, closing bookend, and contents note: beginning & end of string are returned as None""" - bookends = ("\n\\","\n", ";", "--", "/*", "*/","\\") + bookends = ("\n", ";", "--", "/*", "*/","\\") last_bookend_found = None start = 0 From ca34532c792930a5abf9e47814c9ddfea20bbe20 Mon Sep 17 00:00:00 2001 From: Koyae Date: Mon, 16 May 2016 23:21:36 -0600 Subject: [PATCH 11/22] Minor adjustment to improve encapsulation of logic written to strip BOM-table information, per a comment by the original author. --- pgsanity/pgsanity.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py index 2316a21..29a3595 100755 --- a/pgsanity/pgsanity.py +++ b/pgsanity/pgsanity.py @@ -15,24 +15,26 @@ def get_config(argv=sys.argv[1:]): parser.add_argument('files', nargs='*', default=None) return parser.parse_args(argv) -def check_for_bom(starting_bytes): - """ Check the first few bytes of a file to determine whether input - contains a BOM-table or not. +def remove_bom_if_exists(sql_string): + """ Take the entire SQL-payload of a file (or stream) and strip the BOM-table + if one was detected. - Returns a boolean indicating whether a BOM-table appears to be present. + sql_string -- string-representation of incoming character-data. Value + should be passed RAW, meaning BEFORE regular decoding take + place. Otherwise, BOM-detection may fail. + + Returns a BOM-free SQL-payload. """ - minlen = len(BOM_UTF8) - if len(starting_bytes) < minlen: - raise ValueError("Starting bytes of file must be at least" - " {} bytes long to check for BOM.".format(minlen)) - encoding = detect(starting_bytes)["encoding"] - is_utf8 = encoding in ["UTF-8","UTF-8-SIG"] - return is_utf8 and starting_bytes.startswith(BOM_UTF8) - # ^ The above is a tiny bit redundant given that 'UTF-8-SIG' simply means - # "UTF-8 file with a BOM-table". However, older versions of chardet don't - # support this, and will just detect 'UTF-8', leaving us to check for the - # BOM ourselves as we do above. The extra check is not harmful on - # systems that have a more recent chardet module. + encoding = detect(sql_string)["encoding"] + is_utf8 = encoding in ["UTF-8","UTF-8-SIG"] # * + bom_present = is_utf8 and sql_string.startswith(BOM_UTF8) # * + sql_string = sql_string[len(BOM_UTF8):] if bom_present else sql_string + return sql_string + # * The marked lines above are a tiny bit redundant given that 'UTF-8-SIG' + # simply means "UTF-8 file with a BOM-table". However, older versions of + # chardet don't support this, and will just detect 'UTF-8', leaving us to + # check for the BOM ourselves as we do above. The extra check is not + # harmful on systems that have a more recent chardet module. def check_file(filename=None, show_filename=False): """ @@ -48,10 +50,7 @@ def check_file(filename=None, show_filename=False): else: with sys.stdin as filelike: sql_string = sys.stdin.read() - # check for BOM-table and discard if present - nose = sql_string[0:len(BOM_UTF8)] - bom_present = check_for_bom(nose) - sql_string = sql_string[len(nose):] if bom_present else sql_string + sql_string = remove_bom_if_exists(sql_string) success, msg = check_string(sql_string.decode("utf-8")) # ^ The above call to decode() is safe for both ASCII and UTF-8 data. From 92d4bfe549efa5c5eedbb8e82f157105e661426b Mon Sep 17 00:00:00 2001 From: Koyae Date: Tue, 17 May 2016 00:03:38 -0600 Subject: [PATCH 12/22] Added test for BOM-table stripping function. --- pgsanity/pgsanity.py | 2 +- test/test_pgsanity.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py index 29a3595..2add213 100755 --- a/pgsanity/pgsanity.py +++ b/pgsanity/pgsanity.py @@ -21,7 +21,7 @@ def remove_bom_if_exists(sql_string): sql_string -- string-representation of incoming character-data. Value should be passed RAW, meaning BEFORE regular decoding take - place. Otherwise, BOM-detection may fail. + place. Otherwise, BOM-detection may fail. Returns a BOM-free SQL-payload. """ diff --git a/test/test_pgsanity.py b/test/test_pgsanity.py index 8d06d43..4c5de1c 100644 --- a/test/test_pgsanity.py +++ b/test/test_pgsanity.py @@ -1,6 +1,7 @@ import unittest import tempfile import os +from codecs import BOM_UTF8 from pgsanity import pgsanity @@ -26,6 +27,15 @@ def test_check_invalid_string(self): self.assertFalse(success) self.assertEqual('line 1: ERROR: unrecognized data type name "garbage"', msg) + def test_bom_gets_stripped(self): + bomless = "SELECT 'pining for the fjords';".encode('utf-8') + bomful = BOM_UTF8 + bomless + self.assertEqual(pgsanity.remove_bom_if_exists(bomful), bomless) + + def test_bom_removal_idempotence(self): + bomless = "SELET current_setting('parrot.status);".encode('utf-8') + self.assertEqual(bomless, pgsanity.remove_bom_if_exists(bomless)) + class TestPgSanityFiles(unittest.TestCase): def setUp(self): From e7cf7e32efd672b01e80c194583702601cd966da Mon Sep 17 00:00:00 2001 From: koyae Date: Sun, 29 May 2016 19:23:10 -0600 Subject: [PATCH 13/22] Added a test based off of issue #14. --- test/test_pgsanity.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_pgsanity.py b/test/test_pgsanity.py index 4c5de1c..b4bb95b 100644 --- a/test/test_pgsanity.py +++ b/test/test_pgsanity.py @@ -27,6 +27,16 @@ def test_check_invalid_string(self): self.assertFalse(success) self.assertEqual('line 1: ERROR: unrecognized data type name "garbage"', msg) + def test_check_invalid_string_2(self): + text = "SELECT '\n" + text += "-- this is not really a comment' AS c;\n" + text += "SELECT '\n" + text += "-- neither is this' AS c spam;" + + (success,msg) = pgsanity.check_string(text) + self.assertFalse(success) + self.assertEqual('line 4: ERROR: syntax error at or near "spam"') + def test_bom_gets_stripped(self): bomless = "SELECT 'pining for the fjords';".encode('utf-8') bomful = BOM_UTF8 + bomless From b252c8fbe1d45f8405cc19305da5304a2dd9de09 Mon Sep 17 00:00:00 2001 From: koyae Date: Sun, 29 May 2016 23:24:19 -0600 Subject: [PATCH 14/22] Semi-stable commit before I break something trying to fix the last few errors here. --- pgsanity/sqlprep.py | 164 +++++++++++++++++++++++++++++------------- test/test_pgsanity.py | 2 +- test/test_sqlprep.py | 53 +++++++------- 3 files changed, 139 insertions(+), 80 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 26a4f76..161a216 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -7,63 +7,125 @@ def prepare_sql(sql): results = StringIO() - in_statement = False - in_line_comment = False - in_block_comment = False - for (start, end, contents) in split_sql(sql): - precontents = None - start_str = None - - # decide where we are - if not in_statement and not in_line_comment and not in_block_comment: - # not currently in any block - if start not in ["--","\\"] and start != "/*" and len(contents.strip()) > 0: - # not starting a comment and there is contents - in_statement = True - precontents = "EXEC SQL " - - if start == "/*": - in_block_comment = True - elif start in ["--","\\"] and not in_block_comment: - in_line_comment = True - if not in_statement: - start_str = "//" - - start_str = start_str or start or "" - precontents = precontents or "" - results.write(start_str + precontents + contents) - - if not in_line_comment and not in_block_comment and in_statement and end == ";": - in_statement = False - - if in_block_comment and end == "*/": - in_block_comment = False - - if in_line_comment and end == "\n": - in_line_comment = False + for current_sql_expression in split_sql(sql): + assert(current_sql_expression[-1] == ';') + results.write("EXEC SQL " + current_sql_expression) response = results.getvalue() results.close() return response +def get_processing_state(current_state, current_char): + """determine the current state of processing an SQL-string. + return: state symbol. + + States: + + _ -- the base state wherein SQL tokens, commands, and math and + other operators occur. This is the initial processesing state + in which the machine starts off + + /*p -- block-comment pre-entry state. Identical to the "_" state + except that '*' initiates entry of a block comment + + /* -- block-comment state. In block-comments, no SQL actually + occurs, meaning special characters like quotes and semicolons + have no effect + + /*2 -- block-comment pre-exit state. Identical to the "/*" state + except that '/' causes the current block-comment to be closed + + $$p -- extended-string pre-entry state. Identical to the base state + except that '$' initiates entry of an extended string + + $$ -- extended-string state. In extended strings, all characters + are interpreted as string-data, meaning SQL-commands, + operators, etc. have no effect + + $$2 -- extended-string pre-exit state. Identical to the extended- + string state except that '$' causes the current extended- + string to be closed + + --p -- line-comment pre-entry state. identical to the base state, + except that '-' initiates a line-comment + + -- -- line-comment state. All characters are ignored and not + treated as SQL except for '\n', which is the only character + that prompts a transition out of this state + + ; -- the final state which indicates a single, complete + SQL-statement has just been completed + + ' -- single-quote state. In this state, no characters are treated + as SQL. The only transition away is "'" followed by any + character other than "'" + + '2 -- single-quote pre-exit state. Identical to the single-quote + state except that encountering a character other than "'" + causes the current single-quoted string to be closed + + " -- double-quote state. Similar in nature to the single-quote + state, except that possible transition away is intiated + by '"' instead of "'". + + "2 -- double-quote pre-exit state. Similar in nature to the single- + quote pre-exit state except that '"' prompts a return back to + the stable double-quote state, rather than "'" + """ + transitions = { + '_': { + 0: '_', '/' : '/*p', '-': '--p', '$': '$$p', + "'": "'", '"': '"', ';': ';' + }, + "'": {0: "'", "'": "'2"}, + '"': {0: '"', '"': '"2'}, + '--p': {0: '_', '-': '--', ';': ';'}, + '/*p': {0: '_', '*': '/*', ';': ';'}, + '$$p': {0: '_', '$': '$$', ';': ';'}, + '--': {0: '--', '\n':'_'}, + '/*': {0: '/*', '*':'/*2'}, + '/*2': {0: '/*', '/':'_'}, + '$$': {0: '$$', '$': '$$2'}, + '$$2': {0: '$$', '$': '_'}, + "'2": {0: "_", "'": "'", ';': ';'}, + '"2': {0: '_', '"': '"', ';': ';'} + } + # ^ Above, transitions[current_state][0] represents the transition to take + # if no transition is explicitly defined for the passed-in symbol + if current_state not in transitions: + raise ValueError("Received an invalid state '{}'".format(current_state)) + if current_char in transitions[current_state]: + return transitions[current_state][current_char] + else: + return transitions[current_state][0] + def split_sql(sql): - """generate hunks of SQL that are between the bookends - return: tuple of beginning bookend, closing bookend, and contents - note: beginning & end of string are returned as None""" - bookends = ("\n", ";", "--", "/*", "*/","\\") - last_bookend_found = None - start = 0 - - while start <= len(sql): - results = get_next_occurence(sql, start, bookends) - if results is None: - yield (last_bookend_found, None, sql[start:]) - start = len(sql) + 1 - else: - (end, bookend) = results - yield (last_bookend_found, bookend, sql[start:end]) - start = end + len(bookend) - last_bookend_found = bookend + """isolate complete SQL-statements from the passed-in string + return: the SQL-statements from the passed-in string, + separated into individual statements """ + if len(sql) == 0: + raise ValueError("Input appears to be empty.") + previous_state = '_' + current_state = '_' + current_sql_expression = '' + for c in sql: + previous_state = current_state + current_state = get_processing_state(current_state,c) + # disard everything except for newlines if in line-comment state + current_sql_expression += c if ( current_state != '--' + or c == "\n" ) else '' +## print "Current char: {} new state: {}".format(repr(c),current_state) + if current_state == ';': + yield current_sql_expression + current_sql_expression = '' + current_state = '_' + elif ( previous_state == '--p' and current_state == '--' ): + # if previous character was the start of a line-comment token, discard + current_sql_expression = current_sql_expression[:-1] + if current_sql_expression and not re.match("[\s;]*",current_sql_expression): + # unless only whitespace and semicolons left, return remaining characters + # between last ; and EOF + yield current_sql_expression + ';' def get_next_occurence(haystack, offset, needles): """find next occurence of one of the needles in the haystack diff --git a/test/test_pgsanity.py b/test/test_pgsanity.py index b4bb95b..45a252c 100644 --- a/test/test_pgsanity.py +++ b/test/test_pgsanity.py @@ -35,7 +35,7 @@ def test_check_invalid_string_2(self): (success,msg) = pgsanity.check_string(text) self.assertFalse(success) - self.assertEqual('line 4: ERROR: syntax error at or near "spam"') + self.assertEqual('line 4: ERROR: syntax error at or near "spam"', msg) def test_bom_gets_stripped(self): bomless = "SELECT 'pining for the fjords';".encode('utf-8') diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py index 0fe5092..caa1a08 100644 --- a/test/test_sqlprep.py +++ b/test/test_sqlprep.py @@ -5,12 +5,13 @@ class TestSqlPrep(unittest.TestCase): def test_split_sql_nothing_interesting(self): text = "abcd123" - expected = [(None, None, "abcd123")] + expected = ["abcd123;"] self.assertEqual(expected, list(sqlprep.split_sql(text))) + # ^ Retuning the empty string [BOOKMARK] def test_split_sql_trailing_semicolon(self): text = "abcd123;" - expected = [(None, ";", "abcd123"), (";", None, '')] + expected = [text] self.assertEqual(expected, list(sqlprep.split_sql(text))) def test_split_sql_comment_between_statements(self): @@ -18,23 +19,14 @@ def test_split_sql_comment_between_statements(self): text += "--comment here\n" text += "select a from b;" - expected = [(None, ";", "select a from b"), - (";", "\n", ''), - ("\n", "--", ''), - ("--", "\n", 'comment here'), - ("\n", ";", 'select a from b'), - (";", None, '')] + expected = ["select a from b;","\n\nselect a from b;"] self.assertEqual(expected, list(sqlprep.split_sql(text))) def test_split_sql_inline_comment(self): text = "select a from b; --comment here\n" text += "select a from b;" - expected = [(None, ";", "select a from b"), - (";", "--", ' '), - ("--", "\n", 'comment here'), - ("\n", ";", 'select a from b'), - (";", None, '')] + expected = ["select a from b;", " \nselect a from b;"] self.assertEqual(expected, list(sqlprep.split_sql(text))) def test_handles_first_column_comment_between_statements(self): @@ -42,9 +34,8 @@ def test_handles_first_column_comment_between_statements(self): text += "--comment here\n" text += "blah blah;" - expected = "EXEC SQL blah blah;\n" - expected += "//comment here\n" - expected += "EXEC SQL blah blah;" + expected = "EXEC SQL blah blah;" + expected += "EXEC SQL \n\nblah blah;" self.assertEqual(expected, sqlprep.prepare_sql(text)) @@ -52,16 +43,18 @@ def test_handles_inline_comment_between_statements(self): text = "blah blah; --comment here\n" text += "blah blah;" - expected = "EXEC SQL blah blah; //comment here\n" - expected += "EXEC SQL blah blah;" + expected = "EXEC SQL blah blah;" + expected += "EXEC SQL \nblah blah;" self.assertEqual(expected, sqlprep.prepare_sql(text)) + # ^ Returning the empty string [BOOKMARK] def test_does_not_mangle_inline_comment_within_statement(self): text = "blah blah--comment here\n" text += "blah blah" - expected = "EXEC SQL " + text + expected = "EXEC SQL blah blah\n" + expected += "blah blah;" self.assertEqual(expected, sqlprep.prepare_sql(text)) @@ -70,9 +63,13 @@ def test_does_not_mangle_first_column_comment_within_statement(self): text += "--comment here\n" text += "where c=3" - expected = "EXEC SQL " + text + expected = "select a from b\n" + expected += "\n" + expected += "where c=3;" + expected = "EXEC SQL " + expected self.assertEqual(expected, sqlprep.prepare_sql(text)) + # ^ Returning the empty string [BOOKMARK] def test_prepend_exec_sql_to_simple_statements(self): text = "create table control.myfavoritetable (id bigint);" @@ -80,8 +77,8 @@ def test_prepend_exec_sql_to_simple_statements(self): self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_prepend_exec_sql_multiple_lines(self): - text1 = "create table control.myfavoritetable (id bigint);\n" - text2 = "create table control.myfavoritetable (id bigint);" + text1 = "create table control.myfavoritetable (id bigint);" + text2 = "\ncreate table control.myfavoritetable (id bigint);" expected = "EXEC SQL " + text1 + "EXEC SQL " + text2 self.assertEqual(expected, sqlprep.prepare_sql(text1 + text2)) @@ -112,32 +109,32 @@ def test_prepend_exec_sql_wrapped_trailing_sql(self): def test_comment_start_found_within_comment_within_statement(self): text = "select a from b --comment in comment --here\nwhere c=1;" - expected = "EXEC SQL select a from b --comment in comment --here\nwhere c=1;" + expected = "EXEC SQL select a from b \nwhere c=1;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_comment_start_found_within_comment_between_statements(self): text = "select a from b; --comment in comment --here\nselect c from d;" - expected = "EXEC SQL select a from b; //comment in comment //here\nEXEC SQL select c from d;" + expected = "EXEC SQL select a from b; EXEC SQL \nselect c from d;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_double_semicolon(self): text = "select a from b;;" - expected = "EXEC SQL select a from b;;" + expected = "EXEC SQL select a from b;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_semi_found_in_comment_at_end_of_line(self): text = "select a\nfrom b --semi in comment;\nwhere c=1;" - expected = "EXEC SQL select a\nfrom b --semi in comment;\nwhere c=1;" + expected = "EXEC SQL select a\nfrom b \nwhere c=1;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_handles_first_line_comment(self): text = "--comment on line 1\nselect a from b;" - expected = "//comment on line 1\nEXEC SQL select a from b;" + expected = "EXEC SQL \nselect a from b;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_handles_block_comment_on_last_line(self): text = "select a from b;\n/*\nselect c from d;\n*/" - expected = "EXEC SQL select a from b;\n/*\nselect c from d;\n*/" + expected = "EXEC SQL select a from b;EXEC SQL \n/*\nselect c from d;\n*/;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_semi_found_in_block_comment(self): From d69063d0d1a378148cebd2bcb72b82d909161b17 Mon Sep 17 00:00:00 2001 From: koyae Date: Sun, 29 May 2016 23:31:52 -0600 Subject: [PATCH 15/22] Fixed one more simple mistake here. Now onto the serious part. --- test/test_sqlprep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py index caa1a08..72f2169 100644 --- a/test/test_sqlprep.py +++ b/test/test_sqlprep.py @@ -114,10 +114,10 @@ def test_comment_start_found_within_comment_within_statement(self): def test_comment_start_found_within_comment_between_statements(self): text = "select a from b; --comment in comment --here\nselect c from d;" - expected = "EXEC SQL select a from b; EXEC SQL \nselect c from d;" + expected = "EXEC SQL select a from b;EXEC SQL \nselect c from d;" self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_double_semicolon(self): + def test_double_semicolon(self): # BOOKMARK text = "select a from b;;" expected = "EXEC SQL select a from b;" self.assertEqual(expected, sqlprep.prepare_sql(text)) From 46573e8db2de6f4ca9fea16b6c88b24322459597 Mon Sep 17 00:00:00 2001 From: koyae Date: Mon, 30 May 2016 00:09:30 -0600 Subject: [PATCH 16/22] ALL GREEN! --- pgsanity/sqlprep.py | 20 ++------------------ test/test_ecpg.py | 5 +++++ test/test_sqlprep.py | 12 +++++------- 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 161a216..83e9a56 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -91,7 +91,7 @@ def get_processing_state(current_state, current_char): '"2': {0: '_', '"': '"', ';': ';'} } # ^ Above, transitions[current_state][0] represents the transition to take - # if no transition is explicitly defined for the passed-in symbol + # if no transition is explicitly defined for the passed-in character if current_state not in transitions: raise ValueError("Received an invalid state '{}'".format(current_state)) if current_char in transitions[current_state]: @@ -122,23 +122,7 @@ def split_sql(sql): elif ( previous_state == '--p' and current_state == '--' ): # if previous character was the start of a line-comment token, discard current_sql_expression = current_sql_expression[:-1] - if current_sql_expression and not re.match("[\s;]*",current_sql_expression): + if current_sql_expression: # unless only whitespace and semicolons left, return remaining characters # between last ; and EOF yield current_sql_expression + ';' - -def get_next_occurence(haystack, offset, needles): - """find next occurence of one of the needles in the haystack - return: tuple of (index, needle found) - or: None if no needle was found""" - # make map of first char to full needle (only works if all needles - # have different first characters) - firstcharmap = dict([(n[0], n) for n in needles]) - firstchars = firstcharmap.keys() - while offset < len(haystack): - if haystack[offset] in firstchars: - possible_needle = firstcharmap[haystack[offset]] - if haystack[offset:offset + len(possible_needle)] == possible_needle: - return (offset, possible_needle) - offset += 1 - return None diff --git a/test/test_ecpg.py b/test/test_ecpg.py index 6d9c2c8..4f1da1b 100644 --- a/test/test_ecpg.py +++ b/test/test_ecpg.py @@ -14,6 +14,11 @@ def test_simple_failure(self): self.assertFalse(success) self.assertEqual('line 1: ERROR: unrecognized data type name "garbage"', msg) + def test_empty_sql_okay(self): + text = u"EXEC SQL ;" + (success, msg) = ecpg.check_syntax(text) + self.assertTrue(success) + def test_parse_error_simple(self): error = '/tmp/tmpLBKZo5.pgc:1: ERROR: unrecognized data type name "garbage"' expected = 'line 1: ERROR: unrecognized data type name "garbage"' diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py index 72f2169..a6ca02a 100644 --- a/test/test_sqlprep.py +++ b/test/test_sqlprep.py @@ -47,9 +47,8 @@ def test_handles_inline_comment_between_statements(self): expected += "EXEC SQL \nblah blah;" self.assertEqual(expected, sqlprep.prepare_sql(text)) - # ^ Returning the empty string [BOOKMARK] - def test_does_not_mangle_inline_comment_within_statement(self): + def test_does_not_mangle_inline_comment_within_statement(self): # BOOKMARK text = "blah blah--comment here\n" text += "blah blah" @@ -58,7 +57,7 @@ def test_does_not_mangle_inline_comment_within_statement(self): self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_does_not_mangle_first_column_comment_within_statement(self): + def test_does_not_mangle_first_column_comment_within_statement(self): # BOOKMARK text = "select a from b\n" text += "--comment here\n" text += "where c=3" @@ -69,7 +68,6 @@ def test_does_not_mangle_first_column_comment_within_statement(self): expected = "EXEC SQL " + expected self.assertEqual(expected, sqlprep.prepare_sql(text)) - # ^ Returning the empty string [BOOKMARK] def test_prepend_exec_sql_to_simple_statements(self): text = "create table control.myfavoritetable (id bigint);" @@ -117,9 +115,9 @@ def test_comment_start_found_within_comment_between_statements(self): expected = "EXEC SQL select a from b;EXEC SQL \nselect c from d;" self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_double_semicolon(self): # BOOKMARK + def test_double_semicolon(self): text = "select a from b;;" - expected = "EXEC SQL select a from b;" + expected = "EXEC SQL select a from b;EXEC SQL ;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_semi_found_in_comment_at_end_of_line(self): @@ -132,7 +130,7 @@ def test_handles_first_line_comment(self): expected = "EXEC SQL \nselect a from b;" self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_handles_block_comment_on_last_line(self): + def test_handles_block_comment_on_last_line(self): # [BOOKMARK] text = "select a from b;\n/*\nselect c from d;\n*/" expected = "EXEC SQL select a from b;EXEC SQL \n/*\nselect c from d;\n*/;" self.assertEqual(expected, sqlprep.prepare_sql(text)) From 0dd9de2902eb9437c4c9dd68f36109eef5afe325 Mon Sep 17 00:00:00 2001 From: koyae Date: Mon, 30 May 2016 00:17:08 -0600 Subject: [PATCH 17/22] Created additional tests from the todo-list in test_sqlprep.py --- test/test_sqlprep.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py index a6ca02a..edcb52f 100644 --- a/test/test_sqlprep.py +++ b/test/test_sqlprep.py @@ -7,7 +7,6 @@ def test_split_sql_nothing_interesting(self): text = "abcd123" expected = ["abcd123;"] self.assertEqual(expected, list(sqlprep.split_sql(text))) - # ^ Retuning the empty string [BOOKMARK] def test_split_sql_trailing_semicolon(self): text = "abcd123;" @@ -48,7 +47,7 @@ def test_handles_inline_comment_between_statements(self): self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_does_not_mangle_inline_comment_within_statement(self): # BOOKMARK + def test_does_not_mangle_inline_comment_within_statement(self): text = "blah blah--comment here\n" text += "blah blah" @@ -57,7 +56,7 @@ def test_does_not_mangle_inline_comment_within_statement(self): # BOOKMARK self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_does_not_mangle_first_column_comment_within_statement(self): # BOOKMARK + def test_does_not_mangle_first_column_comment_within_statement(self): text = "select a from b\n" text += "--comment here\n" text += "where c=3" @@ -120,6 +119,11 @@ def test_double_semicolon(self): expected = "EXEC SQL select a from b;EXEC SQL ;" self.assertEqual(expected, sqlprep.prepare_sql(text)) + def test_triple_semicolon(self): + text = "select a from b;;;" + expected = "EXEC SQL select a from b;EXEC SQL ;EXEC SQL ;" + self.assertEqual(expected, sqlprep.prepare_sql(text)) + def test_semi_found_in_comment_at_end_of_line(self): text = "select a\nfrom b --semi in comment;\nwhere c=1;" expected = "EXEC SQL select a\nfrom b \nwhere c=1;" @@ -130,7 +134,7 @@ def test_handles_first_line_comment(self): expected = "EXEC SQL \nselect a from b;" self.assertEqual(expected, sqlprep.prepare_sql(text)) - def test_handles_block_comment_on_last_line(self): # [BOOKMARK] + def test_handles_block_comment_on_last_line(self): text = "select a from b;\n/*\nselect c from d;\n*/" expected = "EXEC SQL select a from b;EXEC SQL \n/*\nselect c from d;\n*/;" self.assertEqual(expected, sqlprep.prepare_sql(text)) @@ -150,7 +154,12 @@ def test_opening_two_block_comments_only_requries_one_close(self): expected = "EXEC SQL select a\n/*\n/*\ncomment\n*/from b;EXEC SQL select c from d;" self.assertEqual(expected, sqlprep.prepare_sql(text)) -# TODO: -# semicolon followed by only whitespace / comments -# multiple semicolons in a row (legal?) -# line starts with semi and then has a statement + def test_trailing_whitespace_after_semicolon(self): + text = "select a from b; " + expected = "EXEC SQL select a from b;EXEC SQL ;" + self.assertEqual(expected, sqlprep.prepare_sql(text)) + + def test_line_starts_with_semicolon(self): + text = ";select a from b;" + expected = "EXEC SQL ;EXEC SQL select a from b;" + self.assertEqual(expected, sqlprep.prepare_sql(text)) From 2f158f078b14bf93c5819145f50138b9326cf472 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Jun 2016 01:23:46 -0600 Subject: [PATCH 18/22] Completed restructure in hopes of getting better speed. Tests passing. --- pgsanity/sqlprep.py | 81 +++++++++++++++++++++++--------------------- test/test_sqlprep.py | 2 +- 2 files changed, 43 insertions(+), 40 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 83e9a56..8cbe206 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -1,4 +1,6 @@ import re +from collections import OrderedDict + try: from cStringIO import StringIO except ImportError: @@ -15,7 +17,7 @@ def prepare_sql(sql): results.close() return response -def get_processing_state(current_state, current_char): +def get_processing_state(current_state, current_token): """determine the current state of processing an SQL-string. return: state symbol. @@ -25,30 +27,14 @@ def get_processing_state(current_state, current_char): other operators occur. This is the initial processesing state in which the machine starts off - /*p -- block-comment pre-entry state. Identical to the "_" state - except that '*' initiates entry of a block comment - /* -- block-comment state. In block-comments, no SQL actually occurs, meaning special characters like quotes and semicolons have no effect - /*2 -- block-comment pre-exit state. Identical to the "/*" state - except that '/' causes the current block-comment to be closed - - $$p -- extended-string pre-entry state. Identical to the base state - except that '$' initiates entry of an extended string - $$ -- extended-string state. In extended strings, all characters are interpreted as string-data, meaning SQL-commands, operators, etc. have no effect - $$2 -- extended-string pre-exit state. Identical to the extended- - string state except that '$' causes the current extended- - string to be closed - - --p -- line-comment pre-entry state. identical to the base state, - except that '-' initiates a line-comment - -- -- line-comment state. All characters are ignored and not treated as SQL except for '\n', which is the only character that prompts a transition out of this state @@ -74,28 +60,23 @@ def get_processing_state(current_state, current_char): """ transitions = { '_': { - 0: '_', '/' : '/*p', '-': '--p', '$': '$$p', + 0: '_', '/*' : '/*', '--': '--', '$$': '$$', "'": "'", '"': '"', ';': ';' }, "'": {0: "'", "'": "'2"}, + "'2": {0: "_", "'": "'", ';': ';'}, '"': {0: '"', '"': '"2'}, - '--p': {0: '_', '-': '--', ';': ';'}, - '/*p': {0: '_', '*': '/*', ';': ';'}, - '$$p': {0: '_', '$': '$$', ';': ';'}, + '"2': {0: '_', '"': '"', ';': ';'}, '--': {0: '--', '\n':'_'}, - '/*': {0: '/*', '*':'/*2'}, - '/*2': {0: '/*', '/':'_'}, - '$$': {0: '$$', '$': '$$2'}, - '$$2': {0: '$$', '$': '_'}, - "'2": {0: "_", "'": "'", ';': ';'}, - '"2': {0: '_', '"': '"', ';': ';'} + '/*': {0: '/*', '*/':'_'}, + '$$': {0: '$$', '$$': '_'}, } # ^ Above, transitions[current_state][0] represents the transition to take # if no transition is explicitly defined for the passed-in character if current_state not in transitions: raise ValueError("Received an invalid state '{}'".format(current_state)) - if current_char in transitions[current_state]: - return transitions[current_state][current_char] + if current_token in transitions[current_state]: + return transitions[current_state][current_token] else: return transitions[current_state][0] @@ -105,24 +86,46 @@ def split_sql(sql): separated into individual statements """ if len(sql) == 0: raise ValueError("Input appears to be empty.") + + # first, find the locations of all potential tokens in the input + tokenmap = {}; + tokens = ['$$','*/','/*',';',"'",'"','--',"\n"] + search_position = 0 + for token in tokens: + result = sql.find(token,search_position) + while result!=-1: + tokenmap[result] = token + result = sql.find(token,search_position) + search_position = result + len(token) + search_position = 0 + + tokenmap = OrderedDict(sorted(tokenmap.items(), key=lambda t: t[0])) + + # move through the tokens in order, appending SQL-chunks to current string previous_state = '_' current_state = '_' current_sql_expression = '' - for c in sql: - previous_state = current_state - current_state = get_processing_state(current_state,c) + previous_position = 0 + for position, token in tokenmap.items(): + current_state = get_processing_state(current_state,token) # disard everything except for newlines if in line-comment state - current_sql_expression += c if ( current_state != '--' - or c == "\n" ) else '' -## print "Current char: {} new state: {}".format(repr(c),current_state) + if current_state != '--' and previous_state != '--': + current_sql_expression += sql[previous_position:position+len(token)] + elif current_state == '--' and previous_state != '--': + # if line-comment just started, add everything before it: + current_sql_expression += sql[previous_position:position] + elif token=="\n": + current_sql_expression += token +## print "Current token: {} new state: {}".format(repr(token),current_state) if current_state == ';': yield current_sql_expression current_sql_expression = '' current_state = '_' - elif ( previous_state == '--p' and current_state == '--' ): - # if previous character was the start of a line-comment token, discard - current_sql_expression = current_sql_expression[:-1] - if current_sql_expression: + previous_state = '_' + previous_position = position + len(token) + previous_state = current_state + current_sql_expression += sql[previous_position:].rstrip(';') + if current_sql_expression.strip(' ;'): # unless only whitespace and semicolons left, return remaining characters # between last ; and EOF yield current_sql_expression + ';' diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py index edcb52f..efd6df1 100644 --- a/test/test_sqlprep.py +++ b/test/test_sqlprep.py @@ -156,7 +156,7 @@ def test_opening_two_block_comments_only_requries_one_close(self): def test_trailing_whitespace_after_semicolon(self): text = "select a from b; " - expected = "EXEC SQL select a from b;EXEC SQL ;" + expected = "EXEC SQL select a from b;" self.assertEqual(expected, sqlprep.prepare_sql(text)) def test_line_starts_with_semicolon(self): From c260e0b2142282bf2b4bea90615880b740e6d0a2 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Jun 2016 02:47:13 -0600 Subject: [PATCH 19/22] Unstable commit. Working on a generator to avoid creating a map of tokens on the entire input string. This should prevent memory from getting exhausted so easily. --- pgsanity/sqlprep.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index 8cbe206..a66c171 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -80,6 +80,25 @@ def get_processing_state(current_state, current_token): else: return transitions[current_state][0] +def get_token_gen(sql,tokens): + """ return a generator that indicates each token in turn, and the identity + of that token + return: (token's integer position in string, token) + """ + positionDict = {} + search_position = 0 + for token in tokens: + positionDict[token] = sql.find(token,search_position) + while positionDict.values() != []: + rval = sorted(positionDict.items(), key=lambda t: t[1])[0] + if rval==-1: + for key in positionDict.keys(): + if positionDict[key] == -1: + del positionDict[key] + continue + yield rval + search_position = rval[1] + len(rval[0]) + def split_sql(sql): """isolate complete SQL-statements from the passed-in string return: the SQL-statements from the passed-in string, @@ -88,25 +107,14 @@ def split_sql(sql): raise ValueError("Input appears to be empty.") # first, find the locations of all potential tokens in the input - tokenmap = {}; tokens = ['$$','*/','/*',';',"'",'"','--',"\n"] - search_position = 0 - for token in tokens: - result = sql.find(token,search_position) - while result!=-1: - tokenmap[result] = token - result = sql.find(token,search_position) - search_position = result + len(token) - search_position = 0 - - tokenmap = OrderedDict(sorted(tokenmap.items(), key=lambda t: t[0])) # move through the tokens in order, appending SQL-chunks to current string previous_state = '_' current_state = '_' current_sql_expression = '' previous_position = 0 - for position, token in tokenmap.items(): + for token, position in get_token_gen(sql,tokens): current_state = get_processing_state(current_state,token) # disard everything except for newlines if in line-comment state if current_state != '--' and previous_state != '--': From 6cf047553a8d29b63a931415b0d2e5b3a7cfc050 Mon Sep 17 00:00:00 2001 From: koyae Date: Wed, 22 Jun 2016 01:20:49 -0600 Subject: [PATCH 20/22] Stable-ish commit with loads of comments spammed everywhere. --- pgsanity/sqlprep.py | 57 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index a66c171..f1c8c1d 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -19,6 +19,18 @@ def prepare_sql(sql): def get_processing_state(current_state, current_token): """determine the current state of processing an SQL-string. + + current_state -- see 'States' further down in this dcostring + + current_token -- any character or character-pair which can prompt one + or more transitions in SQL-state (quote-marks, + comment-starting symbols, etc.) + NOTE: For both double-quote and single-quote + characters, the passed-in token should consist of the + initial quote character, plus the character which + immediately follows it, because it is not possible + to determine the next state without it. + return: state symbol. States: @@ -61,7 +73,7 @@ def get_processing_state(current_state, current_token): transitions = { '_': { 0: '_', '/*' : '/*', '--': '--', '$$': '$$', - "'": "'", '"': '"', ';': ';' + "'": "'", '"': '"', ';': ';', "''": "'2", '""': '"2' }, "'": {0: "'", "'": "'2"}, "'2": {0: "_", "'": "'", ';': ';'}, @@ -77,27 +89,41 @@ def get_processing_state(current_state, current_token): raise ValueError("Received an invalid state '{}'".format(current_state)) if current_token in transitions[current_state]: return transitions[current_state][current_token] + elif current_token[0] in transitions[current_state]: + # if we have a double-quote + peek character or a single-quote + char, + # transition using that + temp_state = transitions[current_state][current_token[0]] + return get_processing_state(temp_state,current_token[1]) # recurse else: return transitions[current_state][0] def get_token_gen(sql,tokens): - """ return a generator that indicates each token in turn, and the identity - of that token + """ return a generator that indicates the position of each token in turn, + and the identity of that token return: (token's integer position in string, token) """ + peek_tokens = ["'",'"'] positionDict = {} search_position = 0 for token in tokens: - positionDict[token] = sql.find(token,search_position) + positionDict[token] = sql.find(token,search_position) while positionDict.values() != []: - rval = sorted(positionDict.items(), key=lambda t: t[1])[0] - if rval==-1: - for key in positionDict.keys(): - if positionDict[key] == -1: - del positionDict[key] + si = sorted(positionDict.items(), key=lambda t: t[1]) +## print "Sorted tokens: {}".format(si) + rval = si[0] + find_next = rval[0] + if rval[1]==-1: +## print "Deleting... {}".format(rval[0]) + del positionDict[rval[0]] continue + elif rval[0] in peek_tokens and rval[1]+1 < len(sql): + find_next = rval[0] + rval = (rval[0]+sql[rval[1]+1],rval[1]) yield rval + # if possible, replace the token just returned and advance the cursor search_position = rval[1] + len(rval[0]) + positionDict[find_next] = sql.find(find_next,search_position) +## print "Found next {} at {}".format(find_next,positionDict[find_next]) def split_sql(sql): """isolate complete SQL-statements from the passed-in string @@ -105,6 +131,11 @@ def split_sql(sql): separated into individual statements """ if len(sql) == 0: raise ValueError("Input appears to be empty.") + +## print "\nSTRING:\n" +## print sql +## print "\n:STRING" +## print "" # first, find the locations of all potential tokens in the input tokens = ['$$','*/','/*',';',"'",'"','--',"\n"] @@ -124,9 +155,15 @@ def split_sql(sql): current_sql_expression += sql[previous_position:position] elif token=="\n": current_sql_expression += token -## print "Current token: {} new state: {}".format(repr(token),current_state) +## print "Current token: {}".format(repr(token)) +## print "New state from token: ( {} )".format(current_state) +## print "Current position: {}".format(position) +## print "String so far: {}".format(repr(current_sql_expression)) +## print "---" if current_state == ';': +## print "YIELDING: {}".format(repr(current_sql_expression)) yield current_sql_expression +## print "\n" current_sql_expression = '' current_state = '_' previous_state = '_' From ad4b0e04ddf638e207cb31d3f305554d29c9bec6 Mon Sep 17 00:00:00 2001 From: koyae Date: Wed, 22 Jun 2016 01:24:47 -0600 Subject: [PATCH 21/22] Tidied up comments. --- pgsanity/sqlprep.py | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py index f1c8c1d..2157c74 100644 --- a/pgsanity/sqlprep.py +++ b/pgsanity/sqlprep.py @@ -103,18 +103,16 @@ def get_token_gen(sql,tokens): return: (token's integer position in string, token) """ peek_tokens = ["'",'"'] - positionDict = {} + position_dict = {} search_position = 0 for token in tokens: - positionDict[token] = sql.find(token,search_position) - while positionDict.values() != []: - si = sorted(positionDict.items(), key=lambda t: t[1]) -## print "Sorted tokens: {}".format(si) + position_dict[token] = sql.find(token,search_position) + while position_dict.values() != []: + si = sorted(position_dict.items(), key=lambda t: t[1]) rval = si[0] find_next = rval[0] if rval[1]==-1: -## print "Deleting... {}".format(rval[0]) - del positionDict[rval[0]] + del position_dict[rval[0]] continue elif rval[0] in peek_tokens and rval[1]+1 < len(sql): find_next = rval[0] @@ -122,8 +120,7 @@ def get_token_gen(sql,tokens): yield rval # if possible, replace the token just returned and advance the cursor search_position = rval[1] + len(rval[0]) - positionDict[find_next] = sql.find(find_next,search_position) -## print "Found next {} at {}".format(find_next,positionDict[find_next]) + position_dict[find_next] = sql.find(find_next,search_position) def split_sql(sql): """isolate complete SQL-statements from the passed-in string @@ -131,15 +128,7 @@ def split_sql(sql): separated into individual statements """ if len(sql) == 0: raise ValueError("Input appears to be empty.") - -## print "\nSTRING:\n" -## print sql -## print "\n:STRING" -## print "" - - # first, find the locations of all potential tokens in the input tokens = ['$$','*/','/*',';',"'",'"','--',"\n"] - # move through the tokens in order, appending SQL-chunks to current string previous_state = '_' current_state = '_' @@ -151,19 +140,12 @@ def split_sql(sql): if current_state != '--' and previous_state != '--': current_sql_expression += sql[previous_position:position+len(token)] elif current_state == '--' and previous_state != '--': - # if line-comment just started, add everything before it: + # if line-comment just started, add everything before it current_sql_expression += sql[previous_position:position] elif token=="\n": current_sql_expression += token -## print "Current token: {}".format(repr(token)) -## print "New state from token: ( {} )".format(current_state) -## print "Current position: {}".format(position) -## print "String so far: {}".format(repr(current_sql_expression)) -## print "---" if current_state == ';': -## print "YIELDING: {}".format(repr(current_sql_expression)) yield current_sql_expression -## print "\n" current_sql_expression = '' current_state = '_' previous_state = '_' @@ -172,5 +154,5 @@ def split_sql(sql): current_sql_expression += sql[previous_position:].rstrip(';') if current_sql_expression.strip(' ;'): # unless only whitespace and semicolons left, return remaining characters - # between last ; and EOF + # between last ';' and EOF yield current_sql_expression + ';' From 3fab645a5bb7e4cea80edb4e74157de833d56bcc Mon Sep 17 00:00:00 2001 From: koyae Date: Wed, 22 Jun 2016 02:30:33 -0600 Subject: [PATCH 22/22] Tossed in some code to prevent chardet from hanging on large files. --- pgsanity/pgsanity.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py index 2add213..262bafb 100755 --- a/pgsanity/pgsanity.py +++ b/pgsanity/pgsanity.py @@ -17,7 +17,7 @@ def get_config(argv=sys.argv[1:]): def remove_bom_if_exists(sql_string): """ Take the entire SQL-payload of a file (or stream) and strip the BOM-table - if one was detected. + if one was detected, returning it along with the detected encoding. sql_string -- string-representation of incoming character-data. Value should be passed RAW, meaning BEFORE regular decoding take @@ -25,11 +25,11 @@ def remove_bom_if_exists(sql_string): Returns a BOM-free SQL-payload. """ - encoding = detect(sql_string)["encoding"] + encoding = detect(sql_string[:10000])["encoding"] # HACK is_utf8 = encoding in ["UTF-8","UTF-8-SIG"] # * bom_present = is_utf8 and sql_string.startswith(BOM_UTF8) # * sql_string = sql_string[len(BOM_UTF8):] if bom_present else sql_string - return sql_string + return sql_string, encoding # * The marked lines above are a tiny bit redundant given that 'UTF-8-SIG' # simply means "UTF-8 file with a BOM-table". However, older versions of # chardet don't support this, and will just detect 'UTF-8', leaving us to @@ -50,9 +50,8 @@ def check_file(filename=None, show_filename=False): else: with sys.stdin as filelike: sql_string = sys.stdin.read() - sql_string = remove_bom_if_exists(sql_string) - success, msg = check_string(sql_string.decode("utf-8")) - # ^ The above call to decode() is safe for both ASCII and UTF-8 data. + sql_string, encoding = remove_bom_if_exists(sql_string) + success, msg = check_string(sql_string.decode(encoding)) # report results result = 0