From 407e1067bc961b7bcddca3035ef14fea06e03c12 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 10:51:46 -0600
Subject: [PATCH 01/22] Attempting to deal with UTF-8 BOM-table stuff.

---
 pgsanity/pgsanity.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py
index c620863..850b9a0 100755
--- a/pgsanity/pgsanity.py
+++ b/pgsanity/pgsanity.py
@@ -2,6 +2,8 @@
 
 from __future__ import print_function
 from __future__ import absolute_import
+from chardet import detect
+from codecs import BOM_UTF8
 import argparse
 import sys
 
@@ -13,6 +15,25 @@ def get_config(argv=sys.argv[1:]):
     parser.add_argument('files', nargs='*', default=None)
     return parser.parse_args(argv)
 
+def check_for_bom(starting_bytes):
+    """ Check the first few bytes of a file to determine whether input
+    contains a BOM-table or not.
+
+    Returns a boolean indicating whether a BOM-table appears to be present.
+    """
+    minlen = len(BOM_UTF8)
+    if len(starting_bytes) < minlen:
+        raise ValueError("Starting bytes of file must be at least"
+                         " {} bytes long to check for BOM.".format(minlen))
+    encoding = detect(starting_bytes)["encoding"]
+    is_utf8 = encoding in ["UTF-8","UTF-8-SIG"]
+    return is_utf8 and starting_bytes.startswith(BOM_UTF8)
+    # ^ The above is a tiny bit redundant given that 'UTF-8-SIG' simply means
+    # "UTF-8 file with a BOM-table". However, older versions of chardet don't
+    # support this, and will just detect 'UTF-8', leaving us to check for the
+    # BOM ourselves as we do above. The extra check is not harmful on
+    # systems that have a more recent chardet module.
+
 def check_file(filename=None, show_filename=False):
     """
     Check whether an input file is valid PostgreSQL. If no filename is
@@ -22,8 +43,13 @@ def check_file(filename=None, show_filename=False):
     """
     # either work with sys.stdin or open the file
     if filename is not None:
-        with open(filename, "r") as filelike:
-            sql_string = filelike.read()
+        with open(filename, "rb") as filelike:
+            # discard BOM if present then read remaining bytes
+            nose = filelike.read(len(BOM_UTF8))
+            nose = '' if check_for_bom(nose) else nose
+            sql_string = nose + filelike.read()
+            sql_string = sql_string.decode("utf-8")
+            # ^ This is safe for both ASCII and UTF-8 files.
     else:
         with sys.stdin as filelike:
             sql_string = sys.stdin.read()

From a4b6c984fe18bd9d2670aa25c8564efefab768a1 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 11:38:13 -0600
Subject: [PATCH 02/22] Attempting a modification which will strip BOM for
 piped input as well as regular files.

---
 pgsanity/pgsanity.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py
index 850b9a0..2bdc5bd 100755
--- a/pgsanity/pgsanity.py
+++ b/pgsanity/pgsanity.py
@@ -44,17 +44,16 @@ def check_file(filename=None, show_filename=False):
     # either work with sys.stdin or open the file
     if filename is not None:
         with open(filename, "rb") as filelike:
-            # discard BOM if present then read remaining bytes
-            nose = filelike.read(len(BOM_UTF8))
-            nose = '' if check_for_bom(nose) else nose
-            sql_string = nose + filelike.read()
-            sql_string = sql_string.decode("utf-8")
-            # ^ This is safe for both ASCII and UTF-8 files.
+            sql_string = filelike.read()
     else:
         with sys.stdin as filelike:
             sql_string = sys.stdin.read()
-
-    success, msg = check_string(sql_string)
+    # check for BOM-table and discard if present
+    nose = sql_string[0:len(BOM_UTF8)]
+    bom_present = check_for_bom(nose)
+    sql_string = sql_string[len(nose):] if bom_present else sql_string
+    success, msg = check_string(sql_string.decode("utf-8"))
+    # ^ The above called to decode() is safe for both ASCII and UTF-8 data.
 
     # report results
     result = 0

From 7cca7a494f09bf7db1a9e6fe12f3a2ef21c74f25 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 11:43:59 -0600
Subject: [PATCH 03/22] Typo fix plus small experiment

---
 pgsanity/pgsanity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py
index 2bdc5bd..2316a21 100755
--- a/pgsanity/pgsanity.py
+++ b/pgsanity/pgsanity.py
@@ -43,7 +43,7 @@ def check_file(filename=None, show_filename=False):
     """
     # either work with sys.stdin or open the file
     if filename is not None:
-        with open(filename, "rb") as filelike:
+        with open(filename, "r") as filelike:
             sql_string = filelike.read()
     else:
         with sys.stdin as filelike:
@@ -53,7 +53,7 @@ def check_file(filename=None, show_filename=False):
     bom_present = check_for_bom(nose)
     sql_string = sql_string[len(nose):] if bom_present else sql_string
     success, msg = check_string(sql_string.decode("utf-8"))
-    # ^ The above called to decode() is safe for both ASCII and UTF-8 data.
+    # ^ The above call to decode() is safe for both ASCII and UTF-8 data.
 
     # report results
     result = 0

From 440942087a20e7bd2d5c026649afe80154a9da9a Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 11:54:14 -0600
Subject: [PATCH 04/22] Fixed some unescaped metacharacters in README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 88d810b..038db9a 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,8 @@ errors of the SQL.
     - arch: sudo pacman -S postgresql-libs
 
 ###Getting PgSanity
-PgSanity is available in the Python Package Index, so you can install it with either easy_install or pip.  Here's [PgSanity's page on PyPI](http://pypi.python.org/pypi/pgsanity).
-- sudo pip install pgsanity **or** sudo easy_install pgsanity
+PgSanity is available in the Python Package Index, so you can install it with either easy\_install or pip.  Here's [PgSanity's page on PyPI](http://pypi.python.org/pypi/pgsanity).
+- sudo pip install pgsanity **or** sudo easy\_install pgsanity
     - If you don't have pip you can get it on ubuntu/debian by running: sudo apt-get install python-pip
 
 ##Usage

From 69bf9e63696a1d57834616d3e7f3c34dddcb35b5 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 12:08:48 -0600
Subject: [PATCH 05/22] Experimenting with getting pgsanity to ignore
 psql-commands such as \timing on

---
 pgsanity/sqlprep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 5c7d95b..5e92ad6 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -24,7 +24,7 @@ def prepare_sql(sql):
 
         if start == "/*":
             in_block_comment = True
-        elif start == "--" and not in_block_comment:
+        elif start in ["--","\n\\"] and not in_block_comment:
             in_line_comment = True
             if not in_statement:
                 start_str = "//"
@@ -50,7 +50,7 @@ def split_sql(sql):
     """generate hunks of SQL that are between the bookends
        return: tuple of beginning bookend, closing bookend, and contents
          note: beginning & end of string are returned as None"""
-    bookends = ("\n", ";", "--", "/*", "*/")
+    bookends = ("\n", ";", "--", "/*", "*/", "\n\\")
     last_bookend_found = None
     start = 0
 

From 852604474b43ccf60b9f8097a24719864c1527b1 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 12:11:38 -0600
Subject: [PATCH 06/22] Tweaked change for ignoring psql-commands.

---
 pgsanity/sqlprep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 5e92ad6..4150a78 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -24,7 +24,7 @@ def prepare_sql(sql):
 
         if start == "/*":
             in_block_comment = True
-        elif start in ["--","\n\\"] and not in_block_comment:
+        elif start in ["--","\\"] and not in_block_comment:
             in_line_comment = True
             if not in_statement:
                 start_str = "//"
@@ -50,7 +50,7 @@ def split_sql(sql):
     """generate hunks of SQL that are between the bookends
        return: tuple of beginning bookend, closing bookend, and contents
          note: beginning & end of string are returned as None"""
-    bookends = ("\n", ";", "--", "/*", "*/", "\n\\")
+    bookends = ("\n", ";", "--", "/*", "*/", "\\")
     last_bookend_found = None
     start = 0
 

From 7e1b7f99285be145dfe363aeab6d905f213bcf20 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 12:17:55 -0600
Subject: [PATCH 07/22] Tweaked change for ignoring psql-commands. Attempt 2.

---
 pgsanity/sqlprep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 4150a78..6096d61 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -24,7 +24,7 @@ def prepare_sql(sql):
 
         if start == "/*":
             in_block_comment = True
-        elif start in ["--","\\"] and not in_block_comment:
+        elif start in ["--","\n\\"] and not in_block_comment:
             in_line_comment = True
             if not in_statement:
                 start_str = "//"
@@ -50,7 +50,7 @@ def split_sql(sql):
     """generate hunks of SQL that are between the bookends
        return: tuple of beginning bookend, closing bookend, and contents
          note: beginning & end of string are returned as None"""
-    bookends = ("\n", ";", "--", "/*", "*/", "\\")
+    bookends = ("\n\\","\n", ";", "--", "/*", "*/")
     last_bookend_found = None
     start = 0
 

From 07ad49de0b3bef432e8e1d83a2e69b19148fa146 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 12:22:27 -0600
Subject: [PATCH 08/22] Tweaked change for ignoring psql-commands. Attempt 3.

---
 pgsanity/sqlprep.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 6096d61..d130471 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -17,7 +17,7 @@ def prepare_sql(sql):
         # decide where we are
         if not in_statement and not in_line_comment and not in_block_comment:
             # not currently in any block
-            if start != "--" and start != "/*" and len(contents.strip()) > 0:
+            if start not in ["--","\n\\"] and start != "/*" and len(contents.strip()) > 0:
                 # not starting a comment and there is contents
                 in_statement = True
                 precontents = "EXEC SQL "

From b42c2ec9cb79489371757a62037a9082b8d6a6f4 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 12:29:31 -0600
Subject: [PATCH 09/22] Psql-commands are now being ignored as desired. Cleanup
 is pending.

---
 pgsanity/sqlprep.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index d130471..9e7d682 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -17,14 +17,14 @@ def prepare_sql(sql):
         # decide where we are
         if not in_statement and not in_line_comment and not in_block_comment:
             # not currently in any block
-            if start not in ["--","\n\\"] and start != "/*" and len(contents.strip()) > 0:
+            if start not in ["--","\n\\","\\"] and start != "/*" and len(contents.strip()) > 0:
                 # not starting a comment and there is contents
                 in_statement = True
                 precontents = "EXEC SQL "
 
         if start == "/*":
             in_block_comment = True
-        elif start in ["--","\n\\"] and not in_block_comment:
+        elif start in ["--","\n\\","\\"] and not in_block_comment:
             in_line_comment = True
             if not in_statement:
                 start_str = "//"
@@ -50,7 +50,7 @@ def split_sql(sql):
     """generate hunks of SQL that are between the bookends
        return: tuple of beginning bookend, closing bookend, and contents
          note: beginning & end of string are returned as None"""
-    bookends = ("\n\\","\n", ";", "--", "/*", "*/")
+    bookends = ("\n\\","\n", ";", "--", "/*", "*/","\\")
     last_bookend_found = None
     start = 0
 

From 0a7762338282af21f8826d41ab76443a21f4a096 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Sun, 15 May 2016 12:33:43 -0600
Subject: [PATCH 10/22] Minor cleanup of psql-command-stripping tweak.

---
 pgsanity/sqlprep.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 9e7d682..26a4f76 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -17,14 +17,14 @@ def prepare_sql(sql):
         # decide where we are
         if not in_statement and not in_line_comment and not in_block_comment:
             # not currently in any block
-            if start not in ["--","\n\\","\\"] and start != "/*" and len(contents.strip()) > 0:
+            if start not in ["--","\\"] and start != "/*" and len(contents.strip()) > 0:
                 # not starting a comment and there is contents
                 in_statement = True
                 precontents = "EXEC SQL "
 
         if start == "/*":
             in_block_comment = True
-        elif start in ["--","\n\\","\\"] and not in_block_comment:
+        elif start in ["--","\\"] and not in_block_comment:
             in_line_comment = True
             if not in_statement:
                 start_str = "//"
@@ -50,7 +50,7 @@ def split_sql(sql):
     """generate hunks of SQL that are between the bookends
        return: tuple of beginning bookend, closing bookend, and contents
          note: beginning & end of string are returned as None"""
-    bookends = ("\n\\","\n", ";", "--", "/*", "*/","\\")
+    bookends = ("\n", ";", "--", "/*", "*/","\\")
     last_bookend_found = None
     start = 0
 

From ca34532c792930a5abf9e47814c9ddfea20bbe20 Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Mon, 16 May 2016 23:21:36 -0600
Subject: [PATCH 11/22] Minor adjustment to improve encapsulation of logic
 written to strip BOM-table information, per a comment by the original author.

---
 pgsanity/pgsanity.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py
index 2316a21..29a3595 100755
--- a/pgsanity/pgsanity.py
+++ b/pgsanity/pgsanity.py
@@ -15,24 +15,26 @@ def get_config(argv=sys.argv[1:]):
     parser.add_argument('files', nargs='*', default=None)
     return parser.parse_args(argv)
 
-def check_for_bom(starting_bytes):
-    """ Check the first few bytes of a file to determine whether input
-    contains a BOM-table or not.
+def remove_bom_if_exists(sql_string):
+    """ Take the entire SQL-payload of a file (or stream) and strip the BOM-table
+    if one was detected.
 
-    Returns a boolean indicating whether a BOM-table appears to be present.
+    sql_string   --   string-representation of incoming character-data. Value
+                      should be passed RAW, meaning BEFORE regular decoding take
+                      place. Otherwise, BOM-detection may fail.
+
+    Returns a BOM-free SQL-payload.
     """
-    minlen = len(BOM_UTF8)
-    if len(starting_bytes) < minlen:
-        raise ValueError("Starting bytes of file must be at least"
-                         " {} bytes long to check for BOM.".format(minlen))
-    encoding = detect(starting_bytes)["encoding"]
-    is_utf8 = encoding in ["UTF-8","UTF-8-SIG"]
-    return is_utf8 and starting_bytes.startswith(BOM_UTF8)
-    # ^ The above is a tiny bit redundant given that 'UTF-8-SIG' simply means
-    # "UTF-8 file with a BOM-table". However, older versions of chardet don't
-    # support this, and will just detect 'UTF-8', leaving us to check for the
-    # BOM ourselves as we do above. The extra check is not harmful on
-    # systems that have a more recent chardet module.
+    encoding = detect(sql_string)["encoding"]
+    is_utf8 = encoding in ["UTF-8","UTF-8-SIG"] # *
+    bom_present = is_utf8 and sql_string.startswith(BOM_UTF8) # *
+    sql_string = sql_string[len(BOM_UTF8):] if bom_present else sql_string
+    return sql_string 
+    # * The marked lines above are a tiny bit redundant given that 'UTF-8-SIG'
+    # simply means "UTF-8 file with a BOM-table". However, older versions of
+    # chardet don't support this, and will just detect 'UTF-8', leaving us to
+    # check for the BOM ourselves as we do above. The extra check is not
+    # harmful on systems that have a more recent chardet module.
 
 def check_file(filename=None, show_filename=False):
     """
@@ -48,10 +50,7 @@ def check_file(filename=None, show_filename=False):
     else:
         with sys.stdin as filelike:
             sql_string = sys.stdin.read()
-    # check for BOM-table and discard if present
-    nose = sql_string[0:len(BOM_UTF8)]
-    bom_present = check_for_bom(nose)
-    sql_string = sql_string[len(nose):] if bom_present else sql_string
+    sql_string = remove_bom_if_exists(sql_string)
     success, msg = check_string(sql_string.decode("utf-8"))
     # ^ The above call to decode() is safe for both ASCII and UTF-8 data.
 

From 92d4bfe549efa5c5eedbb8e82f157105e661426b Mon Sep 17 00:00:00 2001
From: Koyae <CronoCat@hotmail.com>
Date: Tue, 17 May 2016 00:03:38 -0600
Subject: [PATCH 12/22] Added test for BOM-table stripping function.

---
 pgsanity/pgsanity.py  |  2 +-
 test/test_pgsanity.py | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py
index 29a3595..2add213 100755
--- a/pgsanity/pgsanity.py
+++ b/pgsanity/pgsanity.py
@@ -21,7 +21,7 @@ def remove_bom_if_exists(sql_string):
 
     sql_string   --   string-representation of incoming character-data. Value
                       should be passed RAW, meaning BEFORE regular decoding take
-                      place. Otherwise, BOM-detection may fail.
+                      place. Otherwise, BOM-detection may fail. 
 
     Returns a BOM-free SQL-payload.
     """
diff --git a/test/test_pgsanity.py b/test/test_pgsanity.py
index 8d06d43..4c5de1c 100644
--- a/test/test_pgsanity.py
+++ b/test/test_pgsanity.py
@@ -1,6 +1,7 @@
 import unittest
 import tempfile
 import os
+from codecs import BOM_UTF8
 
 from pgsanity import pgsanity
 
@@ -26,6 +27,15 @@ def test_check_invalid_string(self):
         self.assertFalse(success)
         self.assertEqual('line 1: ERROR: unrecognized data type name "garbage"', msg)
 
+    def test_bom_gets_stripped(self):
+        bomless = "SELECT 'pining for the fjords';".encode('utf-8')
+        bomful = BOM_UTF8 + bomless
+        self.assertEqual(pgsanity.remove_bom_if_exists(bomful), bomless)
+
+    def test_bom_removal_idempotence(self):
+        bomless = "SELET current_setting('parrot.status);".encode('utf-8')
+        self.assertEqual(bomless, pgsanity.remove_bom_if_exists(bomless))
+
 
 class TestPgSanityFiles(unittest.TestCase):
     def setUp(self):

From e7cf7e32efd672b01e80c194583702601cd966da Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Sun, 29 May 2016 19:23:10 -0600
Subject: [PATCH 13/22] Added a test based off of issue #14.

---
 test/test_pgsanity.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/test_pgsanity.py b/test/test_pgsanity.py
index 4c5de1c..b4bb95b 100644
--- a/test/test_pgsanity.py
+++ b/test/test_pgsanity.py
@@ -27,6 +27,16 @@ def test_check_invalid_string(self):
         self.assertFalse(success)
         self.assertEqual('line 1: ERROR: unrecognized data type name "garbage"', msg)
 
+    def test_check_invalid_string_2(self):
+        text = "SELECT '\n"
+        text += "-- this is not really a comment' AS c;\n"
+        text += "SELECT '\n"
+        text += "-- neither is this' AS c spam;"
+
+        (success,msg) = pgsanity.check_string(text)
+        self.assertFalse(success)
+        self.assertEqual('line 4: ERROR: syntax error at or near "spam"')
+
     def test_bom_gets_stripped(self):
         bomless = "SELECT 'pining for the fjords';".encode('utf-8')
         bomful = BOM_UTF8 + bomless

From b252c8fbe1d45f8405cc19305da5304a2dd9de09 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Sun, 29 May 2016 23:24:19 -0600
Subject: [PATCH 14/22] Semi-stable commit before I break something trying to
 fix the last few errors here.

---
 pgsanity/sqlprep.py   | 164 +++++++++++++++++++++++++++++-------------
 test/test_pgsanity.py |   2 +-
 test/test_sqlprep.py  |  53 +++++++-------
 3 files changed, 139 insertions(+), 80 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 26a4f76..161a216 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -7,63 +7,125 @@
 def prepare_sql(sql):
     results = StringIO()
 
-    in_statement = False
-    in_line_comment = False
-    in_block_comment = False
-    for (start, end, contents) in split_sql(sql):
-        precontents = None
-        start_str = None
-
-        # decide where we are
-        if not in_statement and not in_line_comment and not in_block_comment:
-            # not currently in any block
-            if start not in ["--","\\"] and start != "/*" and len(contents.strip()) > 0:
-                # not starting a comment and there is contents
-                in_statement = True
-                precontents = "EXEC SQL "
-
-        if start == "/*":
-            in_block_comment = True
-        elif start in ["--","\\"] and not in_block_comment:
-            in_line_comment = True
-            if not in_statement:
-                start_str = "//"
-
-        start_str = start_str or start or ""
-        precontents = precontents or ""
-        results.write(start_str + precontents + contents)
-
-        if not in_line_comment and not in_block_comment and in_statement and end == ";":
-            in_statement = False
-
-        if in_block_comment and end == "*/":
-            in_block_comment = False
-
-        if in_line_comment and end == "\n":
-            in_line_comment = False
+    for current_sql_expression in split_sql(sql):
+        assert(current_sql_expression[-1] == ';')
+        results.write("EXEC SQL " + current_sql_expression)
 
     response = results.getvalue()
     results.close()
     return response
 
+def get_processing_state(current_state, current_char):
+    """determine the current state of processing an SQL-string.
+    return: state symbol.
+
+    States:
+
+        _    --    the base state wherein SQL tokens, commands, and math and
+                   other operators occur. This is the initial processesing state
+                   in which the machine starts off
+
+        /*p  --    block-comment pre-entry state. Identical to the "_" state
+                   except that '*' initiates entry of a block comment
+
+        /*   --    block-comment state. In block-comments, no SQL actually
+                   occurs, meaning special characters like quotes and semicolons
+                   have no effect
+
+        /*2  --    block-comment pre-exit state. Identical to the "/*" state
+                   except that '/' causes the current block-comment to be closed
+
+        $$p  --    extended-string pre-entry state. Identical to the base state
+                   except that '$' initiates entry of an extended string
+
+        $$   --    extended-string state. In extended strings, all characters
+                   are interpreted as string-data, meaning SQL-commands,
+                   operators, etc. have no effect
+
+        $$2  --    extended-string pre-exit state. Identical to the extended-
+                   string state except that '$' causes the current extended-
+                   string to be closed
+
+        --p  --    line-comment pre-entry state. identical to the base state,
+                   except that '-' initiates a line-comment
+
+        --   --    line-comment state. All characters are ignored and not
+                   treated as SQL except for '\n', which is the only character
+                   that prompts a transition out of this state
+
+        ;    --    the final state which indicates a single, complete
+                   SQL-statement has just been completed
+
+        '    --    single-quote state. In this state, no characters are treated
+                   as SQL. The only transition away is "'" followed by any
+                   character other than "'"
+
+        '2   --    single-quote pre-exit state. Identical to the single-quote
+                   state except that encountering a character other than "'"
+                   causes the current single-quoted string to be closed
+
+        "    --    double-quote state. Similar in nature to the single-quote
+                   state, except that possible transition away is intiated
+                   by '"' instead of "'".
+
+        "2   --    double-quote pre-exit state. Similar in nature to the single-
+                   quote pre-exit state except that '"' prompts a return back to
+                   the stable double-quote state, rather than "'"
+    """
+    transitions = {
+        '_': {
+            0: '_', '/' : '/*p', '-': '--p', '$': '$$p',
+            "'": "'", '"': '"', ';': ';'
+        },
+        "'": {0: "'", "'": "'2"},
+        '"': {0: '"', '"': '"2'},
+        '--p': {0: '_', '-': '--', ';': ';'},
+        '/*p': {0: '_', '*': '/*', ';': ';'},
+        '$$p': {0: '_', '$': '$$', ';': ';'},
+        '--':  {0: '--', '\n':'_'},
+        '/*': {0: '/*', '*':'/*2'},
+        '/*2': {0: '/*', '/':'_'},
+        '$$':  {0: '$$', '$': '$$2'},
+        '$$2': {0: '$$', '$': '_'},
+        "'2":  {0: "_", "'": "'", ';': ';'},
+        '"2':  {0: '_', '"': '"', ';': ';'}
+    }
+    # ^ Above, transitions[current_state][0] represents the transition to take
+    # if no transition is explicitly defined for the passed-in symbol
+    if current_state not in transitions:
+        raise ValueError("Received an invalid state '{}'".format(current_state))
+    if current_char in transitions[current_state]:
+        return transitions[current_state][current_char]
+    else:
+        return transitions[current_state][0]
+
 def split_sql(sql):
-    """generate hunks of SQL that are between the bookends
-       return: tuple of beginning bookend, closing bookend, and contents
-         note: beginning & end of string are returned as None"""
-    bookends = ("\n", ";", "--", "/*", "*/","\\")
-    last_bookend_found = None
-    start = 0
-
-    while start <= len(sql):
-        results = get_next_occurence(sql, start, bookends)
-        if results is None:
-            yield (last_bookend_found, None, sql[start:])
-            start = len(sql) + 1
-        else:
-            (end, bookend) = results
-            yield (last_bookend_found, bookend, sql[start:end])
-            start = end + len(bookend)
-            last_bookend_found = bookend
+    """isolate complete SQL-statements from the passed-in string
+       return: the SQL-statements from the passed-in string,
+       separated into individual statements """
+    if len(sql) == 0:
+        raise ValueError("Input appears to be empty.")
+    previous_state = '_'
+    current_state = '_'
+    current_sql_expression = ''
+    for c in sql:
+        previous_state = current_state
+        current_state = get_processing_state(current_state,c)
+        # disard everything except for newlines if in line-comment state
+        current_sql_expression += c if ( current_state != '--'
+                                         or c == "\n" ) else ''
+##        print "Current char: {} new state: {}".format(repr(c),current_state)
+        if current_state == ';':
+            yield current_sql_expression
+            current_sql_expression = ''
+            current_state = '_'
+        elif ( previous_state == '--p' and current_state == '--' ):
+        # if previous character was the start of a line-comment token, discard
+            current_sql_expression = current_sql_expression[:-1]
+    if current_sql_expression and not re.match("[\s;]*",current_sql_expression):
+    # unless only whitespace and semicolons left, return remaining characters
+    # between last ; and EOF
+        yield current_sql_expression + ';'
 
 def get_next_occurence(haystack, offset, needles):
     """find next occurence of one of the needles in the haystack
diff --git a/test/test_pgsanity.py b/test/test_pgsanity.py
index b4bb95b..45a252c 100644
--- a/test/test_pgsanity.py
+++ b/test/test_pgsanity.py
@@ -35,7 +35,7 @@ def test_check_invalid_string_2(self):
 
         (success,msg) = pgsanity.check_string(text)
         self.assertFalse(success)
-        self.assertEqual('line 4: ERROR: syntax error at or near "spam"')
+        self.assertEqual('line 4: ERROR: syntax error at or near "spam"', msg)
 
     def test_bom_gets_stripped(self):
         bomless = "SELECT 'pining for the fjords';".encode('utf-8')
diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py
index 0fe5092..caa1a08 100644
--- a/test/test_sqlprep.py
+++ b/test/test_sqlprep.py
@@ -5,12 +5,13 @@
 class TestSqlPrep(unittest.TestCase):
     def test_split_sql_nothing_interesting(self):
         text = "abcd123"
-        expected = [(None, None, "abcd123")]
+        expected = ["abcd123;"]
         self.assertEqual(expected, list(sqlprep.split_sql(text)))
+        # ^ Retuning the empty string [BOOKMARK]
 
     def test_split_sql_trailing_semicolon(self):
         text = "abcd123;"
-        expected = [(None, ";", "abcd123"), (";", None, '')]
+        expected = [text]
         self.assertEqual(expected, list(sqlprep.split_sql(text)))
 
     def test_split_sql_comment_between_statements(self):
@@ -18,23 +19,14 @@ def test_split_sql_comment_between_statements(self):
         text += "--comment here\n"
         text += "select a from b;"
 
-        expected = [(None, ";", "select a from b"),
-                    (";", "\n", ''),
-                    ("\n", "--", ''),
-                    ("--", "\n", 'comment here'),
-                    ("\n", ";", 'select a from b'),
-                    (";", None, '')]
+        expected = ["select a from b;","\n\nselect a from b;"]
         self.assertEqual(expected, list(sqlprep.split_sql(text)))
 
     def test_split_sql_inline_comment(self):
         text = "select a from b; --comment here\n"
         text += "select a from b;"
 
-        expected = [(None, ";", "select a from b"),
-                    (";", "--", ' '),
-                    ("--", "\n", 'comment here'),
-                    ("\n", ";", 'select a from b'),
-                    (";", None, '')]
+        expected = ["select a from b;", " \nselect a from b;"]
         self.assertEqual(expected, list(sqlprep.split_sql(text)))
 
     def test_handles_first_column_comment_between_statements(self):
@@ -42,9 +34,8 @@ def test_handles_first_column_comment_between_statements(self):
         text += "--comment here\n"
         text += "blah blah;"
 
-        expected = "EXEC SQL blah blah;\n"
-        expected += "//comment here\n"
-        expected += "EXEC SQL blah blah;"
+        expected = "EXEC SQL blah blah;"
+        expected += "EXEC SQL \n\nblah blah;"
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
@@ -52,16 +43,18 @@ def test_handles_inline_comment_between_statements(self):
         text = "blah blah; --comment here\n"
         text += "blah blah;"
 
-        expected = "EXEC SQL blah blah; //comment here\n"
-        expected += "EXEC SQL blah blah;"
+        expected = "EXEC SQL blah blah;"
+        expected += "EXEC SQL  \nblah blah;"
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
+        # ^ Returning the empty string [BOOKMARK]
 
     def test_does_not_mangle_inline_comment_within_statement(self):
         text = "blah blah--comment here\n"
         text += "blah blah"
 
-        expected = "EXEC SQL " + text
+        expected = "EXEC SQL blah blah\n"
+        expected += "blah blah;"
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
@@ -70,9 +63,13 @@ def test_does_not_mangle_first_column_comment_within_statement(self):
         text += "--comment here\n"
         text += "where c=3"
 
-        expected = "EXEC SQL " + text
+        expected = "select a from b\n"
+        expected += "\n"
+        expected += "where c=3;"
+        expected = "EXEC SQL " + expected
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
+        # ^ Returning the empty string [BOOKMARK]
 
     def test_prepend_exec_sql_to_simple_statements(self):
         text = "create table control.myfavoritetable (id bigint);"
@@ -80,8 +77,8 @@ def test_prepend_exec_sql_to_simple_statements(self):
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_prepend_exec_sql_multiple_lines(self):
-        text1 = "create table control.myfavoritetable (id bigint);\n"
-        text2 = "create table control.myfavoritetable (id bigint);"
+        text1 = "create table control.myfavoritetable (id bigint);"
+        text2 = "\ncreate table control.myfavoritetable (id bigint);"
         expected = "EXEC SQL " + text1 + "EXEC SQL " + text2
         self.assertEqual(expected, sqlprep.prepare_sql(text1 + text2))
 
@@ -112,32 +109,32 @@ def test_prepend_exec_sql_wrapped_trailing_sql(self):
 
     def test_comment_start_found_within_comment_within_statement(self):
         text = "select a from b --comment in comment --here\nwhere c=1;"
-        expected = "EXEC SQL select a from b --comment in comment --here\nwhere c=1;"
+        expected = "EXEC SQL select a from b \nwhere c=1;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_comment_start_found_within_comment_between_statements(self):
         text = "select a from b; --comment in comment --here\nselect c from d;"
-        expected = "EXEC SQL select a from b; //comment in comment //here\nEXEC SQL select c from d;"
+        expected = "EXEC SQL select a from b; EXEC SQL  \nselect c from d;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_double_semicolon(self):
         text = "select a from b;;"
-        expected = "EXEC SQL select a from b;;"
+        expected = "EXEC SQL select a from b;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_semi_found_in_comment_at_end_of_line(self):
         text = "select a\nfrom b --semi in comment;\nwhere c=1;"
-        expected = "EXEC SQL select a\nfrom b --semi in comment;\nwhere c=1;"
+        expected = "EXEC SQL select a\nfrom b \nwhere c=1;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_handles_first_line_comment(self):
         text = "--comment on line 1\nselect a from b;"
-        expected = "//comment on line 1\nEXEC SQL select a from b;"
+        expected = "EXEC SQL \nselect a from b;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_handles_block_comment_on_last_line(self):
         text = "select a from b;\n/*\nselect c from d;\n*/"
-        expected = "EXEC SQL select a from b;\n/*\nselect c from d;\n*/"
+        expected = "EXEC SQL select a from b;EXEC SQL \n/*\nselect c from d;\n*/;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_semi_found_in_block_comment(self):

From d69063d0d1a378148cebd2bcb72b82d909161b17 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Sun, 29 May 2016 23:31:52 -0600
Subject: [PATCH 15/22] Fixed one more simple mistake here. Now onto the
 serious part.

---
 test/test_sqlprep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py
index caa1a08..72f2169 100644
--- a/test/test_sqlprep.py
+++ b/test/test_sqlprep.py
@@ -114,10 +114,10 @@ def test_comment_start_found_within_comment_within_statement(self):
 
     def test_comment_start_found_within_comment_between_statements(self):
         text = "select a from b; --comment in comment --here\nselect c from d;"
-        expected = "EXEC SQL select a from b; EXEC SQL  \nselect c from d;"
+        expected = "EXEC SQL select a from b;EXEC SQL  \nselect c from d;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_double_semicolon(self):
+    def test_double_semicolon(self): # BOOKMARK
         text = "select a from b;;"
         expected = "EXEC SQL select a from b;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))

From 46573e8db2de6f4ca9fea16b6c88b24322459597 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Mon, 30 May 2016 00:09:30 -0600
Subject: [PATCH 16/22] ALL GREEN!

---
 pgsanity/sqlprep.py  | 20 ++------------------
 test/test_ecpg.py    |  5 +++++
 test/test_sqlprep.py | 12 +++++-------
 3 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 161a216..83e9a56 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -91,7 +91,7 @@ def get_processing_state(current_state, current_char):
         '"2':  {0: '_', '"': '"', ';': ';'}
     }
     # ^ Above, transitions[current_state][0] represents the transition to take
-    # if no transition is explicitly defined for the passed-in symbol
+    # if no transition is explicitly defined for the passed-in character
     if current_state not in transitions:
         raise ValueError("Received an invalid state '{}'".format(current_state))
     if current_char in transitions[current_state]:
@@ -122,23 +122,7 @@ def split_sql(sql):
         elif ( previous_state == '--p' and current_state == '--' ):
         # if previous character was the start of a line-comment token, discard
             current_sql_expression = current_sql_expression[:-1]
-    if current_sql_expression and not re.match("[\s;]*",current_sql_expression):
+    if current_sql_expression:
     # unless only whitespace and semicolons left, return remaining characters
     # between last ; and EOF
         yield current_sql_expression + ';'
-
-def get_next_occurence(haystack, offset, needles):
-    """find next occurence of one of the needles in the haystack
-       return: tuple of (index, needle found)
-           or: None if no needle was found"""
-    # make map of first char to full needle (only works if all needles
-    # have different first characters)
-    firstcharmap = dict([(n[0], n) for n in needles])
-    firstchars = firstcharmap.keys()
-    while offset < len(haystack):
-        if haystack[offset] in firstchars:
-            possible_needle = firstcharmap[haystack[offset]]
-            if haystack[offset:offset + len(possible_needle)] == possible_needle:
-                return (offset, possible_needle)
-        offset += 1
-    return None
diff --git a/test/test_ecpg.py b/test/test_ecpg.py
index 6d9c2c8..4f1da1b 100644
--- a/test/test_ecpg.py
+++ b/test/test_ecpg.py
@@ -14,6 +14,11 @@ def test_simple_failure(self):
         self.assertFalse(success)
         self.assertEqual('line 1: ERROR: unrecognized data type name "garbage"', msg)
 
+    def test_empty_sql_okay(self):
+        text = u"EXEC SQL ;"
+        (success, msg) = ecpg.check_syntax(text)
+        self.assertTrue(success)
+
     def test_parse_error_simple(self):
         error = '/tmp/tmpLBKZo5.pgc:1: ERROR: unrecognized data type name "garbage"'
         expected = 'line 1: ERROR: unrecognized data type name "garbage"'
diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py
index 72f2169..a6ca02a 100644
--- a/test/test_sqlprep.py
+++ b/test/test_sqlprep.py
@@ -47,9 +47,8 @@ def test_handles_inline_comment_between_statements(self):
         expected += "EXEC SQL  \nblah blah;"
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
-        # ^ Returning the empty string [BOOKMARK]
 
-    def test_does_not_mangle_inline_comment_within_statement(self):
+    def test_does_not_mangle_inline_comment_within_statement(self): # BOOKMARK
         text = "blah blah--comment here\n"
         text += "blah blah"
 
@@ -58,7 +57,7 @@ def test_does_not_mangle_inline_comment_within_statement(self):
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_does_not_mangle_first_column_comment_within_statement(self):
+    def test_does_not_mangle_first_column_comment_within_statement(self): # BOOKMARK
         text = "select a from b\n"
         text += "--comment here\n"
         text += "where c=3"
@@ -69,7 +68,6 @@ def test_does_not_mangle_first_column_comment_within_statement(self):
         expected = "EXEC SQL " + expected
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
-        # ^ Returning the empty string [BOOKMARK]
 
     def test_prepend_exec_sql_to_simple_statements(self):
         text = "create table control.myfavoritetable (id bigint);"
@@ -117,9 +115,9 @@ def test_comment_start_found_within_comment_between_statements(self):
         expected = "EXEC SQL select a from b;EXEC SQL  \nselect c from d;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_double_semicolon(self): # BOOKMARK
+    def test_double_semicolon(self):
         text = "select a from b;;"
-        expected = "EXEC SQL select a from b;"
+        expected = "EXEC SQL select a from b;EXEC SQL ;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_semi_found_in_comment_at_end_of_line(self):
@@ -132,7 +130,7 @@ def test_handles_first_line_comment(self):
         expected = "EXEC SQL \nselect a from b;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_handles_block_comment_on_last_line(self):
+    def test_handles_block_comment_on_last_line(self): # [BOOKMARK]
         text = "select a from b;\n/*\nselect c from d;\n*/"
         expected = "EXEC SQL select a from b;EXEC SQL \n/*\nselect c from d;\n*/;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))

From 0dd9de2902eb9437c4c9dd68f36109eef5afe325 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Mon, 30 May 2016 00:17:08 -0600
Subject: [PATCH 17/22] Created additional tests from the todo-list in
 test_sqlprep.py

---
 test/test_sqlprep.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py
index a6ca02a..edcb52f 100644
--- a/test/test_sqlprep.py
+++ b/test/test_sqlprep.py
@@ -7,7 +7,6 @@ def test_split_sql_nothing_interesting(self):
         text = "abcd123"
         expected = ["abcd123;"]
         self.assertEqual(expected, list(sqlprep.split_sql(text)))
-        # ^ Retuning the empty string [BOOKMARK]
 
     def test_split_sql_trailing_semicolon(self):
         text = "abcd123;"
@@ -48,7 +47,7 @@ def test_handles_inline_comment_between_statements(self):
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_does_not_mangle_inline_comment_within_statement(self): # BOOKMARK
+    def test_does_not_mangle_inline_comment_within_statement(self):
         text = "blah blah--comment here\n"
         text += "blah blah"
 
@@ -57,7 +56,7 @@ def test_does_not_mangle_inline_comment_within_statement(self): # BOOKMARK
 
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_does_not_mangle_first_column_comment_within_statement(self): # BOOKMARK
+    def test_does_not_mangle_first_column_comment_within_statement(self):
         text = "select a from b\n"
         text += "--comment here\n"
         text += "where c=3"
@@ -120,6 +119,11 @@ def test_double_semicolon(self):
         expected = "EXEC SQL select a from b;EXEC SQL ;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
+    def test_triple_semicolon(self):
+        text = "select a from b;;;"
+        expected = "EXEC SQL select a from b;EXEC SQL ;EXEC SQL ;"
+        self.assertEqual(expected, sqlprep.prepare_sql(text))
+
     def test_semi_found_in_comment_at_end_of_line(self):
         text = "select a\nfrom b --semi in comment;\nwhere c=1;"
         expected = "EXEC SQL select a\nfrom b \nwhere c=1;"
@@ -130,7 +134,7 @@ def test_handles_first_line_comment(self):
         expected = "EXEC SQL \nselect a from b;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-    def test_handles_block_comment_on_last_line(self): # [BOOKMARK]
+    def test_handles_block_comment_on_last_line(self):
         text = "select a from b;\n/*\nselect c from d;\n*/"
         expected = "EXEC SQL select a from b;EXEC SQL \n/*\nselect c from d;\n*/;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
@@ -150,7 +154,12 @@ def test_opening_two_block_comments_only_requries_one_close(self):
         expected = "EXEC SQL select a\n/*\n/*\ncomment\n*/from b;EXEC SQL select c from d;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
-#  TODO:
-#  semicolon followed by only whitespace / comments
-#  multiple semicolons in a row (legal?)
-#  line starts with semi and then has a statement
+    def test_trailing_whitespace_after_semicolon(self):
+        text = "select a from b; "
+        expected = "EXEC SQL select a from b;EXEC SQL  ;"
+        self.assertEqual(expected, sqlprep.prepare_sql(text))
+
+    def test_line_starts_with_semicolon(self):
+        text = ";select a from b;"
+        expected = "EXEC SQL ;EXEC SQL select a from b;"
+        self.assertEqual(expected, sqlprep.prepare_sql(text))

From 2f158f078b14bf93c5819145f50138b9326cf472 Mon Sep 17 00:00:00 2001
From: root <root@ip-172-31-21-202.us-west-2.compute.internal>
Date: Tue, 21 Jun 2016 01:23:46 -0600
Subject: [PATCH 18/22] Completed restructure in hopes of getting better speed.
 Tests passing.

---
 pgsanity/sqlprep.py  | 81 +++++++++++++++++++++++---------------------
 test/test_sqlprep.py |  2 +-
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 83e9a56..8cbe206 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -1,4 +1,6 @@
 import re
+from collections import OrderedDict
+
 try:
     from cStringIO import StringIO
 except ImportError:
@@ -15,7 +17,7 @@ def prepare_sql(sql):
     results.close()
     return response
 
-def get_processing_state(current_state, current_char):
+def get_processing_state(current_state, current_token):
     """determine the current state of processing an SQL-string.
     return: state symbol.
 
@@ -25,30 +27,14 @@ def get_processing_state(current_state, current_char):
                    other operators occur. This is the initial processesing state
                    in which the machine starts off
 
-        /*p  --    block-comment pre-entry state. Identical to the "_" state
-                   except that '*' initiates entry of a block comment
-
         /*   --    block-comment state. In block-comments, no SQL actually
                    occurs, meaning special characters like quotes and semicolons
                    have no effect
 
-        /*2  --    block-comment pre-exit state. Identical to the "/*" state
-                   except that '/' causes the current block-comment to be closed
-
-        $$p  --    extended-string pre-entry state. Identical to the base state
-                   except that '$' initiates entry of an extended string
-
         $$   --    extended-string state. In extended strings, all characters
                    are interpreted as string-data, meaning SQL-commands,
                    operators, etc. have no effect
 
-        $$2  --    extended-string pre-exit state. Identical to the extended-
-                   string state except that '$' causes the current extended-
-                   string to be closed
-
-        --p  --    line-comment pre-entry state. identical to the base state,
-                   except that '-' initiates a line-comment
-
         --   --    line-comment state. All characters are ignored and not
                    treated as SQL except for '\n', which is the only character
                    that prompts a transition out of this state
@@ -74,28 +60,23 @@ def get_processing_state(current_state, current_char):
     """
     transitions = {
         '_': {
-            0: '_', '/' : '/*p', '-': '--p', '$': '$$p',
+            0: '_', '/*' : '/*', '--': '--', '$$': '$$',
             "'": "'", '"': '"', ';': ';'
         },
         "'": {0: "'", "'": "'2"},
+        "'2":  {0: "_", "'": "'", ';': ';'},
         '"': {0: '"', '"': '"2'},
-        '--p': {0: '_', '-': '--', ';': ';'},
-        '/*p': {0: '_', '*': '/*', ';': ';'},
-        '$$p': {0: '_', '$': '$$', ';': ';'},
+        '"2':  {0: '_', '"': '"', ';': ';'},
         '--':  {0: '--', '\n':'_'},
-        '/*': {0: '/*', '*':'/*2'},
-        '/*2': {0: '/*', '/':'_'},
-        '$$':  {0: '$$', '$': '$$2'},
-        '$$2': {0: '$$', '$': '_'},
-        "'2":  {0: "_", "'": "'", ';': ';'},
-        '"2':  {0: '_', '"': '"', ';': ';'}
+        '/*': {0: '/*', '*/':'_'},
+        '$$':  {0: '$$', '$$': '_'},
     }
     # ^ Above, transitions[current_state][0] represents the transition to take
     # if no transition is explicitly defined for the passed-in character
     if current_state not in transitions:
         raise ValueError("Received an invalid state '{}'".format(current_state))
-    if current_char in transitions[current_state]:
-        return transitions[current_state][current_char]
+    if current_token in transitions[current_state]:
+        return transitions[current_state][current_token]
     else:
         return transitions[current_state][0]
 
@@ -105,24 +86,46 @@ def split_sql(sql):
        separated into individual statements """
     if len(sql) == 0:
         raise ValueError("Input appears to be empty.")
+    
+    # first, find the locations of all potential tokens in the input
+    tokenmap = {};
+    tokens = ['$$','*/','/*',';',"'",'"','--',"\n"]
+    search_position = 0
+    for token in tokens:
+        result = sql.find(token,search_position)
+        while result!=-1:
+            tokenmap[result] = token
+            result = sql.find(token,search_position)
+            search_position = result + len(token)
+        search_position = 0
+        
+    tokenmap = OrderedDict(sorted(tokenmap.items(), key=lambda t: t[0]))
+
+    # move through the tokens in order, appending SQL-chunks to current string
     previous_state = '_'
     current_state = '_'
     current_sql_expression = ''
-    for c in sql:
-        previous_state = current_state
-        current_state = get_processing_state(current_state,c)
+    previous_position = 0
+    for position, token in tokenmap.items():
+        current_state = get_processing_state(current_state,token)
         # disard everything except for newlines if in line-comment state
-        current_sql_expression += c if ( current_state != '--'
-                                         or c == "\n" ) else ''
-##        print "Current char: {} new state: {}".format(repr(c),current_state)
+        if current_state != '--' and previous_state != '--':
+            current_sql_expression += sql[previous_position:position+len(token)]
+        elif current_state == '--' and previous_state != '--':
+        # if line-comment just started, add everything before it:
+            current_sql_expression += sql[previous_position:position]
+        elif token=="\n":
+            current_sql_expression += token
+##        print "Current token: {} new state: {}".format(repr(token),current_state)
         if current_state == ';':
             yield current_sql_expression
             current_sql_expression = ''
             current_state = '_'
-        elif ( previous_state == '--p' and current_state == '--' ):
-        # if previous character was the start of a line-comment token, discard
-            current_sql_expression = current_sql_expression[:-1]
-    if current_sql_expression:
+            previous_state = '_'
+        previous_position = position + len(token)
+        previous_state = current_state
+    current_sql_expression += sql[previous_position:].rstrip(';')
+    if current_sql_expression.strip(' ;'):
     # unless only whitespace and semicolons left, return remaining characters
     # between last ; and EOF
         yield current_sql_expression + ';'
diff --git a/test/test_sqlprep.py b/test/test_sqlprep.py
index edcb52f..efd6df1 100644
--- a/test/test_sqlprep.py
+++ b/test/test_sqlprep.py
@@ -156,7 +156,7 @@ def test_opening_two_block_comments_only_requries_one_close(self):
 
     def test_trailing_whitespace_after_semicolon(self):
         text = "select a from b; "
-        expected = "EXEC SQL select a from b;EXEC SQL  ;"
+        expected = "EXEC SQL select a from b;"
         self.assertEqual(expected, sqlprep.prepare_sql(text))
 
     def test_line_starts_with_semicolon(self):

From c260e0b2142282bf2b4bea90615880b740e6d0a2 Mon Sep 17 00:00:00 2001
From: root <root@ip-172-31-21-202.us-west-2.compute.internal>
Date: Tue, 21 Jun 2016 02:47:13 -0600
Subject: [PATCH 19/22] Unstable commit. Working on a generator to avoid
 creating a map of tokens on the entire input string. This should prevent
 memory from getting exhausted so easily.

---
 pgsanity/sqlprep.py | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index 8cbe206..a66c171 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -80,6 +80,25 @@ def get_processing_state(current_state, current_token):
     else:
         return transitions[current_state][0]
 
+def get_token_gen(sql,tokens):
+    """ return a generator that indicates each token in turn, and the identity
+        of that token
+        return: (token's integer position in string, token)
+    """
+    positionDict = {}
+    search_position = 0
+    for token in tokens:
+        positionDict[token] = sql.find(token,search_position)
+    while positionDict.values() != []:
+        rval = sorted(positionDict.items(), key=lambda t: t[1])[0]
+        if rval==-1:
+            for key in positionDict.keys():
+                if positionDict[key] == -1:
+                    del positionDict[key]
+            continue
+        yield rval
+        search_position = rval[1] + len(rval[0])
+
 def split_sql(sql):
     """isolate complete SQL-statements from the passed-in string
        return: the SQL-statements from the passed-in string,
@@ -88,25 +107,14 @@ def split_sql(sql):
         raise ValueError("Input appears to be empty.")
     
     # first, find the locations of all potential tokens in the input
-    tokenmap = {};
     tokens = ['$$','*/','/*',';',"'",'"','--',"\n"]
-    search_position = 0
-    for token in tokens:
-        result = sql.find(token,search_position)
-        while result!=-1:
-            tokenmap[result] = token
-            result = sql.find(token,search_position)
-            search_position = result + len(token)
-        search_position = 0
-        
-    tokenmap = OrderedDict(sorted(tokenmap.items(), key=lambda t: t[0]))
 
     # move through the tokens in order, appending SQL-chunks to current string
     previous_state = '_'
     current_state = '_'
     current_sql_expression = ''
     previous_position = 0
-    for position, token in tokenmap.items():
+    for token, position in get_token_gen(sql,tokens):
         current_state = get_processing_state(current_state,token)
         # disard everything except for newlines if in line-comment state
         if current_state != '--' and previous_state != '--':

From 6cf047553a8d29b63a931415b0d2e5b3a7cfc050 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Wed, 22 Jun 2016 01:20:49 -0600
Subject: [PATCH 20/22] Stable-ish commit with loads of comments spammed
 everywhere.

---
 pgsanity/sqlprep.py | 57 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index a66c171..f1c8c1d 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -19,6 +19,18 @@ def prepare_sql(sql):
 
 def get_processing_state(current_state, current_token):
     """determine the current state of processing an SQL-string.
+
+    current_state   --    see 'States' further down in this dcostring
+
+    current_token   --    any character or character-pair which can prompt one
+                          or more transitions in SQL-state (quote-marks,
+                          comment-starting symbols, etc.)
+                          NOTE: For both double-quote and single-quote
+                          characters, the passed-in token should consist of the
+                          initial quote character, plus the character which
+                          immediately follows it, because it is not possible
+                          to determine the next state without it.
+
     return: state symbol.
 
     States:
@@ -61,7 +73,7 @@ def get_processing_state(current_state, current_token):
     transitions = {
         '_': {
             0: '_', '/*' : '/*', '--': '--', '$$': '$$',
-            "'": "'", '"': '"', ';': ';'
+            "'": "'", '"': '"', ';': ';', "''": "'2", '""': '"2'
         },
         "'": {0: "'", "'": "'2"},
         "'2":  {0: "_", "'": "'", ';': ';'},
@@ -77,27 +89,41 @@ def get_processing_state(current_state, current_token):
         raise ValueError("Received an invalid state '{}'".format(current_state))
     if current_token in transitions[current_state]:
         return transitions[current_state][current_token]
+    elif current_token[0] in transitions[current_state]:
+    # if we have a double-quote + peek character or a single-quote + char,
+    # transition using that
+        temp_state = transitions[current_state][current_token[0]]
+        return get_processing_state(temp_state,current_token[1]) # recurse
     else:
         return transitions[current_state][0]
 
 def get_token_gen(sql,tokens):
-    """ return a generator that indicates each token in turn, and the identity
-        of that token
+    """ return a generator that indicates the position of each token in turn,
+        and the identity of that token
         return: (token's integer position in string, token)
     """
+    peek_tokens = ["'",'"']
     positionDict = {}
     search_position = 0
     for token in tokens:
-        positionDict[token] = sql.find(token,search_position)
+        positionDict[token] = sql.find(token,search_position)    
     while positionDict.values() != []:
-        rval = sorted(positionDict.items(), key=lambda t: t[1])[0]
-        if rval==-1:
-            for key in positionDict.keys():
-                if positionDict[key] == -1:
-                    del positionDict[key]
+        si = sorted(positionDict.items(), key=lambda t: t[1])
+##        print "Sorted tokens: {}".format(si)
+        rval = si[0]
+        find_next = rval[0]
+        if rval[1]==-1:
+##            print "Deleting... {}".format(rval[0])
+            del positionDict[rval[0]]
             continue
+        elif rval[0] in peek_tokens and rval[1]+1 < len(sql):
+            find_next = rval[0]
+            rval = (rval[0]+sql[rval[1]+1],rval[1])
         yield rval
+        # if possible, replace the token just returned and advance the cursor
         search_position = rval[1] + len(rval[0])
+        positionDict[find_next] = sql.find(find_next,search_position)
+##        print "Found next {} at {}".format(find_next,positionDict[find_next])
 
 def split_sql(sql):
     """isolate complete SQL-statements from the passed-in string
@@ -105,6 +131,11 @@ def split_sql(sql):
        separated into individual statements """
     if len(sql) == 0:
         raise ValueError("Input appears to be empty.")
+
+##    print "\nSTRING:\n"
+##    print sql
+##    print "\n:STRING"
+##    print ""
     
     # first, find the locations of all potential tokens in the input
     tokens = ['$$','*/','/*',';',"'",'"','--',"\n"]
@@ -124,9 +155,15 @@ def split_sql(sql):
             current_sql_expression += sql[previous_position:position]
         elif token=="\n":
             current_sql_expression += token
-##        print "Current token: {} new state: {}".format(repr(token),current_state)
+##        print "Current token: {}".format(repr(token))
+##        print "New state from token: ( {} )".format(current_state)
+##        print "Current position: {}".format(position)
+##        print "String so far: {}".format(repr(current_sql_expression))
+##        print "---"
         if current_state == ';':
+##            print "YIELDING: {}".format(repr(current_sql_expression))
             yield current_sql_expression
+##            print "\n"
             current_sql_expression = ''
             current_state = '_'
             previous_state = '_'

From ad4b0e04ddf638e207cb31d3f305554d29c9bec6 Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Wed, 22 Jun 2016 01:24:47 -0600
Subject: [PATCH 21/22] Tidied up comments.

---
 pgsanity/sqlprep.py | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

diff --git a/pgsanity/sqlprep.py b/pgsanity/sqlprep.py
index f1c8c1d..2157c74 100644
--- a/pgsanity/sqlprep.py
+++ b/pgsanity/sqlprep.py
@@ -103,18 +103,16 @@ def get_token_gen(sql,tokens):
         return: (token's integer position in string, token)
     """
     peek_tokens = ["'",'"']
-    positionDict = {}
+    position_dict = {}
     search_position = 0
     for token in tokens:
-        positionDict[token] = sql.find(token,search_position)    
-    while positionDict.values() != []:
-        si = sorted(positionDict.items(), key=lambda t: t[1])
-##        print "Sorted tokens: {}".format(si)
+        position_dict[token] = sql.find(token,search_position)    
+    while position_dict.values() != []:
+        si = sorted(position_dict.items(), key=lambda t: t[1])
         rval = si[0]
         find_next = rval[0]
         if rval[1]==-1:
-##            print "Deleting... {}".format(rval[0])
-            del positionDict[rval[0]]
+            del position_dict[rval[0]]
             continue
         elif rval[0] in peek_tokens and rval[1]+1 < len(sql):
             find_next = rval[0]
@@ -122,8 +120,7 @@ def get_token_gen(sql,tokens):
         yield rval
         # if possible, replace the token just returned and advance the cursor
         search_position = rval[1] + len(rval[0])
-        positionDict[find_next] = sql.find(find_next,search_position)
-##        print "Found next {} at {}".format(find_next,positionDict[find_next])
+        position_dict[find_next] = sql.find(find_next,search_position)
 
 def split_sql(sql):
     """isolate complete SQL-statements from the passed-in string
@@ -131,15 +128,7 @@ def split_sql(sql):
        separated into individual statements """
     if len(sql) == 0:
         raise ValueError("Input appears to be empty.")
-
-##    print "\nSTRING:\n"
-##    print sql
-##    print "\n:STRING"
-##    print ""
-    
-    # first, find the locations of all potential tokens in the input
     tokens = ['$$','*/','/*',';',"'",'"','--',"\n"]
-
     # move through the tokens in order, appending SQL-chunks to current string
     previous_state = '_'
     current_state = '_'
@@ -151,19 +140,12 @@ def split_sql(sql):
         if current_state != '--' and previous_state != '--':
             current_sql_expression += sql[previous_position:position+len(token)]
         elif current_state == '--' and previous_state != '--':
-        # if line-comment just started, add everything before it:
+        # if line-comment just started, add everything before it
             current_sql_expression += sql[previous_position:position]
         elif token=="\n":
             current_sql_expression += token
-##        print "Current token: {}".format(repr(token))
-##        print "New state from token: ( {} )".format(current_state)
-##        print "Current position: {}".format(position)
-##        print "String so far: {}".format(repr(current_sql_expression))
-##        print "---"
         if current_state == ';':
-##            print "YIELDING: {}".format(repr(current_sql_expression))
             yield current_sql_expression
-##            print "\n"
             current_sql_expression = ''
             current_state = '_'
             previous_state = '_'
@@ -172,5 +154,5 @@ def split_sql(sql):
     current_sql_expression += sql[previous_position:].rstrip(';')
     if current_sql_expression.strip(' ;'):
     # unless only whitespace and semicolons left, return remaining characters
-    # between last ; and EOF
+    # between last ';' and EOF
         yield current_sql_expression + ';'

From 3fab645a5bb7e4cea80edb4e74157de833d56bcc Mon Sep 17 00:00:00 2001
From: koyae <CronoCat@hotmail.com>
Date: Wed, 22 Jun 2016 02:30:33 -0600
Subject: [PATCH 22/22] Tossed in some code to prevent chardet from hanging on
 large files.

---
 pgsanity/pgsanity.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pgsanity/pgsanity.py b/pgsanity/pgsanity.py
index 2add213..262bafb 100755
--- a/pgsanity/pgsanity.py
+++ b/pgsanity/pgsanity.py
@@ -17,7 +17,7 @@ def get_config(argv=sys.argv[1:]):
 
 def remove_bom_if_exists(sql_string):
     """ Take the entire SQL-payload of a file (or stream) and strip the BOM-table
-    if one was detected.
+    if one was detected, returning it along with the detected encoding.
 
     sql_string   --   string-representation of incoming character-data. Value
                       should be passed RAW, meaning BEFORE regular decoding take
@@ -25,11 +25,11 @@ def remove_bom_if_exists(sql_string):
 
     Returns a BOM-free SQL-payload.
     """
-    encoding = detect(sql_string)["encoding"]
+    encoding = detect(sql_string[:10000])["encoding"] # HACK
     is_utf8 = encoding in ["UTF-8","UTF-8-SIG"] # *
     bom_present = is_utf8 and sql_string.startswith(BOM_UTF8) # *
     sql_string = sql_string[len(BOM_UTF8):] if bom_present else sql_string
-    return sql_string 
+    return sql_string, encoding
     # * The marked lines above are a tiny bit redundant given that 'UTF-8-SIG'
     # simply means "UTF-8 file with a BOM-table". However, older versions of
     # chardet don't support this, and will just detect 'UTF-8', leaving us to
@@ -50,9 +50,8 @@ def check_file(filename=None, show_filename=False):
     else:
         with sys.stdin as filelike:
             sql_string = sys.stdin.read()
-    sql_string = remove_bom_if_exists(sql_string)
-    success, msg = check_string(sql_string.decode("utf-8"))
-    # ^ The above call to decode() is safe for both ASCII and UTF-8 data.
+    sql_string, encoding = remove_bom_if_exists(sql_string)
+    success, msg = check_string(sql_string.decode(encoding))
 
     # report results
     result = 0