mysqldumpdissect

#!/usr/bin/env python3
# mysqldumpdissect (part of ossobv/vcutil) // wdoekes/2023-2024
#   // Public Domain
#
# Parse mysqldump output and split extended-insert style statements up
# into individual statements.
#
# USAGE
#   mysqldumpdissect [--coleq=TABLE=IDX:TYPE:EQ]* -f /var/backup/mysqldump.sql
#
# DESCRIPTION
#   Reads mysqldump file, parses the INSERT statements, which can be of
#   the extended-insert type, and outputs them as individual statements.
#   This allows for easier reading and/or grepping.
#
#   Additionally, binary blobs are encoded in the X'0123' notation so they
#   can be copy-pasted easily.
#
#   By using the --coleq lookup, you can search for single groups of
#   values. This can speed up the results by a lot.
#
#   For example --coleq=the_table=0:i:3 will look for INSERTs into
#   the_table, where the 0th column has an integer value of 3.
#
#   Another example --coleq=the_table='2:s:John' will look for INSERTs into
#   the_table, where the 2nd (0-based) column has a string value of "john".
#
#   Example of record extraction:
#
#       $ mysqldumpdissect --coleq=tbl=1:i:32 <<EOF
#       INSERT INTO \`tbl\` VALUES ('alice',31),('bob',32),('charlie',12);
#       EOF
#
#       INSERT INTO `tbl` VALUES ('bob',32);
#
#   Example of blob escaping:
#
#       $ printf "INSERT INTO \`tbl\` VALUES ('Hello \x01\x02\x03');" |
#           ./mysqldumpdissect
#
#       INSERT INTO `tbl` VALUES (X'48656C6C6F20010203');
#
#   Using multiple --coleq expressions is supported. Matches on the same table
#   and different columns are AND-ed. Matches on different tables or on the
#   same column are OR-ed:
#
#       # This matches rows with 32 OR 33 in the 1st column:
#       --coleq=tbl=1:i:32
#       --coleq=tbl=1:i:33
#
#       # This matches only rows with 32 in the 1st column AND 'jack' in the
#       # 2nd column:
#       --coleq=tbl=1:i:32
#       --coleq=tbl=2:s:jack
#
# RATIONALE
#   > Why not dump with --skip-extended-insert in the first place?
#
#   For one, because a load of such a DB is a lot slower because
#   each INSERT counts as a transaction. Also such a dump is bigger.
#   And it's too late anyway, since you're looking at an extended
#   INSERT dump now.
#
#   Additionally, the binary base16 encoding is very very nice if you have
#   binary blobs in your dump. (Which the ossobv/pstore happens to have.)
#
# BUGS/CAVEATS
#   * The dump is different because:
#     - it does not contain the CREATE TABLE statements (missing feature);
#     - double quotes are not uselessly escaped (a diff, but not a problem).
#   * The --coleq rule match syntax is clunky and restrictive. Maybe do:
#     table_name[colidx]=value.
#   * The --coleq rule match probably fails on floats/decimals at the moment.
#   * It is quite slow if you don't provide a --coleq. Likely there is
#     very much time spent in _extract_rows. Some optimization or
#     rewriting in a faster language might be in order.
#   * This was mainly tested with one mysqldump output file. If the
#     mysqldump format is slightly different, then this script might
#     croak. If this is the case it should fail hard, instead of
#     producing good looking but corrupt data.
#
import sys

from argparse import ArgumentParser
from base64 import b16encode
from collections import namedtuple
from io import UnsupportedOperation
from itertools import product
from os import environ, fdopen

USELESS_DQUOTE_ESCAPE = False


class MatchRule(namedtuple('MatchRule', 'table c_matches')):
    @classmethod
    def from_string(cls, s):
        table, c_args = s.split('=', 1)
        c_idx, c_type, c_match = c_args.split(':', 2)
        c_idx = int(c_idx)
        if c_type == 's':
            c_match = c_match.encode()
        elif c_type == 'i':
            c_match = int(c_match)
        else:
            raise ValueError(c_type)
        return cls(table.encode(), ((c_idx, c_match),))

    def matches_row(self, row):
        return all(row.columns[k] == v for (k, v) in self.c_matches)


class Row:
    # 0x20..0x7e
    BIN7_SAFE = set([
        9, 10, 13, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
        71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
        89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 123, 124, 125, 126,
    ])
    # 0x20..0x7e + 0x80..0xff
    BIN8_SAFE = BIN7_SAFE | set([
        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
        142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
        184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
        198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
        212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225,
        226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
        254, 255,
    ])

    def __init__(self, columns):
        self.columns = columns

    def as_bytes(self):
        ret = []
        for column in self.columns:
            if isinstance(column, int):
                ret.append(str(column).encode())
            elif isinstance(column, (bytes, bytearray)):
                ret.append(self._as_byte_column(column))
            elif column is None:
                ret.append(b'NULL')
            else:
                raise NotImplementedError((type(column), column))
        return b','.join(ret)

    @classmethod
    def _as_byte_column(cls, column):
        used_chars = set(column)

        # Simple 7 bit safe?
        if used_chars.issubset(cls.BIN7_SAFE):
            column = cls._make_safe(column)
            return b"'" + column + b"'"

        # UTF-8 safe, no specials?
        if used_chars.issubset(cls.BIN8_SAFE):
            try:
                column.decode('utf-8')
            except UnicodeDecodeError:
                pass
            else:
                column = cls._make_safe(column)
                # TODO: Do we want to prefix the value with _utf8?
                return b"'" + column + b"'"

        # Binary.
        return b"X'" + b16encode(column) + b"'"

    if USELESS_DQUOTE_ESCAPE:
        @classmethod
        def _make_safe(cls, column):
            return (
                column
                .replace(b'\\', b'\\\\')
                .replace(b"'", b"\\'")
                .replace(b'"', b'\\"')  # mysqldump output compat?
                .replace(b'\t', b'\\t')
                .replace(b'\r', b'\\r')
                .replace(b'\n', b'\\n'))
    else:
        @classmethod
        def _make_safe(cls, column):
            return (
                column
                .replace(b'\\', b'\\\\')
                .replace(b"'", b"\\'")
                .replace(b'\t', b'\\t')
                .replace(b'\r', b'\\r')
                .replace(b'\n', b'\\n'))


class MysqlDumpParser:
    ESCAPE_MAP = {
        # https://dev.mysql.com/doc/refman/8.0/en/string-literals.html
        ord('0'): ord('\0'),
        ord("'"): ord("'"),
        ord('"'): ord('"'),
        ord('b'): ord('\b'),
        ord('n'): ord('\n'),
        ord('r'): ord('\r'),
        ord('t'): ord('\t'),
        # The ASCII 26 character can be encoded as \Z to enable you to work
        # around the problem that ASCII 26 stands for END-OF-FILE on Windows.
        ord('Z'): ord('\x1a'),
        ord('\\'): ord('\\'),
        # These may be escaped in LIKE queries, but are not escaped when not
        # needed.
        # ord('%'): ord('%'),
        # ord('_'): ord('_'),
    }

    def __init__(self, fp):
        self.fp = fp
        self.run = 0

    def find(self, rules=()):
        try:
            self.fp.seek(0)
        except UnsupportedOperation:
            if self.run > 0:
                raise
        self.run += 1

        self._it = iter(self.fp)
        try:
            self._line = next(self._it)
        except StopIteration:
            self._it = self._line = None

        # We'll seek to the INSERT INTO `..`, and remove the tables we've
        # found.
        tables_to_seek = set(rule.table for rule in rules) or None

        while self._it:
            try:
                startswith, current_table = self._seek_to_insert_table(
                    tables_to_seek)
            except StopIteration:
                self._it = self._line = None
            else:
                table_rules = None
                if rules:
                    table_rules = [
                        rule for rule in rules if rule.table == current_table]
                    assert table_rules, 'expected rules, else the seek failed'

                for row in self._extract_matching_rows(
                        startswith, table_rules):
                    yield startswith + b'(' + row.as_bytes() + b');'

                # If we have rules, we might be able to quit early.
                if rules:
                    # We've now processed current_table. Remove it from
                    # the tables_to_seek so we can quit early.
                    tables_to_seek.remove(current_table)
                    if not tables_to_seek:
                        # Quit early.
                        break

    def _seek_to_insert_table(self, relevant_tables=None):
        find = b'INSERT INTO `'

        line = self._line
        while True:
            while not line.startswith(find):
                line = next(self._it)
            assert line.startswith(find), line

            backtick_idx = line.index(96, 13)  # ord('`'), len('INSERT INTO `')
            # FIXME: We might not cope with strange table names with
            # embedded backticks here.
            found_table_name = line[13:backtick_idx]
            assert line[backtick_idx + 1] == 32, line

            if relevant_tables is None or found_table_name in relevant_tables:
                self._line = line
                break

            # The current line is not a match. Seek past it.
            not_find = line[0:backtick_idx + 2]  # 'INSERT INTO `abc` '
            line = next(self._it)
            while line.startswith(not_find):
                line = next(self._it)

        up_to_lparen = line[0:line.index(40, backtick_idx + 2)]  # ord('(')
        return up_to_lparen, found_table_name

    def _extract_matching_rows(self, startswith, rules):
        line = self._line
        while line.startswith(startswith):
            for row in self._extract_rows(line[len(startswith):]):
                if not rules:
                    yield row
                elif any(rule.matches_row(row) for rule in rules):
                    yield row
            try:
                line = next(self._it)
            except StopIteration:
                self._it = self._line = None
                return

        assert not line.startswith(startswith), line
        self._line = line

    def _extract_rows(self, line):
        idx = 0
        state = 0
        buffer = bytearray()
        rows = []
        columns = []
        column_type = None
        while True:
            ch = line[idx]

            # OUT_PAREN
            if state == 0:
                if ch == 0x28:  # "("
                    state = 1   # IN_PAREN
                else:
                    assert False, (ch, idx, state, line[idx:idx + 10])

            # IN_PAREN
            elif state == 1:
                if ch in (0x09, 0x20):
                    assert column_type is None, (
                        idx, column_type, line[idx:idx + 10])
                elif ch == 0x27:    # "'"
                    assert column_type is None, (
                        idx, column_type, line[idx:idx + 10])
                    column_type = str
                    state = 2     # IN_STRING
                elif ch in (0x2c, 0x29):  # "," or ")"
                    assert column_type in (str, int), (idx, column_type)
                    if column_type == str:
                        # Make sure we copy. We clear the array later.
                        columns.append(bytes(buffer))
                    elif buffer == b'NULL':
                        columns.append(None)
                    else:
                        # FIXME: We do not cope with decimals/floats right now.
                        assert b'.' not in buffer, (NotImplemented, buffer)
                        columns.append(int(buffer.decode()))
                    buffer.clear()
                    column_type = None

                    if ch == 0x2c:  # ","
                        state = 1
                    elif ch == 0x29:  # ")"
                        state = 3     # AFTER_PAREN
                else:
                    assert column_type in (None, int), (
                        column_type, idx, hex(ch), line[idx-10:idx+10])
                    column_type = int
                    buffer.append(ch)

            # IN_STRING
            elif state == 2:
                if ch == 0x27:    # "'"
                    if line[idx + 1] == 0x27:  # "'" so, "''" -> "'"
                        # We do this unescaping here.
                        buffer.append(0x27)
                        idx += 1
                    else:
                        state = 1   # IN_PAREN
                elif ch == 0x5c:    # "\\"
                    # We do this unescaping here.
                    try:
                        buffer.append(self.ESCAPE_MAP[line[idx + 1]])
                    except KeyError:
                        assert False, (line[idx - 10:idx + 10], line[idx + 1])
                    idx += 1
                else:
                    buffer.append(ch)

            # AFTER_PAREN
            elif state == 3:
                rows.append(Row(columns))
                columns = []
                column_type = None

                if ch == 0x2c:    # ","
                    state = 0
                elif ch == 0x3b:  # ";"
                    assert line[idx:] in (b';\n', b';'), line[idx:]
                    break

            idx += 1

        return rows


def merge_coleq(matchrules):
    """
    Merge list of --coleq args for sane AND/OR behaviour.

    BEWARE: Uses itertools.product(). This will explode fast if you attempt to
    use many lookups on the same table.

    Example input:

        --coleq the_table=1:i:123
        --coleq the_table=1:i:65535
        --coleq the_table=2:s:alice
        --coleq the_table=2:s:bob
        --coleq other_table=3:s:other_match

    Turns into these OR-matches that have internal AND-matches:

        MatchRule('the_table', ((1, 123), (2, 'alice')))
        MatchRule('the_table', ((1, 123), (2, 'bob')))
        MatchRule('the_table', ((1, 65535), (2, 'alice')))
        MatchRule('the_table', ((1, 65535), (2, 'bob')))
        MatchRule('other_table', ((3, 'other_match'),))
    """
    table_idx_rules = {}
    for matchrule in matchrules:
        if matchrule.table not in table_idx_rules:
            table_idx_rules[matchrule.table] = {}
        idx_rules = table_idx_rules[matchrule.table]

        for c_idx, c_match in matchrule.c_matches:
            if c_idx not in idx_rules:
                idx_rules[c_idx] = []
            rules = idx_rules[c_idx]
            rules.append(c_match)
            assert len(set(type(rule) for rule in rules)) == 1, 'diff types?'

    # Sort everything:
    # - tables by name
    # - keys by index
    # - values lexicographically
    merged_rules = []
    for table, idx_rules in sorted(table_idx_rules.items()):
        # (1: [32, 33]), (2, ['john'])
        flat_rules = list(sorted(idx_rules.items()))
        # [1, 2]
        flat_keys = [k for k, v in flat_rules]
        # [[32, 33], ['john']]
        flat_values = [list(sorted(v)) for k, v in flat_rules]
        for new_matches_values in product(*flat_values):
            # [32, 'john'] => [(1, 32), (2, 'john')]
            # [33, 'john'] => [(1, 33), (2, 'john')]
            new_matches = tuple(
                (flat_keys[idx], new_match_value)
                for idx, new_match_value in enumerate(new_matches_values))
            merged_rules.append(MatchRule(table, new_matches))

    return tuple(merged_rules)


def main():
    parser = ArgumentParser(
        prog='mysqldumpdissect',
        description='''
            Parse/scan through MySQL/MariaDB mysqldump output.
            Especially suited for dumps created with --extended-insert
            (the default) and for dumps which contain binary data
            (blobs).''')
    parser.add_argument(
        '-f', '--filename', help='Read SQL from file instead of stdin')
    parser.add_argument(
        '--coleq', action='append', type=MatchRule.from_string, help=(
            'Rule to match by column equality: '
            '"table_name=col_idx:valuetype:value". '
            'Note that col_idx is 0-based. '
            'The valuetype is i for integers and s for strings. '
            'Multiple --coleq expressions are allowed and are AND-ed or '
            'OR-ed depending on what they match: different column on the '
            'same table results in an AND-match.'))
    args = parser.parse_args()

    # Not writing to sys.stdout.buffer because a prematurely closed stdout
    # causes double exception noise:
    #   Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w'
    #     encoding='utf-8'>
    bin_stdout = fdopen(sys.stdout.fileno(), 'wb')

    if args.filename is not None and args.filename != '-':
        fp = open(args.filename, 'rb')
    else:
        fp = sys.stdin.buffer

    try:
        mdp = MysqlDumpParser(fp)

        # Merge coleq args so we get sane AND/OR behaviour:
        coleqs = merge_coleq(args.coleq or [])

        for statement in mdp.find(coleqs):
            try:
                bin_stdout.write(statement + b'\n')
            except BrokenPipeError:
                bin_stdout = None
                break  # happens e.g. when piping to head(1)
    finally:
        if args.filename:
            fp.close()

    if bin_stdout:
        bin_stdout.close()


if __name__ == '__main__':
    if environ.get('RUNTESTS', '') not in ('', '0', 'n'):
        from io import BytesIO
        from unittest import TestCase, main as unittest_main

        class MddTest(TestCase):
            def test_coleq_from_string(self):
                rule = MatchRule.from_string('tbl=1:i:32')
                self.assertEqual(rule.table, b'tbl')
                self.assertEqual(rule.c_matches, ((1, 32),))

                rule = MatchRule.from_string('tbl2=2:s:33')
                self.assertEqual(rule.table, b'tbl2')
                self.assertEqual(rule.c_matches, ((2, b'33'),))

            def test_example_passthrough(self):
                mdp = MysqlDumpParser(BytesIO(
                    b"-- blah at start\n"
                    b"INSERT INTO `tbl_x` VALUES "
                    b"('l\\'alice',31),('bob',32),('charlie',12);\n"
                    b"INSERT INTO `tbl_x` VALUES "
                    b"('david',NULL);\n"
                    b"INSERT INTO `tbl_y` VALUES "
                    b"('Hello \x01\x02\x03',-1);\n"
                    b"-- blah at eof\n"))
                res = list(mdp.find())
                self.assertEqual(res, [
                    b"INSERT INTO `tbl_x` VALUES ('l\\'alice',31);",
                    b"INSERT INTO `tbl_x` VALUES ('bob',32);",
                    b"INSERT INTO `tbl_x` VALUES ('charlie',12);",
                    b"INSERT INTO `tbl_x` VALUES ('david',NULL);",
                    b"INSERT INTO `tbl_y` VALUES (X'48656C6C6F20010203',-1);"])

            def test_example_coleq(self):
                mdp = MysqlDumpParser(BytesIO(
                    # Matches alice.
                    b"INSERT INTO `tbl_a` VALUES ('alice',26);\n"
                    # Skipped, wrong table.
                    b"INSERT INTO `tbl_x` VALUES"
                    b" ('alice',31),('bob',32),('charlie',12);\n"
                    # Skipped, no match.
                    b"INSERT INTO `tbl_y` VALUES ('aaron',1);\n"
                    # Matches bob.
                    b"INSERT INTO `tbl_y` VALUES"
                    b" ('alice',31),('bob',32),('charlie',12);\n"
                    # Matches emily.
                    b"INSERT INTO `tbl_y` VALUES"
                    b" ('david',12345),('emily',32);\n"
                    # Skipped, wrong table.
                    b"INSERT INTO `tbl_z` VALUES"
                    b" ('alice',31),('bob',32),('charlie',12);\n"
                    # This should be skipped! We do not expect another tbl_y
                    # after tbl_z!
                    b"INSERT INTO `tbl_y` VALUES ('badbob',32);\n"))
                res = list(mdp.find((
                    MatchRule.from_string('tbl_a=0:s:alice'),
                    MatchRule.from_string('tbl_y=1:i:32'),
                )))
                self.assertEqual(res, [
                    b"INSERT INTO `tbl_a` VALUES ('alice',26);",
                    b"INSERT INTO `tbl_y` VALUES ('bob',32);",
                    b"INSERT INTO `tbl_y` VALUES ('emily',32);"])

            def test_example_binblob(self):
                mdp = MysqlDumpParser(BytesIO(
                    b"INSERT INTO `tbl` VALUES "
                    b"('Hello \x01\x02\x03');"))
                res = list(mdp.find())
                self.assertEqual(res, [
                    b"INSERT INTO `tbl` VALUES (X'48656C6C6F20010203');"])

            def test_merge_coleq_diffdatattype(self):
                "Datatypes must be the same"
                with self.assertRaises(AssertionError):
                    merge_coleq((
                        MatchRule.from_string('tbl=1:i:32'),
                        MatchRule.from_string('tbl=1:s:33'),
                    ))

            def test_merge_coleq_diffidx_one_rule(self):
                "Different index, means one rules to match"
                rules = merge_coleq((
                    MatchRule.from_string('tbl=1:i:32'),
                    MatchRule.from_string('tbl=2:s:john'),
                ))
                self.assertEqual(rules, (
                    MatchRule(b'tbl', ((1, 32), (2, b'john'))),
                ))

            def test_merge_coleq_sameidx_two_rules(self):
                "Same index, means two possible rules to match"
                rules = merge_coleq((
                    MatchRule.from_string('tbl=1:i:32'),
                    MatchRule.from_string('tbl=1:i:33'),
                ))
                self.assertEqual(rules, (
                    MatchRule(b'tbl', ((1, 32),)),
                    MatchRule(b'tbl', ((1, 33),)),
                ))

            def test_merge_coleq_multirules(self):
                "Same index, means two possible rules to match"
                rules = merge_coleq((
                    MatchRule.from_string('tbl2=2:s:jack'),
                    MatchRule.from_string('tbl=1:i:33'),
                    MatchRule.from_string('tbl=2:s:john'),
                    MatchRule.from_string('tbl=2:s:jack'),
                    MatchRule.from_string('tbl=1:i:32'),
                ))
                self.assertEqual(rules, (
                    # Sorted and everything
                    MatchRule(b'tbl', ((1, 32), (2, b'jack'))),
                    MatchRule(b'tbl', ((1, 32), (2, b'john'))),
                    MatchRule(b'tbl', ((1, 33), (2, b'jack'))),
                    MatchRule(b'tbl', ((1, 33), (2, b'john'))),
                    MatchRule(b'tbl2', ((2, b'jack'),)),
                ))

        unittest_main()
    else:
        main()