udiff

#!/usr/bin/env python
# udiff (part of ossobv/vcutil) // wdoekes/2011,2025 // Public Domain
#
# Find minor differences in large files (microdiff, or unified-diff *grin*).
#
# Use this instead of regular diff(1) which is prone to choke with a
# "memory exhausted" error on large files. This can be useful when
# comparing database dumps before and after an operation: lots of
# invariant INSERT statements and only a handful more or less caused
# by the operation.
#
# See also: http://wjd.nu/notes/2011#diff-memory-exhausted-udiff
import datetime
import os

BUFFER_LINES = 10000


class LineByLineReader(object):
    '''A file reader that reads a file line by line which can look up if
    a similar line exists a bit down the road. The maximum number of
    lines to look forward to is defined by max_lookahead.'''

    def __init__(self, file, max_lookahead=BUFFER_LINES):
        super(LineByLineReader, self).__init__()
        self.file = file
        self.max_lookahead = max_lookahead
        self.lineno = 0
        self.buffer = []  # list of (lineno, line) tuples
        self.bigbuffer = b''

    def _fill(self):
        '''Add another line to the internal buffer. Raises
        StopIteration on EOF.'''
        if self.bigbuffer is None:
            raise StopIteration()

        eof = False
        while True:
            more = self.file.read(8192)
            if not more:
                eof = True
                break
            if b'\n' in more:
                self.bigbuffer += more
                break
            self.bigbuffer += more

        items = self.bigbuffer.split(b'\n')
        for item in items[0:-1]:
            self.lineno += 1
            self.buffer.append((self.lineno, item))

        if eof:
            if items[-1]:
                # Missing LF at eof..
                self.lineno += 1
                self.buffer.append((self.lineno, items[-1]))
            self.bigbuffer = None
            if not self.buffer:
                raise StopIteration()
        else:
            self.bigbuffer = items[-1]

    def _pop(self):
        '''Pops a line from the internal buffer.'''
        ret = self.buffer[0]
        self.buffer.pop(0)
        return ret

    def has_next(self):
        '''Returns whether there are more lines or not.'''
        if not self.buffer:
            try:
                self._fill()
            except StopIteration:
                return False
        return True

    def push_back(self, linedata):
        '''Push a line back into the buffer.'''
        self.buffer.insert(0, linedata)

    def next(self):
        '''Serve the next line or None if we're at EOF.'''
        if not self.buffer:
            try:
                self._fill()
            except StopIteration:
                return None
        return self._pop()

    def find(self, line):
        '''Returns amount of lines to skip if line is found or None.'''
        i = 0
        while True:
            for i, buf in enumerate(self.buffer[i:], i):
                if buf[1] == line:
                    return i
            if len(self.buffer) >= self.max_lookahead:
                break
            try:
                self._fill()
            except StopIteration:
                break
        return None

    def __repr__(self):
        return 'LineByLineReader(%s)' % (self.file.name,)


def filediff(file1, file2):
    max_context = 3

    reader1 = LineByLineReader(file1)
    reader2 = LineByLineReader(file2)

    # How many lines of context we still need to print:
    post_context_needed = 0
    mode = 'skip'  # 'skip', 'diff', 'context'
    has_heading = False  # any heading at all

    context = []

    lineno1 = lineno2 = -1
    while reader1.has_next() and reader2.has_next():
        lineno1, line1 = reader1.next()
        lineno2, line2 = reader2.next()
        # Ah, lines are the same.
        if line1 == line2:
            # Are we at the end of a diff?
            if mode == 'diff':
                mode = 'context'
                post_context_needed = max_context
            # If we need post-diff-context, yield that
            if mode == 'context':
                if post_context_needed:
                    yield b' %s\n' % (line1,)
                    post_context_needed -= 1
                else:
                    mode = 'skip'
            # Assert we have context for the next diff
            if mode == 'skip':
                context.append(line1)
                while len(context) > max_context:
                    context.pop(0)
        # Lines are different.
        else:
            # Dump sub-heading and context if not done yet
            if mode == 'skip':
                if not has_heading or len(context) == max_context:
                    yield b'@@ -%d,? +%d,? @@\n' % (
                        lineno1 - len(context), lineno2 - len(context))
                    has_heading = True
                while context:
                    yield b' %s\n' % (context.pop(0),)
            mode = 'diff'
            # Attempt to find a match a bit later on
            lines2 = reader2.find(line1)
            lines1 = reader1.find(line2)
            if lines1 is not None or lines2 is not None:
                # Okay.. we've found something here
                if lines1 is not None and (
                        lines2 is None or lines1 < lines2):
                    line, reader, lines, sign = line1, reader1, lines1, b'-'
                    reader2.push_back((lineno2, line2))
                elif lines2 is not None and (
                        lines1 is None or lines2 < lines1):
                    line, reader, lines, sign = line2, reader2, lines2, b'+'
                    reader1.push_back((lineno1, line1))
                else:
                    # Both differences are "smallest", 1 goes first
                    line, reader, lines, sign = line1, reader1, lines1, b'-'
                    reader2.push_back((lineno2, line2))
                yield b'%s%s\n' % (sign, line)
                while lines:
                    yield b'%s%s\n' % (sign, reader.next()[1])
                    lines -= 1
            else:
                yield b'-%s\n' % (line1,)
                yield b'+%s\n' % (line2,)

    # Dump sub-heading and context if not done yet
    if mode == 'skip' and (reader1.has_next() or reader2.has_next()):
        # Only print heading if !has_heading or the context is fresh
        if not has_heading or len(context) == max_context:
            # Handle the special case when one of the files is empty
            if lineno1 == -1 and reader1.has_next():
                lineno1 = 0
            if lineno2 == -1 and reader2.has_next():
                lineno2 = 0
            yield b'@@ -%d,? +%d,? @@\n' % (
                lineno1 - len(context) + 1, lineno2 - len(context) + 1)
            has_heading = True
        while context:
            yield b' %s\n' % (context.pop(0),)
    while reader1.has_next():
        yield b'-%s\n' % (reader1.next()[1],)
    while reader2.has_next():
        yield b'+%s\n' % (reader2.next()[1],)


def fixheadings(iterator):
    buf = []
    blank = left = right = 0
    for line in iterator:
        if line.startswith(b'@@'):
            if buf:
                yield (
                    buf[0].replace(b'?', b'%u' % (blank + left), 1)
                    .replace(b'?', b'%u' % (blank + right), 1))
                for i in buf[1:]:
                    yield i
                buf = []
                blank = left = right = 0
        elif line.startswith(b'-'):
            left += 1
        elif line.startswith(b'+'):
            right += 1
        else:
            blank += 1
        buf.append(line)

        # Safety feature: don't overrun memory to fix the numbers.
        # Flush the whole thing and exit.
        if len(buf) > BUFFER_LINES:
            for i in buf:
                yield i
            for i in iterator:
                yield i
            return

    if buf:
        yield (
            buf[0].replace(b'?', b'%u' % (blank + left), 1)
            .replace(b'?', b'%u' % (blank + right), 1))
        for i in buf[1:]:
            yield i


def diff(args, stdout, stderr):
    if len(args) != 2:
        stderr.write(b'diff: need two arguments\n')
        return 1

    file1 = open(args[0], 'rb')
    file2 = open(args[1], 'rb')

    generator = fixheadings(filediff(file1, file2))

    # Print header before first output
    for output in generator:
        mtime, mdate, nsec = [None, None], [None, None], [None, None]
        for i in range(2):
            mtime[i] = os.stat(args[i]).st_mtime
            mdate[i] = datetime.datetime.fromtimestamp(mtime[i])
            nsec[i] = '%09d' % (int((mtime[i] - int(mtime[i])) * 1000000000),)
        fmt = '%Y-%m-%d %H:%M:%S'
        tz = '+0000'  # for another rainy day
        stdout.write(('--- %s\t%s.%s %s\n' % (
            args[0], mdate[0].strftime(fmt), nsec[0], tz)).encode('utf-8'))
        stdout.write(('+++ %s\t%s.%s %s\n' % (
            args[1], mdate[1].strftime(fmt), nsec[1], tz)).encode('utf-8'))
        stdout.write(output)
        break
    # Print other output
    for output in generator:
        stdout.write(output)
        pass


if __name__ == '__main__':
    import sys
    if hasattr(sys.stdout, 'buffer'):
        stdout, stderr = sys.stdout.buffer, sys.stderr.buffer
    else:
        stdout, stderr = sys.stdout, sys.stderr  # py2
        BrokenPipeError = IOError
    try:
        ret = diff(sys.argv[1:], stdout, stderr)
    except BrokenPipeError:
        # Hard exit avoiding the flush to stdout which would trigger a:
        # > Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w'
        # >   encoding='utf-8'>
        # > BrokenPipeError: [Errno 32] Broken pipe
        os._exit(1)
    if ret != 0:
        sys.exit(ret)
# vim: set ts=8 sw=4 sts=4 et ai: