|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +"""Convert HTML table into CSV / TSV / pretty-printed table.""" |
| 4 | + |
| 5 | +import sys, os, re, argparse, json |
| 6 | +from codecs import open |
| 7 | +from collections import defaultdict |
| 8 | +from weblib.table import Table |
| 9 | +from itertools import izip_longest |
| 10 | + |
| 11 | +################ Dump CSV |
| 12 | + |
| 13 | +def simple_normalize_text(text): |
| 14 | + return text.replace('\\', '\\\\').replace('"', r'\"').replace('\n', r'\\n').replace(u'\xa0', ' ').strip() |
| 15 | + |
| 16 | +def dump_csv(rows, fout): |
| 17 | + for row in rows: |
| 18 | + fout.write(','.join('"%s"' % simple_normalize_text(x[1]) for x in row) + '\n') |
| 19 | + |
| 20 | +def tab_normalize_text(text): |
| 21 | + return re.sub(r'\s+', ' ', text.replace('\\', '\\\\').replace('|', r'\p').replace('\n', r'\n'), re.U).strip() |
| 22 | + |
| 23 | +def dump_tsv(rows, fout): |
| 24 | + for row in rows: |
| 25 | + fout.write('\t'.join('%s' % tab_normalize_text(x[1]) for x in row) + '\n') |
| 26 | + |
| 27 | +def table_normalize_text(text): |
| 28 | + return re.sub(r'\s+', ' ', text, re.U).strip() |
| 29 | + |
| 30 | +def dump_table(rows, fout): |
| 31 | + widths = defaultdict(int) |
| 32 | + for row in rows: |
| 33 | + for i, cell in enumerate(row): |
| 34 | + widths[i] = max(widths[i], len(table_normalize_text(cell[1])) + 1) |
| 35 | + for row in rows: |
| 36 | + fout.write('|') |
| 37 | + for i, cell in enumerate(row): |
| 38 | + # wow this is so hacky |
| 39 | + fout.write((' %-' + str(widths[i]) + 's') % table_normalize_text(cell[1])) |
| 40 | + fout.write('|') |
| 41 | + fout.write('\n') |
| 42 | + |
| 43 | +################ More table normalization |
| 44 | + |
| 45 | +def debug_print(stuff): |
| 46 | + for x in stuff: |
| 47 | + print >> sys.stderr, [simple_normalize_text(y[1]) for y in x] |
| 48 | + |
| 49 | +def transpose(rows): |
| 50 | + cols = [] |
| 51 | + n = max(len(row) for row in rows) |
| 52 | + for i in xrange(n): |
| 53 | + col = [] |
| 54 | + for row in rows: |
| 55 | + try: |
| 56 | + col.append(row[i]) |
| 57 | + except LookupError: |
| 58 | + col.append(('', '')) |
| 59 | + cols.append(col) |
| 60 | + return cols |
| 61 | + |
| 62 | +def anti_transpose(cols): |
| 63 | + # All col in cols must have equal length |
| 64 | + assert len(set(len(col) for col in cols)) == 1 |
| 65 | + rows = [] |
| 66 | + n = len(cols[0]) |
| 67 | + for i in xrange(n): |
| 68 | + row = [] |
| 69 | + for col in cols: |
| 70 | + if col[i] is not None: |
| 71 | + row.append(col[i]) |
| 72 | + else: |
| 73 | + row.append(('', '')) |
| 74 | + rows.append(row) |
| 75 | + return rows |
| 76 | + |
| 77 | +def remove_full_rowspans(rows): |
| 78 | + """Remove rows in which all cells have the same text.""" |
| 79 | + return [row for row in rows if len(set(row)) > 1] |
| 80 | + |
| 81 | +def remove_empty_columns(orig_cols): |
| 82 | + """Remove columns with <= 1 non-empty cells.""" |
| 83 | + cols = [] |
| 84 | + for col in orig_cols: |
| 85 | + non_empty = sum((bool(cell[1]) for cell in col), 0) |
| 86 | + if non_empty >= 2: |
| 87 | + cols.append(col) |
| 88 | + return cols |
| 89 | + |
| 90 | +#### Merge columns |
| 91 | + |
| 92 | +def are_mergeable(col1, col2): |
| 93 | + assert len(col1) == len(col2) |
| 94 | + merged = [] |
| 95 | + for i in xrange(len(col1)): |
| 96 | + c1, c2 = col1[i], col2[i] |
| 97 | + if not c1[1]: |
| 98 | + merged.append(c2) |
| 99 | + elif not c2[1] or c1 == c2: |
| 100 | + merged.append(c1) |
| 101 | + else: |
| 102 | + return None |
| 103 | + return merged |
| 104 | + |
| 105 | +def merge_similar_columns(orig_cols): |
| 106 | + """Merge similar columns.""" |
| 107 | + i = 0 |
| 108 | + while i + 1 < len(orig_cols): |
| 109 | + merged = are_mergeable(orig_cols[i], orig_cols[i+1]) |
| 110 | + if merged is not None: |
| 111 | + orig_cols[i:i+2] = [merged] |
| 112 | + else: |
| 113 | + i += 1 |
| 114 | + return orig_cols |
| 115 | + |
| 116 | +#### Merge header rows |
| 117 | + |
| 118 | +def merge_header_rows(orig_rows): |
| 119 | + """Merge all header rows together.""" |
| 120 | + header_rows, body_rows = [], [] |
| 121 | + still_header = True |
| 122 | + for row in orig_rows: |
| 123 | + if not still_header or any(cell[0] == 'td' for cell in row): |
| 124 | + still_header = False |
| 125 | + body_rows.append(row) |
| 126 | + else: |
| 127 | + header_rows.append(row) |
| 128 | + if len(header_rows) < 2 or not body_rows: |
| 129 | + return orig_rows |
| 130 | + # Merge header rows with '\n' |
| 131 | + header_cols = transpose(header_rows) |
| 132 | + header_row = [] |
| 133 | + for col in header_cols: |
| 134 | + texts = [None] |
| 135 | + for cell in col: |
| 136 | + if cell[1] != texts[-1]: |
| 137 | + texts.append(cell[1]) |
| 138 | + header_row.append(('th', '\n'.join(texts[1:]))) |
| 139 | + return [header_row] + body_rows |
| 140 | + |
| 141 | +################ Main function |
| 142 | + |
| 143 | +def main(): |
| 144 | + parser = argparse.ArgumentParser() |
| 145 | + parser.add_argument('-j', '--turk-json', |
| 146 | + help="json metadata file from MTurk task") |
| 147 | + parser.add_argument('-o', '--outfile', |
| 148 | + help="output filename (default = stdout)") |
| 149 | + parser.add_argument('--tsv', action='store_true', |
| 150 | + help='also print out tsv') |
| 151 | + parser.add_argument('--human', action='store_true', |
| 152 | + help='also print out human-readable table') |
| 153 | + parser.add_argument('--html', action='store_true', |
| 154 | + help='also print out cleaned html for the table') |
| 155 | + parser.add_argument('--keep-hidden', action='store_true', |
| 156 | + help='keep hidden texts as is') |
| 157 | + args = parser.parse_args() |
| 158 | + assert not args.tsv or args.outfile.endswith('.csv') |
| 159 | + |
| 160 | + with open(args.turk_json) as fin: |
| 161 | + metadata = json.load(fin) |
| 162 | + |
| 163 | + # Get the path to the HTML file |
| 164 | + # This is kind of hacky |
| 165 | + match = re.match(r'^(?:json|page)/(\d+)-(?:json|page)/(\d+).json$', args.turk_json) |
| 166 | + batch_id, data_id = match.groups() |
| 167 | + inhtml = 'page/{}-page/{}.html'.format(batch_id, data_id) |
| 168 | + |
| 169 | + with open(inhtml, 'r', 'utf8') as fin: |
| 170 | + raw = fin.read() |
| 171 | + table = Table.get_wikitable(raw, metadata['tableIndex'], |
| 172 | + normalization=Table.NORM_DUPLICATE, |
| 173 | + remove_hidden=(not args.keep_hidden)) |
| 174 | + if args.html: |
| 175 | + raw_table = Table.get_wikitable(raw, metadata['tableIndex'], |
| 176 | + remove_hidden=False).table |
| 177 | + |
| 178 | + rows = table.rows |
| 179 | + # rows = list of columns; column = list of cells; cell = (tag, text) |
| 180 | + # Remove redundant rows and columns |
| 181 | + rows = remove_full_rowspans(rows) |
| 182 | + cols = transpose(rows) |
| 183 | + cols = remove_empty_columns(cols) |
| 184 | + cols = merge_similar_columns(cols) |
| 185 | + rows = anti_transpose(cols) |
| 186 | + rows = merge_header_rows(rows) |
| 187 | + # Dump |
| 188 | + if not args.outfile: |
| 189 | + dump_csv(rows, sys.stdout) |
| 190 | + else: |
| 191 | + stem = re.sub('\.csv$', '', args.outfile) |
| 192 | + with open(args.outfile, 'w', 'utf8') as fout: |
| 193 | + dump_csv(rows, fout) |
| 194 | + if args.tsv: |
| 195 | + with open(stem + '.tsv', 'w', 'utf8') as fout: |
| 196 | + dump_tsv(rows, fout) |
| 197 | + if args.human: |
| 198 | + with open(stem + '.table', 'w', 'utf8') as fout: |
| 199 | + dump_table(rows, fout) |
| 200 | + if args.html: |
| 201 | + with open(stem + '.html', 'w', 'utf8') as fout: |
| 202 | + print >> fout, unicode(raw_table) |
| 203 | + |
| 204 | +if __name__ == '__main__': |
| 205 | + main() |
0 commit comments