Skip to content

Commit 0dc9ff1

Browse files
author
Panupong Pasupat
committed
Removed duplicated files + Added 2 scripts
1 parent 00d56bb commit 0dc9ff1

22 files changed

+300
-141520
lines changed

data/random-split-seed-1-test.examples

-2,831
This file was deleted.

data/random-split-seed-1-test.tsv

-2,831
This file was deleted.

data/random-split-seed-1-train.examples

-11,321
This file was deleted.

data/random-split-seed-1-train.tsv

-11,321
This file was deleted.

data/random-split-seed-2-test.examples

-2,838
This file was deleted.

data/random-split-seed-2-test.tsv

-2,838
This file was deleted.

data/random-split-seed-2-train.examples

-11,314
This file was deleted.

data/random-split-seed-2-train.tsv

-11,314
This file was deleted.

data/random-split-seed-3-test.examples

-2,838
This file was deleted.

data/random-split-seed-3-test.tsv

-2,838
This file was deleted.

data/random-split-seed-3-train.examples

-11,314
This file was deleted.

data/random-split-seed-3-train.tsv

-11,314
This file was deleted.

data/random-split-seed-4-test.examples

-2,831
This file was deleted.

data/random-split-seed-4-test.tsv

-2,831
This file was deleted.

data/random-split-seed-4-train.examples

-11,321
This file was deleted.

data/random-split-seed-4-train.tsv

-11,321
This file was deleted.

data/random-split-seed-5-test.examples

-2,836
This file was deleted.

data/random-split-seed-5-test.tsv

-2,836
This file was deleted.

data/random-split-seed-5-train.examples

-11,316
This file was deleted.

data/random-split-seed-5-train.tsv

-11,316
This file was deleted.

get-predictions.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
"""Get predictions from the log file of SEMPRE."""
4+
5+
import sys, os, shutil, re, argparse
6+
7+
PATTERN = re.compile(r'Pred@0000: '
8+
r'\(derivation \(formula (.*)\)\) '
9+
r'\(value (.*)\) '
10+
r'\(type (.*)\)\) \[score=(.*), prob=(.*), comp=(.*)\]')
11+
12+
def lisptree_to_python_object(charbuffer):
13+
"""Convert the lisptree to Python object.
14+
15+
Args:
16+
charbuffer: REVERSED list of characters of the lisptree string.
17+
Characters will be consumed from the list.
18+
"""
19+
c = charbuffer.pop()
20+
if c == '(':
21+
answer = []
22+
while charbuffer[-1] != ')':
23+
if charbuffer[-1] == ' ':
24+
charbuffer.pop()
25+
else:
26+
answer.append(lisptree_to_python_object(charbuffer))
27+
assert charbuffer.pop() == ')'
28+
return answer
29+
elif c == '"':
30+
answer = []
31+
while charbuffer[-1] != '"':
32+
c = charbuffer.pop()
33+
if c == '\\':
34+
answer.append(charbuffer.pop())
35+
else:
36+
answer.append(c)
37+
assert charbuffer.pop() == '"'
38+
return ''.join(answer)
39+
else:
40+
answer = [c if c != '\\' else charbuffer.pop()]
41+
while charbuffer[-1] not in (' ', ')'):
42+
c = charbuffer.pop()
43+
if c == '\\':
44+
answer.append(charbuffer.pop())
45+
else:
46+
assert c != '('
47+
answer.append(c)
48+
return ''.join(answer)
49+
50+
def lisptree_to_values(tree):
51+
assert tree.startswith('(list ') and tree.endswith(')')
52+
tree = lisptree_to_python_object(list(tree.decode('utf8'))[::-1])
53+
assert tree[0] == 'list'
54+
answer = []
55+
for subtree in tree[1:]:
56+
if subtree[0] == 'number':
57+
answer.append(float(subtree[1]))
58+
elif subtree[0] == 'date':
59+
answer.append('{}-{}-{}'.format(
60+
int(subtree[1]) if subtree[1] != '-1' else 'xx',
61+
int(subtree[2]) if subtree[2] != '-1' else 'xx',
62+
int(subtree[3]) if subtree[3] != '-1' else 'xx'))
63+
else:
64+
assert subtree[0] == 'name'
65+
answer.append(re.sub('\s+', ' ', subtree[2]).strip())
66+
return '\t'.join(unicode(x) for x in answer)
67+
68+
def main():
69+
parser = argparse.ArgumentParser()
70+
parser.add_argument('infile', help='log file')
71+
parser.add_argument('iteration', help='iteration to extract')
72+
args = parser.parse_args()
73+
74+
prefix = 'iter=%s:' % args.iteration
75+
ex_id = None
76+
with open(args.infile) as fin:
77+
for line in fin:
78+
line = line.strip()
79+
if line.startswith(prefix):
80+
if ex_id is not None:
81+
# No prediction for the previous example
82+
print ex_id
83+
ex_id = line.split()[3]
84+
elif ex_id is not None and line.startswith('Pred@0000:'):
85+
match = PATTERN.match(line)
86+
formula, denotation, deno_type, score, prob, comp = match.groups()
87+
denotation = lisptree_to_values(denotation)
88+
print u'{}\t{}'.format(ex_id, denotation)
89+
ex_id = None
90+
if ex_id is not None:
91+
print '\t'.join([ex_id, 'None'])
92+
93+
if __name__ == '__main__':
94+
main()
95+

table-to-csv.py

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
"""Convert HTML table into CSV / TSV / pretty-printed table."""
4+
5+
import sys, os, re, argparse, json
6+
from codecs import open
7+
from collections import defaultdict
8+
from weblib.table import Table
9+
from itertools import izip_longest
10+
11+
################ Dump CSV
12+
13+
def simple_normalize_text(text):
14+
return text.replace('\\', '\\\\').replace('"', r'\"').replace('\n', r'\\n').replace(u'\xa0', ' ').strip()
15+
16+
def dump_csv(rows, fout):
17+
for row in rows:
18+
fout.write(','.join('"%s"' % simple_normalize_text(x[1]) for x in row) + '\n')
19+
20+
def tab_normalize_text(text):
21+
return re.sub(r'\s+', ' ', text.replace('\\', '\\\\').replace('|', r'\p').replace('\n', r'\n'), re.U).strip()
22+
23+
def dump_tsv(rows, fout):
24+
for row in rows:
25+
fout.write('\t'.join('%s' % tab_normalize_text(x[1]) for x in row) + '\n')
26+
27+
def table_normalize_text(text):
28+
return re.sub(r'\s+', ' ', text, re.U).strip()
29+
30+
def dump_table(rows, fout):
31+
widths = defaultdict(int)
32+
for row in rows:
33+
for i, cell in enumerate(row):
34+
widths[i] = max(widths[i], len(table_normalize_text(cell[1])) + 1)
35+
for row in rows:
36+
fout.write('|')
37+
for i, cell in enumerate(row):
38+
# wow this is so hacky
39+
fout.write((' %-' + str(widths[i]) + 's') % table_normalize_text(cell[1]))
40+
fout.write('|')
41+
fout.write('\n')
42+
43+
################ More table normalization
44+
45+
def debug_print(stuff):
46+
for x in stuff:
47+
print >> sys.stderr, [simple_normalize_text(y[1]) for y in x]
48+
49+
def transpose(rows):
50+
cols = []
51+
n = max(len(row) for row in rows)
52+
for i in xrange(n):
53+
col = []
54+
for row in rows:
55+
try:
56+
col.append(row[i])
57+
except LookupError:
58+
col.append(('', ''))
59+
cols.append(col)
60+
return cols
61+
62+
def anti_transpose(cols):
63+
# All col in cols must have equal length
64+
assert len(set(len(col) for col in cols)) == 1
65+
rows = []
66+
n = len(cols[0])
67+
for i in xrange(n):
68+
row = []
69+
for col in cols:
70+
if col[i] is not None:
71+
row.append(col[i])
72+
else:
73+
row.append(('', ''))
74+
rows.append(row)
75+
return rows
76+
77+
def remove_full_rowspans(rows):
78+
"""Remove rows in which all cells have the same text."""
79+
return [row for row in rows if len(set(row)) > 1]
80+
81+
def remove_empty_columns(orig_cols):
82+
"""Remove columns with <= 1 non-empty cells."""
83+
cols = []
84+
for col in orig_cols:
85+
non_empty = sum((bool(cell[1]) for cell in col), 0)
86+
if non_empty >= 2:
87+
cols.append(col)
88+
return cols
89+
90+
#### Merge columns
91+
92+
def are_mergeable(col1, col2):
93+
assert len(col1) == len(col2)
94+
merged = []
95+
for i in xrange(len(col1)):
96+
c1, c2 = col1[i], col2[i]
97+
if not c1[1]:
98+
merged.append(c2)
99+
elif not c2[1] or c1 == c2:
100+
merged.append(c1)
101+
else:
102+
return None
103+
return merged
104+
105+
def merge_similar_columns(orig_cols):
106+
"""Merge similar columns."""
107+
i = 0
108+
while i + 1 < len(orig_cols):
109+
merged = are_mergeable(orig_cols[i], orig_cols[i+1])
110+
if merged is not None:
111+
orig_cols[i:i+2] = [merged]
112+
else:
113+
i += 1
114+
return orig_cols
115+
116+
#### Merge header rows
117+
118+
def merge_header_rows(orig_rows):
119+
"""Merge all header rows together."""
120+
header_rows, body_rows = [], []
121+
still_header = True
122+
for row in orig_rows:
123+
if not still_header or any(cell[0] == 'td' for cell in row):
124+
still_header = False
125+
body_rows.append(row)
126+
else:
127+
header_rows.append(row)
128+
if len(header_rows) < 2 or not body_rows:
129+
return orig_rows
130+
# Merge header rows with '\n'
131+
header_cols = transpose(header_rows)
132+
header_row = []
133+
for col in header_cols:
134+
texts = [None]
135+
for cell in col:
136+
if cell[1] != texts[-1]:
137+
texts.append(cell[1])
138+
header_row.append(('th', '\n'.join(texts[1:])))
139+
return [header_row] + body_rows
140+
141+
################ Main function
142+
143+
def main():
144+
parser = argparse.ArgumentParser()
145+
parser.add_argument('-j', '--turk-json',
146+
help="json metadata file from MTurk task")
147+
parser.add_argument('-o', '--outfile',
148+
help="output filename (default = stdout)")
149+
parser.add_argument('--tsv', action='store_true',
150+
help='also print out tsv')
151+
parser.add_argument('--human', action='store_true',
152+
help='also print out human-readable table')
153+
parser.add_argument('--html', action='store_true',
154+
help='also print out cleaned html for the table')
155+
parser.add_argument('--keep-hidden', action='store_true',
156+
help='keep hidden texts as is')
157+
args = parser.parse_args()
158+
assert not args.tsv or args.outfile.endswith('.csv')
159+
160+
with open(args.turk_json) as fin:
161+
metadata = json.load(fin)
162+
163+
# Get the path to the HTML file
164+
# This is kind of hacky
165+
match = re.match(r'^(?:json|page)/(\d+)-(?:json|page)/(\d+).json$', args.turk_json)
166+
batch_id, data_id = match.groups()
167+
inhtml = 'page/{}-page/{}.html'.format(batch_id, data_id)
168+
169+
with open(inhtml, 'r', 'utf8') as fin:
170+
raw = fin.read()
171+
table = Table.get_wikitable(raw, metadata['tableIndex'],
172+
normalization=Table.NORM_DUPLICATE,
173+
remove_hidden=(not args.keep_hidden))
174+
if args.html:
175+
raw_table = Table.get_wikitable(raw, metadata['tableIndex'],
176+
remove_hidden=False).table
177+
178+
rows = table.rows
179+
# rows = list of columns; column = list of cells; cell = (tag, text)
180+
# Remove redundant rows and columns
181+
rows = remove_full_rowspans(rows)
182+
cols = transpose(rows)
183+
cols = remove_empty_columns(cols)
184+
cols = merge_similar_columns(cols)
185+
rows = anti_transpose(cols)
186+
rows = merge_header_rows(rows)
187+
# Dump
188+
if not args.outfile:
189+
dump_csv(rows, sys.stdout)
190+
else:
191+
stem = re.sub('\.csv$', '', args.outfile)
192+
with open(args.outfile, 'w', 'utf8') as fout:
193+
dump_csv(rows, fout)
194+
if args.tsv:
195+
with open(stem + '.tsv', 'w', 'utf8') as fout:
196+
dump_tsv(rows, fout)
197+
if args.human:
198+
with open(stem + '.table', 'w', 'utf8') as fout:
199+
dump_table(rows, fout)
200+
if args.html:
201+
with open(stem + '.html', 'w', 'utf8') as fout:
202+
print >> fout, unicode(raw_table)
203+
204+
if __name__ == '__main__':
205+
main()

0 commit comments

Comments
 (0)