forked from Barro/assembly-entries-metadata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
match-directory-files-to-section.py
91 lines (74 loc) · 2.97 KB
/
match-directory-files-to-section.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
import argparse
import asmmetadata
import logging
import os
import pyjarowinkler.distance
import re
import sys
def normalize_remove_suffix(value):
return re.sub(r"(.)\.[^.]+$", "\\1", value)
def normalize_remove_numeric_prefix(value):
return re.sub(r"^\d\d[.\- _]", "", value)
def select_best_candidates(entry, filenames):
entry_key = asmmetadata.get_entry_key(entry)
match_values = []
max_distance = 0.0
for filename in filenames:
file_key = normalize_remove_numeric_prefix(filename)
file_key = normalize_remove_suffix(file_key)
file_key = asmmetadata.normalize_key(file_key)
distance = pyjarowinkler.distance.get_jaro_distance(
entry_key, file_key)
max_distance = max(distance, max_distance)
match_values.append((filename, distance))
if max_distance < 0.75:
return []
return [x[0] for x in filter(lambda x: x[1] == max_distance, match_values)]
def select_bestest_match(entry, best_matches):
for best_match in best_matches:
if best_match.endswith(".diz"):
continue
return best_match
raise ValueError("No best match for %s" % entry["name"])
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument("datafile")
parser.add_argument("attribute_name", metavar="attribute-name")
parser.add_argument("value_prefix", metavar="value-prefix")
parser.add_argument("directory")
parser.add_argument("section")
args = parser.parse_args(argv[1:])
entry_data = asmmetadata.parse_file(open(args.datafile, "r"))
section = entry_data.getSection(args.section)
if section is None:
logging.error("No section with key %r", args.section)
return os.EX_DATAERR
for _, _, filenames in os.walk(args.directory):
filenames = [x for x in filenames if not x.endswith(".diz")]
filenames = [x for x in filenames if x != "index.html"]
# Just get the filenames.
break
for entry in section["entries"]:
best_matches = select_best_candidates(entry, filenames)
not_used_matches = []
if len(best_matches) == 0:
sys.stderr.write("NO MATCH: %s\n" % entry["title"])
continue
elif len(best_matches) > 1:
best_match = select_bestest_match(entry, best_matches)
else:
best_match, = best_matches
not_used_matches = best_matches[:]
not_used_matches.remove(best_match)
print("%s -> %s" % (asmmetadata.get_entry_key(entry), best_match))
if len(not_used_matches) > 0:
print(" UNUSED: %s" % ", ".join(not_used_matches))
entry[args.attribute_name] = "%s%s" % (args.value_prefix, best_match)
for match in best_matches:
filenames.remove(match)
if len(filenames) > 0:
print("EXTRA: %s" % ", ".join(filenames))
asmmetadata.print_metadata(open(args.datafile, "w"), entry_data)
if __name__ == "__main__":
sys.exit(main(sys.argv))