You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I figured I'd contribute back, this has worked on 5,000 legal documents from many sources. Here is a wrapper I wrote to help supportmy W2N.. Some of this is related to typos in the original documents or badly scanned.
in my case I don't care about pennies so I removed the 100/xx type text. hope this helps someone.
my_name = sys._getframe().f_code.co_name
if val is not None and len(val) > 0:
val = re.sub("\$,", "", val).lower()
# remove 00/100 or other values
val = re.sub(r' AND \d+\/100(ths|th)?', '', val, flags=re.IGNORECASE)
val = re.sub(r' & \d+\/100(ths|th)?', '', val, flags=re.IGNORECASE)
val = re.sub(r' AND NO\/100?', '', val, flags=re.IGNORECASE)
# e.g. four hundred AND two
if val.isnumeric(): #or val.find('and') != -1:
return val
val = re.sub("^venty", "seventy", val, flags=re.IGNORECASE)
val = re.sub("^irty", "thirty", val, flags=re.IGNORECASE)
val = re.sub("^fty", "fifty", val, flags=re.IGNORECASE)
val = re.sub("^neteen", "nineteen", val, flags=re.IGNORECASE)
val = re.sub("^fteen", "fifteen", val, flags=re.IGNORECASE)
val = re.sub("eightfen", "eighteen", val, flags=re.IGNORECASE)
val = re.sub("^ghteen", "eighteen", val, flags=re.IGNORECASE)
val = re.sub(" ghteen", "eighteen", val, flags=re.IGNORECASE)
val = re.sub("sixten", "sixteen", val, flags=re.IGNORECASE)
val = re.sub("^iwo ", "two ", val, flags=re.IGNORECASE)
val = re.sub("^o ", "two ", val, flags=re.IGNORECASE)
val = re.sub("^ven", "seven", val, flags=re.IGNORECASE)
val = re.sub("^even", "seven", val, flags=re.IGNORECASE)
val = re.sub("^ve", "five", val, flags=re.IGNORECASE)
val = re.sub("^x", "six", val, flags=re.IGNORECASE)
val = re.sub("elght", "eight", val, flags=re.IGNORECASE)
val = re.sub("light", "eight", val, flags=re.IGNORECASE)
val = re.sub("^ght", "eight", val, flags=re.IGNORECASE)
val = re.sub("^n ", "ten ", val, flags=re.IGNORECASE)
val = re.sub("^nety", "ninety", val, flags=re.IGNORECASE)
val = re.sub("^elve", "twelve", val, flags=re.IGNORECASE)
# fix hyphenated words so they include the hyphen
for t in tens:
while True:
m = val.find(t)
dash_pos = m + len(t)
if m != -1 and len(val) > dash_pos and val[dash_pos:dash_pos+1].isalpha():
val = val[:dash_pos ] + '-' + val[dash_pos:]
else:
break
try:
val = str(w2n.word_to_num(val))
except ValueError as e:
pass
#print(my_name, ": value to be converted:", val)
return val
The text was updated successfully, but these errors were encountered:
I figured I'd contribute back, this has worked on 5,000 legal documents from many sources. Here is a wrapper I wrote to help supportmy W2N.. Some of this is related to typos in the original documents or badly scanned.
in my case I don't care about pennies so I removed the 100/xx type text. hope this helps someone.
def word2value(val):
tens = [
'twenty', 'thirty', 'forty', 'fifty','sixty', 'seventy', 'eighty', 'ninety']
The text was updated successfully, but these errors were encountered: