diff --git a/lib/tokenizer/tokenizer.rb b/lib/tokenizer/tokenizer.rb index 1016f5c..4d9bb14 100644 --- a/lib/tokenizer/tokenizer.rb +++ b/lib/tokenizer/tokenizer.rb @@ -22,7 +22,20 @@ class WhitespaceTokenizer PAIR_POST = [')', '}', ']', '>', '»', '“'] # Characters which can be both prefixes AND suffixes. - PRE_N_POST = ['"', "'"] + PRE_N_POST = ['"'] + + # Characters which can both prefixes and suffixes but are only a splittable + # if at the beginning or end of a token with the exception of being prefixed/suffixed + # by other splittables. + # taking the single quote "'" as a PRE_N_POST_ONLY splittable, + # The following would be valid uses as a splittable: + # 'test quotes' + # 'test quotes'. <- suffixed by another splittable + # ('test quotes'). <- prefixed and suffixed by another splittable + # The following would not be valid uses as a splittable: + # l'interrelation + # l'imagerie + PRE_N_POST_ONLY = ["'"] private_constant :FS @@ -49,12 +62,50 @@ def tokenize(str) splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+") + pattern_prepostonly_pfix = + Regexp.new("^[#{Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*[#{ + Regexp.escape(PRE_N_POST_ONLY.join)}]+[#{ + Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*") + pattern_prepostonly_sfix = + Regexp.new("[#{Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*[#{ + Regexp.escape(PRE_N_POST_ONLY.join)}]+[#{ + Regexp.escape((splittables + PRE_N_POST_ONLY).join)}]*$") + #most accomodating url regex I found was here: + #http://stackoverflow.com/a/24058129/4852737 + url_pattern = %r{(([\w]+:)?\/\/)?(([\d\w]|%[a-fA-f\d]{2,2})+ + (:([\d\w]|%[a-fA-f\d]{2,2})+) + ?@)?([\d\w][-\d\w]{0,253}[\d\w]\.)+[\w]{2,63}(:[\d]+)?(\/([-+_~.\d\w] + |%[a-fA-f\d]{2,2})*)*(\?(&?([-+_~.\d\w]|%[a-fA-f\d]{2,2})=?)*)?(# + ([-+_~.\d\w]|%[a-fA-f\d]{2,2})*)?} output = [] tokens.each do |token| - prefix, stem, suffix = token.partition(pattern) - output << prefix.split('') unless prefix.empty? - output << stem unless stem.empty? - output << suffix.split('') unless suffix.empty? + if url_pattern.match(token) + #if token is validated as a url, if last character is a splittable then split it out + output << (splittables.include?(token[-1]) ? + [token[0...-1],token[-1]] : token) + else + #if prefix chars are PRE_N_POST_ONLY splittable then split + prefix, stem, suffix = token.partition(pattern_prepostonly_pfix) + output << stem.split('') unless stem.empty? + token_remaining = stem.empty? ? prefix : suffix + prefix, stem, suffix = token_remaining.partition(pattern) + output << prefix.split('') unless prefix.empty? + unless stem.empty? + #if suffix chars are any splittable including PRE_N_POST_ONLY then split + prefix, stem, suffix_discard = stem.partition(pattern_prepostonly_sfix) + output << prefix unless prefix.empty? + output << stem.split('') unless stem.empty? + end + #while suffix is not empty, take the first character as a splittable token, + #and partition remaining suffix + while suffix.length > 0 + prior_suffix = suffix + output << suffix[0] + prefix, stem, suffix = prior_suffix[1..-1].partition(pattern) + output << prefix.split('') unless prefix.empty? + output << stem unless stem.empty? + end + end end output.flatten diff --git a/test/development_tests/test_tokenize_urls.rb b/test/development_tests/test_tokenize_urls.rb new file mode 100644 index 0000000..fb6f276 --- /dev/null +++ b/test/development_tests/test_tokenize_urls.rb @@ -0,0 +1,62 @@ +# coding: utf-8 +require 'minitest/autorun' +require 'minitest/spec' +require 'tokenizer' + +class TestTokenizerUrls < Minitest::Test + def setup + @t = Tokenizer::WhitespaceTokenizer.new(:de) + end + + def test_url_tokenization_001 + assert_equal(@t.tokenize('test url www.google.com.'), + ['test','url','www.google.com','.']) + end + + def test_url_tokenization_002 + assert_equal(@t.tokenize('test url www.google.com.au.'), + ['test','url','www.google.com.au','.']) + end + + def test_url_tokenization_003 + assert_equal(@t.tokenize('test url http://www.google.com.au.'), + ['test','url','http://www.google.com.au','.']) + end + + def test_url_tokenization_004 + assert_equal(@t.tokenize('test url https://www.google.com.au.'), + ['test','url','https://www.google.com.au','.']) + end + + def test_url_tokenization_005 + assert_equal(@t.tokenize('test url ftp://www.google.com.au.'), + ['test','url','ftp://www.google.com.au','.']) + end + + def test_url_tokenization_006 + assert_equal(@t.tokenize('test url Google.com.'), + ['test','url','Google.com','.']) + end + + def test_url_tokenization_007 + assert_equal(@t.tokenize('test url Au.ac.'), + ['test','url','Au.ac','.']) + end + + def test_url_tokenization_008 + assert_equal(@t.tokenize('test url google.com. Another sentence.'), + ['test','url','google.com','.','Another','sentence','.']) + end + + def test_url_tokenization_009 + assert_equal(@t.tokenize('test url www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html another word.'), + ['test','url','www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html','another','word','.']) + end + + def test_url_tokenization_010 + assert_equal(@t.tokenize('test url www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html. Another sentence.'), + ['test','url','www.culture.gov.uk/heritage/search_frame.asp?name=/heritage/lib1.html','.','Another','sentence','.']) + end +end + + diff --git a/test/regression_tests/test_de_tokenizer.rb b/test/regression_tests/test_de_tokenizer.rb index b8a47d7..3a52ed2 100644 --- a/test/regression_tests/test_de_tokenizer.rb +++ b/test/regression_tests/test_de_tokenizer.rb @@ -31,6 +31,69 @@ def test_tokenization_002 output = @t.tokenize(input) assert_equal(etalon, output) end + + def test_tokenization_003 + input = 'Try some code: test(this).' + etalon = %w(Try some code : test ( this ) .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_004 + input = 'Try an email: test.email@example.com.' + etalon = %w(Try an email : test.email@example.com .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_005 + input = "et souligne 'l'interrelation étroite de l'imagerie' avec le comportement." + etalon = %w(et souligne ' l'interrelation étroite de l'imagerie ' avec le comportement .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_006 + input = 'Try some code: test(inner(brackets)also).' + etalon = %w(Try some code : test ( inner ( brackets ) also ) .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_007 + input = 'Try some code: test[(inner(brackets)also)].' + etalon = %w(Try some code : test [ ( inner ( brackets ) also ) ] .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_008 + input = "Check single quotes: 'quoted string'." + etalon = %w(Check single quotes : ' quoted string ' .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_009 + input = "Check silly embedded single quotes: 'quoted 'embedded string' string'." + etalon = %w(Check silly embedded single quotes : ' quoted ' embedded string ' string ' .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_010 + input = "Check quotes: ('test quotes')." + etalon = %w(Check quotes : ( ' test quotes ' ) .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end + + def test_tokenization_011 + input = "Check quotes: (''test quotes'')." + etalon = %w(Check quotes : ( ' ' test quotes ' ' ) .) + output = @t.tokenize(input) + assert_equal(etalon, output) + end end describe Tokenizer do