From bad26202097afee0d515907327a22a11f15db7d9 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 01:56:23 -0400 Subject: [PATCH 01/11] boost.maybe for groups --- lib/lucene_query_parser/parser.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index c3c115b..df80087 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -76,6 +76,7 @@ def initialize(args={}) rule :group do str('(') >> space.maybe >> expr.as(:group) >> space.maybe >> str(')') + boost.maybe end rule :field do From f7ac828a85317ca3f2a6eafafe81af292001fdb2 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 01:58:56 -0400 Subject: [PATCH 02/11] Forgot >> --- lib/lucene_query_parser/parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index df80087..817661a 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -75,7 +75,7 @@ def initialize(args={}) end rule :group do - str('(') >> space.maybe >> expr.as(:group) >> space.maybe >> str(')') + str('(') >> space.maybe >> expr.as(:group) >> space.maybe >> str(')') >> boost.maybe end From 36f01e908cdf6678d30a0f225a676a1737b617ad Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 08:15:56 -0400 Subject: [PATCH 03/11] Add some forgiving elements --- lib/lucene_query_parser/parser.rb | 6 ++--- spec/lucene_query_parser/parser_spec.rb | 30 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index 817661a..2d1212f 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -71,7 +71,7 @@ def initialize(args={}) end rule :distance do - str('~') >> match['0-9'].repeat(1).as(:distance) + space.maybe >> str('~') >> match['0-9'].repeat(1).as(:distance) end rule :group do @@ -107,12 +107,12 @@ def initialize(args={}) end rule :fuzzy do - str('~') >> + space.maybe >> str('~') >> ( str('0.') >> match['0-9'].repeat(1) | match['01'] ).maybe.as(:similarity) end rule :boost do - str('^') >> ( + space.maybe >> str('^') >> ( str('0.') >> match['0-9'].repeat(1) | match['0-9'].repeat(1) ).as(:boost) diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index c487102..3e52ae9 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -62,12 +62,30 @@ def show_err(input, location) ) end + it "parses a nearness query (forgiving)" do + should parse(%q("foo bar" ~2)).as( + {:phrase => "foo bar", :distance => "2"} + ) + end + it "parses a paren grouping" do should parse(%q((foo bar))).as( {:group => [{:term => "foo"}, {:term => "bar"}]} ) end + it "parses boosts in groupings" do + should parse('(foo bar)^5').as( + {:group => [{:term => "foo"}, {:term => "bar"}], :boost => "5"} + ) + end + + it "parses boosts in groupings (forgiving)" do + should parse('(foo bar) ^5').as( + {:group => [{:term => "foo"}, {:term => "bar"}], :boost => "5"} + ) + end + it "parses nested paren groups" do should parse(%q((foo (bar (baz))))).as( {:group => [ @@ -175,6 +193,18 @@ def show_err(input, location) ) end + it "parses a boost on phrase" do + should parse('"some phrase"^3').as( + {:phrase => "some phrase", :boost => "3"} + ) + end + + it "parses a boost on phrase (forgiving)" do + should parse('"some phrase" ^3').as( + {:phrase => "some phrase", :boost => "3"} + ) + end + it { should parse('year:[2010 TO 2011]').as( {:field => "year", :inclusive_range => {:from => "2010", :to => "2011"}} ) } From 5f40922c0fda18169b1752c72adaa600bc259455 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 09:02:06 -0400 Subject: [PATCH 04/11] Add negation --- lib/lucene_query_parser/parser.rb | 1 + spec/lucene_query_parser/parser_spec.rb | 28 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index 2d1212f..d33e478 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -103,6 +103,7 @@ def initialize(args={}) rule :unary_operator do str('+').as(:required) | str('-').as(:prohibited) | + str('!').as(:prohibited) | (str('NOT').as(:op) >> space) end diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index 3e52ae9..7ef9c2f 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -148,6 +148,34 @@ def show_err(input, location) ] end + it "parses negation in terms" do + should parse("foo !bar").as [ + {:term => "foo"}, + {:term => "bar", :prohibited => "!"} + ] + end + + it "parses negation in groupings" do + should parse('!(foo bar)^5').as( + {:group => [{:term => "foo"}, {:term => "bar"}], :prohibited => "!", :boost => "5"} + ) + end + + it "parses negation in phrases" do + q = %q(!"foo bar" isn't one) + should parse(q).as [ + {:phrase => "foo bar", :prohibited => "!"}, + {:term => "isn't"}, + {:term => "one"} + ] + end + + it "parses negation in field:value" do + should parse("!title:foo").as( + {:field => "title", :term => "foo", :prohibited => "!"} + ) + end + it "parses field:value" do should parse("title:foo").as( {:field => "title", :term => "foo"} From 3f9da603eee439410b23cf7501650410f066a220 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 09:20:46 -0400 Subject: [PATCH 05/11] spaces should be optional between operators and operands --- lib/lucene_query_parser/parser.rb | 2 +- spec/lucene_query_parser/parser_spec.rb | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index d33e478..fa6f4f0 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -44,7 +44,7 @@ def initialize(args={}) rule :expr do space.maybe >> - operand >> (space >> (operator >> space >> operand | operand)).repeat >> + operand >> (space.maybe >> (operator >> space.maybe >> operand | operand)).repeat >> space.maybe end diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index 7ef9c2f..87d4787 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -74,6 +74,20 @@ def show_err(input, location) ) end + it "parses grouping side by side with space" do + should parse('(foo bar) (lorem ipsum)').as([ + {:group => [{:term => "foo"}, {:term => "bar"}]}, + {:group => [{:term => "lorem"}, {:term => "ipsum"}]} + ]) + end + + it "parses grouping side by side with no space" do + should parse('(foo bar)(lorem ipsum)').as([ + {:group => [{:term => "foo"}, {:term => "bar"}]}, + {:group => [{:term => "lorem"}, {:term => "ipsum"}]} + ]) + end + it "parses boosts in groupings" do should parse('(foo bar)^5').as( {:group => [{:term => "foo"}, {:term => "bar"}], :boost => "5"} From 24ffdb8a4616ad91252649aa54e51b4aba8213f1 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 10:38:35 -0400 Subject: [PATCH 06/11] Allow for non-breaking space --- lib/lucene_query_parser/parser.rb | 2 +- spec/lucene_query_parser/parser_spec.rb | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index fa6f4f0..67a732f 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -132,7 +132,7 @@ def initialize(args={}) end rule :space do - match["\n \t"].repeat(1) + match["\n \t\u00a0"].repeat(1) end end diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index 87d4787..c75ca36 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -162,6 +162,13 @@ def show_err(input, location) ] end + it "parses NOTs with a group" do + should parse("foo NOT (bar coca)").as [ + {:term => "foo"}, + {:group => [{:term => "bar"}, {:term => "coca"}], :op => "NOT"} + ] + end + it "parses negation in terms" do should parse("foo !bar").as [ {:term => "foo"}, @@ -321,6 +328,12 @@ def show_err(input, location) should parse('fo?').as( {:term => 'fo?'} ) end + it "parses non-breaking space" do + should parse("foo bar").as [ # do not be fooled, there is a non-breaking space between foo and bar + {:term => "foo"}, + {:term => "bar"}, + ] + end end describe "#error_location" do From 6e18f0e99acc7ea6e97fe3ea9ebefd2baf6281ec Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 11:32:03 -0400 Subject: [PATCH 07/11] Add && and || --- lib/lucene_query_parser/parser.rb | 2 +- spec/lucene_query_parser/parser_spec.rb | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index 67a732f..3d8b5e5 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -49,7 +49,7 @@ def initialize(args={}) end rule :operator do - str('AND').as(:op) | str('OR').as(:op) + str('AND').as(:op) | str('OR').as(:op) | str('&&').as(:op) | str('||').as(:op) end rule :operand do diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index c75ca36..505512f 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -146,6 +146,20 @@ def show_err(input, location) ] end + it "parses && groupings" do + should parse(%q(foo && bar)).as [ + {:term => "foo"}, + {:op => "&&", :term => "bar"} + ] + end + + it "parses || groupings" do + should parse(%q(foo || bar)).as [ + {:term => "foo"}, + {:op => "||", :term => "bar"} + ] + end + it "parses a sequence of AND and OR" do should parse(%q(foo AND bar OR baz OR mumble)).as [ {:term => "foo"}, From 4abeebea830cc021503b2d2960167360e168e466 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 12:02:15 -0400 Subject: [PATCH 08/11] Be more consistent --- lib/lucene_query_parser/parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index 3d8b5e5..ae04c32 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -29,7 +29,7 @@ def initialize(args={}) # must define :term rule at run-time so that it can include # the term_re_str self.class.rule :term do - match[term_re_str].repeat(1).as(:term) >> (fuzzy | boost).maybe + ( (escape_special_words | match[term_re_str]).repeat(1) ).as(:term) >> (fuzzy | boost).maybe end else self.class.rule :term do From 0e699078491396d46a145b7b456a8f00f2af8257 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 12:46:37 -0400 Subject: [PATCH 09/11] Be even more lenient --- lib/lucene_query_parser/parser.rb | 10 +++++----- spec/lucene_query_parser/parser_spec.rb | 12 ++++++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index ae04c32..a1ebf5f 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -70,10 +70,6 @@ def initialize(args={}) (distance | boost).maybe end - rule :distance do - space.maybe >> str('~') >> match['0-9'].repeat(1).as(:distance) - end - rule :group do str('(') >> space.maybe >> expr.as(:group) >> space.maybe >> str(')') >> boost.maybe @@ -107,13 +103,17 @@ def initialize(args={}) (str('NOT').as(:op) >> space) end + rule :distance do + space.maybe >> str('~') >> space.maybe >> match['0-9'].repeat(1).as(:distance) + end + rule :fuzzy do space.maybe >> str('~') >> ( str('0.') >> match['0-9'].repeat(1) | match['01'] ).maybe.as(:similarity) end rule :boost do - space.maybe >> str('^') >> ( + space.maybe >> str('^') >> space.maybe >> ( str('0.') >> match['0-9'].repeat(1) | match['0-9'].repeat(1) ).as(:boost) diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index 505512f..8e2482a 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -68,6 +68,12 @@ def show_err(input, location) ) end + it "parses a nearness query (even more forgiving)" do + should parse(%q("foo bar" ~ 2)).as( + {:phrase => "foo bar", :distance => "2"} + ) + end + it "parses a paren grouping" do should parse(%q((foo bar))).as( {:group => [{:term => "foo"}, {:term => "bar"}]} @@ -100,6 +106,12 @@ def show_err(input, location) ) end + it "parses boosts in groupings (even more forgiving)" do + should parse('(foo bar) ^ 5').as( + {:group => [{:term => "foo"}, {:term => "bar"}], :boost => "5"} + ) + end + it "parses nested paren groups" do should parse(%q((foo (bar (baz))))).as( {:group => [ From 4bc4eb6725d9420870f221c32b2627ce91d2ddf9 Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 13:46:04 -0400 Subject: [PATCH 10/11] Be more lenient with required and prohibited terms --- lib/lucene_query_parser/parser.rb | 2 +- spec/lucene_query_parser/parser_spec.rb | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index a1ebf5f..9472e21 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -53,7 +53,7 @@ def initialize(args={}) end rule :operand do - unary_operator.maybe >> ( + unary_operator.maybe >> space.maybe >> ( group | field | term | diff --git a/spec/lucene_query_parser/parser_spec.rb b/spec/lucene_query_parser/parser_spec.rb index 8e2482a..f0aeb61 100644 --- a/spec/lucene_query_parser/parser_spec.rb +++ b/spec/lucene_query_parser/parser_spec.rb @@ -128,10 +128,32 @@ def show_err(input, location) should parse("+foo").as({:term => "foo", :required => "+"}) end + it "parses a required term (lenient)" do + should parse("+ foo").as({:term => "foo", :required => "+"}) + end + + it "parses a required term (lenient) v2" do + should parse("foo + bar").as([ + {:term => "foo"}, + {:term => "bar", :required => "+"} + ]) + end + it "parses a prohibited term" do should parse("-foo").as({:term => "foo", :prohibited => "-"}) end + it "parses a prohibited term (lenient)" do + should parse("- foo").as({:term => "foo", :prohibited => "-"}) + end + + it "parses a prohibited term (lenient) v2" do + should parse("foo - bar").as([ + {:term => "foo"}, + {:term => "bar", :prohibited => "-"} + ]) + end + it "parses prohibited groups and phrases" do should parse(%q(+(foo bar) -"mumble stuff")).as [ {:group => [{:term => "foo"}, {:term => "bar"}], :required => "+"}, From ab7ca9cc7de5e1e6561a1d5019bb180626a829cb Mon Sep 17 00:00:00 2001 From: Dan Maglasang Date: Wed, 14 Sep 2016 18:49:00 -0400 Subject: [PATCH 11/11] Allow for ZERO WIDTH SPACE char --- lib/lucene_query_parser/parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lucene_query_parser/parser.rb b/lib/lucene_query_parser/parser.rb index 9472e21..253928e 100644 --- a/lib/lucene_query_parser/parser.rb +++ b/lib/lucene_query_parser/parser.rb @@ -132,7 +132,7 @@ def initialize(args={}) end rule :space do - match["\n \t\u00a0"].repeat(1) + match["\n \t\u00a0\u200B"].repeat(1) end end