From 7e66c6697d212c805d46b1fb64b1cabb566ee82c Mon Sep 17 00:00:00 2001 From: Vasyl Khrystiuk Date: Sun, 22 Dec 2024 00:03:07 +0200 Subject: [PATCH] [WIP] --- .../filters/date/fuzzy/PartRecognizer.java | 12 +-- .../extractors/AllYMDPatternExtractor.java | 76 +++++++++++++++++++ .../extractors/AnyYMDPatternExtractor.java | 35 +++++---- .../EnglishDMYPatternExtractor.java | 7 -- .../date/fuzzy/extractors/Extractors.java | 11 +-- .../ISO8601YMDPatternExtractor.java | 43 ----------- .../EnglishDMYPatternExtractorTest.java | 35 --------- 7 files changed, 97 insertions(+), 122 deletions(-) create mode 100644 src/main/java/liqp/filters/date/fuzzy/extractors/AllYMDPatternExtractor.java delete mode 100644 src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java delete mode 100644 src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java delete mode 100644 src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java diff --git a/src/main/java/liqp/filters/date/fuzzy/PartRecognizer.java b/src/main/java/liqp/filters/date/fuzzy/PartRecognizer.java index fcf77706..5bc0a9ab 100644 --- a/src/main/java/liqp/filters/date/fuzzy/PartRecognizer.java +++ b/src/main/java/liqp/filters/date/fuzzy/PartRecognizer.java @@ -1,7 +1,6 @@ package liqp.filters.date.fuzzy; -import static liqp.filters.date.fuzzy.extractors.Extractors.ISO8601YMDPatternExtractor; -import static liqp.filters.date.fuzzy.extractors.Extractors.englishDateExtractor; +import static liqp.filters.date.fuzzy.extractors.Extractors.allYMDPatternExtractor; import static liqp.filters.date.fuzzy.extractors.Extractors.fullMonthExtractor; import static liqp.filters.date.fuzzy.extractors.Extractors.fullWeekdaysExtractor; import static liqp.filters.date.fuzzy.extractors.Extractors.plainYearExtractor; @@ -55,14 +54,7 @@ List recognizePart(List parts, DatePatternRecognizingContext ctx) { ctx.hasTime = false; } if (notSet(ctx.hasYear)) { - LookupResult result = lookup(parts, ISO8601YMDPatternExtractor.get(ctx.locale)); - if (result.found) { - ctx.hasYear = true; - ctx.hasMonth = true; - ctx.hasDay = true; - return result.parts; - } - result = lookup(parts, englishDateExtractor.get(ctx.locale)); + LookupResult result = lookup(parts, allYMDPatternExtractor.get(ctx.locale)); if (result.found) { ctx.hasYear = true; ctx.hasMonth = true; diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/AllYMDPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/AllYMDPatternExtractor.java new file mode 100644 index 00000000..ce9a90e6 --- /dev/null +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/AllYMDPatternExtractor.java @@ -0,0 +1,76 @@ +package liqp.filters.date.fuzzy.extractors; + +import static liqp.filters.date.fuzzy.extractors.AnyYMDPatternExtractor.pD; +import static liqp.filters.date.fuzzy.extractors.AnyYMDPatternExtractor.pM; +import static liqp.filters.date.fuzzy.extractors.AnyYMDPatternExtractor.pY2; +import static liqp.filters.date.fuzzy.extractors.AnyYMDPatternExtractor.pY4; +import static liqp.filters.date.fuzzy.extractors.AnyYMDPatternExtractor.pp; + +import java.util.ArrayList; +import java.util.List; +import liqp.filters.date.fuzzy.PartExtractor; + +public class AllYMDPatternExtractor implements PartExtractor { + + private final List extractors = new ArrayList<>(); + + public AllYMDPatternExtractor() { + AnyYMDPatternExtractor iSO8601Y4MDPatternExtractor = new AnyYMDPatternExtractor( + pY4(), pp("-"), pM(), pp("-"), pD()); // yyyy-MM-dd + extractors.add(iSO8601Y4MDPatternExtractor); + AnyYMDPatternExtractor americanY4MDPatternExtractor = new AnyYMDPatternExtractor( + pM(), pp("/"), pD(), pp("/"), pY4()); // MM/dd/yyyy + extractors.add(americanY4MDPatternExtractor); + // next are top-rated locale formats, according to gpt + AnyYMDPatternExtractor indianY4MDPatternExtractor = new AnyYMDPatternExtractor( + pD(), pp("-"), pM(), pp("-"), pY4()); // d-M-yyyy + extractors.add(indianY4MDPatternExtractor); + AnyYMDPatternExtractor chineseY4MDPatternExtractor = new AnyYMDPatternExtractor( + pY4(), pp("/"), pM(), pp("/"), pD()); // yyyy/M/d + extractors.add(chineseY4MDPatternExtractor); + AnyYMDPatternExtractor englishY4MDPatternExtractor = new AnyYMDPatternExtractor( + pD(), pp("/"), pM(), pp("/"), pY4()); // d/M/yyyy + extractors.add(englishY4MDPatternExtractor); + AnyYMDPatternExtractor slavicY4MDPatternExtractor = new AnyYMDPatternExtractor( + pD(), pp("."), pM(), pp("."), pY4()); + extractors.add(slavicY4MDPatternExtractor); + AnyYMDPatternExtractor coldEuropeY4MDPatternExtractor = new AnyYMDPatternExtractor( + pY4(), pp("-"), pM(), pp("-"), pD()); // yyyy-MM-dd + extractors.add(coldEuropeY4MDPatternExtractor); + AnyYMDPatternExtractor espanaY4MDPatternExtractor = new AnyYMDPatternExtractor( + pY4(), pp("-"), pM(), pp("-"), pD()); // yyyy/MM/dd + extractors.add(espanaY4MDPatternExtractor); + AnyYMDPatternExtractor americanY2MDPatternExtractor = new AnyYMDPatternExtractor( + pM(), pp("/"), pD(), pp("/"), pY4()); // MM/dd/yy + extractors.add(americanY2MDPatternExtractor); + AnyYMDPatternExtractor indianY2MDPatternExtractor = new AnyYMDPatternExtractor( + pD(), pp("-"), pM(), pp("-"), pY2()); // d-M-yy + extractors.add(indianY2MDPatternExtractor); + AnyYMDPatternExtractor chineseY2MDPatternExtractor = new AnyYMDPatternExtractor( + pY2(), pp("/"), pM(), pp("/"), pD()); // yyyy/M/d + extractors.add(chineseY2MDPatternExtractor); + AnyYMDPatternExtractor englishY2MDPatternExtractor = new AnyYMDPatternExtractor( + pD(), pp("/"), pM(), pp("/"), pY2()); // d/M/yy + extractors.add(englishY2MDPatternExtractor); + AnyYMDPatternExtractor slavicY2MDPatternExtractor = new AnyYMDPatternExtractor( + pD(), pp("."), pM(), pp("."), pY2()); + extractors.add(slavicY2MDPatternExtractor); + AnyYMDPatternExtractor coldEuropeY2MDPatternExtractor = new AnyYMDPatternExtractor( + pY2(), pp("-"), pM(), pp("-"), pD()); // yy-MM-dd + extractors.add(coldEuropeY2MDPatternExtractor); + AnyYMDPatternExtractor espanaY2MDPatternExtractor = new AnyYMDPatternExtractor( + pY2(), pp("-"), pM(), pp("-"), pD()); // yy/MM/dd + extractors.add(espanaY2MDPatternExtractor); + } + + @Override + public PartExtractorResult extract(String source) { + for (AnyYMDPatternExtractor extractor : extractors) { + PartExtractorResult result = extractor.extract(source); + if (result.found) { + return result; + } + } + return new PartExtractorResult(); + } +} diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java index ac864d4b..6d3d5715 100644 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java @@ -7,22 +7,22 @@ import java.util.Optional; import java.util.regex.Matcher; -abstract class AnyYMDPatternExtractor extends RegexPartExtractor { +class AnyYMDPatternExtractor extends RegexPartExtractor { public enum RuleType { Y, M, D, PUNCTUATION; } public static class RulePart { private final RuleType type; - private final Integer[] length; + private final Integer length; private final String content; private RulePart(RuleType type, String content) { this.type = type; this.content = content; - this.length = new Integer[0]; + this.length = null; } - private RulePart(RuleType type, Integer[] length) { + private RulePart(RuleType type, Integer length) { this.type = type; this.length = length; this.content = null; @@ -32,17 +32,17 @@ private RulePart(RuleType type, Integer[] length) { static RulePart pp(String content) { return new RulePart(RuleType.PUNCTUATION, content); } - static RulePart pY(Integer length) { - return new RulePart(RuleType.Y, new Integer[]{length}); + static RulePart pY4() { + return new RulePart(RuleType.Y, 4); } - static RulePart pY(Integer length1, Integer length2) { - return new RulePart(RuleType.Y, new Integer[]{length1, length2}); + static RulePart pY2() { + return new RulePart(RuleType.Y, 2); } static RulePart pM() { - return new RulePart(RuleType.M, (Integer[])null); + return new RulePart(RuleType.M, (Integer)null); } static RulePart pD() { - return new RulePart(RuleType.D, (Integer[])null); + return new RulePart(RuleType.D, (Integer)null); } private final RulePart[] partsInOrder; protected AnyYMDPatternExtractor(RulePart... partsInOrder) { @@ -54,18 +54,17 @@ private static String reconstructPattern(RulePart[] partsInOrder) { StringBuilder sb = new StringBuilder("(?:^|.*?\\D)"); for (RulePart part : partsInOrder) { if (part.type == RuleType.PUNCTUATION) { - sb.append(part.content); + if (".".equals(part.content)) { + sb.append("\\."); + } else { + sb.append(part.content); + } } else { if (part.type == RuleType.Y) { - if (part.length == null || part.length.length == 0) { + if (part.length == null) { throw new IllegalArgumentException("Year part must have length"); } - if (part.length.length == 1) { - sb.append("(?\\d{").append(part.length[0]).append("})"); - } else { - sb.append("(?\\d{").append(part.length[0]).append("}|\\d{") - .append(part.length[1]).append("})"); - } + sb.append("(?\\d{").append(part.length).append("})"); } else if (part.type == RuleType.M) { sb.append("(?0?[1-9]|1[0-2])"); } else if (part.type == RuleType.D) { diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java deleted file mode 100644 index 48781819..00000000 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java +++ /dev/null @@ -1,7 +0,0 @@ -package liqp.filters.date.fuzzy.extractors; - -class EnglishDMYPatternExtractor extends AnyYMDPatternExtractor { - public EnglishDMYPatternExtractor() { - super(pD(), pp("/"), pM(), pp("/"), pY(2, 4)); - } -} diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/Extractors.java b/src/main/java/liqp/filters/date/fuzzy/extractors/Extractors.java index 24518cdf..d9af8e35 100644 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/Extractors.java +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/Extractors.java @@ -82,21 +82,14 @@ public PartExtractor get(Locale locale) { * BASIC_ISO_DATE = 20111203 (yyyyMMdd) * */ - ISO8601YMDPatternExtractor { - private final PartExtractor partExtractor = new ISO8601YMDPatternExtractor(); + allYMDPatternExtractor { + private final PartExtractor partExtractor = new AllYMDPatternExtractor(); @Override public PartExtractor get(Locale locale) { return partExtractor; } }, - englishDateExtractor { - private final PartExtractor partExtractor = new EnglishDMYPatternExtractor(); - @Override - public PartExtractor get(Locale locale) { - return partExtractor; - } - }, ; public abstract PartExtractor get(Locale locale); diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java deleted file mode 100644 index e8952afa..00000000 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java +++ /dev/null @@ -1,43 +0,0 @@ -package liqp.filters.date.fuzzy.extractors; - -import java.util.regex.Matcher; - -class ISO8601YMDPatternExtractor extends RegexPartExtractor { - - public ISO8601YMDPatternExtractor() { - super("(?:^|.*?\\D)" - + "(?\\d{4})-(?0?[1-9]|1[0-2])-(?0?[1-9]|[12][0-9]|3[01])" - + "(?:$|\\D.*?)", null); - } - - @Override - public PartExtractorResult extract(String source) { - Matcher matcher = pattern.matcher(source); - if (matcher.find()) { - PartExtractorResult result = new PartExtractorResult(); - result.found = true; - result.start = matcher.start("year"); - result.end = matcher.end("date"); - result.formatterPatterns = newList(getPattern(matcher)); - return result; - } - return new PartExtractorResult(); - } - - private String getPattern(Matcher matcher) { - StringBuilder sbfp = new StringBuilder("yyyy"); - sbfp.append("-"); - if (matcher.group("month").length() == 1) { - sbfp.append("M"); - } else { - sbfp.append("MM"); - } - sbfp.append("-"); - if (matcher.group("date").length() == 1) { - sbfp.append("d"); - } else { - sbfp.append("dd"); - } - return sbfp.toString(); - } -} diff --git a/src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java b/src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java deleted file mode 100644 index 84d4767d..00000000 --- a/src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java +++ /dev/null @@ -1,35 +0,0 @@ -package liqp.filters.date.fuzzy.extractors; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.util.Arrays; -import java.util.Locale; -import org.junit.Test; - -public class EnglishDMYPatternExtractorTest{ - @Test - public void test() { - EnglishDMYPatternExtractor extractor = new EnglishDMYPatternExtractor(); - PartExtractorResult extract = extractor.extract(" 1/1/11 "); - assertTrue(extract.found); - assertEquals(2, extract.start); - assertEquals(8, extract.end); - assertEquals(4, extract.formatterPatterns.size()); - assertEquals(Arrays.asList("d/M/yy", "d/MM/yy", "dd/M/yy", "dd/MM/yy"), - extract.formatterPatterns); - } - - @Test - public void test2() { - EnglishDMYPatternExtractor extractor = new EnglishDMYPatternExtractor(); - PartExtractorResult extract = extractor.extract(" 31/12/11 "); - assertTrue(extract.found); - assertEquals(2, extract.start); - assertEquals(10, extract.end); - assertEquals(4, extract.formatterPatterns.size()); - // order matters! - assertEquals(Arrays.asList("dd/MM/yy", "dd/M/yy", "d/MM/yy", "d/M/yy"), - extract.formatterPatterns); - } -}