Issue #576

ceskaexpedice · Feb 9, 2018 · 33890cc · 33890cc
1 parent 220d9c7
commit 33890cc
Show file tree

Hide file tree

Showing 11 changed files with 673 additions and 47 deletions.
diff --git a/build.gradle b/build.gradle
@@ -57,7 +57,7 @@ repositories {
 
 allprojects {
          group='cz.incad.kramerius'
-         version='5.4.0'
+         version='5.4.1'
 }
 
 

diff --git a/client/src/main/java/cz/incad/kramerius/client/AltoSupportServlet.java b/client/src/main/java/cz/incad/kramerius/client/AltoSupportServlet.java
@@ -6,10 +6,7 @@
 import java.io.StringReader;
 import java.net.URLEncoder;
 import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
@@ -66,11 +63,16 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp)
         String q = req.getParameter("q");
         String pid = req.getParameter("pid");
         try {
-            String query = URLEncoder.encode(q+" AND PID:"+pid.replace(":", "\\:"),"UTF-8");
-            String searchUrl = KConfiguration.getInstance().getConfiguration().getString("api.point")+"/search?"+"q="+query+"&hl=true";
+
+
+            String filterQuery = "PID:"+URLEncoder.encode(pid.replace(":", "\\:"),"UTF-8");
+            String query = URLEncoder.encode(q,"UTF-8");
+            String fieldList = URLEncoder.encode("text text_ocr text_ocr_lemmatized text_ocr_lemmatized_ascii", "UTF-8");
+
+            String searchUrl = KConfiguration.getInstance().getConfiguration().getString("api.point")+"/search?"+"fq="+filterQuery+"&q="+query+"&defType=edismax&qf="+fieldList+"&hl=true";
             String xml = get(searchUrl, null, null);
             Document parsed = XMLUtils.parseDocument(new StringReader(xml));
-            List<String> hterms = findHighlightTerm(parsed.getDocumentElement(), pid);
+            Set<String> hterms = findHighlightTerm(parsed.getDocumentElement(), pid);
             JSONObject jsonObject = new JSONObject();
             for (String sterm : hterms) {
                 String altoUrl = KConfiguration.getInstance().getConfiguration().getString("api.point")+"/item/"+pid+"/streams/ALTO";
@@ -79,7 +81,6 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp)
                 byte[] bytes = alto.getBytes(Charset.forName("UTF-8"));
 
                 Document parsedAlto = XMLUtils.parseDocument(new ByteArrayInputStream(bytes));
-
                 AltoDisected disected2 = cz.incad.kramerius.utils.ALTOUtils.disectAlto(sterm, parsedAlto);
                 jsonObject.put(sterm, disected2.toJSON());
 
@@ -95,7 +96,7 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp)
         }
     }
 
-    public static List<String> findHighlightTerm(final Element elm, final String pid) {
+    public static Set<String> findHighlightTerm(final Element elm, final String pid) {
         List<Element> elmRecursive = XMLUtils.getElementsRecursive(elm, new ElementsFilter() {
             @Override
             public boolean acceptElement(Element itm) {
@@ -122,27 +123,39 @@ public boolean acceptElement(Element itm) {
                 });
                 found.addAll(nfound);
             }
-
-            List<String> terms = new ArrayList<String>();
+
+            List<String> fieldList = Arrays.asList("text","text_ocr", "text_ocr_lemmatized", "text_ocr_lemmatized_ascii");
+            Set<String> terms = new HashSet<>();
             for (Element docEl : found) {
-                List<String> textArray = SOLRUtils.array(docEl, "text", String.class);
-                for (String text : textArray) {
-                    String textContent = textContent(text);
-                    if (textContent != null) {
-                        terms.add(textContent);
+
+                for (String fieldName :
+                        fieldList) {
+
+                    List<String> textArray = SOLRUtils.array(docEl, fieldName, String.class);
+                    for (String text : textArray) {
+                        if (text != null) {
+                            String textContent = textContent(text);
+                            if (textContent != null) {
+                                terms.add(textContent);
+                            }
+                        }
                     }
-                }
-                if (textArray.isEmpty()) {
-                    String value = SOLRUtils.value(docEl, "text", String.class);
-                    String textContent = textContent(value);
-                    if (textContent != null) {
-                        terms.add(textContent);
+                    if (textArray.isEmpty()) {
+                        String value = SOLRUtils.value(docEl, fieldName, String.class);
+                        if (value != null) {
+                            String textContent = textContent(value);
+                            if (textContent != null) {
+                                terms.add(textContent);
+                            }
+                        }
                     }
+
                 }
+
             }
 
             return terms;
-        } else return new ArrayList<String>();
+        } else return new HashSet<>();
 
     }
 

diff --git a/client/src/main/java/cz/incad/kramerius/client/tools/Search.java b/client/src/main/java/cz/incad/kramerius/client/tools/Search.java
@@ -352,8 +352,10 @@ public Map<String, String[]> getUsedFilters() {
 
     private String getBoost(String q) {
         String ret = "";
-        ret = "&defType=edismax&qf=text+"
+        ret = "&defType=edismax&qf=text+text_lemmatized+text_lemmatized_ascii^0.2+"
                 + fieldsConfig.getMappedField("title") + "^4.0+"
+                + "title_lemmatized^4.0+"
+                + "title_lemmatized_ascii+"
                 + fieldsConfig.getMappedField("autor") + "^1.5&bq=(level:0)^4.5"
                 + "&bq=" + fieldsConfig.getMappedField("dostupnost") + ":\"public\"^1.2"
                 + "&pf=text^10";

diff --git a/client/src/main/webapp/js/leftthumbs.js b/client/src/main/webapp/js/leftthumbs.js
@@ -235,15 +235,34 @@ LeftThumbs.prototype = {
 
             var q = "q=";
             if(searchFlag && authorFlag){
-                q += "(text:" + $("#q").val()+" OR dc.creator:"+this._params()['author']+")";
+                var qval = $("#q").val();
+                var nestedQueries = "";
+                if ( (qval.startsWith("\"") || qval.startsWith("'")) ) {
+                    nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" )";
+                } else {
+                    nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" OR ";
+                    nestedQueries += "_query_:\"{!edismax qf=text_lemmatized}"+qval+"\" OR "
+                    nestedQueries += "_query_:\"{!edismax qf=text_lemmatized_ascii}"+qval+"\" ) ";
+                }
+                q += "("+nestedQueries+" OR dc.creator:"+this._params()['author']+")";
             } else if(searchFlag){
-                q += "(text:" + $("#q").val()+")";
+                var qval = $("#q").val();
+                var nestedQueries = "";
+                if ( (qval.startsWith("\"") || qval.startsWith("'")) ) {
+                    nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" )";
+                } else {
+                    nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" OR ";
+                    nestedQueries += "_query_:\"{!edismax qf=text_lemmatized}"+qval+"\" OR "
+                    nestedQueries += "_query_:\"{!edismax qf=text_lemmatized_ascii}"+qval+"\" ) ";
+                }
+
+                q += nestedQueries;
             } else if(authorFlag){
                 q += "(dc.creator:"+this._params()['author']+")";
             }
 
             q += "&rows=5000&fq=pid_path:" + pid_path.replace(/:/g, "\\:") + "*";
-            var hl = authorFlag ? "&hl=true&hl.fl=text_ocr,dc.creator&hl.mergeContiguous=true&hl.snippets=2" : "&hl=true&hl.fl=text_ocr&hl.mergeContiguous=true&hl.snippets=2";
+            var hl = authorFlag ? "&hl=true&hl.fl=text_ocr+text_ocr_lemmatized+text_ocr_lemmatized_ascii+dc.creator&hl.mergeContiguous=true&hl.snippets=2" : "&hl=true&hl.fl=text_ocr+text_ocr_lemmatized+text_ocr_lemmatized_ascii&hl.mergeContiguous=true&hl.snippets=2";
 
             K5.api.askForSolr(q + hl, _.bind(function(data) {
                 var numFound = data.response.numFound;

diff --git a/client/src/test/java/cz/incad/kramerius/client/AltoSupportServletTest.java b/client/src/test/java/cz/incad/kramerius/client/AltoSupportServletTest.java
@@ -6,8 +6,10 @@
 import java.net.URL;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.TransformerException;
 
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -21,13 +23,13 @@
 
 public class AltoSupportServletTest extends TestCase {
 
-    public void testFindTerms() throws IOException, ParserConfigurationException, SAXException {
+    public void testFindTerms() throws IOException, ParserConfigurationException, SAXException, TransformerException {
         URL url = AltoSupportServletTest.class.getResource("solr.xml");
         InputStream openStream = url.openStream();
         Document document = XMLUtils.parseDocument(openStream);
-        List<String> terms = AltoSupportServlet.findHighlightTerm(document.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
+        Set<String> terms = AltoSupportServlet.findHighlightTerm(document.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
         Assert.assertTrue(terms.size() == 1);
-        Assert.assertTrue(terms.get(0).equals("PROSA"));
+        Assert.assertTrue(terms.iterator().next().equals("PROSA"));
     }
 
     public void testFindAltos() throws ParserConfigurationException, SAXException, IOException {
@@ -38,8 +40,8 @@ public void testFindAltos() throws ParserConfigurationException, SAXException, I
 
         openStream = alto.openStream();
         Document altoDocument = XMLUtils.parseDocument(openStream);
-        
-        List<String> terms = AltoSupportServlet.findHighlightTerm(solrDocument.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
+
+        Set<String> terms = AltoSupportServlet.findHighlightTerm(solrDocument.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
         for (String sterm : terms) {
             AltoDisected disected = ALTOUtils.disectAlto(sterm, altoDocument);
             Assert.assertNotNull(disected);

diff --git a/indexer/src/cz/incad/kramerius/indexer/ExtendedFields.java b/indexer/src/cz/incad/kramerius/indexer/ExtendedFields.java
@@ -201,8 +201,11 @@ public String toXmlString(int pageNum) throws Exception {
             String[] pids = s.split("/");
             if (pageNum != 0) {
                 sb.append("<field name=\"parent_pid\">").append(pids[pids.length - 1]).append("</field>");
-                sb.append("<field name=\"text\">").append(getPDFPage(pageNum)).append("</field>");
-
+                // TODO: Do it better. For now, i have to use any field which is aldrady defined
+                // in the future consider of interoducing field text_pdf
+                String pdfText = getPDFPage(pageNum);
+                sb.append("<field name=\"text_ocr\">").append(pdfText).append("</field>");
+
             } else {
                 if (pids.length == 1) {
                     sb.append("<field name=\"parent_pid\">").append(pids[0]).append("</field>");

diff --git a/installation/solr-6.x/kramerius/conf/managed-schema b/installation/solr-6.x/kramerius/conf/managed-schema
@@ -590,7 +590,6 @@
     <copyField source="dc.title" dest="text_lemmatized"/>
     <copyField source="keywords" dest="text_lemmatized"/>
     <copyField source="dc.description" dest="text_lemmatized"/>
-    <copyField source="dc.title" dest="text_lemmatized"/>
     <copyField source="dc.creator" dest="text_lemmatized"/>
     <copyField source="details" dest="text_lemmatized"/>
     <copyField source="text_ocr" dest="text_lemmatized"/>
@@ -599,7 +598,6 @@
     <copyField source="dc.title" dest="text_lemmatized_ascii"/>
     <copyField source="keywords" dest="text_lemmatized_ascii"/>
     <copyField source="dc.description" dest="text_lemmatized_ascii"/>
-    <copyField source="dc.title" dest="text_lemmatized_ascii"/>
     <copyField source="dc.creator" dest="text_lemmatized_ascii"/>
     <copyField source="details" dest="text_lemmatized_ascii"/>
     <copyField source="text_ocr" dest="text_lemmatized_ascii"/>
@@ -608,7 +606,6 @@
     <copyField source="dc.title" dest="text_lemmatized_nostopwords"/>
     <copyField source="keywords" dest="text_lemmatized_nostopwords"/>
     <copyField source="dc.description" dest="text_lemmatized_nostopwords"/>
-    <copyField source="dc.title" dest="text_lemmatized_nostopwords"/>
     <copyField source="dc.creator" dest="text_lemmatized_nostopwords"/>
     <copyField source="details" dest="text_lemmatized_nostopwords"/>
     <copyField source="text_ocr" dest="text_lemmatized_nostopwords"/>

diff --git a/solr-migration/src/main/java/cz/incad/kramerius/services/BatchUtils.java b/solr-migration/src/main/java/cz/incad/kramerius/services/BatchUtils.java
@@ -48,6 +48,19 @@ public boolean acceptElement(Element elm) {
         return batches;
     }
 
+
+    /** find element by attribute */
+    static Element findByAttribute(Element sourceDocElm, String attName) {
+        Element elemName = XMLUtils.findElement(sourceDocElm,  new XMLUtils.ElementsFilter() {
+            @Override
+            public boolean acceptElement(Element element) {
+                return element.getAttribute("name").equals(attName);
+            }
+        });
+        return elemName;
+    }
+
+    /** find pid in source doc */
     static String pid(Element sourceDocElm) {
         Element pidElm = XMLUtils.findElement(sourceDocElm,  new XMLUtils.ElementsFilter() {
             @Override
@@ -72,9 +85,9 @@ public static void transform(Element sourceDocElm, Document destDocument,Element
                 if (node.getNodeType() == Node.ELEMENT_NODE) {
                     List<String> primitiveVals = Arrays.asList("str","int","bool", "date");
                     if (primitiveVals.contains(node.getNodeName())) {
-                        simpleValue(pid, destDocument,destDocElem, node,null);
+                        simpleValue(pid, destDocument,destDocElem, node,null, false);
                     } else {
-                        arrayValue(pid, destDocument,destDocElem,node);
+                        arrayValue(pid,sourceDocElm, destDocument,destDocElem,node);
                     }
                 }
             }
@@ -115,34 +128,44 @@ public boolean acceptElement(Element paramElement) {
 
     }
 
-    public static void simpleValue(String pid, Document ndoc, Element docElm, Node node, String derivedName) {
+    public static void simpleValue(String pid, Document feedDoc, Element feedDocElm, Node node, String derivedName, boolean dontCareAboutNonCopiingFields) {
         String attributeName = derivedName != null ? derivedName : ((Element)node).getAttribute("name");
-        if (!nonCopiingField(attributeName)) {
-            Element strElm = ndoc.createElement("field");
+        if (dontCareAboutNonCopiingFields || !nonCopiingField(attributeName)) {
+            Element strElm = feedDoc.createElement("field");
             strElm.setAttribute("name", attributeName);
-            docElm.appendChild(strElm);
+            feedDocElm.appendChild(strElm);
             String content = StringEscapeUtils.escapeXml(node.getTextContent());
             strElm.setTextContent(content);
         }
     }
 
-    public static void arrayValue(String pid, Document ndoc, Element docElm, Node node) {
+    public static void arrayValue(String pid,Element sourceDocElement, Document feedDoc, Element feedDocElement, Node node) {
         String attributeName = ((Element) node).getAttribute("name");
         if (!nonCopiingField(attributeName)) {
             if (exceptionField(attributeName) && pid.contains("/@")) {
                 NodeList childNodes = node.getChildNodes();
                 for (int i = 0,ll=childNodes.getLength(); i < ll; i++) {
                     Node n = childNodes.item(i);
                     if (n.getNodeType() == Node.ELEMENT_NODE) {
-                        simpleValue(pid, ndoc,docElm, n, attributeName);
+                        // exception again !!! uuugrrrr !!;
+
+                        // bug in pdf; text is filled directly to text field although text is copied field
+                        // first we have to find text_ocr, if it doesn't exist, copy whole text to text, text_lemmatized, text_lemmatized_ascii and text_lemmatized_nostopwords
+                        Element textOcr = findByAttribute(sourceDocElement, "text_ocr");
+                        if (textOcr == null) {
+                            simpleValue(pid, feedDoc,feedDocElement, n, attributeName, false);
+                            simpleValue(pid, feedDoc, feedDocElement, n,"text_lemmatized", true);
+                            simpleValue(pid, feedDoc, feedDocElement, n,"text_lemmatized_ascii", true);
+                            simpleValue(pid, feedDoc, feedDocElement, n,"text_lemmatized_nostopwords", true);
+                        }
                     }
                 }
             } else if (!exceptionField(attributeName)) {
                 NodeList childNodes = node.getChildNodes();
                 for (int i = 0,ll=childNodes.getLength(); i < ll; i++) {
                     Node n = childNodes.item(i);
                     if (n.getNodeType() == Node.ELEMENT_NODE) {
-                        simpleValue(pid, ndoc,docElm, n, attributeName);
+                        simpleValue(pid, feedDoc,feedDocElement, n, attributeName, false);
                     }
                 }
             }