Skip to content

Commit

Permalink
Issue #576
Browse files Browse the repository at this point in the history
  • Loading branch information
pavels committed Feb 9, 2018
1 parent 220d9c7 commit 33890cc
Show file tree
Hide file tree
Showing 11 changed files with 673 additions and 47 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ repositories {

allprojects {
group='cz.incad.kramerius'
version='5.4.0'
version='5.4.1'
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
import java.io.StringReader;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -66,11 +63,16 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp)
String q = req.getParameter("q");
String pid = req.getParameter("pid");
try {
String query = URLEncoder.encode(q+" AND PID:"+pid.replace(":", "\\:"),"UTF-8");
String searchUrl = KConfiguration.getInstance().getConfiguration().getString("api.point")+"/search?"+"q="+query+"&hl=true";


String filterQuery = "PID:"+URLEncoder.encode(pid.replace(":", "\\:"),"UTF-8");
String query = URLEncoder.encode(q,"UTF-8");
String fieldList = URLEncoder.encode("text text_ocr text_ocr_lemmatized text_ocr_lemmatized_ascii", "UTF-8");

String searchUrl = KConfiguration.getInstance().getConfiguration().getString("api.point")+"/search?"+"fq="+filterQuery+"&q="+query+"&defType=edismax&qf="+fieldList+"&hl=true";
String xml = get(searchUrl, null, null);
Document parsed = XMLUtils.parseDocument(new StringReader(xml));
List<String> hterms = findHighlightTerm(parsed.getDocumentElement(), pid);
Set<String> hterms = findHighlightTerm(parsed.getDocumentElement(), pid);
JSONObject jsonObject = new JSONObject();
for (String sterm : hterms) {
String altoUrl = KConfiguration.getInstance().getConfiguration().getString("api.point")+"/item/"+pid+"/streams/ALTO";
Expand All @@ -79,7 +81,6 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp)
byte[] bytes = alto.getBytes(Charset.forName("UTF-8"));

Document parsedAlto = XMLUtils.parseDocument(new ByteArrayInputStream(bytes));

AltoDisected disected2 = cz.incad.kramerius.utils.ALTOUtils.disectAlto(sterm, parsedAlto);
jsonObject.put(sterm, disected2.toJSON());

Expand All @@ -95,7 +96,7 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp)
}
}

public static List<String> findHighlightTerm(final Element elm, final String pid) {
public static Set<String> findHighlightTerm(final Element elm, final String pid) {
List<Element> elmRecursive = XMLUtils.getElementsRecursive(elm, new ElementsFilter() {
@Override
public boolean acceptElement(Element itm) {
Expand All @@ -122,27 +123,39 @@ public boolean acceptElement(Element itm) {
});
found.addAll(nfound);
}

List<String> terms = new ArrayList<String>();

List<String> fieldList = Arrays.asList("text","text_ocr", "text_ocr_lemmatized", "text_ocr_lemmatized_ascii");
Set<String> terms = new HashSet<>();
for (Element docEl : found) {
List<String> textArray = SOLRUtils.array(docEl, "text", String.class);
for (String text : textArray) {
String textContent = textContent(text);
if (textContent != null) {
terms.add(textContent);

for (String fieldName :
fieldList) {

List<String> textArray = SOLRUtils.array(docEl, fieldName, String.class);
for (String text : textArray) {
if (text != null) {
String textContent = textContent(text);
if (textContent != null) {
terms.add(textContent);
}
}
}
}
if (textArray.isEmpty()) {
String value = SOLRUtils.value(docEl, "text", String.class);
String textContent = textContent(value);
if (textContent != null) {
terms.add(textContent);
if (textArray.isEmpty()) {
String value = SOLRUtils.value(docEl, fieldName, String.class);
if (value != null) {
String textContent = textContent(value);
if (textContent != null) {
terms.add(textContent);
}
}
}

}

}

return terms;
} else return new ArrayList<String>();
} else return new HashSet<>();

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,10 @@ public Map<String, String[]> getUsedFilters() {

private String getBoost(String q) {
String ret = "";
ret = "&defType=edismax&qf=text+"
ret = "&defType=edismax&qf=text+text_lemmatized+text_lemmatized_ascii^0.2+"
+ fieldsConfig.getMappedField("title") + "^4.0+"
+ "title_lemmatized^4.0+"
+ "title_lemmatized_ascii+"
+ fieldsConfig.getMappedField("autor") + "^1.5&bq=(level:0)^4.5"
+ "&bq=" + fieldsConfig.getMappedField("dostupnost") + ":\"public\"^1.2"
+ "&pf=text^10";
Expand Down
25 changes: 22 additions & 3 deletions client/src/main/webapp/js/leftthumbs.js
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,34 @@ LeftThumbs.prototype = {

var q = "q=";
if(searchFlag && authorFlag){
q += "(text:" + $("#q").val()+" OR dc.creator:"+this._params()['author']+")";
var qval = $("#q").val();
var nestedQueries = "";
if ( (qval.startsWith("\"") || qval.startsWith("'")) ) {
nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" )";
} else {
nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" OR ";
nestedQueries += "_query_:\"{!edismax qf=text_lemmatized}"+qval+"\" OR "
nestedQueries += "_query_:\"{!edismax qf=text_lemmatized_ascii}"+qval+"\" ) ";
}
q += "("+nestedQueries+" OR dc.creator:"+this._params()['author']+")";
} else if(searchFlag){
q += "(text:" + $("#q").val()+")";
var qval = $("#q").val();
var nestedQueries = "";
if ( (qval.startsWith("\"") || qval.startsWith("'")) ) {
nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" )";
} else {
nestedQueries = "(_query_:\"{!edismax qf=text}"+qval+"\" OR ";
nestedQueries += "_query_:\"{!edismax qf=text_lemmatized}"+qval+"\" OR "
nestedQueries += "_query_:\"{!edismax qf=text_lemmatized_ascii}"+qval+"\" ) ";
}

q += nestedQueries;
} else if(authorFlag){
q += "(dc.creator:"+this._params()['author']+")";
}

q += "&rows=5000&fq=pid_path:" + pid_path.replace(/:/g, "\\:") + "*";
var hl = authorFlag ? "&hl=true&hl.fl=text_ocr,dc.creator&hl.mergeContiguous=true&hl.snippets=2" : "&hl=true&hl.fl=text_ocr&hl.mergeContiguous=true&hl.snippets=2";
var hl = authorFlag ? "&hl=true&hl.fl=text_ocr+text_ocr_lemmatized+text_ocr_lemmatized_ascii+dc.creator&hl.mergeContiguous=true&hl.snippets=2" : "&hl=true&hl.fl=text_ocr+text_ocr_lemmatized+text_ocr_lemmatized_ascii&hl.mergeContiguous=true&hl.snippets=2";

K5.api.askForSolr(q + hl, _.bind(function(data) {
var numFound = data.response.numFound;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import java.net.URL;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
Expand All @@ -21,13 +23,13 @@

public class AltoSupportServletTest extends TestCase {

public void testFindTerms() throws IOException, ParserConfigurationException, SAXException {
public void testFindTerms() throws IOException, ParserConfigurationException, SAXException, TransformerException {
URL url = AltoSupportServletTest.class.getResource("solr.xml");
InputStream openStream = url.openStream();
Document document = XMLUtils.parseDocument(openStream);
List<String> terms = AltoSupportServlet.findHighlightTerm(document.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
Set<String> terms = AltoSupportServlet.findHighlightTerm(document.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
Assert.assertTrue(terms.size() == 1);
Assert.assertTrue(terms.get(0).equals("PROSA"));
Assert.assertTrue(terms.iterator().next().equals("PROSA"));
}

public void testFindAltos() throws ParserConfigurationException, SAXException, IOException {
Expand All @@ -38,8 +40,8 @@ public void testFindAltos() throws ParserConfigurationException, SAXException, I

openStream = alto.openStream();
Document altoDocument = XMLUtils.parseDocument(openStream);
List<String> terms = AltoSupportServlet.findHighlightTerm(solrDocument.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");

Set<String> terms = AltoSupportServlet.findHighlightTerm(solrDocument.getDocumentElement(),"uuid:5c243d40-3425-11e3-bd38-5ef3fc9ae867");
for (String sterm : terms) {
AltoDisected disected = ALTOUtils.disectAlto(sterm, altoDocument);
Assert.assertNotNull(disected);
Expand Down
7 changes: 5 additions & 2 deletions indexer/src/cz/incad/kramerius/indexer/ExtendedFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,11 @@ public String toXmlString(int pageNum) throws Exception {
String[] pids = s.split("/");
if (pageNum != 0) {
sb.append("<field name=\"parent_pid\">").append(pids[pids.length - 1]).append("</field>");
sb.append("<field name=\"text\">").append(getPDFPage(pageNum)).append("</field>");

// TODO: Do it better. For now, i have to use any field which is aldrady defined
// in the future consider of interoducing field text_pdf
String pdfText = getPDFPage(pageNum);
sb.append("<field name=\"text_ocr\">").append(pdfText).append("</field>");

} else {
if (pids.length == 1) {
sb.append("<field name=\"parent_pid\">").append(pids[0]).append("</field>");
Expand Down
3 changes: 0 additions & 3 deletions installation/solr-6.x/kramerius/conf/managed-schema
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,6 @@
<copyField source="dc.title" dest="text_lemmatized"/>
<copyField source="keywords" dest="text_lemmatized"/>
<copyField source="dc.description" dest="text_lemmatized"/>
<copyField source="dc.title" dest="text_lemmatized"/>
<copyField source="dc.creator" dest="text_lemmatized"/>
<copyField source="details" dest="text_lemmatized"/>
<copyField source="text_ocr" dest="text_lemmatized"/>
Expand All @@ -599,7 +598,6 @@
<copyField source="dc.title" dest="text_lemmatized_ascii"/>
<copyField source="keywords" dest="text_lemmatized_ascii"/>
<copyField source="dc.description" dest="text_lemmatized_ascii"/>
<copyField source="dc.title" dest="text_lemmatized_ascii"/>
<copyField source="dc.creator" dest="text_lemmatized_ascii"/>
<copyField source="details" dest="text_lemmatized_ascii"/>
<copyField source="text_ocr" dest="text_lemmatized_ascii"/>
Expand All @@ -608,7 +606,6 @@
<copyField source="dc.title" dest="text_lemmatized_nostopwords"/>
<copyField source="keywords" dest="text_lemmatized_nostopwords"/>
<copyField source="dc.description" dest="text_lemmatized_nostopwords"/>
<copyField source="dc.title" dest="text_lemmatized_nostopwords"/>
<copyField source="dc.creator" dest="text_lemmatized_nostopwords"/>
<copyField source="details" dest="text_lemmatized_nostopwords"/>
<copyField source="text_ocr" dest="text_lemmatized_nostopwords"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ public boolean acceptElement(Element elm) {
return batches;
}


/** find element by attribute */
static Element findByAttribute(Element sourceDocElm, String attName) {
Element elemName = XMLUtils.findElement(sourceDocElm, new XMLUtils.ElementsFilter() {
@Override
public boolean acceptElement(Element element) {
return element.getAttribute("name").equals(attName);
}
});
return elemName;
}

/** find pid in source doc */
static String pid(Element sourceDocElm) {
Element pidElm = XMLUtils.findElement(sourceDocElm, new XMLUtils.ElementsFilter() {
@Override
Expand All @@ -72,9 +85,9 @@ public static void transform(Element sourceDocElm, Document destDocument,Element
if (node.getNodeType() == Node.ELEMENT_NODE) {
List<String> primitiveVals = Arrays.asList("str","int","bool", "date");
if (primitiveVals.contains(node.getNodeName())) {
simpleValue(pid, destDocument,destDocElem, node,null);
simpleValue(pid, destDocument,destDocElem, node,null, false);
} else {
arrayValue(pid, destDocument,destDocElem,node);
arrayValue(pid,sourceDocElm, destDocument,destDocElem,node);
}
}
}
Expand Down Expand Up @@ -115,34 +128,44 @@ public boolean acceptElement(Element paramElement) {

}

public static void simpleValue(String pid, Document ndoc, Element docElm, Node node, String derivedName) {
public static void simpleValue(String pid, Document feedDoc, Element feedDocElm, Node node, String derivedName, boolean dontCareAboutNonCopiingFields) {
String attributeName = derivedName != null ? derivedName : ((Element)node).getAttribute("name");
if (!nonCopiingField(attributeName)) {
Element strElm = ndoc.createElement("field");
if (dontCareAboutNonCopiingFields || !nonCopiingField(attributeName)) {
Element strElm = feedDoc.createElement("field");
strElm.setAttribute("name", attributeName);
docElm.appendChild(strElm);
feedDocElm.appendChild(strElm);
String content = StringEscapeUtils.escapeXml(node.getTextContent());
strElm.setTextContent(content);
}
}

public static void arrayValue(String pid, Document ndoc, Element docElm, Node node) {
public static void arrayValue(String pid,Element sourceDocElement, Document feedDoc, Element feedDocElement, Node node) {
String attributeName = ((Element) node).getAttribute("name");
if (!nonCopiingField(attributeName)) {
if (exceptionField(attributeName) && pid.contains("/@")) {
NodeList childNodes = node.getChildNodes();
for (int i = 0,ll=childNodes.getLength(); i < ll; i++) {
Node n = childNodes.item(i);
if (n.getNodeType() == Node.ELEMENT_NODE) {
simpleValue(pid, ndoc,docElm, n, attributeName);
// exception again !!! uuugrrrr !!;

// bug in pdf; text is filled directly to text field although text is copied field
// first we have to find text_ocr, if it doesn't exist, copy whole text to text, text_lemmatized, text_lemmatized_ascii and text_lemmatized_nostopwords
Element textOcr = findByAttribute(sourceDocElement, "text_ocr");
if (textOcr == null) {
simpleValue(pid, feedDoc,feedDocElement, n, attributeName, false);
simpleValue(pid, feedDoc, feedDocElement, n,"text_lemmatized", true);
simpleValue(pid, feedDoc, feedDocElement, n,"text_lemmatized_ascii", true);
simpleValue(pid, feedDoc, feedDocElement, n,"text_lemmatized_nostopwords", true);
}
}
}
} else if (!exceptionField(attributeName)) {
NodeList childNodes = node.getChildNodes();
for (int i = 0,ll=childNodes.getLength(); i < ll; i++) {
Node n = childNodes.item(i);
if (n.getNodeType() == Node.ELEMENT_NODE) {
simpleValue(pid, ndoc,docElm, n, attributeName);
simpleValue(pid, feedDoc,feedDocElement, n, attributeName, false);
}
}
}
Expand Down
Loading

0 comments on commit 33890cc

Please sign in to comment.