Skip to content

Commit

Permalink
Changed sorting to natural collator + ICU string rules, allowed dupli…
Browse files Browse the repository at this point in the history
…cate sort values

Updates #254
Fixes #207
  • Loading branch information
vlahoda authored and jirikrepl committed Apr 21, 2016
1 parent d4a7325 commit 5f76795
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 74 deletions.
3 changes: 2 additions & 1 deletion common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ dependencies {
compile 'org.apache.pdfbox:pdfbox:1.8.2'
compile 'com.levigo.jbig2:levigo-jbig2-imageio:1.6.1'

compile 'com.ibm.icu:icu4j:3.8'
compile 'com.ibm.icu:icu4j:56.1'
compile 'com.google.guava:guava:19.0'
compile 'org.bouncycastle:bcprov-jdk15:1.44'
compile 'org.bouncycastle:bcmail-jdk15:1.44'
/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ public interface SortingService {
* Sort given list of FOXML objects (their PIDs) based on the content of the BIBLIO-MODS datastream
* @param pids list of FOXML PIDs to sort
* @param xpath XPath expression to extract the data (upon which the objects will be sorted) from BIBLIO-MODS
* @param numeric when true, the data from xpath will be sorted as numeric (integer) values, otherwise alphabetically
* @return sorted list of PIDs
*/
List<String> sortObjects(List<String> pids, String xpath, boolean numeric);
List<String> sortObjects(List<String> pids, String xpath);
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
package cz.incad.kramerius.service.impl;

import com.google.inject.*;
import javax.annotation.PostConstruct;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.logging.Logger;

import org.w3c.dom.Document;

import com.google.common.collect.Ordering;
import com.google.common.collect.TreeMultimap;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.google.inject.Scopes;
import com.google.inject.name.Named;
import com.google.inject.name.Names;
import com.ibm.icu.text.Collator;

import cz.incad.kramerius.FedoraAccess;
import cz.incad.kramerius.FedoraNamespaceContext;
import cz.incad.kramerius.KrameriusModels;
Expand All @@ -16,17 +40,8 @@
import cz.incad.kramerius.relation.impl.RelationServiceImpl;
import cz.incad.kramerius.service.SortingService;
import cz.incad.kramerius.statistics.StatisticsAccessLog;
import cz.incad.kramerius.utils.NaturalOrderCollator;
import cz.incad.kramerius.utils.conf.KConfiguration;
import org.w3c.dom.Document;

import javax.annotation.PostConstruct;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;
import java.util.*;
import java.util.logging.Logger;

/**
* @ author vlahoda
Expand All @@ -44,62 +59,74 @@ public class SortingServiceImpl implements SortingService {


RelationService relationService;
private XPathFactory xpathFactory = XPathFactory.newInstance();
private Map<String, String> sortingConfigMap = new HashMap<String, String>();

@Inject
public SortingServiceImpl(@Named("rawFedoraAccess") FedoraAccess fedoraAccess, KConfiguration configuration, RelationService relationService){
public SortingServiceImpl(@Named("rawFedoraAccess") FedoraAccess fedoraAccess, KConfiguration configuration, RelationService relationService) {
this.fedoraAccess = fedoraAccess;
this.configuration = configuration;
this.relationService = relationService;
initSortingConfigMap();
}

public static void main(String[] args) throws IOException {
LOGGER.info("SortRelations service: " + Arrays.toString(args));
Injector injector = Guice.createInjector(new SortingModule());
SortingService inst = injector.getInstance(SortingService.class);
inst.sortRelations(args[0], true);
LOGGER.info("SortRelations finished.");
}

@Override
public void sortRelations(String pid, boolean startIndexer) {
try {
//TODO: I18n
if (startIndexer){
try{
if (startIndexer) {
try {
ProcessStarter.updateName("Sort relations (" + pid + ")");
}catch(Exception ex){}
} catch (Exception ex) {
}
}
String lastTime = fedoraAccess.getAPIA().getObjectProfile(pid, null).getObjLastModDate();
RelationModel model = relationService.load(pid);
for (KrameriusModels kind : model.getRelationKinds()) {
if (KrameriusModels.DONATOR.equals(kind)) continue;
if (KrameriusModels.DONATOR.equals(kind))
continue;
List<Relation> relations = model.getRelations(kind);
List<String> originalPids = new ArrayList<String>(relations.size());
for (Relation relation : relations) {
originalPids.add(relation.getPID());
}
SortingConfig sortingConfig = sortingConfigMap.get(kind.getValue());
if (sortingConfig == null){
LOGGER.warning("Unsupported relation type for sorting: "+kind.getValue());
String xpath = sortingConfigMap.get(kind.getValue());
if (xpath == null) {
LOGGER.warning("Unsupported relation type for sorting: " + kind.getValue());
continue;
}
List<String> sortedPids = sortObjects(originalPids, sortingConfig.xpath, sortingConfig.numeric);
List<String> sortedPids = sortObjects(originalPids, xpath);
relations.clear();
for (String sortedPid : sortedPids) {
relations.add(new Relation(sortedPid, kind));
}
}
String currTime = fedoraAccess.getAPIA().getObjectProfile(pid, null).getObjLastModDate();
if (currTime.equals(lastTime)){
if (currTime.equals(lastTime)) {
relationService.save(pid, model);
if (startIndexer){
if (startIndexer) {
IndexerProcessStarter.spawnIndexer(true, "Reindexing sorted relations", pid);
}
}else{
LOGGER.warning("Cannot save sorted relations, object "+pid+" was modified.");
} else {
LOGGER.warning("Cannot save sorted relations, object " + pid + " was modified.");
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Override
public List<String> sortObjects(List<String> pids, String xpathString, boolean numeric) {
TreeMap<Object, String> sortedMap = new TreeMap<Object, String>();
public List<String> sortObjects(List<String> pids, String xpathString) {
Collator stringCollator = Collator.getInstance(new Locale(configuration.getConfiguration().getString("sort.locale", "cs_CZ")));
TreeMultimap<String, String> sortedMap = TreeMultimap.create(new NaturalOrderCollator(stringCollator), Ordering.natural());
List<String> failedList = new ArrayList<String>();
XPathExpression expr = null;
try {
Expand All @@ -111,66 +138,40 @@ public List<String> sortObjects(List<String> pids, String xpathString, boolean n
}
for (String pid : pids) {
String sortingValue = null;
try{
try {
Document mods = RelationUtils.getMods(pid, fedoraAccess);
sortingValue = expr.evaluate(mods);
} catch (Exception e) {
//ignore, will be logged in next step (sortingValue test)
}
if (sortingValue == null || "".equals(sortingValue)){
if (sortingValue == null || "".equals(sortingValue)) {
failedList.add(pid);
LOGGER.info("Cannot sort relation for invalid value:"+sortingValue + " ("+pid+")");
}else{
if (numeric){
try{
Integer ordinal = Integer.parseInt(sortingValue);
String existing = sortedMap.put(ordinal,pid);
if (existing != null){
failedList.add(existing);
}
}catch (Exception ex){
failedList.add(pid);
LOGGER.info("Cannot sort relation for invalid numeric value:"+sortingValue + " ("+pid+")");
}
}else{
String existing = sortedMap.put(sortingValue,pid);
if (existing != null){
failedList.add(existing);
}
LOGGER.info("Cannot sort relation for invalid value:" + sortingValue + " (" + pid + ")");
} else {
try {
sortedMap.put(sortingValue, pid);
} catch (Exception ex) {
failedList.add(pid);
LOGGER.info("Cannot sort relation for invalid value:" + sortingValue + " (" + pid + ")");
}
}
}
List<String> result = new ArrayList<String>(pids.size());
for (Map.Entry<Object,String> entry:sortedMap.entrySet()){
result.add(entry.getValue());
for (String o : sortedMap.values()) {
result.add(o);
}
result.addAll(failedList);
return result;
}

private XPathFactory xpathFactory = XPathFactory.newInstance();
private Map<String, SortingConfig> sortingConfigMap = new HashMap<String,SortingConfig>();

@PostConstruct
private void initSortingConfigMap(){
private void initSortingConfigMap() {
String[] rawConfig = configuration.getConfiguration().getStringArray(CONFIG_KEY);
for (String modelConfig:rawConfig){
for (String modelConfig : rawConfig) {
String[] configItems = modelConfig.split(";");
SortingConfig sortingConfig = new SortingConfig();
sortingConfig.xpath = configItems[1];
sortingConfig.numeric = Boolean.parseBoolean(configItems[2]);
sortingConfigMap.put(configItems[0], sortingConfig);
}
}



public static void main(String[] args) throws IOException {
LOGGER.info("SortRelations service: " + Arrays.toString(args));
Injector injector = Guice.createInjector(new SortingModule());
SortingService inst = injector.getInstance(SortingService.class);
inst.sortRelations(args[0], true);
LOGGER.info("SortRelations finished.");
sortingConfigMap.put(configItems[0], configItems[1]);
}
}
}

Expand All @@ -185,7 +186,4 @@ protected void configure() {
}
}

class SortingConfig {
String xpath;
boolean numeric;
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package cz.incad.kramerius.utils;


import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;

import com.ibm.icu.text.Collator;


public class NaturalOrderCollator implements Comparator<String>


{

Collator stringCollator;

public static void main(String[] args) {
String[] strings = new String[]{"1-2", "1-02", "1-20", "10-20", "fred", "jane", "pic01",
"pic2", "pic02", "pic02a", "pic3", "pic4", "pic 4 else", "pic 5", "pic05", "pic 5",
"pic 5 something", "pic 6", "pic 7", "pic100", "pic100a", "pic120", "pic121",
"pic02000", "tom", "x2-g8", "x2-y7", "x2-y08", "x8-y8", "anča1chtěla", "čtenář5a", "cizinec", "anča1hleděla", "an-ča1čuměla", "1", "10", "2", "20", "2a", "pic", "pic0100", ",", "#", "#1", "9", "6", "7-8", "[7,8]", "motýl noční", "motýlek"};

List<String> orig = Arrays.asList(strings);

System.out.println("Original: " + orig);

List<String> scrambled = Arrays.asList(strings);
Collections.shuffle(scrambled);

System.out.println("Scrambled: " + scrambled);

Collections.sort(scrambled, new NaturalOrderCollator());
//Collections.sort(scrambled, Collator.getInstance(new Locale("cs")));

System.out.println("Sorted: " + scrambled);
}

public NaturalOrderCollator(Collator stringCollator) {
this.stringCollator = stringCollator;
}

public NaturalOrderCollator() {
this.stringCollator = Collator.getInstance(new Locale("cs_CZ"));
}

private final boolean isDigit(char ch) {
return ch >= 48 && ch <= 57;
}

/**
* Length of string is passed in for improved efficiency (only need to calculate it once)
**/
private final String getChunk(String s, int slength, int marker) {
StringBuilder chunk = new StringBuilder();
char c = s.charAt(marker);
chunk.append(c);
marker++;
if (isDigit(c)) {
while (marker < slength) {
c = s.charAt(marker);
if (!isDigit(c))
break;
chunk.append(c);
marker++;
}
} else {
while (marker < slength) {
c = s.charAt(marker);
if (isDigit(c))
break;
chunk.append(c);
marker++;
}
}
return chunk.toString();
}

public int compare(String s1, String s2) {

int thisMarker = 0;
int thatMarker = 0;
int s1Length = s1.length();
int s2Length = s2.length();

while (thisMarker < s1Length && thatMarker < s2Length) {
String thisChunk = getChunk(s1, s1Length, thisMarker);
thisMarker += thisChunk.length();

String thatChunk = getChunk(s2, s2Length, thatMarker);
thatMarker += thatChunk.length();

// If both chunks contain numeric characters, sort them numerically
int result = 0;
if (isDigit(thisChunk.charAt(0)) && isDigit(thatChunk.charAt(0))) {
int firstInt = Integer.parseInt(thisChunk);
int secondInt = Integer.parseInt(thatChunk);
result = firstInt - secondInt;
if (result != 0) {
return result;
}
} else {
result = stringCollator.compare(thisChunk, thatChunk);
}

if (result != 0)
return result;
}

return s1Length - s2Length;
}
}
3 changes: 3 additions & 0 deletions common/src/main/java/res/configuration.properties
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ sort.xpaths=page;//mods:mods/mods:part/mods:detail[@type='pageIndex']/mods:numbe
supplement;//mods:mods/mods:part/mods:detail[@type='pageNumber']/mods:number | //mods:mods/mods:titleInfo/mods:partNumber;true,\
picture;//mods:mods/mods:part/mods:detail[@type='pageNumber']/mods:number | //mods:mods/mods:titleInfo/mods:partNumber;true

## Locale used for relations sorting
sort.locale=cs_CZ

## Podporovane jazyky v rozhrani
interface.languages=\u010desky,cs,english,en

Expand Down

0 comments on commit 5f76795

Please sign in to comment.