Skip to content

Commit 511cd57

Browse files
committed
Merge branch 'develop'
2 parents dc5ee11 + 03b9ba9 commit 511cd57

File tree

10 files changed

+296
-148
lines changed

10 files changed

+296
-148
lines changed

goobi-viewer-indexer/pom.xml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
<modelVersion>4.0.0</modelVersion>
77
<groupId>io.goobi.viewer</groupId>
88
<artifactId>viewer-indexer</artifactId>
9-
<version>24.03</version>
9+
<version>24.04-SNAPSHOT</version>
1010

1111

1212
<name>Goobi viewer - Indexer</name>
@@ -32,32 +32,32 @@
3232
<sonar.java.target>17</sonar.java.target>
3333

3434
<!-- Checkstyle -->
35-
<checkstyle.max.violations>400</checkstyle.max.violations>
35+
<checkstyle.max.violations>180</checkstyle.max.violations>
3636
<skipCheckstyle>true</skipCheckstyle>
3737

3838
<!-- Manifest information -->
3939
<maven.build.timestamp.format>yyyy-MM-dd HH:mm</maven.build.timestamp.format>
4040
<buildDate>${maven.build.timestamp}</buildDate>
4141

4242
<!-- intranda libraries -->
43-
<alto.version>1.5.13</alto.version>
43+
<alto.version>1.5.15</alto.version>
4444
<iiif-api-model.version>2.6.3</iiif-api-model.version>
4545
<normdataimporter.version>1.10.9</normdataimporter.version>
4646

4747
<!-- other libraries -->
4848
<angus-mail.version>2.0.3</angus-mail.version>
4949
<commons-beanutils.version>1.9.4</commons-beanutils.version>
5050
<commons-configuration2.version>2.10.1</commons-configuration2.version>
51-
<commons-io.version>2.16.0</commons-io.version>
51+
<commons-io.version>2.16.1</commons-io.version>
5252
<commons-jxpath.version>1.3</commons-jxpath.version>
5353
<commons-lang3.version>3.14.0</commons-lang3.version>
54-
<commons-text.version>1.11.0</commons-text.version>
54+
<commons-text.version>1.12.0</commons-text.version>
5555
<httpclient.version>4.5.14</httpclient.version>
5656
<httpcore.version>4.4.16</httpcore.version>
57-
<icu.version>74.2</icu.version>
57+
<icu.version>75.1</icu.version>
5858
<imageio-openjpeg.version>0.6.7</imageio-openjpeg.version>
5959
<log4j.version>2.23.1</log4j.version>
60-
<jackson.version>2.17.0</jackson.version>
60+
<jackson.version>2.17.1</jackson.version>
6161
<jaxen.version>2.0.0</jaxen.version>
6262
<jai.version>1.4.0</jai.version>
6363
<jakarta.mail-api.version>2.1.3</jakarta.mail-api.version>
@@ -66,7 +66,7 @@
6666
<json.version>20240303</json.version>
6767
<junit.version>5.10.2</junit.version>
6868
<metadata-extractor.version>2.19.0</metadata-extractor.version>
69-
<solr.version>9.5.0</solr.version>
69+
<solr.version>9.6.0</solr.version>
7070
<sf-geojson.version>3.3.3</sf-geojson.version>
7171
</properties>
7272

@@ -477,7 +477,7 @@
477477
<dependency>
478478
<groupId>com.puppycrawl.tools</groupId>
479479
<artifactId>checkstyle</artifactId>
480-
<version>10.15.0</version>
480+
<version>10.16.0</version>
481481
</dependency>
482482
</dependencies>
483483
<executions>

goobi-viewer-indexer/src/main/java/io/goobi/viewer/indexer/Indexer.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2179,4 +2179,30 @@ void prerenderPagePdfsIfRequired(String pi, boolean hasNewMediaFiles) {
21792179
logger.error(e.getMessage());
21802180
}
21812181
}
2182+
2183+
/**
2184+
*
2185+
* @param fileName File name to check
2186+
* @param pageDoc
2187+
* @return Alternative file name if fileName not image and alternatives available; otherwise fileName
2188+
* @should replace value correctly
2189+
* @should return fileName if image
2190+
* @should return fileNAme if url
2191+
*/
2192+
static String checkThumbnailFileName(String fileName, SolrInputDocument pageDoc) {
2193+
String ret = fileName;
2194+
if (!Utils.isValidURL(fileName) && !FileTools.isImageFile(fileName) && pageDoc != null) {
2195+
String filenameTiffField = SolrConstants.FILENAME + "_TIFF";
2196+
String filenameJpegField = SolrConstants.FILENAME + "_JPEG";
2197+
if (pageDoc.getFieldValue(filenameTiffField) != null) {
2198+
ret = (String) pageDoc.getFieldValue(filenameTiffField);
2199+
logger.info("Using {}:{} for {}", filenameTiffField, ret, SolrConstants.THUMBNAIL);
2200+
} else if (pageDoc.getFieldValue(filenameJpegField) != null) {
2201+
ret = (String) pageDoc.getFieldValue(filenameJpegField);
2202+
logger.info("Using {}:{} for {}", filenameJpegField, ret, SolrConstants.THUMBNAIL);
2203+
}
2204+
}
2205+
2206+
return ret;
2207+
}
21822208
}

goobi-viewer-indexer/src/main/java/io/goobi/viewer/indexer/LidoIndexer.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,12 @@ boolean generatePageDocument(Element eleResourceSet, String iddoc, Integer order
714714
}
715715
}
716716

717+
// Do not create pages for resourceSet elements that have no relation to images
718+
if (StringUtils.isEmpty(filePath)) {
719+
logger.debug("No file path found for this resource set.");
720+
return false;
721+
}
722+
717723
String fileName;
718724
if (StringUtils.isNotEmpty(filePath) && filePath.contains("/")) {
719725
if (Utils.isFileNameMatchesRegex(filePath, IIIF_IMAGE_FILE_NAMES)) {
@@ -733,7 +739,7 @@ boolean generatePageDocument(Element eleResourceSet, String iddoc, Integer order
733739
}
734740

735741
// Add file size
736-
if (dataFolders != null) {
742+
if (dataFolders != null && fileName != null) {
737743
try {
738744
Path dataFolder = dataFolders.get(DataRepository.PARAM_MEDIA);
739745
// TODO other mime types/folders

goobi-viewer-indexer/src/main/java/io/goobi/viewer/indexer/MetsIndexer.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ private List<LuceneField> mapPagesToDocstruct(IndexObject indexObj, IndexObject
736736
if (StringUtils.isEmpty(filePathBanner) && SolrIndexerDaemon.getInstance().getConfiguration().isUseFirstPageAsDefaultRepresentative()
737737
&& firstPageDoc != null) {
738738
// Add thumbnail information from the first page
739-
String thumbnailFileName = (String) firstPageDoc.getFieldValue(SolrConstants.FILENAME);
739+
String thumbnailFileName = checkThumbnailFileName((String) firstPageDoc.getFieldValue(SolrConstants.FILENAME), firstPageDoc);
740740
ret.add(new LuceneField(SolrConstants.THUMBNAIL, thumbnailFileName));
741741
if (DocType.SHAPE.name().equals(firstPageDoc.getFieldValue(SolrConstants.DOCTYPE))) {
742742
ret.add(new LuceneField(SolrConstants.THUMBPAGENO, String.valueOf(firstPageDoc.getFieldValue("ORDER_PARENT"))));
@@ -748,7 +748,7 @@ private List<LuceneField> mapPagesToDocstruct(IndexObject indexObj, IndexObject
748748
thumbnailSet = true;
749749
}
750750
for (SolrInputDocument pageDoc : pageDocs) {
751-
String pageFileName = (String) pageDoc.getFieldValue(SolrConstants.FILENAME);
751+
String pageFileName = checkThumbnailFileName((String) pageDoc.getFieldValue(SolrConstants.FILENAME), pageDoc);
752752
String pageFileBaseName = FilenameUtils.getBaseName(pageFileName);
753753
// Add thumbnail information from the representative page
754754
if (!thumbnailSet && StringUtils.isNotEmpty(filePathBanner) && filePathBanner.equals(pageFileName)) {
@@ -929,7 +929,7 @@ private List<LuceneField> mapPagesToDocstruct(IndexObject indexObj, IndexObject
929929
if (!thumbnailSet && StringUtils.isNotEmpty(filePathBanner) && firstPageDoc != null) {
930930
logger.warn("Selected representative image '{}' is not mapped to any structure element - using first mapped image instead.",
931931
filePathBanner);
932-
String pageFileName = (String) firstPageDoc.getFieldValue(SolrConstants.FILENAME);
932+
String pageFileName = checkThumbnailFileName((String) firstPageDoc.getFieldValue(SolrConstants.FILENAME), firstPageDoc);
933933
ret.add(new LuceneField(SolrConstants.THUMBNAIL, pageFileName));
934934
// THUMBNAILREPRESENT is just used to identify the presence of a custom representation thumbnail to the indexer, it is not used in the viewer
935935
ret.add(new LuceneField(SolrConstants.THUMBNAILREPRESENT, pageFileName));
@@ -1469,7 +1469,8 @@ boolean generatePageDocument(Element eleStructMapPhysical, String iddoc, String
14691469
}
14701470

14711471
// Width + height (from IIIF)
1472-
if (doc.getField(SolrConstants.WIDTH) == null && doc.getField(SolrConstants.HEIGHT) == null && !downloadExternalImages && filePath != null
1472+
if (SolrIndexerDaemon.getInstance().getConfiguration().isReadImageDimensionsFromIIIF() && doc.getField(SolrConstants.WIDTH) == null
1473+
&& doc.getField(SolrConstants.HEIGHT) == null && !downloadExternalImages && filePath != null
14731474
&& filePath.endsWith("info.json")) {
14741475
int[] dim = getImageDimensionsFromIIIF(filePath);
14751476
if (dim.length == 2) {
@@ -2267,8 +2268,7 @@ boolean isAnchor() throws FatalIndexerException {
22672268
* @should return false if relatedItem not anchor
22682269
*/
22692270
protected boolean isVolume() {
2270-
String query =
2271-
"/mets:mets/mets:dmdSec[1]/mets:mdWrap[@MDTYPE='MODS']/mets:xmlData/mods:mods/mods:relatedItem[@type='host' and not(@otherType)]/mods:recordInfo/mods:recordIdentifier";
2271+
String query = SolrIndexerDaemon.getInstance().getConfiguration().getMetsVolumeCheckXPath();
22722272
List<Element> relatedItemList = xp.evaluateToElements(query, null);
22732273

22742274
return relatedItemList != null && !relatedItemList.isEmpty();
@@ -2279,12 +2279,11 @@ protected boolean isVolume() {
22792279
* getMetsCreateDate.
22802280
* </p>
22812281
*
2282-
* @throws io.goobi.viewer.indexer.exceptions.FatalIndexerException
22832282
* @return a {@link java.time.LocalDateTime} object.
22842283
* @should return CREATEDATE value
22852284
* @should return null if date does not exist in METS
22862285
*/
2287-
protected ZonedDateTime getMetsCreateDate() throws FatalIndexerException {
2286+
protected ZonedDateTime getMetsCreateDate() {
22882287
String dateString = xp.evaluateToAttributeStringValue("/mets:mets/mets:metsHdr/@CREATEDATE", null);
22892288
return parseCreateDate(dateString);
22902289
}

goobi-viewer-indexer/src/main/java/io/goobi/viewer/indexer/helper/Configuration.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,10 @@ public String getSolrUrl() {
303303
public String getOldSolrUrl() {
304304
return getConfiguration("oldSolrUrl");
305305
}
306-
306+
307307
/**
308308
*
309-
* @return a boolean
309+
* @return a boolean
310310
* @should return correct value
311311
*/
312312
public boolean isSolrUseHttp2() {
@@ -666,7 +666,7 @@ public boolean isHostProxyWhitelisted(String url) throws MalformedURLException {
666666
public boolean isUseFirstPageAsDefaultRepresentative() {
667667
return getBoolean("init.representativeImage.useFirstPageAsDefault", true);
668668
}
669-
669+
670670
/**
671671
*
672672
* @return Configured preferredImageFileGroup values or empty list
@@ -675,7 +675,7 @@ public boolean isUseFirstPageAsDefaultRepresentative() {
675675
public List<String> getMetsPreferredImageFileGroups() {
676676
return getStringList("init.mets.preferredImageFileGroup");
677677
}
678-
678+
679679
/**
680680
*
681681
* @return Configured physicalElementTypes/type values or empty list
@@ -685,6 +685,16 @@ public List<String> getMetsAllowedPhysicalTypes() {
685685
return getStringList("init.mets.physicalElementTypes.type");
686686
}
687687

688+
/**
689+
*
690+
* @return Configured XPath expression or default value
691+
* @should return correct value
692+
*/
693+
public String getMetsVolumeCheckXPath() {
694+
return getString("init.mets.volumeCheckXPath",
695+
"/mets:mets/mets:dmdSec[1]/mets:mdWrap[@MDTYPE='MODS']/mets:xmlData/mods:mods/mods:relatedItem[@type='host']/mods:recordInfo/mods:recordIdentifier");
696+
}
697+
688698
/**
689699
* Overrides values in the config file (for unit test purposes).
690700
*
@@ -726,4 +736,8 @@ boolean checkEmailConfiguration() {
726736
return true;
727737
}
728738

739+
public boolean isReadImageDimensionsFromIIIF() {
740+
return getBoolean("performance.loadExternalImageInfos", true);
741+
}
742+
729743
}

goobi-viewer-indexer/src/main/java/io/goobi/viewer/indexer/helper/FileTools.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.apache.commons.io.FilenameUtils;
5151
import org.apache.commons.io.IOUtils;
5252
import org.apache.commons.io.output.FileWriterWithEncoding;
53+
import org.apache.commons.lang3.StringUtils;
5354
import org.apache.logging.log4j.LogManager;
5455
import org.apache.logging.log4j.Logger;
5556

@@ -436,4 +437,38 @@ public boolean accept(Path entry) throws IOException {
436437
}
437438
}
438439
}
440+
441+
/**
442+
* <p>
443+
* isImageUrl.
444+
* </p>
445+
*
446+
* @param fileName
447+
* @return true if this is an image file name; false otherwise
448+
* @should return true for image file names
449+
*/
450+
public static boolean isImageFile(String fileName) {
451+
if (StringUtils.isEmpty(fileName)) {
452+
return false;
453+
}
454+
455+
String extension = FilenameUtils.getExtension(fileName);
456+
if (StringUtils.isEmpty(extension)) {
457+
return false;
458+
}
459+
460+
switch (extension.toLowerCase()) {
461+
case "tif":
462+
case "tiff":
463+
case "jpg":
464+
case "jpeg":
465+
case "gif":
466+
case "png":
467+
case "jp2":
468+
return true;
469+
default:
470+
return false;
471+
}
472+
}
473+
439474
}

goobi-viewer-indexer/src/main/resources/config_indexer.xml

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<root>
44
<init>
55
<!-- Hotfolder pause between scans (milliseconds) -->
6-
<sleep>1000</sleep>
6+
<sleep>500</sleep>
77

88
<!-- Minimum required free space to index (MB) -->
99
<minStorageSpace>2048</minStorageSpace>
@@ -125,15 +125,19 @@
125125
<type>audio</type>
126126
<type>video</type>
127127
</physicalElementTypes>
128+
129+
<!-- volumeCheckXPath: XPath expression to check whether the currently processed records has an anchor host. -->
130+
<volumeCheckXPath>/mets:mets/mets:dmdSec[1]/mets:mdWrap[@MDTYPE='MODS']/mets:xmlData/mods:mods/mods:relatedItem[@type='host'][not(@otherType) or @otherType='hierarchical']/mods:recordInfo/mods:recordIdentifier</volumeCheckXPath>
128131
</mets>
129132

130133
<lido>
131134
<!-- imageXPath: XPath expressions for image names/URLs relative to lido:resourceSet -->
132-
<imageXPath>lido:resourceRepresentation[@lido:type='image_master']/lido:linkResource</imageXPath>
135+
<imageXPath>lido:resourceRepresentation[@lido:type='image_master' or @lido:type='PRESENTATION']/lido:linkResource</imageXPath>
133136
<imageXPath>lido:resourceRepresentation[@lido:type='http://terminology.lido-schema.org/resourceRepresentation_type/provided_representation']/lido:linkResource</imageXPath>
134137
<imageXPath>lido:resourceRepresentation[@lido:type='http://terminology.lido-schema.org/lido00464']/lido:linkResource</imageXPath>
135138
<imageXPath>lido:resourceRepresentation[@lido:type='image_overview']/lido:linkResource</imageXPath>
136-
<imageXPath>lido:resourceID</imageXPath>
139+
<imageXPath>lido:resourceRepresentation[@lido:type='PRESENTATION']/lido:linkResource</imageXPath>
140+
<imageXPath>lido:resourceID</imageXPath>
137141
</lido>
138142

139143
<email>
@@ -185,6 +189,10 @@
185189
Should be turned off when indexing large numbers of records at once or if the storage is slow. Default is true. -->
186190
<countHotfolderFiles>true</countHotfolderFiles>
187191

192+
<!-- If true, allow the indexer to read file information of external image urls in order to index image width and heights.
193+
This is usefull to cache information about image dimensions for IIIF resources and image downloads, but requires additional time and possibly resources from the targeted server -->
194+
<loadExternalImageInfos>true</loadExternalImageInfos>
195+
188196
<!-- authorityDataCache/@enabled: If true, previously retrieved authority data records will be cached within the application
189197
for the duration of its lifetime. Default is true. -->
190198
<authorityDataCache enabled="true">
@@ -263,8 +271,6 @@
263271
<item>lido:lidoRecID</item>
264272
<item>mets:xmlData/mods:mods/mods:identifier[@type="ppn" or @type="PPN"]</item>
265273
<item>mets:xmlData/mods:mods/mods:recordInfo/mods:recordIdentifier</item>
266-
<item>mets:xmlData/mods:mods/mods:identifier[@type="kuni"]</item>
267-
<item>mets:xmlData/mods:mods/mods:identifier[@type="rosdok_document_id"]</item>
268274
<item>/mets:mets/@OBJID</item>
269275
<item>lido:administrativeMetadata/lido:recordWrap/lido:recordID </item>
270276
<item>//denkxweb:recId</item>
@@ -275,7 +281,6 @@
275281
<getnode>first</getnode>
276282
<addToDefault>true</addToDefault>
277283
<addUntokenizedVersion>false</addUntokenizedVersion>
278-
<replace string="rosdok/ppn"></replace>
279284
<replace string="http://ld.zdb-services.de/resource/organisations/DE-MUS-814819/"></replace>
280285
<replace regex="https?://hdl.handle.net/([0-9]+)/([0-9]+).([0-9]+)">$1_$2_$3</replace>
281286
</item>
@@ -405,7 +410,8 @@
405410
<item>mets:xmlData/mods:mods/mods:accessCondition[@type='moving wall']/@type</item>
406411
<item>lido:descriptiveMetadata/lido:objectClassificationWrap/lido:classificationWrap/lido:classification[@lido:type="restriction"]/lido:conceptID[@lido:type="http://terminology.lido-schema.org/identifier_type/local_identifier"]</item>
407412
<item>dc:rights[not(contains(text(),'http'))]</item>
408-
<item>ead:did/ead:accesscondition</item>
413+
<item>ead:archdesc/ead:dsc/ead:accessrestrict</item>
414+
<item>ead:dsc/ead:accessrestrict[1]</item>
409415
</list>
410416
</xpath>
411417

@@ -765,6 +771,8 @@
765771
<item>mets:xmlData/mods:mods/mods:language/mods:languageTerm[@type='code'][@authority='iso639-2b']</item>
766772
<item>mets:xmlData/mods:mods/mods:language/mods:languageTerm[@type='code'][@authority='rfc3066']</item>
767773
<item>dc:language</item>
774+
<item>ead:archdesc/ead:did/ead:langmaterial[@label='Language']/ead:language[1]</item>
775+
<item>ead:did/ead:langmaterial[@label='Language']/ead:language[1]</item>
768776
</list>
769777
</xpath>
770778
<onefield>false</onefield>
@@ -2864,6 +2872,7 @@
28642872
<xpath>lido:descriptiveMetadata/lido:objectIdentificationWrap/lido:objectMeasurementsWrap/lido:objectMeasurementsSet/lido:objectMeasurements</xpath>
28652873
<addToDefault>false</addToDefault>
28662874
<groupEntity type="OTHER">
2875+
<field name="MD_VALUE">lido:measurementsSet[lido:measurementType="diameter"]/lido:measurementValue</field>
28672876
<field name="MD_DIAMETER">lido:measurementsSet[lido:measurementType="diameter"]/lido:measurementValue</field>
28682877
<field name="MD_DIAMETERUNIT">lido:measurementsSet[lido:measurementType="diameter"]/lido:measurementUnit</field>
28692878
<field name="MD_WEIGHT">lido:measurementsSet[lido:measurementType="weight"]/lido:measurementValue</field>
@@ -3415,8 +3424,13 @@
34153424
<MD_SCOPECONTENT>
34163425
<list>
34173426
<item>
3418-
<xpath>mets:xmlData/mods:mods/mods:extension/intranda:intranda/intranda:ScopeContent
3419-
</xpath>
3427+
<xpath>
3428+
<list>
3429+
<item>mets:xmlData/mods:mods/mods:extension/intranda:intranda/intranda:ScopeContent</item>
3430+
<item>ead:archdesc/ead:dsc/ead:scopecontent[1]</item>
3431+
<item>ead:dsc/ead:scopecontent[1]</item>
3432+
</list>
3433+
</xpath>
34203434
<addToDefault>true</addToDefault>
34213435
<onefield>false</onefield>
34223436
</item>

0 commit comments

Comments
 (0)