From f28c08fa3136877130f006846a042266b4ee0ea2 Mon Sep 17 00:00:00 2001 From: cbergin Date: Tue, 24 Oct 2023 14:19:43 -0700 Subject: [PATCH 1/2] Update lambdas to not index files with metadata security classification value of Protected B or Protected C --- .../ProcessSQSMessage.java | 45 +++++++++++++------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/ProcessSQSMessage.java b/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/ProcessSQSMessage.java index a96e1ac..6c5018b 100644 --- a/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/ProcessSQSMessage.java +++ b/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/ProcessSQSMessage.java @@ -175,24 +175,41 @@ public String handleRequest(Map event, Context context) { String fileName = filePath.substring(filePath.lastIndexOf("/") + 1); OpenSearchRESTClient restClient = new OpenSearchRESTClient(); - restClient.addIndex(content, fileName, fileDetailsJson, scanStatus); - // Push ID onto SQS for clamAV - logger.log("\nInfo: File parsing complete. Schedule ClamAV scan."); - - // update metadata - boolean metaAdded = GetFileFromWFDMAPI.setIndexedMetadata(wfdmToken, fileId, versionNumber, fileDetailsJson); - if (!metaAdded) { - // We failed to apply the metadata regarding the virus scan status... - // Should we continue to process the data from this point, or just choke? - logger.log("\nERROR: Failed to add metadata to file resource"); + + // We are disabling indexing of files with a security classification of Protected B or Protected C + JSONArray metaArray = fileDetailsJson.getJSONArray("metadata"); + boolean skipIndexing = false; + for (int i = 0; i < metaArray.length(); i++) { + String metadataName = metaArray.getJSONObject(i).getString("metadataName"); + String metadataValue = metaArray.getJSONObject(i).getString("metadataValue"); + + if (metadataName.equals("SecurityClassification") + && (metadataValue.equals("Protected B") || metadataValue.equals("Protected C"))) { + skipIndexing = true; + } } - // after updating metadata, get file info again and update index - fileInfo = GetFileFromWFDMAPI.getFileInformation(wfdmToken, fileId); - fileDetailsJson = new JSONObject(fileInfo); + if (!skipIndexing) { - restClient.addIndex(content, fileName, fileDetailsJson, scanStatus); + restClient.addIndex(content, fileName, fileDetailsJson, scanStatus); + // Push ID onto SQS for clamAV + logger.log("\nInfo: File parsing complete. Schedule ClamAV scan."); + // update metadata + boolean metaAdded = GetFileFromWFDMAPI.setIndexedMetadata(wfdmToken, fileId, versionNumber, fileDetailsJson); + if (!metaAdded) { + // We failed to apply the metadata regarding the virus scan status... + // Should we continue to process the data from this point, or just choke? + logger.log("\nERROR: Failed to add metadata to file resource"); + } + + // after updating metadata, get file info again and update index + fileInfo = GetFileFromWFDMAPI.getFileInformation(wfdmToken, fileId); + fileDetailsJson = new JSONObject(fileInfo); + + restClient.addIndex(content, fileName, fileDetailsJson, scanStatus); + + } } } catch (UnirestException | TransformerConfigurationException | SAXException e) { logger.log("\nError: Failure to recieve file from WFDM" + e.getLocalizedMessage()); From 87d5c2ac6e64a80e13e8b998e73325ff47a147c6 Mon Sep 17 00:00:00 2001 From: cbergin Date: Tue, 24 Oct 2023 16:10:01 -0700 Subject: [PATCH 2/2] add pptx parsing to TikaParseDocument --- .../gov/nrs/wfdm/wfdm_file_index_service/TikaParseDocument.java | 1 + 1 file changed, 1 insertion(+) diff --git a/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/TikaParseDocument.java b/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/TikaParseDocument.java index 09b247c..723ecda 100644 --- a/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/TikaParseDocument.java +++ b/wfdm-file-index-service/src/main/java/ca/bc/gov/nrs/wfdm/wfdm_file_index_service/TikaParseDocument.java @@ -77,6 +77,7 @@ public static String parseStream(InputStream stream, String mimeType) case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": case "application/vnd.ms-excel.sheet.macroEnabled.12": case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": + case "application/vnd.openxmlformats-officedocument.presentationml.presentation": parser = new OOXMLParser(); break; case "application/pdf":