From e8607fe183850cb261bb504add0a98024aac3e6a Mon Sep 17 00:00:00 2001 From: krwong <69482343+krwong@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:54:02 -0400 Subject: [PATCH] BXC-4603 group by multiple fields (#95) * group by more than one field * fix group by more than one field * remove unused code * rename groupField to groupFields, remove generateOneGroupMapping, join group values in multiMemberGroupSet, other small fixes * add helper method to build key, simplify number of cases in if statement * check all entries in list are empty * split groupFields, fix test, add sync multiple group test * add another sync multiple group test --- .../migration/cdm/GroupMappingCommand.java | 3 +- .../cdm/options/GroupMappingOptions.java | 15 +- .../cdm/services/GroupMappingService.java | 91 +++++++---- .../cdm/services/sips/WorkGenerator.java | 1 - .../migration/cdm/GroupMappingCommandIT.java | 39 +++++ .../migration/cdm/PermissionsCommandIT.java | 3 +- .../AggregateFileMappingServiceTest.java | 2 +- .../cdm/services/GroupMappingServiceTest.java | 142 +++++++++++++++++- .../cdm/services/PermissionsServiceTest.java | 2 +- .../cdm/services/SipServiceTest.java | 4 +- 10 files changed, 249 insertions(+), 53 deletions(-) diff --git a/src/main/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommand.java b/src/main/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommand.java index db9810d3..78c92ae1 100644 --- a/src/main/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommand.java +++ b/src/main/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommand.java @@ -7,7 +7,6 @@ import java.nio.file.Path; import edu.unc.lib.boxc.migration.cdm.options.GroupMappingSyncOptions; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import edu.unc.lib.boxc.migration.cdm.exceptions.MigrationException; @@ -120,7 +119,7 @@ private void initialize() throws IOException { } private void validateOptions(GroupMappingOptions options) { - if (StringUtils.isBlank(options.getGroupField())) { + if (options.getGroupFields().isEmpty()) { throw new IllegalArgumentException("Must provide an group field name"); } } diff --git a/src/main/java/edu/unc/lib/boxc/migration/cdm/options/GroupMappingOptions.java b/src/main/java/edu/unc/lib/boxc/migration/cdm/options/GroupMappingOptions.java index f3b0cf96..a9f794f2 100644 --- a/src/main/java/edu/unc/lib/boxc/migration/cdm/options/GroupMappingOptions.java +++ b/src/main/java/edu/unc/lib/boxc/migration/cdm/options/GroupMappingOptions.java @@ -2,6 +2,8 @@ import picocli.CommandLine.Option; +import java.util.List; + /** * Options for generating object grouping mappings * @@ -10,10 +12,11 @@ public class GroupMappingOptions { @Option(names = {"-n", "--field-name"}, + split = ",", description = { - "Name of the CDM export field to perform grouping on."}, + "Name(s) of the CDM export field to perform grouping on."}, defaultValue = "file") - private String groupField; + private List groupFields; @Option(names = {"-u", "--update"}, description = { @@ -31,12 +34,12 @@ public class GroupMappingOptions { description = "Overwrite mapping file if one already exists") private boolean force; - public String getGroupField() { - return groupField; + public List getGroupFields() { + return groupFields; } - public void setGroupField(String groupField) { - this.groupField = groupField; + public void setGroupFields(List groupFields) { + this.groupFields = groupFields; } public boolean getUpdate() { diff --git a/src/main/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingService.java b/src/main/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingService.java index 975e6cad..7c5c0b24 100644 --- a/src/main/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingService.java +++ b/src/main/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingService.java @@ -58,7 +58,7 @@ public class GroupMappingService { private CdmFieldService fieldService; private List exportFields; - public void generateMapping(GroupMappingOptions options) throws IOException { + public void generateMapping(GroupMappingOptions options) throws Exception { assertProjectStateValid(); ensureMappingState(options); @@ -88,37 +88,7 @@ public void generateMapping(GroupMappingOptions options) throws IOException { // Return set of all group keys that have at least 2 records in them var multiMemberGroupSet = new HashSet(); - ResultSet groupRs = stmt.executeQuery("select " + options.getGroupField() - + " from " + CdmIndexService.TB_NAME - + " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null" - + " group by " + options.getGroupField() - + " having count(*) > 1"); - while (groupRs.next()) { - var groupValue = groupRs.getString(1); - if (StringUtils.isBlank(groupValue)) { - continue; - } - multiMemberGroupSet.add(groupValue); - } - - ResultSet rs = stmt.executeQuery("select " + CdmFieldInfo.CDM_ID + ", " + options.getGroupField() - + " from " + CdmIndexService.TB_NAME - + " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null" - + " order by " + CdmFieldInfo.CDM_ID + " ASC"); - while (rs.next()) { - String cdmId = rs.getString(1); - String matchedValue = rs.getString(2); - - // Add empty mapping for records either not in groups or in groups with fewer than 2 members - if (StringUtils.isBlank(matchedValue) || !multiMemberGroupSet.contains(matchedValue)) { - log.debug("No matching field for object {}", cdmId); - csvPrinter.printRecord(cdmId, null); - continue; - } - - String groupKey = GroupMappingInfo.GROUPED_WORK_PREFIX + options.getGroupField() + ":" + matchedValue; - csvPrinter.printRecord(cdmId, groupKey); - } + generateMultipleGroupMapping(options, stmt, multiMemberGroupSet, csvPrinter); } catch (SQLException e) { throw new MigrationException("Error interacting with export index", e); } finally { @@ -135,6 +105,55 @@ public void generateMapping(GroupMappingOptions options) throws IOException { } } + private void generateMultipleGroupMapping(GroupMappingOptions options, Statement stmt, + Set multiMemberGroupSet, CSVPrinter csvPrinter) throws Exception { + int numberGroups = options.getGroupFields().size(); + String multipleGroups = String.join(", ", options.getGroupFields()); + + ResultSet groupRs = stmt.executeQuery("select " + multipleGroups + + " from " + CdmIndexService.TB_NAME + + " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null" + + " group by " + multipleGroups + + " having count(*) > 1"); + while (groupRs.next()) { + List groupValues = new ArrayList<>(); + for (int i = 1; i < numberGroups + 1; i++) { + var groupValue = groupRs.getString(i); + groupValues.add(groupValue); + } + var multipleGroupValues = buildKey(groupValues); + multiMemberGroupSet.add(multipleGroupValues); + } + + ResultSet rs = stmt.executeQuery("select " + CdmFieldInfo.CDM_ID + ", " + multipleGroups + + " from " + CdmIndexService.TB_NAME + + " where " + CdmIndexService.ENTRY_TYPE_FIELD + " is null" + + " order by " + CdmFieldInfo.CDM_ID + " ASC"); + while (rs.next()) { + String cdmId = rs.getString(1); + List matchedValues = new ArrayList<>(); + for (int i = 2; i < numberGroups + 2; i++) { + var matchedValue = rs.getString(i); + matchedValues.add(matchedValue); + } + var multipleMatchedValues = buildKey(matchedValues); + + // Add empty mapping for records either not in groups or in groups with fewer than 2 members + if (multipleMatchedValues == null || !multiMemberGroupSet.contains(multipleMatchedValues)) { + log.debug("No matching field for object {}", cdmId); + csvPrinter.printRecord(cdmId, null); + continue; + } + + List listGroups = new ArrayList<>(); + for (int i = 0; i < numberGroups; i++) { + listGroups.add(options.getGroupFields().get(i) + ":" + matchedValues.get(i)); + } + String groupKey = GroupMappingInfo.GROUPED_WORK_PREFIX + String.join(",", listGroups); + csvPrinter.printRecord(cdmId, groupKey); + } + } + private void assertProjectStateValid() { if (project.getProjectProperties().getIndexedDate() == null) { throw new InvalidProjectStateException("Project must be indexed prior to generating source mappings"); @@ -166,6 +185,14 @@ private void ensureMappingState(GroupMappingOptions options) { } } + private String buildKey(List values) { + String joinedValues = null; + if (!values.stream().allMatch(String::isEmpty)) { + joinedValues = String.join(",", values); + } + return joinedValues; + } + /** * Merge existing mappings with updated mappings, writing to temporary files as intermediates * @param options diff --git a/src/main/java/edu/unc/lib/boxc/migration/cdm/services/sips/WorkGenerator.java b/src/main/java/edu/unc/lib/boxc/migration/cdm/services/sips/WorkGenerator.java index e015bd26..714b1061 100644 --- a/src/main/java/edu/unc/lib/boxc/migration/cdm/services/sips/WorkGenerator.java +++ b/src/main/java/edu/unc/lib/boxc/migration/cdm/services/sips/WorkGenerator.java @@ -4,7 +4,6 @@ import edu.unc.lib.boxc.deposit.impl.model.DepositModelHelpers; import edu.unc.lib.boxc.migration.cdm.exceptions.InvalidProjectStateException; import edu.unc.lib.boxc.migration.cdm.model.DestinationSipEntry; -import edu.unc.lib.boxc.migration.cdm.model.GroupMappingInfo; import edu.unc.lib.boxc.migration.cdm.model.PermissionsInfo; import edu.unc.lib.boxc.migration.cdm.model.SourceFilesInfo; import edu.unc.lib.boxc.migration.cdm.options.SipGenerationOptions; diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommandIT.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommandIT.java index a2401780..2080e36d 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommandIT.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/GroupMappingCommandIT.java @@ -47,6 +47,18 @@ public void generateBasicMatchSucceedsTest() throws Exception { assertTrue(Files.exists(project.getGroupMappingPath())); } + @Test + public void generateMultipleMatchSucceedsTest() throws Exception { + indexExportSamples(); + String[] args = new String[] { + "-w", project.getProjectPath().toString(), + "group_mapping", "generate", + "-n", "groupa,dcmi"}; + executeExpectSuccess(args); + + assertTrue(Files.exists(project.getGroupMappingPath())); + } + @Test public void generateAndSyncTest() throws Exception { indexExportSamples(); @@ -74,6 +86,33 @@ public void generateAndSyncTest() throws Exception { } } + @Test + public void generateAndSyncMultipleGroupsTest() throws Exception { + indexExportSamples(); + String[] args = new String[] { + "-w", project.getProjectPath().toString(), + "group_mapping", "generate", + "-n", "groupa,dcmi" }; + executeExpectSuccess(args); + + assertTrue(Files.exists(project.getGroupMappingPath())); + + String[] args2 = new String[] { + "-w", project.getProjectPath().toString(), + "group_mapping", "sync" }; + executeExpectSuccess(args2); + + var indexService = new CdmIndexService(); + indexService.setProject(project); + Connection conn = null; + try { + conn = indexService.openDbConnection(); + assertFilesGrouped(conn, "25", "26"); + } finally { + CdmIndexService.closeDbConnection(conn); + } + } + @Test public void statusNotGenerated() throws Exception { String[] args = new String[] { diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/PermissionsCommandIT.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/PermissionsCommandIT.java index c000390a..1b07ec50 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/PermissionsCommandIT.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/PermissionsCommandIT.java @@ -10,6 +10,7 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -404,7 +405,7 @@ private void assertMappingCount(int count) throws IOException { private void setupGroupedIndex() throws Exception { var options = new GroupMappingOptions(); - options.setGroupField("groupa"); + options.setGroupFields(Arrays.asList("groupa")); testHelper.getGroupMappingService().generateMapping(options); var syncOptions = new GroupMappingSyncOptions(); syncOptions.setSortField("file"); diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/AggregateFileMappingServiceTest.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/AggregateFileMappingServiceTest.java index 5dfa21ca..cbe25e4d 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/AggregateFileMappingServiceTest.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/AggregateFileMappingServiceTest.java @@ -210,7 +210,7 @@ public void generateWithMatchAddToBottomTest() throws Exception { private void setupGroupedIndex() throws Exception { var options = new GroupMappingOptions(); - options.setGroupField("groupa"); + options.setGroupFields(Arrays.asList("groupa")); testHelper.getGroupMappingService().generateMapping(options); var syncOptions = new GroupMappingSyncOptions(); syncOptions.setSortField("file"); diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingServiceTest.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingServiceTest.java index 1c8842b8..44b6584b 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingServiceTest.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/GroupMappingServiceTest.java @@ -152,7 +152,7 @@ public void generateForceRunTest() throws Exception { assertGroupAMappingsPresent(info); options.setForce(true); - options.setGroupField("digitc"); + options.setGroupFields(Arrays.asList("digitc")); service.generateMapping(options); @@ -181,7 +181,7 @@ public void generateUpdateRunTest() throws Exception { assertEquals(5, info.getMappings().size()); options.setUpdate(true); - options.setGroupField("digitc"); + options.setGroupFields(Arrays.asList("digitc")); service.generateMapping(options); @@ -201,7 +201,7 @@ public void generateUpdateRunTest() throws Exception { public void generateUpdateForceRunTest() throws Exception { indexExportSamples(); GroupMappingOptions options = makeDefaultOptions(); - options.setGroupField("digitc"); + options.setGroupFields(Arrays.asList("digitc")); service.generateMapping(options); GroupMappingInfo info = service.loadMappings(); @@ -214,7 +214,7 @@ public void generateUpdateForceRunTest() throws Exception { options.setUpdate(true); options.setForce(true); - options.setGroupField("groupa"); + options.setGroupFields(Arrays.asList("groupa")); service.generateMapping(options); GroupMappingInfo info2 = service.loadMappings(); @@ -305,7 +305,7 @@ public void syncSingleRunTest() throws Exception { public void syncSecondRunWithCleanupTest() throws Exception { indexExportSamples(); GroupMappingOptions options = makeDefaultOptions(); - options.setGroupField("digitc"); + options.setGroupFields(Arrays.asList("digitc")); service.generateMapping(options); service.syncMappings(makeDefaultSyncOptions()); @@ -326,7 +326,7 @@ public void syncSecondRunWithCleanupTest() throws Exception { CdmIndexService.closeDbConnection(conn); } - options.setGroupField("groupa"); + options.setGroupFields(Arrays.asList("groupa")); options.setForce(true); service.generateMapping(options); @@ -352,6 +352,134 @@ public void syncSecondRunWithCleanupTest() throws Exception { } } + @Test + public void generateSingleRunMultipleGroupsTest() throws Exception { + indexExportSamples(); + GroupMappingOptions options = new GroupMappingOptions(); + options.setGroupFields(Arrays.asList("groupa", "dcmi")); + service.generateMapping(options); + + GroupMappingInfo info = service.loadMappings(); + + assertMappingPresent(info, "25", "groupa:group1,dcmi:Image"); + assertMappingPresent(info, "26", "groupa:group1,dcmi:Image"); + assertMappingPresent(info, "27", null); + assertMappingPresent(info, "28", null); + assertMappingPresent(info, "29", null); + assertEquals(5, info.getMappings().size()); + + assertGroupingPresent(info, "groupa:group1,dcmi:Image", "25", "26"); + assertEquals(1, info.getGroupedMappings().size()); + + assertMappedDatePresent(); + } + + @Test + public void syncSingleRunMultipleGroupsTest() throws Exception { + indexExportSamples(); + GroupMappingOptions options = new GroupMappingOptions(); + options.setGroupFields(Arrays.asList("groupa", "dcmi")); + service.generateMapping(options); + + service.syncMappings(makeDefaultSyncOptions()); + + Connection conn = null; + try { + GroupMappingInfo info = service.loadMappings(); + conn = indexService.openDbConnection(); + assertWorkSynced(conn, "groupa:group1", "Redoubt C", "2005-11-23"); + assertWorkSynced(conn, "dcmi:Image", "Redoubt C", "2005-11-23"); + assertFilesGrouped(conn, "groupa:group1,dcmi:Image", "25", "26"); + assertFileHasOrder(conn, "25", 1); + assertFileHasOrder(conn, "26", 0); + // Group key with a single child should not be grouped + assertNumberOfGroups(conn, 1); + assertParentIdsPresent(conn, "groupa:group1,dcmi:Image", null); + assertSyncedDatePresent(); + } finally { + CdmIndexService.closeDbConnection(conn); + } + } + + @Test + public void generateMultipleGroupsForceUpdateRunTest() throws Exception { + indexExportSamples(); + GroupMappingOptions options = makeDefaultOptions(); + service.generateMapping(options); + + GroupMappingInfo info = service.loadMappings(); + assertEquals(5, info.getMappings().size()); + + options.setUpdate(true); + options.setForce(true); + options.setGroupFields(Arrays.asList("groupa", "dcmi")); + + service.generateMapping(options); + + // Mapping state should have been overwritten + GroupMappingInfo info2 = service.loadMappings(); + assertMappingPresent(info2, "25", "groupa:group1,dcmi:Image"); + assertMappingPresent(info2, "26", "groupa:group1,dcmi:Image"); + assertMappingPresent(info2, "27", null); + assertMappingPresent(info2, "28", null); + assertMappingPresent(info2, "29", null); + assertEquals(5, info2.getMappings().size()); + + assertMappedDatePresent(); + } + + @Test + public void syncSecondRunWithCleanupMultipleGroupsTest() throws Exception { + indexExportSamples(); + GroupMappingOptions options = makeDefaultOptions(); + options.setGroupFields(Arrays.asList("digitc")); + service.generateMapping(options); + + service.syncMappings(makeDefaultSyncOptions()); + + Connection conn = null; + try { + GroupMappingInfo info = service.loadMappings(); + conn = indexService.openDbConnection(); + assertWorkSynced(conn, "digitc:2005-11-10", "Redoubt C", "2005-11-23"); + assertFilesGrouped(conn, "digitc:2005-11-10", "25", "28", "29"); + assertFileHasOrder(conn, "25", 0); + assertFileHasOrder(conn, "28", 1); + assertFileHasOrder(conn, "29", 2); + assertNumberOfGroups(conn, 1); + assertParentIdsPresent(conn, "digitc:2005-11-10", null); + assertSyncedDatePresent(); + } finally { + CdmIndexService.closeDbConnection(conn); + } + + options.setGroupFields(Arrays.asList("groupa", "dcmi")); + options.setForce(true); + service.generateMapping(options); + + service.syncMappings(makeDefaultSyncOptions()); + + try { + GroupMappingInfo info = service.loadMappings(); + conn = indexService.openDbConnection(); + assertWorkSynced(conn, "groupa:group1", "Redoubt C", "2005-11-23"); + assertWorkSynced(conn, "dcmi:Image", "Redoubt C", "2005-11-23"); + assertFilesGrouped(conn, "groupa:group1,dcmi:Image", "25", "26"); + assertFileHasOrder(conn, "25", 1); + assertFileHasOrder(conn, "26", 0); + // Group key with a single child should not be grouped + assertNumberOfGroups(conn, 1); + assertParentIdsPresent(conn, "groupa:group1,dcmi:Image", null); + + assertGroupingPresent(info, "groupa:group1,dcmi:Image", "25", "26"); + assertEquals(1, info.getGroupedMappings().size()); + + assertSyncedDatePresent(); + } finally { + CdmIndexService.closeDbConnection(conn); + } + } + private void assertWorkSynced(Connection conn, String workId, String expectedTitle, String expectedCreated) throws Exception { String groupKey = asGroupKey(workId); @@ -437,7 +565,7 @@ private String asGroupKey(String matchValue) { private GroupMappingOptions makeDefaultOptions() { GroupMappingOptions options = new GroupMappingOptions(); - options.setGroupField("groupa"); + options.setGroupFields(Arrays.asList("groupa")); return options; } diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/PermissionsServiceTest.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/PermissionsServiceTest.java index 6e84c743..63cb96f7 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/PermissionsServiceTest.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/PermissionsServiceTest.java @@ -599,7 +599,7 @@ private void assertMappingPresent(PermissionsInfo info, String cdmid, String eve private void setupGroupedIndex() throws Exception { var options = new GroupMappingOptions(); - options.setGroupField("groupa"); + options.setGroupFields(Arrays.asList("groupa")); testHelper.getGroupMappingService().generateMapping(options); var syncOptions = new GroupMappingSyncOptions(); syncOptions.setSortField("file"); diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/SipServiceTest.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/SipServiceTest.java index 211d0018..5891fe45 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/SipServiceTest.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/SipServiceTest.java @@ -1513,9 +1513,9 @@ private GroupMappingSyncOptions makeDefaultSyncOptions() { return options; } - private void setupGroupIndex() throws IOException { + private void setupGroupIndex() throws Exception { GroupMappingOptions groupOptions = new GroupMappingOptions(); - groupOptions.setGroupField("groupa"); + groupOptions.setGroupFields(Arrays.asList("groupa")); GroupMappingService groupService = testHelper.getGroupMappingService(); groupService.generateMapping(groupOptions); groupService.syncMappings(makeDefaultSyncOptions());