Skip to content

Commit

Permalink
Bin merge test & input validation (#11275)
Browse files Browse the repository at this point in the history
* Input validation
* squash bins
  • Loading branch information
fuzhaoyuan authored Jan 8, 2025
1 parent 600691b commit 2cbefb0
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -135,42 +135,54 @@ public boolean isCategoricalClinicalDataFilter(ClinicalDataFilter clinicalDataFi
* Merge the range of numerical bins in DataFilters to reduce the number of scans that runs on the database when filtering.
*/
public static <T extends DataFilter> List<T> mergeDataFilters(List<T> filters) {
boolean isNonNumericalOnly = true;
// this should throw error or move to all binning endpoints in the future for input validation
if (!areValidFilters(filters)) {
return filters;
}

boolean hasNumericalValue = false;
List<T> mergedDataFilters = new ArrayList<>();

for (T filter : filters) {
List<DataFilterValue> mergedValues = new ArrayList<>();
List<DataFilterValue> nonNumericalValues = new ArrayList<>();

// record the start and end of current merging range
BigDecimal mergedStart = null;
BigDecimal mergedEnd = null;
// for each value
for (DataFilterValue dataFilterValue : filter.getValues()) {
// leave non-numerical values as they are
// if it is non-numerical, leave it as is
if (dataFilterValue.getValue() != null) {
nonNumericalValues.add(dataFilterValue);
continue;
}
// else it is numerical so start merging process
hasNumericalValue = true;
BigDecimal start = dataFilterValue.getStart();
BigDecimal end = dataFilterValue.getEnd();

// if current merging range is null, we take current bin's range
if (mergedStart == null && mergedEnd == null) {
mergedStart = start;
mergedEnd = end;
}
// else we already has a merging range, we check if this one is consecutive of our range
else if (mergedEnd.equals(start)) {
// if true, we expand our range
mergedEnd = end;
}
// merge adjacent numerical bins
else {
isNonNumericalOnly = false;
BigDecimal start = dataFilterValue.getStart();
BigDecimal end = dataFilterValue.getEnd();

if (mergedStart == null && mergedEnd == null) {
mergedStart = start;
mergedEnd = end;
}
else if (mergedEnd.equals(start)) {
mergedEnd = end;
} else {
mergedValues.add(new DataFilterValue(mergedStart, mergedEnd, null));
mergedStart = start;
mergedEnd = end;
}
// otherwise it's a gap, so we save our current range first, and then use current bin to start the next range
mergedValues.add(new DataFilterValue(mergedStart, mergedEnd));
mergedStart = start;
mergedEnd = end;
}
}

if (!isNonNumericalOnly) {
mergedValues.add(new DataFilterValue(mergedStart, mergedEnd, null));
// in the end we need to save the final range, but if everything is non-numerical then no need to
if (hasNumericalValue) {
mergedValues.add(new DataFilterValue(mergedStart, mergedEnd));
}
mergedValues.addAll(nonNumericalValues);
filter.setValues(mergedValues);
Expand All @@ -179,4 +191,63 @@ else if (mergedEnd.equals(start)) {

return mergedDataFilters;
}

public static <T extends DataFilter> boolean areValidFilters(List<T> filters) {
if (filters == null || filters.isEmpty()) {
return false;
}

for (T filter : filters) {
if (!isValidFilter(filter)) {
return false;
}
}
return true;
}

private static <T extends DataFilter> boolean isValidFilter(T filter) {
if (filter == null || filter.getValues() == null || filter.getValues().isEmpty()) {
return false;
}

BigDecimal start = null;
BigDecimal end = null;
for (DataFilterValue value : filter.getValues()) {
if (!validateDataFilterValue(value, start, end)) {
return false;
}
// update start and end values to check next bin range
if (value.getStart() != null) {
start = value.getStart();
}
if (value.getEnd() != null) {
end = value.getEnd();
}
}
return true;
}

private static boolean validateDataFilterValue(DataFilterValue value, BigDecimal lastStart, BigDecimal lastEnd) {
// non-numerical value should not have numerical value
if (value.getValue() != null) {
return value.getStart() == null && value.getEnd() == null;
}

// check if start < end
if (value.getStart() != null && value.getEnd() != null
&& value.getStart().compareTo(value.getEnd()) >= 0) {
return false;
}

// check if start stays increasing and no overlapping
if (value.getStart() != null
&& ((lastStart != null && lastStart.compareTo(value.getStart()) >= 0)
|| (lastEnd != null && value.getStart().compareTo(lastEnd) < 0))) {
return false;
}

// check if end stays increasing
return value.getEnd() == null || lastEnd == null
|| lastEnd.compareTo(value.getEnd()) < 0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ public class DataFilterValue implements Serializable {

public DataFilterValue() {}

public DataFilterValue(BigDecimal start, BigDecimal end, String value) {
public DataFilterValue(BigDecimal start, BigDecimal end) {
this.start = start;
this.end = end;
}

public DataFilterValue(String value) {
this.value = value;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ public class StudyViewFilterHelperTest {
public void testMergeDataFilterNumericalContinuousValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(-5), BigDecimal.valueOf(-1), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-1), BigDecimal.valueOf(3), null));
values.add(new DataFilterValue(BigDecimal.valueOf(3), BigDecimal.valueOf(7), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-5), BigDecimal.valueOf(-1)));
values.add(new DataFilterValue(BigDecimal.valueOf(-1), BigDecimal.valueOf(3)));
values.add(new DataFilterValue(BigDecimal.valueOf(3), BigDecimal.valueOf(7)));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
Expand All @@ -36,10 +36,10 @@ public void testMergeDataFilterNumericalContinuousValues() {
public void testMergeDataFilterNumericalDiscontinuousValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-1.75), BigDecimal.valueOf(-1.5), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-1.5), BigDecimal.valueOf(-1.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25)));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2)));
values.add(new DataFilterValue(BigDecimal.valueOf(-1.75), BigDecimal.valueOf(-1.5)));
values.add(new DataFilterValue(BigDecimal.valueOf(-1.5), BigDecimal.valueOf(-1.25)));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
Expand All @@ -60,9 +60,9 @@ public void testMergeDataFilterNumericalDiscontinuousValues() {
public void testMergeDataFilterNumericalInfiniteValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(null, BigDecimal.valueOf(-2.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2), null, null));
values.add(new DataFilterValue(null, BigDecimal.valueOf(-2.25)));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2)));
values.add(new DataFilterValue(BigDecimal.valueOf(-2), null));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
Expand All @@ -79,9 +79,9 @@ public void testMergeDataFilterNumericalInfiniteValues() {
public void testMergeDataFilterNumericalNonNumericalValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25), null));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2), null));
values.add(new DataFilterValue(null, null, "NA"));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.5), BigDecimal.valueOf(-2.25)));
values.add(new DataFilterValue(BigDecimal.valueOf(-2.25), BigDecimal.valueOf(-2)));
values.add(new DataFilterValue("NA"));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
Expand All @@ -99,12 +99,116 @@ public void testMergeDataFilterNumericalNonNumericalValues() {
public void testMergeDataFilterNonNumericalOnlyValues() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(null, null, "NA"));
values.add(new DataFilterValue("NA"));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
String value = mergedDataFilterValues.getFirst().getValue();
assertEquals("NA", value);
}

// invalid null / empty -> unprocessed null / empty
@Test
public void testMergeDataFilterEmptyValidation() {
List<GenomicDataFilter> mergedUninstantiatedFilters = StudyViewFilterHelper.mergeDataFilters(null);
assertNull(mergedUninstantiatedFilters);

List<GenomicDataFilter> mergedEmptyFilters = StudyViewFilterHelper.mergeDataFilters(new ArrayList<>());
assertEquals(0, mergedEmptyFilters.size());

List<GenomicDataFilter> uninstantiatedDataFilters = new ArrayList<>();
uninstantiatedDataFilters.add(null);
List<GenomicDataFilter> mergedUninstantiatedDataFilters = StudyViewFilterHelper.mergeDataFilters(uninstantiatedDataFilters);
assertNull(mergedUninstantiatedDataFilters.getFirst());

List<GenomicDataFilter> uninstantiatedValueFilters = new ArrayList<>();
uninstantiatedValueFilters.add(new GenomicDataFilter(null, null, null));
List<GenomicDataFilter> mergedUninstantiatedValueFilters = StudyViewFilterHelper.mergeDataFilters(uninstantiatedValueFilters);
assertNull(mergedUninstantiatedValueFilters.getFirst().getValues());

List<GenomicDataFilter> emptyValueFilters = new ArrayList<>();
emptyValueFilters.add(new GenomicDataFilter(null, null, new ArrayList<>()));
List<GenomicDataFilter> mergedEmptyValueFilters = StudyViewFilterHelper.mergeDataFilters(emptyValueFilters);
assertEquals(0, mergedEmptyValueFilters.getFirst().getValues().size());
}

// invalid (2, 3, "NA"] -> unprocessed (2, 3, "NA"]
@Test
public void testMergeDataFilterNonNumericalValidation() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> invalidValues = new ArrayList<>();
DataFilterValue invalidValue = new DataFilterValue("NA");
invalidValue.setStart(BigDecimal.valueOf(2));
invalidValue.setEnd(BigDecimal.valueOf(3));
invalidValues.add(invalidValue);
genomicDataFilters.add(new GenomicDataFilter(null, null, invalidValues));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal start = mergedDataFilterValues.getFirst().getStart();
BigDecimal end = mergedDataFilterValues.getFirst().getEnd();
String value = mergedDataFilterValues.getFirst().getValue();
assertEquals(0, BigDecimal.valueOf(2).compareTo(start));
assertEquals(0, BigDecimal.valueOf(3).compareTo(end));
assertEquals("NA", value);
}

// invalid (42, 6] -> unprocessed (42, 6]
@Test
public void testMergeDataFilterRangeValidation() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> invalidValues = new ArrayList<>();
invalidValues.add(new DataFilterValue(BigDecimal.valueOf(42), BigDecimal.valueOf(6)));
genomicDataFilters.add(new GenomicDataFilter(null, null, invalidValues));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal start = mergedDataFilterValues.getFirst().getStart();
BigDecimal end = mergedDataFilterValues.getFirst().getEnd();
assertEquals(0, BigDecimal.valueOf(42).compareTo(start));
assertEquals(0, BigDecimal.valueOf(6).compareTo(end));
}

// invalid (3, 5], (1, 3] -> unprocessed (3, 5], (1, 3]
@Test
public void testMergeDataFilterContinuousValidation() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(3), BigDecimal.valueOf(5)));
values.add(new DataFilterValue(BigDecimal.valueOf(1), BigDecimal.valueOf(3)));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal firstStart = mergedDataFilterValues.getFirst().getStart();
BigDecimal firstEnd = mergedDataFilterValues.getFirst().getEnd();
assertEquals(0, BigDecimal.valueOf(3).compareTo(firstStart));
assertEquals(0, BigDecimal.valueOf(5).compareTo(firstEnd));
BigDecimal secondStart = mergedDataFilterValues.get(1).getStart();
BigDecimal secondEnd = mergedDataFilterValues.get(1).getEnd();
assertEquals(0, BigDecimal.valueOf(1).compareTo(secondStart));
assertEquals(0, BigDecimal.valueOf(3).compareTo(secondEnd));
}

// invalid (3, 5], (2, 4] -> unprocessed (3, 5], (2, 4]
@Test
public void testMergeDataFilterNoOverlapValidation() {
List<GenomicDataFilter> genomicDataFilters = new ArrayList<>();
List<DataFilterValue> values = new ArrayList<>();
values.add(new DataFilterValue(BigDecimal.valueOf(3), BigDecimal.valueOf(5)));
values.add(new DataFilterValue(BigDecimal.valueOf(2), BigDecimal.valueOf(4)));
genomicDataFilters.add(new GenomicDataFilter(null, null, values));

List<GenomicDataFilter> mergedGenomicDataFilters = StudyViewFilterHelper.mergeDataFilters(genomicDataFilters);
List<DataFilterValue> mergedDataFilterValues = mergedGenomicDataFilters.getFirst().getValues();
BigDecimal firstStart = mergedDataFilterValues.getFirst().getStart();
BigDecimal firstEnd = mergedDataFilterValues.getFirst().getEnd();
assertEquals(0, BigDecimal.valueOf(3).compareTo(firstStart));
assertEquals(0, BigDecimal.valueOf(5).compareTo(firstEnd));
BigDecimal secondStart = mergedDataFilterValues.get(1).getStart();
BigDecimal secondEnd = mergedDataFilterValues.get(1).getEnd();
assertEquals(0, BigDecimal.valueOf(2).compareTo(secondStart));
assertEquals(0, BigDecimal.valueOf(4).compareTo(secondEnd));
}
}
1 change: 1 addition & 0 deletions src/test/resources/clickhouse_data.sql
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) val
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values (4,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,');
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values (5,'2,');
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values (11,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,');
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values (12,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,');
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values (14,'1,2,3,4,5,6,7,8,9,10,11,12,');
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values (15,'1,2,3,4,');
insert into genetic_profile_samples (genetic_profile_id,ordered_sample_list) values(10,'1,2,3,4,5,6,7,8,9,10,11,');
Expand Down

0 comments on commit 2cbefb0

Please sign in to comment.