Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve list detection. Support korean numeration #332

Merged
merged 1 commit into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,15 @@
import org.verapdf.wcag.algorithms.entities.lists.ListIntervalsCollection;
import org.verapdf.wcag.algorithms.entities.lists.info.ListItemInfo;
import org.verapdf.wcag.algorithms.entities.lists.info.ListItemTextInfo;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.AlfaLettersListLabelsDetectionAlgorithm1;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.AlfaLettersListLabelsDetectionAlgorithm2;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.ArabicNumbersListLabelsDetectionAlgorithm;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.RomanNumbersListLabelsDetectionAlgorithm;
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection.*;

import java.util.*;

public class ListLabelsUtils {

private static final Set<Character> labels = new HashSet<>(
Arrays.asList('\u002D', '\u2022', '\u25CF', '\u2714', '\u2717', '\u2794', '\u27A2', '\uE00A', '\uE00C',
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼)
'\uF076', '\u2588', '\u25A0', '\u2013', '\uF0B7', '\uF0A7', '\u25A1', '\uF0A1', '\u25AA', '\u25FC', '\u25CB', '\u203B')); //office labels examples (-, •, ✔, ✗, ●, ➔, ➢), pdf files labels examples (█, ■, , □, , ▪, ◼, ○, ※)
private static final Character o = '\u006F';

public static boolean isListLabel(String value) {
Expand Down Expand Up @@ -70,6 +67,7 @@ public static boolean isListLabels(List<String> listLabels) {
}
return new RomanNumbersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) ||
new ArabicNumbersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) ||
new KoreanLettersListLabelsDetectionAlgorithm().isListLabels(labels, commonStartLength, commonEndLength) ||
new AlfaLettersListLabelsDetectionAlgorithm1().isListLabels(labels, commonStartLength, commonEndLength) ||
new AlfaLettersListLabelsDetectionAlgorithm2().isListLabels(labels, commonStartLength, commonEndLength);
}
Expand Down Expand Up @@ -142,6 +140,7 @@ public static Set<ListInterval> getListItemsIntervals(List<ListItemTextInfo> ite
ListIntervalsCollection listIntervals = new ListIntervalsCollection(getItemsWithEqualsLabels(itemsInfo));
listIntervals.putAll(new AlfaLettersListLabelsDetectionAlgorithm1().getItemsIntervals(itemsInfo));
listIntervals.putAll(new AlfaLettersListLabelsDetectionAlgorithm2().getItemsIntervals(itemsInfo));
listIntervals.putAll(new KoreanLettersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo));
listIntervals.putAll(new RomanNumbersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo));
listIntervals.putAll(new ArabicNumbersListLabelsDetectionAlgorithm().getItemsIntervals(itemsInfo));
return listIntervals.getSet();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package org.verapdf.wcag.algorithms.semanticalgorithms.utils.listLabelsDetection;

import java.util.Arrays;
import java.util.List;

public class KoreanLettersListLabelsDetectionAlgorithm extends LettersListLabelsDetectionAlgorithm {

protected static final List<Character> letters = Arrays.asList('가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하');

private static final String UPPER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String LOWER_CASE_KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";
private static final String KOREAN_LETTER_REGEX = "[가나다라마바사아자차카타파하]+";

@Override
protected String getRegex() {
return KOREAN_LETTER_REGEX;
}

@Override
protected String getLowerCaseRegex() {
return LOWER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected String getUpperCaseRegex() {
return UPPER_CASE_KOREAN_LETTER_REGEX;
}

@Override
protected String getStringFromNumber(Integer number) {
return getLettersFromNumber(number);
}

@Override
protected Integer getNumberFromString(String string) {
return getNumberFromLetters(string);
}

private static String getLettersFromNumber(int integer) {
integer--;
if (integer < letters.size()) {
return letters.get(integer).toString();
}
return null;
}

private static Integer getNumberFromLetters(String s) {
if (s.length() != 1) {
return null;
}
int num = letters.indexOf(s.charAt(0));
if (num < 0) {
return null;
}
return num + 1;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public Set<ListInterval> getItemsIntervals(List<ListItemTextInfo> itemsInfo) {
if (number != null) {
number++;
String s = getStringFromNumber(number);
if (!item.toUpperCase().startsWith(s, start) || !item.startsWith(prefix) ||
if (s == null || !item.toUpperCase().startsWith(s, start) || !item.startsWith(prefix) ||
isCharMatchRegex(item, start + s.length()) || isBadItem(itemInfo, item, s, start) ||
((!item.substring(start, start + s.length()).matches(getLowerCaseRegex()) || isUpperCase) &&
(!item.substring(start, start + s.length()).matches(getUpperCaseRegex()) || !isUpperCase))) {
Expand Down Expand Up @@ -104,7 +104,7 @@ public Set<ListInterval> getItemsIntervals(List<ListItemTextInfo> itemsInfo) {
continue;
}
//only Roman???
if (!substring.toUpperCase().startsWith(getStringFromNumber(number))) {
if (getStringFromNumber(number) == null || !substring.toUpperCase().startsWith(getStringFromNumber(number))) {
number = null;
continue;
}
Expand Down
Loading