Skip to content

Commit a3a7054

Browse files
committed
git-svn-id: https://unicode.org/repos/unicodetools/trunk@1566 13e8329f-0b23-4da4-9fe8-d0f6fe080806
1 parent 0a03c64 commit a3a7054

File tree

11 files changed

+462
-139
lines changed

11 files changed

+462
-139
lines changed

unicodetools/org/unicode/tools/emoji/CandidateData.java

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
import com.ibm.icu.text.DateFormat;
4040
import com.ibm.icu.text.Transform;
4141
import com.ibm.icu.text.UnicodeSet;
42+
import com.ibm.icu.text.UnicodeSet.SpanCondition;
43+
import com.ibm.icu.text.UnicodeSetSpanner;
4244
import com.ibm.icu.util.ICUException;
4345
import com.ibm.icu.util.ULocale;
4446
import com.ibm.icu.util.VersionInfo;
@@ -150,7 +152,7 @@ private CandidateData(String sourceFile) {
150152
if (line.startsWith("#") || line.isEmpty()) { // comment
151153
continue;
152154
} else if (line.startsWith("U+")) { // data
153-
fixGenderSkin(source); // old source
155+
fixGenderSkin(source); // fix old source. we do it here so we know the properties
154156

155157
source = Utility.fromHex(line);
156158
if (allCharacters.contains(source)) {
@@ -318,6 +320,7 @@ private CandidateData(String sourceFile) {
318320
textPresentation.freeze();
319321

320322
emoji_Modifier_Base.freeze();
323+
321324
emoji_Gender_Base.freeze();
322325
takesSign.freeze();
323326
emoji_Component.freeze();
@@ -392,34 +395,52 @@ private void fixGenderSkin(String source) {
392395
if (source == null) {
393396
return;
394397
}
395-
int single = UnicodeSet.getSingleCodePoint(source);
396-
if (single == Integer.MAX_VALUE) {
397-
return;
398+
if (source.equals("👩‍🦯️")) {
399+
int debug = 0;
398400
}
399-
boolean isModBase = emoji_Modifier_Base.contains(source);
400-
if (isModBase) {
401+
402+
403+
boolean hasModifierBase = emoji_Modifier_Base.containsSome(source)
404+
|| EmojiData.EMOJI_DATA_BETA.getModifierBases().containsSome(source);
405+
if (hasModifierBase) {
406+
// find the point where it occurs; not efficient but we don't care
407+
UnicodeSet all_Emoji_Modifier_Base = new UnicodeSet(emoji_Modifier_Base)
408+
.addAll(EmojiData.EMOJI_DATA_BETA.getModifierBases())
409+
.freeze();
410+
411+
int start = all_Emoji_Modifier_Base.span(source, SpanCondition.NOT_CONTAINED);
412+
int end = all_Emoji_Modifier_Base.span(source, start, SpanCondition.CONTAINED);
413+
414+
String prefix = source.substring(0, end);
415+
String postfix = source.substring(end);
401416
for (String mod : EmojiData.MODIFIERS) {
402-
addCombo(source, source + mod, "", ": " + EmojiData.EMOJI_DATA.getName(mod));
417+
addCombo(source, prefix + mod + postfix, "", ": " + EmojiData.EMOJI_DATA_BETA.getName(mod));
403418
}
404419
}
420+
421+
int single = UnicodeSet.getSingleCodePoint(source);
422+
if (single == Integer.MAX_VALUE) {
423+
return;
424+
}
425+
405426
boolean isGenderBase = emoji_Gender_Base.contains(source);
406427
if (isGenderBase) {
407428
for (String gen : Emoji.GENDER_MARKERS) {
408429
String genSuffix = Emoji.JOINER_STR + gen + Emoji.EMOJI_VARIANT_STRING;
409430
String genPrefix = gen.equals(Emoji.MALE) ? "man " : "woman ";
410431
addCombo(source, source + genSuffix, genPrefix, "");
411-
if (isModBase) {
432+
if (hasModifierBase) {
412433
for (String mod : EmojiData.MODIFIERS) {
413-
addCombo(source, source + mod + genSuffix, genPrefix, ": " + EmojiData.EMOJI_DATA.getName(mod));
434+
addCombo(source, source + mod + genSuffix, genPrefix, ": " + EmojiData.EMOJI_DATA_BETA.getName(mod));
414435
}
415436
}
416437
}
417438
}
418-
if (isGenderBase && isModBase) {
439+
if (isGenderBase && hasModifierBase) {
419440
addComment(source, "Combinations of gender and skin-tone produce 17 more emoji sequences.");
420441
} else if (isGenderBase) {
421442
addComment(source, "Combinations of gender and skin-tone produce 2 more emoji sequences.");
422-
} else if (isModBase) {
443+
} else if (hasModifierBase) {
423444
addComment(source, "Combinations of gender and skin-tone produce 5 more emoji sequences.");
424445
}
425446
// Comment=There will be 55 emoji sequences with combinations of gender and skin-tone
@@ -470,7 +491,7 @@ public int compare(String o1, String o2) {
470491

471492
String cat1 = getCategory(o1);
472493
int catOrder1 = EmojiOrder.STD_ORDER.getGroupOrder(cat1);
473-
494+
474495
String cat2 = getCategory(o2);
475496
int catOrder2 = EmojiOrder.STD_ORDER.getGroupOrder(cat2);
476497
if (catOrder1 != catOrder2) {
@@ -785,7 +806,7 @@ public String transform(String source) {
785806
break main;
786807
}
787808
if (source.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ)) {
788-
temp = EmojiData.EMOJI_DATA.getFallbackName(source);
809+
temp = EmojiData.EMOJI_DATA_BETA.getFallbackName(source);
789810
break main;
790811
}
791812
switch(CountEmoji.Category.getBucket(source)) {
@@ -965,4 +986,16 @@ public String addEmojiVariants(String s1) {
965986
public String getVersionString() {
966987
return "candidates:" + DateFormat.getInstanceForSkeleton("yyyyMMdd", ULocale.ROOT).format(date);
967988
}
989+
990+
/** We don't expect to have any more of these */
991+
@Override
992+
public UnicodeSet getExplicitGender() {
993+
return UnicodeSet.EMPTY;
994+
}
995+
996+
/** We don't expect to have any more of these */
997+
@Override
998+
public UnicodeSet getMultiPersonGroupings() {
999+
return UnicodeSet.EMPTY;
1000+
}
9681001
}

unicodetools/org/unicode/tools/emoji/CountEmoji.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ enum Attribute {
342342
singleton, zwj, skin, gender, role, family, hair, dup
343343
}
344344

345-
enum Category {
345+
public enum Category {
346346
character("char"),
347347
keycap_seq,
348348
flag_seq,
@@ -393,6 +393,10 @@ public String toString() {
393393
public String toStringPlain() {
394394
return displayName;
395395
}
396+
/** added to make migration easier */
397+
static public Category getType(String s) {
398+
return getBucket(s);
399+
}
396400
static public Category getBucket(String s) {
397401
try {
398402
String noVariants = EmojiData.removeEmojiVariants(s);

unicodetools/org/unicode/tools/emoji/EmojiData.java

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,7 @@
1616
import java.util.concurrent.ConcurrentHashMap;
1717
import java.util.regex.Pattern;
1818

19-
import javax.xml.stream.events.Characters;
20-
2119
import org.unicode.cldr.draft.FileUtilities;
22-
import org.unicode.cldr.tool.GenerateBirth;
2320
import org.unicode.cldr.util.Annotations;
2421
import org.unicode.cldr.util.Annotations.AnnotationSet;
2522
import org.unicode.cldr.util.CldrUtility;
@@ -43,10 +40,7 @@
4340
import com.google.common.collect.ImmutableList;
4441
import com.google.common.collect.ImmutableMultimap;
4542
import com.google.common.collect.ImmutableSet;
46-
import com.google.common.collect.ImmutableSet.Builder;
47-
import com.google.common.collect.ImmutableSetMultimap;
4843
import com.google.common.collect.Multimap;
49-
import com.google.common.collect.SortedSetMultimap;
5044
import com.google.common.collect.TreeMultimap;
5145
import com.ibm.icu.dev.util.UnicodeMap;
5246
import com.ibm.icu.lang.CharSequences;
@@ -55,9 +49,6 @@
5549
import com.ibm.icu.text.Transform;
5650
import com.ibm.icu.text.UTF16;
5751
import com.ibm.icu.text.UnicodeSet;
58-
import com.ibm.icu.text.UnicodeSet.SpanCondition;
59-
import com.ibm.icu.text.UnicodeSetSpanner;
60-
import com.ibm.icu.text.UnicodeSetSpanner.CountMethod;
6152
import com.ibm.icu.util.ULocale;
6253
import com.ibm.icu.util.VersionInfo;
6354

@@ -121,6 +112,7 @@ public enum DefaultPresentation {text, emoji}
121112
private UnicodeSet otherHuman;
122113
private UnicodeSet genderBase;
123114
private UnicodeMap<String> toNeutral;
115+
private UnicodeSet multiPersonGrouping;
124116

125117
public static final Splitter semi = Splitter.onPattern("[;#]").trimResults();
126118
public static final Splitter semiOnly = Splitter.onPattern(";").trimResults();
@@ -506,11 +498,13 @@ private EmojiData(VersionInfo version) {
506498

507499
if (DEBUG) System.out.println("rawHairBases: " + rawHairBases.toPattern(false));
508500

509-
explicitGender.addAll(new UnicodeSet("[[👦-👩 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🕴 👫-👭]]"))
501+
explicitGender.addAll(new UnicodeSet("[[👦-👩 🧔 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🕴 👫-👭]]"))
510502
.freeze();
511503

512504
explicitHair.addAll(new UnicodeSet("[👱]"))
513505
.freeze();
506+
507+
multiPersonGrouping = new UnicodeSet("[👯 🤼 👫-👭 💏 💑 👪 🤝]");
514508

515509
hairBases.addAll(rawHairBases)
516510
.retainAll(modifierBases)
@@ -915,6 +909,7 @@ public UnicodeSet getSortingChars() {
915909
}
916910

917911
public static final EmojiData EMOJI_DATA = of(Emoji.VERSION_TO_GENERATE);
912+
public static final EmojiData EMOJI_DATA_BETA = of(Emoji.VERSION_BETA);
918913

919914
public UnicodeSet getFlagSequences() {
920915
return flagSequences;
@@ -1387,20 +1382,23 @@ public static void main(String[] args) {
13871382
UnicodeSet explicitGendered = new UnicodeSet()
13881383
.addAll(e11a.maleToOther.keySet())
13891384
.addAll(e11a.femaleToOther.keySet())
1385+
.add(new UnicodeSet("[🧔]"))
13901386
.freeze();
13911387

13921388
UnicodeSet gendered = new UnicodeSet()
13931389
.addAll(e11a.maleToOther.keySet())
13941390
.addAll(e11a.femaleToOther.keySet())
13951391
.addAll(e11a.otherHuman)
13961392
.freeze();
1393+
13971394
UnicodeSet people = new UnicodeSet()
13981395
.addAll(EmojiOrder.BETA_ORDER.majorGroupings.getSet(MajorGroup.People))
13991396
.removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("body"))
14001397
.removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("emotion"))
14011398
.removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("clothing"))
14021399
.retainAll(e11a.allEmojiWithoutDefectives)
14031400
.freeze();
1401+
14041402
diff2("gendered", gendered, "people", people);
14051403

14061404
System.out.println("genderBase:\t" + e11a.getGenderBase().size() + "\t" + e11a.getGenderBase().toPattern(false));
@@ -1891,4 +1889,9 @@ static UnicodeSet getWithoutMods(UnicodeSet chars) {
18911889
public UnicodeSet getGenderBase() {
18921890
return genderBase;
18931891
}
1892+
1893+
@Override
1894+
public UnicodeSet getMultiPersonGroupings() {
1895+
return multiPersonGrouping;
1896+
}
18941897
}

unicodetools/org/unicode/tools/emoji/EmojiDataSource.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,7 @@ public default UnicodeSet getEmojiForSortRules() {
5555

5656
public String addEmojiVariants(String s1);
5757
public String getVersionString();
58+
public UnicodeSet getExplicitGender();
59+
public UnicodeSet getMultiPersonGroupings();
5860
}
5961

unicodetools/org/unicode/tools/emoji/EmojiDataSourceCombined.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,18 @@ public String addEmojiVariants(String s1) {
140140
public String getVersionString() {
141141
return emojiData.getVersion() + " + " + candidates.getVersionString();
142142
}
143+
144+
@Override
145+
public UnicodeSet getExplicitGender() {
146+
return add(emojiData.getExplicitGender(),
147+
candidates.getExplicitGender());
148+
}
149+
150+
@Override
151+
public UnicodeSet getMultiPersonGroupings() {
152+
return add(emojiData.getMultiPersonGroupings(),
153+
candidates.getMultiPersonGroupings());
154+
}
143155

144156
// public static void main(String[] args) {
145157
// UnicodeSet allChars = EMOJI_DATA.getAllEmojiWithDefectives();

unicodetools/org/unicode/tools/emoji/GenerateEmojiData.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,17 +129,24 @@ public static <T> void printData(UnicodeMap<String> extraNames) throws IOExcepti
129129

130130
try (TempPrintWriter outText2 = new TempPrintWriter(OUTPUT_DIR, "internal/emoji-internal.txt")) {
131131
UnicodeSet emojiGenderBase = EmojiDataSourceCombined.EMOJI_DATA.getGenderBases();
132+
UnicodeSet emojiExplicitGender = EmojiDataSourceCombined.EMOJI_DATA.getExplicitGender();
133+
UnicodeSet emojiMultiPersonGroupings = EmojiDataSourceCombined.EMOJI_DATA.getMultiPersonGroupings();
132134
outText2.println(Utility.getBaseDataHeader("emoji-internal", 51, "Emoji Data Internal", Emoji.VERSION_STRING));
133135

134136

135-
int width = maxLength("Emoji_Gender_Base");
137+
int width = maxLength("Emoji_Gender_Base",
138+
"Emoji_Explicit_Gender",
139+
"Multi_Person_Groupings"
140+
);
136141

137142
// outText2.println("# Warning: the format has changed from Version 1.0");
138143
outText2.println("# Format: ");
139144
outText2.println("# <codepoint(s)> ; <property> # <comments> ");
140145
outText2.println("# Note: there is no guarantee as to the structure of whitespace or comments");
141146
outText2.println(ORDERING_NOTE);
142147
printer.show(outText2, "Emoji_Gender_Base", null, width, 14, emojiGenderBase, true, true, false);
148+
printer.show(outText2, "Emoji_Explicit_Gender", null, width, 14, emojiExplicitGender, true, true, false);
149+
printer.show(outText2, "Multi_Person_Groupings", null, width, 14, emojiMultiPersonGroupings, true, true, false);
143150
outText2.println("\n#EOF");
144151
}
145152

0 commit comments

Comments
 (0)