11package org .unicode .jsp ;
22
3+ import com .google .common .base .Joiner ;
4+ import com .google .common .collect .Multimap ;
5+ import com .google .common .collect .TreeMultimap ;
36import com .ibm .icu .dev .util .UnicodeMap ;
47import com .ibm .icu .lang .CharSequences ;
58import com .ibm .icu .lang .UCharacter ;
1215import com .ibm .icu .text .Transform ;
1316import com .ibm .icu .text .UTF16 ;
1417import com .ibm .icu .text .UnicodeSet ;
18+ import com .ibm .icu .text .UnicodeSetIterator ;
19+ import com .ibm .icu .util .LocaleData ;
1520import com .ibm .icu .util .ULocale ;
1621import com .ibm .icu .util .VersionInfo ;
1722import java .nio .charset .Charset ;
1823import java .util .ArrayList ;
1924import java .util .Arrays ;
25+ import java .util .Collection ;
2026import java .util .List ;
2127import java .util .Locale ;
28+ import java .util .Map .Entry ;
29+ import java .util .Set ;
30+ import java .util .TreeSet ;
2231import org .unicode .idna .Idna .IdnaType ;
2332import org .unicode .idna .Idna2003 ;
2433import org .unicode .idna .Idna2008 ;
2837import org .unicode .props .UnicodeProperty .BaseProperty ;
2938import org .unicode .props .UnicodeProperty .Factory ;
3039import org .unicode .props .UnicodeProperty .SimpleProperty ;
40+ import org .unicode .text .utility .Utility ;
3141
3242public class XPropertyFactory extends UnicodeProperty .Factory {
3343
44+ private static final Joiner JOIN_COMMAS = Joiner .on ("," );
45+ private static final boolean DEBUG_MULTI = false ;
46+
3447 static final UnicodeSet ALL =
3548 new UnicodeSet ("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]" ).freeze ();
3649
@@ -96,6 +109,7 @@ public final Factory add2(UnicodeProperty sp) {
96109 add (
97110 new CodepointTransformProperty (
98111 new Transform <Integer , String >() {
112+ @ Override
99113 public String transform (Integer source ) {
100114 return Normalizer .normalize (source , Normalizer .NFC );
101115 }
@@ -105,6 +119,7 @@ public String transform(Integer source) {
105119 add (
106120 new CodepointTransformProperty (
107121 new Transform <Integer , String >() {
122+ @ Override
108123 public String transform (Integer source ) {
109124 return Normalizer .normalize (source , Normalizer .NFD );
110125 }
@@ -114,6 +129,7 @@ public String transform(Integer source) {
114129 add (
115130 new CodepointTransformProperty (
116131 new Transform <Integer , String >() {
132+ @ Override
117133 public String transform (Integer source ) {
118134 return Normalizer .normalize (source , Normalizer .NFKC );
119135 }
@@ -123,6 +139,7 @@ public String transform(Integer source) {
123139 add (
124140 new CodepointTransformProperty (
125141 new Transform <Integer , String >() {
142+ @ Override
126143 public String transform (Integer source ) {
127144 return Normalizer .normalize (source , Normalizer .NFKD );
128145 }
@@ -133,6 +150,7 @@ public String transform(Integer source) {
133150 add (
134151 new StringTransformProperty (
135152 new StringTransform () {
153+ @ Override
136154 public String transform (String source ) {
137155 return UCharacter .foldCase (source , true );
138156 }
@@ -142,6 +160,7 @@ public String transform(String source) {
142160 add (
143161 new StringTransformProperty (
144162 new StringTransform () {
163+ @ Override
145164 public String transform (String source ) {
146165 return UCharacter .toLowerCase (ULocale .ROOT , source );
147166 }
@@ -151,6 +170,7 @@ public String transform(String source) {
151170 add (
152171 new StringTransformProperty (
153172 new StringTransform () {
173+ @ Override
154174 public String transform (String source ) {
155175 return UCharacter .toUpperCase (ULocale .ROOT , source );
156176 }
@@ -160,6 +180,7 @@ public String transform(String source) {
160180 add (
161181 new StringTransformProperty (
162182 new StringTransform () {
183+ @ Override
163184 public String transform (String source ) {
164185 return UCharacter .toTitleCase (ULocale .ROOT , source , null );
165186 }
@@ -170,6 +191,7 @@ public String transform(String source) {
170191 add (
171192 new StringTransformProperty (
172193 new StringTransform () {
194+ @ Override
173195 public String transform (String source ) {
174196 StringBuilder b = new StringBuilder ();
175197 for (int cp : CharSequences .codePoints (source )) {
@@ -184,6 +206,7 @@ public String transform(String source) {
184206 add (
185207 new StringTransformProperty (
186208 new StringTransform () {
209+ @ Override
187210 public String transform (String source ) {
188211 String result = NFM .nfm .get (source );
189212 return result == null ? source : result ;
@@ -201,6 +224,7 @@ public String transform(String source) {
201224 add (
202225 new CodepointTransformProperty (
203226 new Transform <Integer , String >() {
227+ @ Override
204228 public String transform (Integer source ) {
205229 return UnicodeUtilities .getSubheader ().getSubheader (source );
206230 }
@@ -239,6 +263,9 @@ public String transform(Integer source) {
239263 .setMain ("bmp" , "bmp" , UnicodeProperty .BINARY , "6.0" ));
240264
241265 addCollationProperty ();
266+ addExamplarProperty (LocaleData .ES_STANDARD , "exem" , "exemplar" );
267+ addExamplarProperty (LocaleData .ES_AUXILIARY , "exema" , "exemplar_aux" );
268+ addExamplarProperty (LocaleData .ES_PUNCTUATION , "exemp" , "exemplar_punct" );
242269
243270 // set up the special script property
244271 UnicodeProperty scriptProp = base .getProperty ("sc" );
@@ -251,7 +278,8 @@ public String transform(Integer source) {
251278 .setMain ("Script_Extensions" , "scx" , UnicodeProperty .ENUMERATED , "1.1" )
252279 .addValueAliases (
253280 ScriptTester .getScriptSpecialsAlternates (),
254- AliasAddAction .IGNORE_IF_MISSING ));
281+ AliasAddAction .IGNORE_IF_MISSING )
282+ .setMultivalued (true ));
255283
256284 CachedProps cp = CachedProps .CACHED_PROPS ;
257285 for (String prop : cp .getAvailable ()) {
@@ -289,6 +317,81 @@ public String transform(Integer source) {
289317 .setMain ("RGI_Emoji" , "RGI_Emoji" , UnicodeProperty .BINARY , "13.0" ));
290318 }
291319
320+ private void addExamplarProperty (
321+ int exemplarType , String propertyAbbreviation , String propertyName ) {
322+ Multimap <Integer , String > data = TreeMultimap .create ();
323+ Set <String > localeSet = new TreeSet <>();
324+
325+ for (ULocale ulocale : ULocale .getAvailableLocales ()) {
326+ if (!ulocale .getCountry ().isEmpty () || !ulocale .getVariant ().isEmpty ()) {
327+ continue ;
328+ // we want to skip cases where characters are in the parent locale, but there is no
329+ // ULocale parentLocale = ulocale.getParent();
330+ }
331+ UnicodeSet exemplarSet = LocaleData .getExemplarSet (ulocale , 0 , exemplarType );
332+ if (!ulocale .getScript ().isEmpty ()) {
333+ // we can't find out the parent locale or defaultContent locale in ICU, so we hack
334+ // it
335+ String langLocale = ulocale .getLanguage ();
336+ UnicodeSet langExemplarSet =
337+ LocaleData .getExemplarSet (new ULocale (langLocale ), 0 , exemplarType );
338+ if (langExemplarSet .equals (exemplarSet )) {
339+ continue ;
340+ }
341+ }
342+ String locale = ulocale .toLanguageTag ();
343+ localeSet .add (locale );
344+ for (UnicodeSetIterator it = new UnicodeSetIterator (exemplarSet ); it .nextRange (); ) {
345+ if (it .codepoint == UnicodeSetIterator .IS_STRING ) {
346+ // flatten
347+ int cp = 0 ;
348+ for (int i = 0 ; i < it .string .length (); i += Character .charCount (cp )) {
349+ cp = it .string .codePointAt (i );
350+ data .put (cp , locale );
351+ }
352+ } else {
353+ for (int cp = it .codepoint ; cp <= it .codepointEnd ; ++cp ) {
354+ data .put (cp , locale );
355+ }
356+ }
357+ }
358+ }
359+
360+ // convert to UnicodeMap
361+ UnicodeMap <String > unicodeMap = new UnicodeMap <>();
362+ for (Entry <Integer , Collection <String >> entry : data .asMap ().entrySet ()) {
363+ String value = JOIN_COMMAS .join (entry .getValue ()).intern ();
364+ unicodeMap .put (entry .getKey (), value );
365+ }
366+ if (DEBUG_MULTI ) {
367+ System .out .println ("\n " + propertyName );
368+ for (UnicodeMap .EntryRange <String > entry : unicodeMap .entryRanges ()) {
369+ System .out .println (
370+ Utility .hex (entry .codepoint )
371+ + (entry .codepoint == entry .codepointEnd
372+ ? ""
373+ : "-" + Utility .hex (entry .codepointEnd ))
374+ + " ;\t "
375+ + entry .value );
376+ }
377+ }
378+
379+ // put locales into right format
380+ String [] localeList = localeSet .toArray (new String [localeSet .size ()]);
381+ String [][] locales = new String [][] {localeList , localeList }; // abbreviations are the same
382+
383+ add (
384+ new UnicodeProperty .UnicodeMapProperty ()
385+ .set (unicodeMap )
386+ .setMain (
387+ propertyName ,
388+ propertyAbbreviation ,
389+ UnicodeProperty .ENUMERATED ,
390+ "1.1" )
391+ .addValueAliases (locales , AliasAddAction .ADD_MAIN_ALIAS )
392+ .setMultivalued (true ));
393+ }
394+
292395 private void addCollationProperty () {
293396 RuleBasedCollator c = UnicodeSetUtilities .RAW_COLLATOR ;
294397 // (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
@@ -652,6 +755,7 @@ public StringTransformProperty(
652755 setUniformUnassigned (hasUniformUnassigned );
653756 }
654757
758+ @ Override
655759 protected String _getValue (int codepoint ) {
656760 return transform .transform (UTF16 .valueOf (codepoint ));
657761 }
@@ -666,6 +770,7 @@ public CodepointTransformProperty(
666770 setUniformUnassigned (hasUniformUnassigned );
667771 }
668772
773+ @ Override
669774 protected String _getValue (int codepoint ) {
670775 return transform .transform (codepoint );
671776 }
@@ -682,6 +787,7 @@ public static class EncodingProperty extends SimpleProperty {
682787 encoder = new CharEncoder (charset , false , false );
683788 }
684789
790+ @ Override
685791 protected String _getValue (int codepoint ) {
686792 int len = encoder .getValue (codepoint , temp , 0 );
687793 if (len < 0 ) {
@@ -697,6 +803,7 @@ protected String _getValue(int codepoint) {
697803 return result .toString ();
698804 }
699805
806+ @ Override
700807 public boolean isDefault (int codepoint ) {
701808 int len = encoder .getValue (codepoint , temp , 0 );
702809 return len < 0 ;
@@ -716,6 +823,7 @@ public static class EncodingPropertyBoolean extends SimpleProperty {
716823 encoder = new CharEncoder (charset , true , true );
717824 }
718825
826+ @ Override
719827 protected String _getValue (int codepoint ) {
720828 return (encoder .getValue (codepoint , null , 0 ) > 0 ) ? "Yes" : "No" ;
721829 }
@@ -731,6 +839,7 @@ public XPropertyFactory.UnicodeSetProperty set(UnicodeSet set) {
731839 return this ;
732840 }
733841
842+ @ Override
734843 protected UnicodeMap <String > _getUnicodeMap () {
735844 UnicodeMap <String > result = new UnicodeMap <String >();
736845 result .putAll (unicodeSet , "Yes" );
@@ -743,10 +852,12 @@ public XPropertyFactory.UnicodeSetProperty set(String string) {
743852 return set (new UnicodeSet (string ).freeze ());
744853 }
745854
855+ @ Override
746856 protected String _getValue (int codepoint ) {
747857 return YESNO_ARRAY [unicodeSet .contains (codepoint ) ? 0 : 1 ];
748858 }
749859
860+ @ Override
750861 protected List _getAvailableValues (List result ) {
751862 return YESNO ;
752863 }
0 commit comments