@@ -100,11 +100,7 @@ public static boolean startsWithIgnoreCase(final String s, final String expected
100
100
}
101
101
102
102
/**
103
- * Escapes the characters '<', '>' and '&' into their XML entity equivalents. Note that
104
- * sometimes we have to use this method instead of
105
- * {@link org.apache.commons.lang3.StringEscapeUtils#escapeXml(String)} or
106
- * {@link org.apache.commons.lang3.StringEscapeUtils#escapeHtml4(String)} because those methods
107
- * escape some unicode characters as well.
103
+ * Escapes the characters '<', '>' and '&' into their XML entity equivalents.
108
104
*
109
105
* @param s the string to escape
110
106
* @return the escaped form of the specified string
@@ -114,6 +110,83 @@ public static String escapeXmlChars(final String s) {
114
110
replaceEach (s , new String [] {"&" , "<" , ">" }, new String [] {"&" , "<" , ">" });
115
111
}
116
112
113
+ /**
114
+ * Escape the string to be used as xml 1.0 content be replacing the
115
+ * characters '"', '&', ''', '<', and '>' into their XML entity equivalents.
116
+ * @param text the attribute value
117
+ * @return the escaped value
118
+ */
119
+ public static String escapeXml (final String text ) {
120
+ if (text == null ) {
121
+ return null ;
122
+ }
123
+
124
+ StringBuilder escaped = null ;
125
+
126
+ final int offset = 0 ;
127
+ final int max = text .length ();
128
+
129
+ int readOffset = offset ;
130
+
131
+ for (int i = offset ; i < max ; i ++) {
132
+ final int codepoint = Character .codePointAt (text , i );
133
+ final boolean codepointValid = supportedByXML10 (codepoint );
134
+
135
+ if (!codepointValid
136
+ || codepoint == '<'
137
+ || codepoint == '>'
138
+ || codepoint == '&'
139
+ || codepoint == '\''
140
+ || codepoint == '"' ) {
141
+
142
+ // replacement required
143
+ if (escaped == null ) {
144
+ escaped = new StringBuilder (max );
145
+ }
146
+
147
+ if (i > readOffset ) {
148
+ escaped .append (text , readOffset , i );
149
+ }
150
+
151
+ if (Character .charCount (codepoint ) > 1 ) {
152
+ i ++;
153
+ }
154
+ readOffset = i + 1 ;
155
+
156
+ // skip
157
+ if (!codepointValid ) {
158
+ continue ;
159
+ }
160
+
161
+ if (codepoint == '<' ) {
162
+ escaped .append ("<" );
163
+ }
164
+ else if (codepoint == '>' ) {
165
+ escaped .append (">" );
166
+ }
167
+ else if (codepoint == '&' ) {
168
+ escaped .append ("&" );
169
+ }
170
+ else if (codepoint == '\'' ) {
171
+ escaped .append ("'" );
172
+ }
173
+ else if (codepoint == '\"' ) {
174
+ escaped .append (""" );
175
+ }
176
+ }
177
+ }
178
+
179
+ if (escaped == null ) {
180
+ return text ;
181
+ }
182
+
183
+ if (max > readOffset ) {
184
+ escaped .append (text , readOffset , max );
185
+ }
186
+
187
+ return escaped .toString ();
188
+ }
189
+
117
190
/**
118
191
* Escape the string to be used as attribute value.
119
192
* Only {@code <}, {@code &} and {@code "} have to be escaped (see
@@ -122,33 +195,96 @@ public static String escapeXmlChars(final String s) {
122
195
* @return the escaped value
123
196
*/
124
197
public static String escapeXmlAttributeValue (final String attValue ) {
125
- final int len = attValue .length ();
126
- StringBuilder sb = null ;
127
- for (int i = len - 1 ; i >= 0 ; --i ) {
128
- final char c = attValue .charAt (i );
129
- String replacement = null ;
130
- if (c == '<' ) {
131
- replacement = "<" ;
132
- }
133
- else if (c == '&' ) {
134
- replacement = "&" ;
135
- }
136
- else if (c == '\"' ) {
137
- replacement = """ ;
138
- }
198
+ if (attValue == null ) {
199
+ return null ;
200
+ }
201
+
202
+ StringBuilder escaped = null ;
203
+
204
+ final int offset = 0 ;
205
+ final int max = attValue .length ();
206
+
207
+ int readOffset = offset ;
208
+
209
+ for (int i = offset ; i < max ; i ++) {
210
+ final int codepoint = Character .codePointAt (attValue , i );
211
+ final boolean codepointValid = supportedByXML10 (codepoint );
212
+
213
+ if (!codepointValid
214
+ || codepoint == '<'
215
+ || codepoint == '&'
216
+ || codepoint == '"' ) {
217
+
218
+ // replacement required
219
+ if (escaped == null ) {
220
+ escaped = new StringBuilder (max );
221
+ }
222
+
223
+ if (i > readOffset ) {
224
+ escaped .append (attValue , readOffset , i );
225
+ }
226
+
227
+ if (Character .charCount (codepoint ) > 1 ) {
228
+ i ++;
229
+ }
230
+ readOffset = i + 1 ;
231
+
232
+ // skip
233
+ if (!codepointValid ) {
234
+ continue ;
235
+ }
139
236
140
- if (replacement != null ) {
141
- if (sb == null ) {
142
- sb = new StringBuilder (attValue );
237
+ if (codepoint == '<' ) {
238
+ escaped .append ("<" );
239
+ }
240
+ else if (codepoint == '&' ) {
241
+ escaped .append ("&" );
242
+ }
243
+ else if (codepoint == '\"' ) {
244
+ escaped .append (""" );
143
245
}
144
- sb .replace (i , i + 1 , replacement );
145
246
}
146
247
}
147
248
148
- if (sb != null ) {
149
- return sb .toString ();
249
+ if (escaped == null ) {
250
+ return attValue ;
251
+ }
252
+
253
+ if (max > readOffset ) {
254
+ escaped .append (attValue , readOffset , max );
150
255
}
151
- return attValue ;
256
+
257
+ return escaped .toString ();
258
+ }
259
+
260
+ /*
261
+ * XML 1.0 does not allow control characters or unpaired Unicode surrogate codepoints.
262
+ * We will remove characters that do not fit in the following ranges:
263
+ * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
264
+ */
265
+ private static boolean supportedByXML10 (final int codepoint ) {
266
+ if (codepoint < 0x20 ) {
267
+ return codepoint == 0x9 || codepoint == 0xA || codepoint == 0xD ;
268
+ }
269
+ if (codepoint <= 0xD7FF ) {
270
+ return true ;
271
+ }
272
+
273
+ if (codepoint < 0xE000 ) {
274
+ return false ;
275
+ }
276
+ if (codepoint <= 0xFFFD ) {
277
+ return true ;
278
+ }
279
+
280
+ if (codepoint < 0x10000 ) {
281
+ return false ;
282
+ }
283
+ if (codepoint <= 0x10FFFF ) {
284
+ return true ;
285
+ }
286
+
287
+ return true ;
152
288
}
153
289
154
290
/**
0 commit comments