001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.io.Reader; 027import java.io.StringReader; 028import java.net.HttpURLConnection; 029import java.net.URL; 030import java.net.URLConnection; 031import java.text.MessageFormat; 032import java.util.Locale; 033import java.util.Objects; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037import org.apache.commons.io.ByteOrderMark; 038import org.apache.commons.io.IOUtils; 039 040/** 041 * Character stream that handles all the necessary Voodoo to figure out the 042 * charset encoding of the XML document within the stream. 043 * <p> 044 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 045 * This one IS a character stream. 046 * </p> 047 * <p> 048 * All this has to be done without consuming characters from the stream, if not 049 * the XML parser will not recognized the document as a valid XML. This is not 050 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 051 * right now, XmlStreamReader handles it and things work in all parsers). 052 * </p> 053 * <p> 054 * The XmlStreamReader class handles the charset encoding of XML documents in 055 * Files, raw streams and HTTP streams by offering a wide set of constructors. 056 * </p> 057 * <p> 058 * By default the charset encoding detection is lenient, the constructor with 059 * the lenient flag can be used for a script (following HTTP MIME and XML 060 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 061 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 062 * Determining the character encoding of a feed</a>. 063 * </p> 064 * <p> 065 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 066 * Apache License 2.0. 067 * </p> 068 * 069 * @see org.apache.commons.io.output.XmlStreamWriter 070 * @since 2.0 071 */ 072public class XmlStreamReader extends Reader { 073 private static final String UTF_8 = "UTF-8"; 074 075 private static final String US_ASCII = "US-ASCII"; 076 077 private static final String UTF_16BE = "UTF-16BE"; 078 079 private static final String UTF_16LE = "UTF-16LE"; 080 081 private static final String UTF_32BE = "UTF-32BE"; 082 083 private static final String UTF_32LE = "UTF-32LE"; 084 085 private static final String UTF_16 = "UTF-16"; 086 087 private static final String UTF_32 = "UTF-32"; 088 089 private static final String EBCDIC = "CP1047"; 090 091 private static final ByteOrderMark[] BOMS = { 092 ByteOrderMark.UTF_8, 093 ByteOrderMark.UTF_16BE, 094 ByteOrderMark.UTF_16LE, 095 ByteOrderMark.UTF_32BE, 096 ByteOrderMark.UTF_32LE 097 }; 098 099 // UTF_16LE and UTF_32LE have the same two starting BOM bytes. 100 private static final ByteOrderMark[] XML_GUESS_BYTES = { 101 new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 102 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), 103 new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 104 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 105 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 106 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 107 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 108 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) 109 }; 110 111 private static final Pattern CHARSET_PATTERN = Pattern 112 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 113 114 /** 115 * Pattern capturing the encoding of the "xml" processing instruction. 116 */ 117 public static final Pattern ENCODING_PATTERN = Pattern.compile( 118 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 119 Pattern.MULTILINE); 120 121 private static final String RAW_EX_1 = 122 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 123 124 private static final String RAW_EX_2 = 125 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 126 127 private static final String HTTP_EX_1 = 128 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 129 130 private static final String HTTP_EX_2 = 131 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 132 133 private static final String HTTP_EX_3 = 134 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"; 135 136 /** 137 * Returns charset parameter value, NULL if not present, NULL if 138 * httpContentType is NULL. 139 * 140 * @param httpContentType the HTTP content type 141 * @return The content type encoding (upcased) 142 */ 143 static String getContentTypeEncoding(final String httpContentType) { 144 String encoding = null; 145 if (httpContentType != null) { 146 final int i = httpContentType.indexOf(";"); 147 if (i > -1) { 148 final String postMime = httpContentType.substring(i + 1); 149 final Matcher m = CHARSET_PATTERN.matcher(postMime); 150 encoding = m.find() ? m.group(1) : null; 151 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; 152 } 153 } 154 return encoding; 155 } 156 157 /** 158 * Returns MIME type or NULL if httpContentType is NULL. 159 * 160 * @param httpContentType the HTTP content type 161 * @return The mime content type 162 */ 163 static String getContentTypeMime(final String httpContentType) { 164 String mime = null; 165 if (httpContentType != null) { 166 final int i = httpContentType.indexOf(";"); 167 if (i >= 0) { 168 mime = httpContentType.substring(0, i); 169 } else { 170 mime = httpContentType; 171 } 172 mime = mime.trim(); 173 } 174 return mime; 175 } 176 177 /** 178 * Returns the encoding declared in the <?xml encoding=...?>, NULL if none. 179 * 180 * @param inputStream InputStream to create the reader from. 181 * @param guessedEnc guessed encoding 182 * @return the encoding declared in the <?xml encoding=...?> 183 * @throws IOException thrown if there is a problem reading the stream. 184 */ 185 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) 186 throws IOException { 187 String encoding = null; 188 if (guessedEnc != null) { 189 final byte[] bytes = IOUtils.byteArray(); 190 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE); 191 int offset = 0; 192 int max = IOUtils.DEFAULT_BUFFER_SIZE; 193 int c = inputStream.read(bytes, offset, max); 194 int firstGT = -1; 195 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 196 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) { 197 offset += c; 198 max -= c; 199 c = inputStream.read(bytes, offset, max); 200 xmlProlog = new String(bytes, 0, offset, guessedEnc); 201 firstGT = xmlProlog.indexOf('>'); 202 } 203 if (firstGT == -1) { 204 if (c == -1) { 205 throw new IOException("Unexpected end of XML stream"); 206 } 207 throw new IOException( 208 "XML prolog or ROOT element not found on first " 209 + offset + " bytes"); 210 } 211 final int bytesRead = offset; 212 if (bytesRead > 0) { 213 inputStream.reset(); 214 final BufferedReader bReader = new BufferedReader(new StringReader( 215 xmlProlog.substring(0, firstGT + 1))); 216 final StringBuffer prolog = new StringBuffer(); 217 String line; 218 while ((line = bReader.readLine()) != null) { 219 prolog.append(line); 220 } 221 final Matcher m = ENCODING_PATTERN.matcher(prolog); 222 if (m.find()) { 223 encoding = m.group(1).toUpperCase(Locale.ROOT); 224 encoding = encoding.substring(1, encoding.length() - 1); 225 } 226 } 227 } 228 return encoding; 229 } 230 231 /** 232 * Indicates if the MIME type belongs to the APPLICATION XML family. 233 * 234 * @param mime The mime type 235 * @return true if the mime type belongs to the APPLICATION XML family, 236 * otherwise false 237 */ 238 static boolean isAppXml(final String mime) { 239 return mime != null && 240 (mime.equals("application/xml") || 241 mime.equals("application/xml-dtd") || 242 mime.equals("application/xml-external-parsed-entity") || 243 mime.startsWith("application/") && mime.endsWith("+xml")); 244 } 245 246 /** 247 * Indicates if the MIME type belongs to the TEXT XML family. 248 * 249 * @param mime The mime type 250 * @return true if the mime type belongs to the TEXT XML family, 251 * otherwise false 252 */ 253 static boolean isTextXml(final String mime) { 254 return mime != null && 255 (mime.equals("text/xml") || 256 mime.equals("text/xml-external-parsed-entity") || 257 mime.startsWith("text/") && mime.endsWith("+xml")); 258 } 259 260 private final Reader reader; 261 262 private final String encoding; 263 264 private final String defaultEncoding; 265 266 /** 267 * Creates a Reader for a File. 268 * <p> 269 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 270 * if this is also missing defaults to UTF-8. 271 * <p> 272 * It does a lenient charset encoding detection, check the constructor with 273 * the lenient parameter for details. 274 * 275 * @param file File to create a Reader from. 276 * @throws IOException thrown if there is a problem reading the file. 277 */ 278 @SuppressWarnings("resource") // FileInputStream is managed through another reader in this instance. 279 public XmlStreamReader(final File file) throws IOException { 280 this(new FileInputStream(Objects.requireNonNull(file, "file"))); 281 } 282 283 /** 284 * Creates a Reader for a raw InputStream. 285 * <p> 286 * It follows the same logic used for files. 287 * <p> 288 * It does a lenient charset encoding detection, check the constructor with 289 * the lenient parameter for details. 290 * 291 * @param inputStream InputStream to create a Reader from. 292 * @throws IOException thrown if there is a problem reading the stream. 293 */ 294 public XmlStreamReader(final InputStream inputStream) throws IOException { 295 this(inputStream, true); 296 } 297 298 /** 299 * Creates a Reader for a raw InputStream. 300 * <p> 301 * It follows the same logic used for files. 302 * <p> 303 * If lenient detection is indicated and the detection above fails as per 304 * specifications it then attempts the following: 305 * <p> 306 * If the content type was 'text/html' it replaces it with 'text/xml' and 307 * tries the detection again. 308 * <p> 309 * Else if the XML prolog had a charset encoding that encoding is used. 310 * <p> 311 * Else if the content type had a charset encoding that encoding is used. 312 * <p> 313 * Else 'UTF-8' is used. 314 * <p> 315 * If lenient detection is indicated an XmlStreamReaderException is never 316 * thrown. 317 * 318 * @param inputStream InputStream to create a Reader from. 319 * @param lenient indicates if the charset encoding detection should be 320 * relaxed. 321 * @throws IOException thrown if there is a problem reading the stream. 322 * @throws XmlStreamReaderException thrown if the charset encoding could not 323 * be determined according to the specs. 324 */ 325 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException { 326 this(inputStream, lenient, null); 327 } 328 329 /** 330 * Creates a Reader for a raw InputStream. 331 * <p> 332 * It follows the same logic used for files. 333 * <p> 334 * If lenient detection is indicated and the detection above fails as per 335 * specifications it then attempts the following: 336 * <p> 337 * If the content type was 'text/html' it replaces it with 'text/xml' and 338 * tries the detection again. 339 * <p> 340 * Else if the XML prolog had a charset encoding that encoding is used. 341 * <p> 342 * Else if the content type had a charset encoding that encoding is used. 343 * <p> 344 * Else 'UTF-8' is used. 345 * <p> 346 * If lenient detection is indicated an XmlStreamReaderException is never 347 * thrown. 348 * 349 * @param inputStream InputStream to create a Reader from. 350 * @param lenient indicates if the charset encoding detection should be 351 * relaxed. 352 * @param defaultEncoding The default encoding 353 * @throws IOException thrown if there is a problem reading the stream. 354 * @throws XmlStreamReaderException thrown if the charset encoding could not 355 * be determined according to the specs. 356 */ 357 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 358 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) 359 throws IOException { 360 Objects.requireNonNull(inputStream, "inputStream"); 361 this.defaultEncoding = defaultEncoding; 362 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 363 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 364 this.encoding = doRawStream(bom, pis, lenient); 365 this.reader = new InputStreamReader(pis, encoding); 366 } 367 368 /** 369 * Creates a Reader using an InputStream and the associated content-type 370 * header. 371 * <p> 372 * First it checks if the stream has BOM. If there is not BOM checks the 373 * content-type encoding. If there is not content-type encoding checks the 374 * XML prolog encoding. If there is not XML prolog encoding uses the default 375 * encoding mandated by the content-type MIME type. 376 * <p> 377 * It does a lenient charset encoding detection, check the constructor with 378 * the lenient parameter for details. 379 * 380 * @param inputStream InputStream to create the reader from. 381 * @param httpContentType content-type header to use for the resolution of 382 * the charset encoding. 383 * @throws IOException thrown if there is a problem reading the file. 384 */ 385 public XmlStreamReader(final InputStream inputStream, final String httpContentType) 386 throws IOException { 387 this(inputStream, httpContentType, true); 388 } 389 390 391 /** 392 * Creates a Reader using an InputStream and the associated content-type 393 * header. This constructor is lenient regarding the encoding detection. 394 * <p> 395 * First it checks if the stream has BOM. If there is not BOM checks the 396 * content-type encoding. If there is not content-type encoding checks the 397 * XML prolog encoding. If there is not XML prolog encoding uses the default 398 * encoding mandated by the content-type MIME type. 399 * <p> 400 * If lenient detection is indicated and the detection above fails as per 401 * specifications it then attempts the following: 402 * <p> 403 * If the content type was 'text/html' it replaces it with 'text/xml' and 404 * tries the detection again. 405 * <p> 406 * Else if the XML prolog had a charset encoding that encoding is used. 407 * <p> 408 * Else if the content type had a charset encoding that encoding is used. 409 * <p> 410 * Else 'UTF-8' is used. 411 * <p> 412 * If lenient detection is indicated an XmlStreamReaderException is never 413 * thrown. 414 * 415 * @param inputStream InputStream to create the reader from. 416 * @param httpContentType content-type header to use for the resolution of 417 * the charset encoding. 418 * @param lenient indicates if the charset encoding detection should be 419 * relaxed. 420 * @throws IOException thrown if there is a problem reading the file. 421 * @throws XmlStreamReaderException thrown if the charset encoding could not 422 * be determined according to the specs. 423 */ 424 public XmlStreamReader(final InputStream inputStream, final String httpContentType, 425 final boolean lenient) throws IOException { 426 this(inputStream, httpContentType, lenient, null); 427 } 428 429 /** 430 * Creates a Reader using an InputStream and the associated content-type 431 * header. This constructor is lenient regarding the encoding detection. 432 * <p> 433 * First it checks if the stream has BOM. If there is not BOM checks the 434 * content-type encoding. If there is not content-type encoding checks the 435 * XML prolog encoding. If there is not XML prolog encoding uses the default 436 * encoding mandated by the content-type MIME type. 437 * <p> 438 * If lenient detection is indicated and the detection above fails as per 439 * specifications it then attempts the following: 440 * <p> 441 * If the content type was 'text/html' it replaces it with 'text/xml' and 442 * tries the detection again. 443 * <p> 444 * Else if the XML prolog had a charset encoding that encoding is used. 445 * <p> 446 * Else if the content type had a charset encoding that encoding is used. 447 * <p> 448 * Else 'UTF-8' is used. 449 * <p> 450 * If lenient detection is indicated an XmlStreamReaderException is never 451 * thrown. 452 * 453 * @param inputStream InputStream to create the reader from. 454 * @param httpContentType content-type header to use for the resolution of 455 * the charset encoding. 456 * @param lenient indicates if the charset encoding detection should be 457 * relaxed. 458 * @param defaultEncoding The default encoding 459 * @throws IOException thrown if there is a problem reading the file. 460 * @throws XmlStreamReaderException thrown if the charset encoding could not 461 * be determined according to the specs. 462 */ 463 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 464 public XmlStreamReader(final InputStream inputStream, final String httpContentType, 465 final boolean lenient, final String defaultEncoding) throws IOException { 466 Objects.requireNonNull(inputStream, "inputStream"); 467 this.defaultEncoding = defaultEncoding; 468 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 469 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 470 this.encoding = processHttpStream(bom, pis, httpContentType, lenient); 471 this.reader = new InputStreamReader(pis, encoding); 472 } 473 474 /** 475 * Creates a Reader using the InputStream of a URL. 476 * <p> 477 * If the URL is not of type HTTP and there is not 'content-type' header in 478 * the fetched data it uses the same logic used for Files. 479 * <p> 480 * If the URL is a HTTP Url or there is a 'content-type' header in the 481 * fetched data it uses the same logic used for an InputStream with 482 * content-type. 483 * <p> 484 * It does a lenient charset encoding detection, check the constructor with 485 * the lenient parameter for details. 486 * 487 * @param url URL to create a Reader from. 488 * @throws IOException thrown if there is a problem reading the stream of 489 * the URL. 490 */ 491 public XmlStreamReader(final URL url) throws IOException { 492 this(Objects.requireNonNull(url, "url").openConnection(), null); 493 } 494 495 /** 496 * Creates a Reader using the InputStream of a URLConnection. 497 * <p> 498 * If the URLConnection is not of type HttpURLConnection and there is not 499 * 'content-type' header in the fetched data it uses the same logic used for 500 * files. 501 * <p> 502 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 503 * the fetched data it uses the same logic used for an InputStream with 504 * content-type. 505 * <p> 506 * It does a lenient charset encoding detection, check the constructor with 507 * the lenient parameter for details. 508 * 509 * @param conn URLConnection to create a Reader from. 510 * @param defaultEncoding The default encoding 511 * @throws IOException thrown if there is a problem reading the stream of 512 * the URLConnection. 513 */ 514 public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException { 515 Objects.requireNonNull(conn, "conn"); 516 this.defaultEncoding = defaultEncoding; 517 final boolean lenient = true; 518 final String contentType = conn.getContentType(); 519 final InputStream inputStream = conn.getInputStream(); 520 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance 521 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 522 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 523 if (conn instanceof HttpURLConnection || contentType != null) { 524 this.encoding = processHttpStream(bom, pis, contentType, lenient); 525 } else { 526 this.encoding = doRawStream(bom, pis, lenient); 527 } 528 this.reader = new InputStreamReader(pis, encoding); 529 } 530 531 /** 532 * Calculate the HTTP encoding. 533 * 534 * @param httpContentType The HTTP content type 535 * @param bomEnc BOM encoding 536 * @param xmlGuessEnc XML Guess encoding 537 * @param xmlEnc XML encoding 538 * @param lenient indicates if the charset encoding detection should be 539 * relaxed. 540 * @return the HTTP encoding 541 * @throws IOException thrown if there is a problem reading the stream. 542 */ 543 String calculateHttpEncoding(final String httpContentType, 544 final String bomEnc, final String xmlGuessEnc, final String xmlEnc, 545 final boolean lenient) throws IOException { 546 547 // Lenient and has XML encoding 548 if (lenient && xmlEnc != null) { 549 return xmlEnc; 550 } 551 552 // Determine mime/encoding content types from HTTP Content Type 553 final String cTMime = getContentTypeMime(httpContentType); 554 final String cTEnc = getContentTypeEncoding(httpContentType); 555 final boolean appXml = isAppXml(cTMime); 556 final boolean textXml = isTextXml(cTMime); 557 558 // Mime type NOT "application/xml" or "text/xml" 559 if (!appXml && !textXml) { 560 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 561 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 562 } 563 564 // No content type encoding 565 if (cTEnc == null) { 566 if (appXml) { 567 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 568 } 569 return defaultEncoding == null ? US_ASCII : defaultEncoding; 570 } 571 572 // UTF-16BE or UTF-16LE content type encoding 573 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 574 if (bomEnc != null) { 575 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 576 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 577 } 578 return cTEnc; 579 } 580 581 // UTF-16 content type encoding 582 if (cTEnc.equals(UTF_16)) { 583 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 584 return bomEnc; 585 } 586 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 587 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 588 } 589 590 // UTF-32BE or UTF-132E content type encoding 591 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 592 if (bomEnc != null) { 593 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 594 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 595 } 596 return cTEnc; 597 } 598 599 // UTF-32 content type encoding 600 if (cTEnc.equals(UTF_32)) { 601 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 602 return bomEnc; 603 } 604 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 605 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 606 } 607 608 return cTEnc; 609 } 610 611 /** 612 * Calculate the raw encoding. 613 * 614 * @param bomEnc BOM encoding 615 * @param xmlGuessEnc XML Guess encoding 616 * @param xmlEnc XML encoding 617 * @return the raw encoding 618 * @throws IOException thrown if there is a problem reading the stream. 619 */ 620 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, 621 final String xmlEnc) throws IOException { 622 623 // BOM is Null 624 if (bomEnc == null) { 625 if (xmlGuessEnc == null || xmlEnc == null) { 626 return defaultEncoding == null ? UTF_8 : defaultEncoding; 627 } 628 if (xmlEnc.equals(UTF_16) && 629 (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 630 return xmlGuessEnc; 631 } 632 return xmlEnc; 633 } 634 635 // BOM is UTF-8 636 if (bomEnc.equals(UTF_8)) { 637 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 638 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 639 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 640 } 641 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 642 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 643 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 644 } 645 return bomEnc; 646 } 647 648 // BOM is UTF-16BE or UTF-16LE 649 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 650 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 651 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 652 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 653 } 654 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 655 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 656 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 657 } 658 return bomEnc; 659 } 660 661 // BOM is UTF-32BE or UTF-32LE 662 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 663 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 664 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 665 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 666 } 667 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 668 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 669 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 670 } 671 return bomEnc; 672 } 673 674 // BOM is something else 675 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 676 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 677 } 678 679 /** 680 * Closes the XmlStreamReader stream. 681 * 682 * @throws IOException thrown if there was a problem closing the stream. 683 */ 684 @Override 685 public void close() throws IOException { 686 reader.close(); 687 } 688 689 /** 690 * Do lenient detection. 691 * 692 * @param httpContentType content-type header to use for the resolution of 693 * the charset encoding. 694 * @param ex The thrown exception 695 * @return the encoding 696 * @throws IOException thrown if there is a problem reading the stream. 697 */ 698 private String doLenientDetection(String httpContentType, 699 XmlStreamReaderException ex) throws IOException { 700 if (httpContentType != null && httpContentType.startsWith("text/html")) { 701 httpContentType = httpContentType.substring("text/html".length()); 702 httpContentType = "text/xml" + httpContentType; 703 try { 704 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), 705 ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 706 } catch (final XmlStreamReaderException ex2) { 707 ex = ex2; 708 } 709 } 710 String encoding = ex.getXmlEncoding(); 711 if (encoding == null) { 712 encoding = ex.getContentTypeEncoding(); 713 } 714 if (encoding == null) { 715 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 716 } 717 return encoding; 718 } 719 720 /** 721 * Process the raw stream. 722 * 723 * @param bom BOMInputStream to detect byte order marks 724 * @param pis BOMInputStream to guess XML encoding 725 * @param lenient indicates if the charset encoding detection should be 726 * relaxed. 727 * @return the encoding to be used 728 * @throws IOException thrown if there is a problem reading the stream. 729 */ 730 private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) 731 throws IOException { 732 final String bomEnc = bom.getBOMCharsetName(); 733 final String xmlGuessEnc = pis.getBOMCharsetName(); 734 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 735 try { 736 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 737 } catch (final XmlStreamReaderException ex) { 738 if (lenient) { 739 return doLenientDetection(null, ex); 740 } 741 throw ex; 742 } 743 } 744 745 /** 746 * Returns the default encoding to use if none is set in HTTP content-type, 747 * XML prolog and the rules based on content-type are not adequate. 748 * <p> 749 * If it is NULL the content-type based rules are used. 750 * 751 * @return the default encoding to use. 752 */ 753 public String getDefaultEncoding() { 754 return defaultEncoding; 755 } 756 757 /** 758 * Returns the charset encoding of the XmlStreamReader. 759 * 760 * @return charset encoding. 761 */ 762 public String getEncoding() { 763 return encoding; 764 } 765 766 /** 767 * Process a HTTP stream. 768 * 769 * @param bom BOMInputStream to detect byte order marks 770 * @param pis BOMInputStream to guess XML encoding 771 * @param httpContentType The HTTP content type 772 * @param lenient indicates if the charset encoding detection should be 773 * relaxed. 774 * @return the encoding to be used 775 * @throws IOException thrown if there is a problem reading the stream. 776 */ 777 private String processHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType, 778 final boolean lenient) throws IOException { 779 final String bomEnc = bom.getBOMCharsetName(); 780 final String xmlGuessEnc = pis.getBOMCharsetName(); 781 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 782 try { 783 return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient); 784 } catch (final XmlStreamReaderException ex) { 785 if (lenient) { 786 return doLenientDetection(httpContentType, ex); 787 } 788 throw ex; 789 } 790 } 791 792 /** 793 * Invokes the underlying reader's {@code read(char[], int, int)} method. 794 * @param buf the buffer to read the characters into 795 * @param offset The start offset 796 * @param len The number of bytes to read 797 * @return the number of characters read or -1 if the end of stream 798 * @throws IOException if an I/O error occurs. 799 */ 800 @Override 801 public int read(final char[] buf, final int offset, final int len) throws IOException { 802 return reader.read(buf, offset, len); 803 } 804 805}