001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024import java.util.Comparator; 025import java.util.List; 026import org.apache.commons.io.ByteOrderMark; 027import org.apache.commons.io.IOUtils; 028 029/** 030 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 031 * <p> 032 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the 033 * first byte in the stream. 034 * </p> 035 * <p> 036 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: 037 * </p> 038 * <ul> 039 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 040 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 041 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 042 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 043 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 044 * </ul> 045 * 046 * <h2>Example 1 - Detect and exclude a UTF-8 BOM</h2> 047 * 048 * <pre> 049 * BOMInputStream bomIn = new BOMInputStream(in); 050 * if (bomIn.hasBOM()) { 051 * // has a UTF-8 BOM 052 * } 053 * </pre> 054 * 055 * <h2>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h2> 056 * 057 * <pre> 058 * boolean include = true; 059 * BOMInputStream bomIn = new BOMInputStream(in, include); 060 * if (bomIn.hasBOM()) { 061 * // has a UTF-8 BOM 062 * } 063 * </pre> 064 * 065 * <h2>Example 3 - Detect Multiple BOMs</h2> 066 * 067 * <pre> 068 * BOMInputStream bomIn = new BOMInputStream(in, 069 * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, 070 * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE 071 * ); 072 * if (bomIn.hasBOM() == false) { 073 * // No BOM found 074 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 075 * // has a UTF-16LE BOM 076 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 077 * // has a UTF-16BE BOM 078 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 079 * // has a UTF-32LE BOM 080 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 081 * // has a UTF-32BE BOM 082 * } 083 * </pre> 084 * 085 * @see org.apache.commons.io.ByteOrderMark 086 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 087 * @since 2.0 088 */ 089public class BOMInputStream extends ProxyInputStream { 090 private final boolean include; 091 /** 092 * BOMs are sorted from longest to shortest. 093 */ 094 private final List<ByteOrderMark> boms; 095 private ByteOrderMark byteOrderMark; 096 private int[] firstBytes; 097 private int fbLength; 098 private int fbIndex; 099 private int markFbIndex; 100 private boolean markedAtStart; 101 102 /** 103 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 104 * 105 * @param delegate 106 * the InputStream to delegate to 107 */ 108 public BOMInputStream(final InputStream delegate) { 109 this(delegate, false, ByteOrderMark.UTF_8); 110 } 111 112 /** 113 * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. 114 * 115 * @param delegate 116 * the InputStream to delegate to 117 * @param include 118 * true to include the UTF-8 BOM or false to exclude it 119 */ 120 public BOMInputStream(final InputStream delegate, final boolean include) { 121 this(delegate, include, ByteOrderMark.UTF_8); 122 } 123 124 /** 125 * Constructs a new BOM InputStream that excludes the specified BOMs. 126 * 127 * @param delegate 128 * the InputStream to delegate to 129 * @param boms 130 * The BOMs to detect and exclude 131 */ 132 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { 133 this(delegate, false, boms); 134 } 135 136 /** 137 * Compares ByteOrderMark objects in descending length order. 138 */ 139 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = (bom1, bom2) -> { 140 final int len1 = bom1.length(); 141 final int len2 = bom2.length(); 142 return Integer.compare(len2, len1); 143 }; 144 145 /** 146 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 147 * 148 * @param delegate 149 * the InputStream to delegate to 150 * @param include 151 * true to include the specified BOMs or false to exclude them 152 * @param boms 153 * The BOMs to detect and optionally exclude 154 */ 155 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { 156 super(delegate); 157 if (IOUtils.length(boms) == 0) { 158 throw new IllegalArgumentException("No BOMs specified"); 159 } 160 this.include = include; 161 final List<ByteOrderMark> list = Arrays.asList(boms); 162 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 163 list.sort(ByteOrderMarkLengthComparator); 164 this.boms = list; 165 166 } 167 168 /** 169 * Indicates whether the stream contains one of the specified BOMs. 170 * 171 * @return true if the stream has one of the specified BOMs, otherwise false if it does not 172 * @throws IOException 173 * if an error reading the first bytes of the stream occurs 174 */ 175 public boolean hasBOM() throws IOException { 176 return getBOM() != null; 177 } 178 179 /** 180 * Indicates whether the stream contains the specified BOM. 181 * 182 * @param bom 183 * The BOM to check for 184 * @return true if the stream has the specified BOM, otherwise false if it does not 185 * @throws IllegalArgumentException 186 * if the BOM is not one the stream is configured to detect 187 * @throws IOException 188 * if an error reading the first bytes of the stream occurs 189 */ 190 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 191 if (!boms.contains(bom)) { 192 throw new IllegalArgumentException("Stream not configure to detect " + bom); 193 } 194 getBOM(); 195 return byteOrderMark != null && byteOrderMark.equals(bom); 196 } 197 198 /** 199 * Return the BOM (Byte Order Mark). 200 * 201 * @return The BOM or null if none 202 * @throws IOException 203 * if an error reading the first bytes of the stream occurs 204 */ 205 public ByteOrderMark getBOM() throws IOException { 206 if (firstBytes == null) { 207 fbLength = 0; 208 // BOMs are sorted from longest to shortest 209 final int maxBomSize = boms.get(0).length(); 210 firstBytes = new int[maxBomSize]; 211 // Read first maxBomSize bytes 212 for (int i = 0; i < firstBytes.length; i++) { 213 firstBytes[i] = in.read(); 214 fbLength++; 215 if (firstBytes[i] < 0) { 216 break; 217 } 218 } 219 // match BOM in firstBytes 220 byteOrderMark = find(); 221 if ((byteOrderMark != null) && !include) { 222 if (byteOrderMark.length() < firstBytes.length) { 223 fbIndex = byteOrderMark.length(); 224 } else { 225 fbLength = 0; 226 } 227 } 228 } 229 return byteOrderMark; 230 } 231 232 /** 233 * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 234 * 235 * @return The BOM charset Name or null if no BOM found 236 * @throws IOException 237 * if an error reading the first bytes of the stream occurs 238 * 239 */ 240 public String getBOMCharsetName() throws IOException { 241 getBOM(); 242 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 243 } 244 245 /** 246 * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte 247 * {@code read()} method, either returning a valid byte or -1 to indicate that the initial bytes have been 248 * processed already. 249 * 250 * @return the byte read (excluding BOM) or -1 if the end of stream 251 * @throws IOException 252 * if an I/O error occurs 253 */ 254 private int readFirstBytes() throws IOException { 255 getBOM(); 256 return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; 257 } 258 259 /** 260 * Find a BOM with the specified bytes. 261 * 262 * @return The matched BOM or null if none matched 263 */ 264 private ByteOrderMark find() { 265 for (final ByteOrderMark bom : boms) { 266 if (matches(bom)) { 267 return bom; 268 } 269 } 270 return null; 271 } 272 273 /** 274 * Check if the bytes match a BOM. 275 * 276 * @param bom 277 * The BOM 278 * @return true if the bytes match the bom, otherwise false 279 */ 280 private boolean matches(final ByteOrderMark bom) { 281 // if (bom.length() != fbLength) { 282 // return false; 283 // } 284 // firstBytes may be bigger than the BOM bytes 285 for (int i = 0; i < bom.length(); i++) { 286 if (bom.get(i) != firstBytes[i]) { 287 return false; 288 } 289 } 290 return true; 291 } 292 293 // ---------------------------------------------------------------------------- 294 // Implementation of InputStream 295 // ---------------------------------------------------------------------------- 296 297 /** 298 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 299 * 300 * @return the byte read (excluding BOM) or -1 if the end of stream 301 * @throws IOException 302 * if an I/O error occurs 303 */ 304 @Override 305 public int read() throws IOException { 306 final int b = readFirstBytes(); 307 return b >= 0 ? b : in.read(); 308 } 309 310 /** 311 * Invokes the delegate's {@code read(byte[], int, int)} method, detecting and optionally skipping BOM. 312 * 313 * @param buf 314 * the buffer to read the bytes into 315 * @param off 316 * The start offset 317 * @param len 318 * The number of bytes to read (excluding BOM) 319 * @return the number of bytes read or -1 if the end of stream 320 * @throws IOException 321 * if an I/O error occurs 322 */ 323 @Override 324 public int read(final byte[] buf, int off, int len) throws IOException { 325 int firstCount = 0; 326 int b = 0; 327 while (len > 0 && b >= 0) { 328 b = readFirstBytes(); 329 if (b >= 0) { 330 buf[off++] = (byte) (b & 0xFF); 331 len--; 332 firstCount++; 333 } 334 } 335 final int secondCount = in.read(buf, off, len); 336 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 337 } 338 339 /** 340 * Invokes the delegate's {@code read(byte[])} method, detecting and optionally skipping BOM. 341 * 342 * @param buf 343 * the buffer to read the bytes into 344 * @return the number of bytes read (excluding BOM) or -1 if the end of stream 345 * @throws IOException 346 * if an I/O error occurs 347 */ 348 @Override 349 public int read(final byte[] buf) throws IOException { 350 return read(buf, 0, buf.length); 351 } 352 353 /** 354 * Invokes the delegate's {@code mark(int)} method. 355 * 356 * @param readlimit 357 * read ahead limit 358 */ 359 @Override 360 public synchronized void mark(final int readlimit) { 361 markFbIndex = fbIndex; 362 markedAtStart = firstBytes == null; 363 in.mark(readlimit); 364 } 365 366 /** 367 * Invokes the delegate's {@code reset()} method. 368 * 369 * @throws IOException 370 * if an I/O error occurs 371 */ 372 @Override 373 public synchronized void reset() throws IOException { 374 fbIndex = markFbIndex; 375 if (markedAtStart) { 376 firstBytes = null; 377 } 378 379 in.reset(); 380 } 381 382 /** 383 * Invokes the delegate's {@code skip(long)} method, detecting and optionally skipping BOM. 384 * 385 * @param n 386 * the number of bytes to skip 387 * @return the number of bytes to skipped or -1 if the end of stream 388 * @throws IOException 389 * if an I/O error occurs 390 */ 391 @Override 392 public long skip(final long n) throws IOException { 393 int skipped = 0; 394 while ((n > skipped) && (readFirstBytes() >= 0)) { 395 skipped++; 396 } 397 return in.skip(n - skipped) + skipped; 398 } 399}