001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream; 025import org.apache.commons.compress.utils.ByteUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p>This implementation uses an internal buffer in order to handle 031 * the back-references that are at the heart of the LZ77 algorithm. 032 * The size of the buffer must be at least as big as the biggest 033 * offset used in the compressed stream. The current version of the 034 * Snappy algorithm as defined by Google works on 32k blocks and 035 * doesn't contain offsets bigger than 32k which is the default block 036 * size used by this class.</p> 037 * 038 * @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a> 039 * @since 1.7 040 */ 041public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream { 042 043 /** Mask used to determine the type of "tag" is being processed */ 044 private static final int TAG_MASK = 0x03; 045 046 /** Default block size */ 047 public static final int DEFAULT_BLOCK_SIZE = 32768; 048 049 /** The size of the uncompressed data */ 050 private final int size; 051 052 /** Number of uncompressed bytes still to be read. */ 053 private int uncompressedBytesRemaining; 054 055 /** Current state of the stream */ 056 private State state = State.NO_BLOCK; 057 058 private boolean endReached = false; 059 060 /** 061 * Constructor using the default buffer size of 32k. 062 * 063 * @param is 064 * An InputStream to read compressed data from 065 * 066 * @throws IOException if reading fails 067 */ 068 public SnappyCompressorInputStream(final InputStream is) throws IOException { 069 this(is, DEFAULT_BLOCK_SIZE); 070 } 071 072 /** 073 * Constructor using a configurable buffer size. 074 * 075 * @param is 076 * An InputStream to read compressed data from 077 * @param blockSize 078 * The block size used in compression 079 * 080 * @throws IOException if reading fails 081 * @throws IllegalArgumentException if blockSize is not bigger than 0 082 */ 083 public SnappyCompressorInputStream(final InputStream is, final int blockSize) 084 throws IOException { 085 super(is, blockSize); 086 uncompressedBytesRemaining = size = (int) readSize(); 087 } 088 089 /** 090 * {@inheritDoc} 091 */ 092 @Override 093 public int read(final byte[] b, final int off, final int len) throws IOException { 094 if (endReached) { 095 return -1; 096 } 097 switch (state) { 098 case NO_BLOCK: 099 fill(); 100 return read(b, off, len); 101 case IN_LITERAL: 102 int litLen = readLiteral(b, off, len); 103 if (!hasMoreDataInBlock()) { 104 state = State.NO_BLOCK; 105 } 106 return litLen > 0 ? litLen : read(b, off, len); 107 case IN_BACK_REFERENCE: 108 int backReferenceLen = readBackReference(b, off, len); 109 if (!hasMoreDataInBlock()) { 110 state = State.NO_BLOCK; 111 } 112 return backReferenceLen > 0 ? backReferenceLen : read(b, off, len); 113 default: 114 throw new IOException("Unknown stream state " + state); 115 } 116 } 117 118 /** 119 * Try to fill the buffer with the next block of data. 120 */ 121 private void fill() throws IOException { 122 if (uncompressedBytesRemaining == 0) { 123 endReached = true; 124 return; 125 } 126 127 int b = readOneByte(); 128 if (b == -1) { 129 throw new IOException("Premature end of stream reading block start"); 130 } 131 int length = 0; 132 int offset = 0; 133 134 switch (b & TAG_MASK) { 135 136 case 0x00: 137 138 length = readLiteralLength(b); 139 if (length < 0) { 140 throw new IOException("Illegal block with a negative literal size found"); 141 } 142 uncompressedBytesRemaining -= length; 143 startLiteral(length); 144 state = State.IN_LITERAL; 145 break; 146 147 case 0x01: 148 149 /* 150 * These elements can encode lengths between [4..11] bytes and 151 * offsets between [0..2047] bytes. (len-4) occupies three bits 152 * and is stored in bits [2..4] of the tag byte. The offset 153 * occupies 11 bits, of which the upper three are stored in the 154 * upper three bits ([5..7]) of the tag byte, and the lower 155 * eight are stored in a byte following the tag byte. 156 */ 157 158 length = 4 + ((b >> 2) & 0x07); 159 if (length < 0) { 160 throw new IOException("Illegal block with a negative match length found"); 161 } 162 uncompressedBytesRemaining -= length; 163 offset = (b & 0xE0) << 3; 164 b = readOneByte(); 165 if (b == -1) { 166 throw new IOException("Premature end of stream reading back-reference length"); 167 } 168 offset |= b; 169 170 try { 171 startBackReference(offset, length); 172 } catch (IllegalArgumentException ex) { 173 throw new IOException("Illegal block with bad offset found", ex); 174 } 175 state = State.IN_BACK_REFERENCE; 176 break; 177 178 case 0x02: 179 180 /* 181 * These elements can encode lengths between [1..64] and offsets 182 * from [0..65535]. (len-1) occupies six bits and is stored in 183 * the upper six bits ([2..7]) of the tag byte. The offset is 184 * stored as a little-endian 16-bit integer in the two bytes 185 * following the tag byte. 186 */ 187 188 length = (b >> 2) + 1; 189 if (length < 0) { 190 throw new IOException("Illegal block with a negative match length found"); 191 } 192 uncompressedBytesRemaining -= length; 193 194 offset = (int) ByteUtils.fromLittleEndian(supplier, 2); 195 196 try { 197 startBackReference(offset, length); 198 } catch (IllegalArgumentException ex) { 199 throw new IOException("Illegal block with bad offset found", ex); 200 } 201 state = State.IN_BACK_REFERENCE; 202 break; 203 204 case 0x03: 205 206 /* 207 * These are like the copies with 2-byte offsets (see previous 208 * subsection), except that the offset is stored as a 32-bit 209 * integer instead of a 16-bit integer (and thus will occupy 210 * four bytes). 211 */ 212 213 length = (b >> 2) + 1; 214 if (length < 0) { 215 throw new IOException("Illegal block with a negative match length found"); 216 } 217 uncompressedBytesRemaining -= length; 218 219 offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff; 220 221 try { 222 startBackReference(offset, length); 223 } catch (IllegalArgumentException ex) { 224 throw new IOException("Illegal block with bad offset found", ex); 225 } 226 state = State.IN_BACK_REFERENCE; 227 break; 228 default: 229 // impossible as TAG_MASK is two bits and all four possible cases have been covered 230 break; 231 } 232 } 233 234 /* 235 * For literals up to and including 60 bytes in length, the 236 * upper six bits of the tag byte contain (len-1). The literal 237 * follows immediately thereafter in the bytestream. - For 238 * longer literals, the (len-1) value is stored after the tag 239 * byte, little-endian. The upper six bits of the tag byte 240 * describe how many bytes are used for the length; 60, 61, 62 241 * or 63 for 1-4 bytes, respectively. The literal itself follows 242 * after the length. 243 */ 244 private int readLiteralLength(final int b) throws IOException { 245 int length; 246 switch (b >> 2) { 247 case 60: 248 length = readOneByte(); 249 if (length == -1) { 250 throw new IOException("Premature end of stream reading literal length"); 251 } 252 break; 253 case 61: 254 length = (int) ByteUtils.fromLittleEndian(supplier, 2); 255 break; 256 case 62: 257 length = (int) ByteUtils.fromLittleEndian(supplier, 3); 258 break; 259 case 63: 260 length = (int) ByteUtils.fromLittleEndian(supplier, 4); 261 break; 262 default: 263 length = b >> 2; 264 break; 265 } 266 267 return length + 1; 268 } 269 270 /** 271 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 272 * 1), stored as a little-endian varint. Varints consist of a series of 273 * bytes, where the lower 7 bits are data and the upper bit is set iff there 274 * are more bytes to be read. In other words, an uncompressed length of 64 275 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) 276 * would be stored as 0xFE 0xFF 0x7F. 277 * 278 * @return The size of the uncompressed data 279 * 280 * @throws IOException 281 * Could not read a byte 282 */ 283 private long readSize() throws IOException { 284 int index = 0; 285 long sz = 0; 286 int b = 0; 287 288 do { 289 b = readOneByte(); 290 if (b == -1) { 291 throw new IOException("Premature end of stream reading size"); 292 } 293 sz |= (b & 0x7f) << (index++ * 7); 294 } while (0 != (b & 0x80)); 295 return sz; 296 } 297 298 /** 299 * Get the uncompressed size of the stream 300 * 301 * @return the uncompressed size 302 */ 303 @Override 304 public int getSize() { 305 return size; 306 } 307 308 private enum State { 309 NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE 310 } 311}