001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.io.InputStream; 023import java.util.Arrays; 024 025import org.apache.commons.compress.compressors.CompressorInputStream; 026import org.apache.commons.compress.utils.ByteUtils; 027import org.apache.commons.compress.utils.IOUtils; 028import org.apache.commons.compress.utils.InputStreamStatistics; 029import org.apache.commons.io.input.BoundedInputStream; 030 031/** 032 * Encapsulates code common to LZ77 decompressors. 033 * 034 * <p> 035 * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for 036 * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only 037 * used for back-references. 038 * </p> 039 * 040 * <p> 041 * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg 042 * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise. 043 * </p> 044 * 045 * <p> 046 * The contract for subclasses' {@code read} implementation is: 047 * </p> 048 * <ul> 049 * 050 * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li> 051 * 052 * <li>Use {@link #readOneByte} to access the underlying stream directly.</li> 053 * 054 * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it 055 * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li> 056 * 057 * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference} 058 * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return 059 * {@code 0}.</li> 060 * 061 * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li> 062 * 063 * </ul> 064 * 065 * <p> 066 * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read. 067 * </p> 068 * 069 * @since 1.14 070 */ 071public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics { 072 073 /** Size of the window - must be bigger than the biggest offset expected. */ 074 private final int windowSize; 075 076 /** 077 * Buffer to write decompressed bytes to for back-references, will be three times windowSize big. 078 * 079 * <p> 080 * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy 081 * back-references. 082 * </p> 083 */ 084 private final byte[] buf; 085 086 /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */ 087 private int writeIndex; 088 089 /** Index of the next byte to be read. */ 090 private int readIndex; 091 092 /** The underlying stream to read compressed data from */ 093 private final BoundedInputStream in; 094 095 /** Number of bytes still to be read from the current literal or back-reference. */ 096 private long bytesRemaining; 097 098 /** Offset of the current back-reference. */ 099 private int backReferenceOffset; 100 101 /** Uncompressed size */ 102 private int size; 103 104 // used in no-arg read method 105 private final byte[] oneByte = new byte[1]; 106 107 /** 108 * Supplier that delegates to {@link #readOneByte}. 109 */ 110 protected final ByteUtils.ByteSupplier supplier = this::readOneByte; 111 112 /** 113 * Creates a new LZ77 input stream. 114 * 115 * @param is An InputStream to read compressed data from 116 * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected. 117 * @throws IllegalArgumentException if windowSize is not bigger than 0 118 */ 119 public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) { 120 this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get(); 121 if (windowSize <= 0) { 122 throw new IllegalArgumentException("windowSize must be bigger than 0"); 123 } 124 this.windowSize = windowSize; 125 buf = new byte[3 * windowSize]; 126 writeIndex = readIndex = 0; 127 bytesRemaining = 0; 128 } 129 130 /** {@inheritDoc} */ 131 @Override 132 public int available() { 133 return writeIndex - readIndex; 134 } 135 136 /** {@inheritDoc} */ 137 @Override 138 public void close() throws IOException { 139 in.close(); 140 } 141 142 /** 143 * @since 1.17 144 */ 145 @Override 146 public long getCompressedCount() { 147 return in.getCount(); 148 } 149 150 /** 151 * Gets the uncompressed size of the stream 152 * 153 * @return the uncompressed size 154 */ 155 public int getSize() { 156 return size; 157 } 158 159 /** 160 * Is there still data remaining inside the current block? 161 * 162 * @return true if there is still data remaining inside the current block. 163 */ 164 protected final boolean hasMoreDataInBlock() { 165 return bytesRemaining > 0; 166 } 167 168 /** 169 * Adds some initial data to fill the window with. 170 * 171 * <p> 172 * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the 173 * LZ4 frame format using block dependency. 174 * </p> 175 * 176 * @param data the data to fill the window with. 177 * @throws IllegalStateException if the stream has already started to read data 178 */ 179 public void prefill(final byte[] data) { 180 if (writeIndex != 0) { 181 throw new IllegalStateException("The stream has already been read from, can't prefill anymore"); 182 } 183 // we don't need more data than the big offset could refer to, so cap it 184 final int len = Math.min(windowSize, data.length); 185 // we need the last data as we are dealing with *back*-references 186 System.arraycopy(data, data.length - len, buf, 0, len); 187 writeIndex += len; 188 readIndex += len; 189 } 190 191 /** {@inheritDoc} */ 192 @Override 193 public int read() throws IOException { 194 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 195 } 196 197 /** 198 * Reads data from the current back-reference. 199 * 200 * @param b buffer to write data to 201 * @param off offset to start writing to 202 * @param len maximum amount of data to read 203 * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass 204 * @throws NullPointerException if {@code b} is null 205 * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off} 206 */ 207 protected final int readBackReference(final byte[] b, final int off, final int len) { 208 final int avail = available(); 209 if (len > avail) { 210 tryToCopy(len - avail); 211 } 212 return readFromBuffer(b, off, len); 213 } 214 215 private int readFromBuffer(final byte[] b, final int off, final int len) { 216 final int readable = Math.min(len, available()); 217 if (readable > 0) { 218 System.arraycopy(buf, readIndex, b, off, readable); 219 readIndex += readable; 220 if (readIndex > 2 * windowSize) { 221 slideBuffer(); 222 } 223 } 224 size += readable; 225 return readable; 226 } 227 228 /** 229 * Reads data from the current literal block. 230 * 231 * @param b buffer to write data to 232 * @param off offset to start writing to 233 * @param len maximum amount of data to read 234 * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass 235 * @throws IOException if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read 236 * @throws NullPointerException if {@code b} is null 237 * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off} 238 */ 239 protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException { 240 final int avail = available(); 241 if (len > avail) { 242 tryToReadLiteral(len - avail); 243 } 244 return readFromBuffer(b, off, len); 245 } 246 247 /** 248 * Reads a single byte from the real input stream and ensures the data is accounted for. 249 * 250 * @return the byte read as value between 0 and 255 or -1 if EOF has been reached. 251 * @throws IOException if the underlying stream throws 252 */ 253 protected final int readOneByte() throws IOException { 254 final int b = in.read(); 255 if (b != -1) { 256 count(1); 257 return b & 0xFF; 258 } 259 return -1; 260 } 261 262 private void slideBuffer() { 263 System.arraycopy(buf, windowSize, buf, 0, windowSize * 2); 264 writeIndex -= windowSize; 265 readIndex -= windowSize; 266 } 267 268 /** 269 * Used by subclasses to signal the next block contains a back-reference with the given coordinates. 270 * 271 * @param offset the offset of the back-reference 272 * @param length the length of the back-reference 273 * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative 274 */ 275 protected final void startBackReference(final int offset, final long length) { 276 if (offset <= 0 || offset > writeIndex) { 277 throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number of bytes available for back-references"); 278 } 279 if (length < 0) { 280 throw new IllegalArgumentException("length must not be negative"); 281 } 282 backReferenceOffset = offset; 283 bytesRemaining = length; 284 } 285 286 /** 287 * Used by subclasses to signal the next block contains the given amount of literal data. 288 * 289 * @param length the length of the block 290 * @throws IllegalArgumentException if length is negative 291 */ 292 protected final void startLiteral(final long length) { 293 if (length < 0) { 294 throw new IllegalArgumentException("length must not be negative"); 295 } 296 bytesRemaining = length; 297 } 298 299 private void tryToCopy(final int bytesToCopy) { 300 // this will fit into the buffer without sliding and not 301 // require more than is available inside the back-reference 302 final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex); 303 if (copy == 0) { 304 // NOP 305 } else if (backReferenceOffset == 1) { // pretty common special case 306 final byte last = buf[writeIndex - 1]; 307 Arrays.fill(buf, writeIndex, writeIndex + copy, last); 308 writeIndex += copy; 309 } else if (copy < backReferenceOffset) { 310 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy); 311 writeIndex += copy; 312 } else { 313 // back-reference overlaps with the bytes created from it 314 // like go back two bytes and then copy six (by copying 315 // the last two bytes three time). 316 final int fullRots = copy / backReferenceOffset; 317 for (int i = 0; i < fullRots; i++) { 318 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset); 319 writeIndex += backReferenceOffset; 320 } 321 322 final int pad = copy - backReferenceOffset * fullRots; 323 if (pad > 0) { 324 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad); 325 writeIndex += pad; 326 } 327 } 328 bytesRemaining -= copy; 329 } 330 331 private void tryToReadLiteral(final int bytesToRead) throws IOException { 332 // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer" 333 final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex); 334 final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */; 335 count(bytesRead); 336 if (reallyTryToRead != bytesRead) { 337 throw new IOException("Premature end of stream reading literal"); 338 } 339 writeIndex += reallyTryToRead; 340 bytesRemaining -= reallyTryToRead; 341 } 342}