/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package focusedCrawler.crawler.crawlercommons.fetcher;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class EncodingUtils {
private static final Logger LOGGER = LoggerFactory.getLogger(EncodingUtils.class);
private static final int EXPECTED_GZIP_COMPRESSION_RATIO = 5;
private static final int EXPECTED_DEFLATE_COMPRESSION_RATIO = 5;
private static final int BUF_SIZE = 4096;
public static class ExpandedResult {
private byte[] _expanded;
private boolean _isTruncated;
public ExpandedResult(byte[] expanded, boolean isTruncated) {
super();
_expanded = expanded;
_isTruncated = isTruncated;
}
public byte[] getExpanded() {
return _expanded;
}
public void setExpanded(byte[] expanded) {
_expanded = expanded;
}
public boolean isTruncated() {
return _isTruncated;
}
public void setTruncated(boolean isTruncated) {
_isTruncated = isTruncated;
}
}
public static byte[] processGzipEncoded(byte[] compressed) throws IOException {
return processGzipEncoded(compressed, Integer.MAX_VALUE).getExpanded();
}
public static ExpandedResult processGzipEncoded(byte[] compressed, int sizeLimit) throws IOException {
ByteArrayOutputStream outStream = new ByteArrayOutputStream(EXPECTED_GZIP_COMPRESSION_RATIO * compressed.length);
GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(compressed));
boolean isTruncated = false;
byte[] buf = new byte[BUF_SIZE];
int written = 0;
while (true) {
try {
int size = inStream.read(buf);
if (size <= 0) {
break;
}
if ((written + size) > sizeLimit) {
isTruncated = true;
outStream.write(buf, 0, sizeLimit - written);
break;
}
outStream.write(buf, 0, size);
written += size;
} catch (Exception e) {
LOGGER.trace("Exception unzipping content", e);
break;
}
}
safeClose(outStream);
return new ExpandedResult(outStream.toByteArray(), isTruncated);
}
// TODO KKr The following routines are designed to support the deflate
// compression standard (RFC 1250) for HTTP 1.1 (RFC 2616). However,
// I was unable to verify that they really work correctly, so I've
// removed deflate from SimpleHttpFetcher.DEFAULT_ACCEPT_ENCODING.
public static byte[] processDeflateEncoded(byte[] content) throws IOException {
return processDeflateEncoded(content, Integer.MAX_VALUE);
}
public static byte[] processDeflateEncoded(byte[] compressed, int sizeLimit) throws IOException {
ByteArrayOutputStream outStream = new ByteArrayOutputStream(EXPECTED_DEFLATE_COMPRESSION_RATIO * compressed.length);
// "true" because HTTP does not provide zlib headers
Inflater inflater = new Inflater(true);
InflaterInputStream inStream = new InflaterInputStream(new ByteArrayInputStream(compressed), inflater);
byte[] buf = new byte[BUF_SIZE];
int written = 0;
while (true) {
try {
int size = inStream.read(buf);
if (size <= 0) {
break;
}
if ((written + size) > sizeLimit) {
outStream.write(buf, 0, sizeLimit - written);
break;
}
outStream.write(buf, 0, size);
written += size;
} catch (Exception e) {
LOGGER.trace("Exception inflating content", e);
break;
}
}
safeClose(outStream);
return outStream.toByteArray();
}
private static void safeClose(OutputStream os) {
if (os == null) {
return;
}
try {
os.close();
} catch (IOException e) {
LOGGER.warn("IOException closing input stream", e);
}
}
}