package org.commoncrawl.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataOutputBuffer;
/**
* A collection of utility methods for working on GZIPed data.
*/
public class GZIPUtils {
private static final Log LOG = LogFactory
.getLog(GZIPUtils.class);
private static final int EXPECTED_COMPRESSION_RATIO = 5;
private static final int BUF_SIZE = 4096;
public static class UnzipResult {
public UnzipResult(byte data[],int offset,int length, boolean truncated) {
this.data = new FlexBuffer(data,offset,length);
this.wasTruncated = truncated;
}
public FlexBuffer data;
public boolean wasTruncated = false;
}
/**
* Returns an gunzipped copy of the input array. If the gzipped input has been
* truncated or corrupted, a best-effort attempt is made to unzip as much as
* possible. If no data can be extracted <code>null</code> is returned.
*/
public static final UnzipResult unzipBestEffort(byte[] in) {
return unzipBestEffort(in, 0, in.length, Integer.MAX_VALUE);
}
public static final UnzipResult unzipBestEffort(byte[] in, int sizeLimit) {
return unzipBestEffort(in, 0, in.length, sizeLimit);
}
/**
* Returns an gunzipped copy of the input array, truncated to
* <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been
* truncated or corrupted, a best-effort attempt is made to unzip as much as
* possible. If no data can be extracted <code>null</code> is returned.
*/
public static final UnzipResult unzipBestEffort(byte[] in, int offset,
int sizeIn, int sizeLimit) {
try {
// decompress using GZIPInputStream
DataOutputBuffer outStream = new DataOutputBuffer(
EXPECTED_COMPRESSION_RATIO * in.length);
boolean truncated = false;
GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(
in, offset, sizeIn));
byte[] buf = new byte[BUF_SIZE];
int written = 0;
while (true) {
try {
int size = inStream.read(buf);
if (size <= 0)
break;
if ((written + size) > sizeLimit) {
outStream.write(buf, 0, sizeLimit - written);
truncated = true;
break;
}
outStream.write(buf, 0, size);
written += size;
} catch (Exception e) {
break;
}
}
try {
outStream.close();
} catch (IOException e) {
}
return new UnzipResult(outStream.getData(),0,outStream.getLength(), truncated);
} catch (IOException e) {
return null;
} catch (OutOfMemoryError e) {
LOG.fatal(CCStringUtils.stringifyException(e));
return null;
}
}
/**
* Returns an gunzipped copy of the input array.
*
* @throws IOException
* if the input cannot be properly decompressed
*/
public static final byte[] unzip(byte[] in) throws IOException {
// decompress using GZIPInputStream
ByteArrayOutputStream outStream = new ByteArrayOutputStream(
EXPECTED_COMPRESSION_RATIO * in.length);
GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(in));
byte[] buf = new byte[BUF_SIZE];
while (true) {
int size = inStream.read(buf);
if (size <= 0)
break;
outStream.write(buf, 0, size);
}
outStream.close();
return outStream.toByteArray();
}
/**
* Returns an gzipped copy of the input array.
*/
public static final byte[] zip(byte[] in) {
try {
// compress using GZIPOutputStream
ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
/ EXPECTED_COMPRESSION_RATIO);
GZIPOutputStream outStream = new GZIPOutputStream(byteOut);
try {
outStream.write(in);
} catch (Exception e) {
LOG.warn(CCStringUtils.stringifyException(e));
}
try {
outStream.close();
} catch (IOException e) {
LOG.warn(CCStringUtils.stringifyException(e));
}
return byteOut.toByteArray();
} catch (IOException e) {
LOG.warn(CCStringUtils.stringifyException(e));
return null;
}
}
}