/*
* #%L
* gitools-core
* %%
* Copyright (C) 2013 Universitat Pompeu Fabra - Biomedical Genomics group
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-3.0.html>.
* #L%
*/
package org.gitools.matrix.model.compressmatrix;
import org.gitools.api.matrix.MatrixDimensionKey;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.zip.Deflater;
public abstract class AbstractCompressor {
protected static final char SEPARATOR = '\t';
private static final int MAX_LINES_TO_DICTIONARY = 5000000;
private static final int MIN_FREQUENCY = 2;
private static final int MAX_DICTIONARY_ENTRIES = 10000;
private CompressDimension rows;
private CompressDimension columns;
private String[] header;
private byte[] dictionary;
private byte[] outBuffer;
private Deflater deflater = new Deflater();
private long fileLinesCount;
private long maxLineLength;
public AbstractCompressor() {
}
protected CompressDimension getColumns() {
return columns;
}
protected CompressDimension getRows() {
return rows;
}
protected String[] getHeader() {
return header;
}
protected byte[] getDictionary() {
return dictionary;
}
protected long getMaxLineLength() {
return maxLineLength;
}
protected long getTotalLines() {
return fileLinesCount;
}
/**
* Convert and array of Strings into a byte array
*
* @param values The strings
* @return The byte arrays
* @throws java.io.UnsupportedEncodingException
*/
public static byte[] stringToByteArray(String[] values) throws UnsupportedEncodingException {
StringBuilder buffer = new StringBuilder(values.length * 10);
for (String value : values) {
buffer.append(value).append('\t');
}
return buffer.toString().getBytes("UTF-8");
}
/**
* Fast field split
*
* @param str The string to split using SEPARATOR
* @param num The position to return
* @return The string at 'num' position using 'SEPARATOR' in 'str' string.
*/
protected static String parseField(String str, int num) {
int start = -1;
for (int i = 0; i < num; i++) {
start = str.indexOf(SEPARATOR, start + 1);
if (start == -1)
return null;
}
int end = str.indexOf(SEPARATOR, start + 1);
if (end == -1)
end = str.length();
String result = str.substring(start + 1, end);
return result.replace('"', ' ').trim();
}
protected CompressRow compressRow(NotCompressRow row) throws Exception {
// Prepare a buffer with all the columns of this row
byte[] input = row.toByteArray();
// Compress the columns into 'outBuffer'
int length = compressDeflater(input);
byte[] content = Arrays.copyOf(outBuffer, length);
return new CompressRow(input.length, content);
}
/**
* Compress an 'input' byte array into the 'outBuffer'
*
* @param input Array to compress
* @return The length of the compressed buffer
* @throws Exception
*/
private int compressDeflater(byte[] input) throws Exception {
deflater.reset();
deflater.setInput(input);
deflater.setDictionary(dictionary);
deflater.finish();
return deflater.deflate(outBuffer);
}
/**
* Builds a frequency base compression dictionary and look for all the columns
* and rows identifiers available.
*
* @param reader Input matrix reader
* @throws Exception
*/
protected void initialize(IMatrixReader reader) throws Exception {
// Some internal variables
Map<String, Integer> frequencies = new HashMap<>();
Set<String> rows = new HashSet<>(1000);
Set<String> columns = new HashSet<>(1000);
String[] fields;
// Read the headers
String[] headers = reader.readNext();
header = Arrays.copyOfRange(headers, 2, headers.length);
int maxLineLength = 0;
fileLinesCount = 0;
while ((fields = reader.readNext()) != null) {
fileLinesCount++;
if (!columns.contains(fields[0])) {
columns.add(fields[0]);
}
if (!rows.contains(fields[1])) {
rows.add(fields[1]);
}
// Update frequencies
if (fileLinesCount < MAX_LINES_TO_DICTIONARY) {
int length = 0;
for (int i = 2; i < fields.length; i++) {
length += fields[i].length() + 1;
Integer freq = frequencies.get(fields[i]);
freq = (freq == null) ? 1 : freq + 1;
frequencies.put(fields[i], freq);
}
if (length > maxLineLength) {
maxLineLength = length;
}
}
}
this.maxLineLength = maxLineLength;
reader.close();
// Filter entries with frequency = 1
List<Map.Entry<String, Integer>> entries = new ArrayList<>(frequencies.size());
for (Map.Entry<String, Integer> entry : frequencies.entrySet()) {
if (entry.getValue() > MIN_FREQUENCY) {
entries.add(entry);
}
}
// Sort the frequency table by frequency
Collections.sort(entries, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return (o1.getValue().compareTo(o2.getValue()));
}
});
// Convert the frequency table into a deflate dictionary
StringBuilder buffer = new StringBuilder();
long count = 0;
for (Map.Entry<String, Integer> entry : entries) {
buffer.append(entry.getKey());
if (count++ > MAX_DICTIONARY_ENTRIES) {
break;
}
}
buffer.append('\t');
dictionary = buffer.toString().getBytes("UTF-8");
// Allocate the maximum buffer required when compressing
outBuffer = new byte[2 * (maxLineLength + 1) * columns.size()];
// Initialize rows and columns
this.rows = new CompressDimension(MatrixDimensionKey.ROWS, rows.toArray(new String[rows.size()]));
this.columns = new CompressDimension(MatrixDimensionKey.COLUMNS, columns.toArray(new String[columns.size()]));
}
public interface IMatrixReader {
String[] readNext();
void close();
}
}