package org.wikibrain.matrix; import gnu.trove.map.hash.TIntLongHashMap; import org.apache.commons.io.FileUtils; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.IntBuffer; import java.nio.LongBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.util.Iterator; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Implementation of a dense matrix. * The rows are memory mapped, so they can be immediately read from disk. * All rows must have the same columns in the same order. */ public class DenseMatrix implements Matrix<DenseMatrixRow> { public static final Logger LOG = LoggerFactory.getLogger(DenseMatrix.class); public static final int FILE_HEADER = 0xabccba; private int numRows; private IntBuffer rowIds; private LongBuffer rowOffsets; private int colIds[]; private FileChannel channel; private File path; MemoryMappedMatrix rowBuffers; private ValueConf vconf; // default header page size is 100MB, will be expanded if necessary public static final int DEFAULT_HEADER_SIZE = 100 * 1024 * 1024; /** * Create a dense matrix based on the data in a particular file. * @param path Path to the matrix data file. * @throws java.io.IOException */ public DenseMatrix(File path) throws IOException { this.path = path; info("initializing sparse matrix with file length " + FileUtils.sizeOf(path)); this.channel = (new FileInputStream(path)).getChannel(); readHeaders(); rowBuffers = new MemoryMappedMatrix(path, channel, rowIds, rowOffsets); } private void readHeaders() throws IOException { long size = Math.min(channel.size(), DEFAULT_HEADER_SIZE); MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, size); // read header if (buffer.getInt(0) != FILE_HEADER) { throw new IOException("invalid file header: " + buffer.getInt(0)); } this.vconf = new ValueConf(buffer.getFloat(4), buffer.getFloat(8)); this.numRows = buffer.getInt(12); int numCols = buffer.getInt(16); int headerSize = 20 + 12 * numRows + 4 * numCols; if (headerSize > DEFAULT_HEADER_SIZE) { info("maxPageSize not large enough for entire header. Resizing to " + headerSize); buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, headerSize); } debug("preparing buffer for " + numRows + " rows"); buffer.position(20); buffer.limit(buffer.position() + 4 * numRows); rowIds = buffer.slice().asIntBuffer(); if (rowIds.capacity() != numRows) { throw new IllegalStateException(); } buffer.position(20 + 4 * numRows); buffer.limit(buffer.position() + 8 * numRows); rowOffsets = buffer.slice().asLongBuffer(); if (rowOffsets.capacity() != numRows) { throw new IllegalStateException(); } // read column ids buffer.limit(headerSize); int pos = 20 + 12 * numRows; colIds = new int[numCols]; for (int i = 0; i < numCols; i++) { colIds[i] = buffer.getInt(pos); pos += 4; } if (!SparseMatrixUtils.isIncreasing(colIds)) { throw new IllegalArgumentException("Columns must be sorted by id"); } info("read " + colIds.length + " column ids"); } @Override public DenseMatrixRow getRow(int rowId) throws IOException { ByteBuffer bb = rowBuffers.getRow(rowId); if (bb == null) { return null; } else { return new DenseMatrixRow(vconf, colIds, bb); } } @Override public int[] getRowIds() { return rowBuffers.getRowIdsInDiskOrder(); } public int[] getColIds() { return colIds; } @Override public int getNumRows() { return numRows; } public ValueConf getValueConf() { return vconf; } @Override public Iterator<DenseMatrixRow> iterator() { return new DenseMatrixIterator(); } public class DenseMatrixIterator implements Iterator<DenseMatrixRow> { private AtomicInteger i = new AtomicInteger(); private int[] rowIds = rowBuffers.getRowIdsInDiskOrder(); @Override public boolean hasNext() { return i.get() < numRows; } @Override public DenseMatrixRow next() { try { return getRow(rowIds[i.getAndIncrement()]); } catch (IOException e) { LOG.error("getRow failed", e); return null; } } @Override public void remove() { throw new UnsupportedOperationException(); } } @Override public File getPath() { return path; } @Override public void close() throws IOException { rowBuffers.close(); } private void info(String message) { LOG.error("dense matrix " + path + ": " + message); } private void debug(String message) { LOG.error("dense matrix " + path + ": " + message); } }