/* * Copyright (C) 2008-2015 by Holger Arndt * * This file is part of the Universal Java Matrix Package (UJMP). * See the NOTICE file distributed with this work for additional * information regarding copyright ownership and licensing. * * UJMP is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * UJMP is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with UJMP; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ package org.ujmp.core.util.io; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * This stream searches all line end characters (hex 0A) in a file. For Linux * line splitting is OK, for Windows the character hex 0D has to be eliminated * with String.trim() */ public class SeekableLineInputStream extends InputStream { private static final long MAXLINECOUNT = 1000; private final long[] countsTotal = new long[256]; private final long[] diffSum = new long[256]; private int bufferSize = 65536; private BufferedRandomAccessFile in; private final List<Long> lineEnds = new ArrayList<Long>(); private long maxLineLength = 0; public SeekableLineInputStream(String file) throws IOException { this(new File(file)); } public SeekableLineInputStream(File file) throws IOException { in = new BufferedRandomAccessFile(file, "r", bufferSize); long totalLength = in.length(); long last = -1; long lineCount = 0; final long[][] countsPerLine = new long[2][256]; long[] c = countsPerLine[0]; final long[] c0 = countsPerLine[0]; final long[] c1 = countsPerLine[1]; final byte[] bytes = new byte[bufferSize]; byte b; for (long pos = 0; pos < totalLength; pos += bufferSize) { Arrays.fill(bytes, (byte) 0); in.read(pos, bytes); for (int i = 0; i < bufferSize; i++) { b = bytes[i]; // count characters if (lineCount < MAXLINECOUNT) { c[b + 128]++; } // when a new line comes if (b == 10) { if (lineCount < MAXLINECOUNT) { // sum up character frequencies for (int j = 256; --j != -1;) { countsTotal[j] += c[j]; } for (int j = 256; --j != -1;) { diffSum[j] += Math.abs(c0[j] - c1[j]); } // ignore difference for first line if (lineCount == 0) { Arrays.fill(diffSum, 0); } c = countsPerLine[(int) (++lineCount % 2)]; Arrays.fill(c, 0); } long length = pos + i - last; if (length > maxLineLength) { maxLineLength = length; } lineEnds.add(pos + i); last = pos + i; } } } // remove last newline, if it is the last byte in the file lineEnds.remove(totalLength - 1); for (int i = 0; i < 256; i++) { if (countsTotal[i] > 0) System.out.println((i - 128) + " " + countsTotal[i] + " " + diffSum[i]); } System.out.println("This file has " + getLineCount() + " lines"); // if initial buffer size was too small, we have to increase it now if (maxLineLength + 1 > bufferSize) { bufferSize = (int) maxLineLength + 1; in.close(); in = new BufferedRandomAccessFile(file, "r", bufferSize); } } public long getMaxLineLength() { return maxLineLength; } public void close() throws IOException { in.close(); } public int getLineCount() { return lineEnds.size() + 1; } public int read() throws IOException { return in.read(); } public String getMostProbableDelimiter() { long lines = Math.min(MAXLINECOUNT, lineEnds.size()); return null; } public String readLine(int lineNumber) throws IOException { String line; long start = 0; if (lineNumber > 0) { start = lineEnds.get(lineNumber - 1) + 1; } long end = 0; if (lineNumber < getLineCount() - 1) { end = lineEnds.get(lineNumber); } else { end = in.length(); } int length = (int) (end - start); if (length == 0) { return ""; } byte[] bytes = new byte[length]; in.read(start, bytes); // eliminate Windows line end if (bytes[bytes.length - 1] == 13) { line = new String(bytes, 0, bytes.length - 1); } else { line = new String(bytes); } return line; } }