package it.unimi.dsi.io; /* * DSI utilities * * Copyright (C) 2005-2009 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectList; import it.unimi.dsi.lang.MutableString; import java.io.Closeable; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.AbstractCollection; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.zip.GZIPInputStream; /** A wrapper exhibiting the lines of a file as a {@link java.util.Collection}. * * <P><strong>Warning</strong>: the lines returned by iterators generated by * instances of this class <em>are not cacheable</em>. The returned value is * a {@link it.unimi.dsi.lang.MutableString} instance that is reused * at each call, and that is <em>modified by a call to {@link java.util.Iterator#hasNext() hasNext()}</em>. * Thus, for instance, * <pre> * ObjectIterators.unwrap( fileLinesColletion.iterator() ); * </pre> * will not give the expected results. Use {@link #allLines()} to get * the list of all lines (again, under the form of compact {@link it.unimi.dsi.lang.MutableString}s). * Note also that {@link #toString()} will return a single string containing all * file lines separated by the string associated to the system property <samp>line.separator</samp>. * * <P>An instance of this class allows to access the lines of a file as a * {@link java.util.Collection}. Using {@linkplain java.util.Collection#contains(java.lang.Object) * direct access} is strongly discouraged (it will require a full scan of the file), but * the {@link #iterator()} can be fruitfully used to scan the file, and can be called any * number of times, as it opens an independent input stream at each call. For the * same reason, the returned iterator type ({@link it.unimi.dsi.io.FileLinesCollection.FileLinesIterator}) * is {@link java.io.Closeable}, and should be closed after usage. * * <p>Using a suitable {@linkplain #FileLinesCollection(CharSequence, String, boolean) constructor}, it is possible * to specify that the file is compresse in <samp>gzip</samp> format (in this case, it will be opened using a {@link GZIPInputStream}). * * <P>Note that the first call to {@link #size()} will require a full file scan. * * @author Sebastiano Vigna * @since 0.9.2 */ public class FileLinesCollection extends AbstractCollection<MutableString> { /** The filename upon which this file-lines collection is based. */ private final String filename; /** The encoding of {@link #filename}, or <code>null</code> for the standard platform encoding. */ private final String encoding; /** The cached size of the collection. */ private int size = -1; /** Whether {@link #filename} is zipped. */ private final boolean zipped; /** Creates a file-lines collection for the specified filename with the specified encoding. * * @param filename a filename. * @param encoding an encoding. */ public FileLinesCollection( final CharSequence filename, final String encoding ) { this( filename, encoding, false ); } /** Creates a file-lines collection for the specified filename with the specified encoding, optionally assuming * that the file is compressed using <samp>gzip</samp> format. * * @param filename a filename. * @param encoding an encoding. * @param zipped whether <samp>filename</samp> is zipped. */ public FileLinesCollection( final CharSequence filename, final String encoding, final boolean zipped ) { this.zipped = zipped; this.filename = filename.toString(); this.encoding = encoding; } /** An iterator over the lines of a {@link FileLinesCollection}. * * <p>Instances of this class open an {@link java.io.InputStream}, and thus should be {@linkplain Closeable#close() closed} after * usage. A “safety-net” finaliser tries to take care of the cases in which * closing an instance is impossible. An exhausted iterator, however, will be closed automagically. */ public static final class FileLinesIterator implements Iterator<MutableString>, SafelyCloseable { private FastBufferedReader fbr; MutableString s = new MutableString(), next; boolean toAdvance = true; private FileLinesIterator( final String filename, final String encoding, final boolean zipped ) { try { fbr = encoding != null ? new FastBufferedReader( new InputStreamReader( zipped ? new GZIPInputStream( new FileInputStream( filename ) ) : new FileInputStream( filename ), encoding ) ) : new FastBufferedReader( new FileReader( filename ) ); } catch (IOException e) { throw new RuntimeException( e ); } } public boolean hasNext() { if ( toAdvance ) { try { next = fbr.readLine( s ); if ( next == null ) close(); } catch (IOException e) { throw new RuntimeException( e ); } toAdvance = false; } return next != null; } public MutableString next() { if ( ! hasNext() ) throw new NoSuchElementException(); toAdvance = true; return s; } public void remove() { throw new UnsupportedOperationException(); } public synchronized void close() { if ( fbr == null ) return; try { fbr.close(); } catch ( IOException e ) { throw new RuntimeException( e ); } finally { fbr = null; } } protected synchronized void finalize() throws Throwable { try { if ( fbr != null ) close(); } finally { super.finalize(); } } } public FileLinesIterator iterator() { return new FileLinesIterator( filename, encoding, zipped ); } public synchronized int size() { if ( size == -1 ) { FileLinesIterator i = iterator(); size = 0; while( i.hasNext() ) { size++; i.next(); } i.close(); } return size; } /** Returns all lines of the file wrapped by this file-lines collection. * * @return all lines of the file wrapped by this file-lines collection. */ public ObjectList<MutableString> allLines() { final ObjectArrayList<MutableString> result = new ObjectArrayList<MutableString>(); for( Iterator<MutableString> i = iterator(); i.hasNext(); ) result.add( i.next().copy() ); return result; } public String toString() { final MutableString separator = new MutableString( System.getProperty( "line.separator" ) ); final MutableString s = new MutableString(); for( MutableString l: this ) s.append( l ).append( separator ); return s.toString(); } }