package edu.stanford.nlp.io; import edu.stanford.nlp.util.*; import java.io.*; import java.lang.reflect.InvocationTargetException; import java.net.InetAddress; import java.net.SocketTimeoutException; import java.net.URL; import java.net.URLConnection; import java.nio.channels.FileChannel; import java.util.*; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; /** * Helper Class for various I/O related things. * * @author Kayur Patel, Teg Grenager */ public class IOUtils { private static final int SLURPBUFFSIZE = 16000; public static final String eolChar = System.getProperty("line.separator"); public static final String defaultEncoding = "utf-8"; // A class of static methods private IOUtils() { } /** * Write object to a file with the specified name. The file is silently gzipped if the filename ends with .gz. * * @param o Object to be written to file * @param filename Name of the temp file * @throws IOException If can't write file. * @return File containing the object */ public static File writeObjectToFile(Object o, String filename) throws IOException { return writeObjectToFile(o, new File(filename)); } /** * Write an object to a specified File. The file is silently gzipped if the filename ends with .gz. * * @param o Object to be written to file * @param file The temp File * @throws IOException If File cannot be written * @return File containing the object */ public static File writeObjectToFile(Object o, File file) throws IOException { return writeObjectToFile(o, file, false); } /** * Write an object to a specified File. The file is silently gzipped if the filename ends with .gz. * * @param o Object to be written to file * @param file The temp File * @param append If true, append to this file instead of overwriting it * @throws IOException If File cannot be written * @return File containing the object */ public static File writeObjectToFile(Object o, File file, boolean append) throws IOException { // file.createNewFile(); // cdm may 2005: does nothing needed OutputStream os = new FileOutputStream(file, append); if (file.getName().endsWith(".gz")) { os = new GZIPOutputStream(os); } os = new BufferedOutputStream(os); ObjectOutputStream oos = new ObjectOutputStream(os); oos.writeObject(o); oos.close(); return file; } /** * Write object to a file with the specified name. * * @param o Object to be written to file * @param filename Name of the temp file * @return File containing the object, or null if an exception was caught */ public static File writeObjectToFileNoExceptions(Object o, String filename) { File file = null; ObjectOutputStream oos = null; try { file = new File(filename); // file.createNewFile(); // cdm may 2005: does nothing needed oos = new ObjectOutputStream(new BufferedOutputStream( new GZIPOutputStream(new FileOutputStream(file)))); oos.writeObject(o); oos.close(); } catch (Exception e) { e.printStackTrace(); } finally { closeIgnoringExceptions(oos); } return file; } /** * Write object to temp file which is destroyed when the program exits. * * @param o Object to be written to file * @param filename Name of the temp file * @throws IOException If file cannot be written * @return File containing the object */ public static File writeObjectToTempFile(Object o, String filename) throws IOException { File file = File.createTempFile(filename, ".tmp"); file.deleteOnExit(); writeObjectToFile(o, file); return file; } /** * Write object to a temp file and ignore exceptions. * * @param o Object to be written to file * @param filename Name of the temp file * @return File containing the object */ public static File writeObjectToTempFileNoExceptions(Object o, String filename) { try { return writeObjectToTempFile(o, filename); } catch (Exception e) { System.err.println("Error writing object to file " + filename); e.printStackTrace(); return null; } } private static OutputStream getBufferedOutputStream(String path) throws IOException { OutputStream os = new BufferedOutputStream(new FileOutputStream(path)); if (path.endsWith(".gz")) { os = new GZIPOutputStream(os); } return os; } //++ todo [cdm, Aug 2012]: Do we need the below methods? They're kind of weird in unnecessarily bypassing using a Writer. /** * Writes a string to a file. * * @param contents The string to write * @param path The file path * @param encoding The encoding to encode in * @throws IOException In case of failure */ public static void writeStringToFile(String contents, String path, String encoding) throws IOException { OutputStream writer = getBufferedOutputStream(path); writer.write(contents.getBytes(encoding)); writer.close(); } /** * Writes a string to a file, as UTF-8. * * @param contents The string to write * @param path The file path * @throws IOException In case of failure */ /** * Writes a string to a file, squashing exceptions * * @param contents The string to write * @param path The file path * @param encoding The encoding to encode in * */ public static void writeStringToFileNoExceptions(String contents, String path, String encoding) { OutputStream writer = null; try{ if (path.endsWith(".gz")) { writer = new GZIPOutputStream(new FileOutputStream(path)); } else { writer = new BufferedOutputStream(new FileOutputStream(path)); } writer.write(contents.getBytes(encoding)); } catch (Exception e) { e.printStackTrace(); } finally { if(writer != null){ closeIgnoringExceptions(writer); } } } /** * Writes a string to a temporary file * * @param contents The string to write * @param path The file path * @param encoding The encoding to encode in * @throws IOException In case of failure * @return The File written to */ public static File writeStringToTempFile(String contents, String path, String encoding) throws IOException { OutputStream writer; File tmp = File.createTempFile(path,".tmp"); if (path.endsWith(".gz")) { writer = new GZIPOutputStream(new FileOutputStream(tmp)); } else { writer = new BufferedOutputStream(new FileOutputStream(tmp)); } writer.write(contents.getBytes(encoding)); return tmp; } /** * Writes a string to a temporary file, as UTF-8 * * @param contents The string to write * @param path The file path * @throws IOException In case of failure */ public static void writeStringToTempFile(String contents, String path) throws IOException { writeStringToTempFile(contents, path, "UTF-8"); } /** * Writes a string to a temporary file, squashing exceptions * * @param contents The string to write * @param path The file path * @param encoding The encoding to encode in * @return The File that was written to */ public static File writeStringToTempFileNoExceptions(String contents, String path, String encoding) { OutputStream writer = null; File tmp = null; try { tmp = File.createTempFile(path,".tmp"); if (path.endsWith(".gz")) { writer = new GZIPOutputStream(new FileOutputStream(tmp)); } else { writer = new BufferedOutputStream(new FileOutputStream(tmp)); } writer.write(contents.getBytes(encoding)); } catch (Exception e) { e.printStackTrace(); } finally { closeIgnoringExceptions(writer); } return tmp; } /** * Writes a string to a temporary file with UTF-8 encoding, squashing exceptions * * @param contents The string to write * @param path The file path */ public static void writeStringToTempFileNoExceptions(String contents, String path) { writeStringToTempFileNoExceptions(contents, path, "UTF-8"); } //-- todo [cdm, Aug 2012]: Do we need the below methods? They're kind of weird in unnecessarily bypassing using a Writer. // todo [cdm, Sep 2013]: Can we remove this next method and its friends? (Weird in silently gzipping, overlaps other functionality.) /** * Read an object from a stored file. It is silently ungzipped, regardless of name. * * @param file The file pointing to the object to be retrieved * @throws IOException If file cannot be read * @throws ClassNotFoundException If reading serialized object fails * @return The object read from the file. */ public static <T> T readObjectFromFile(File file) throws IOException, ClassNotFoundException { try { ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( new GZIPInputStream(new FileInputStream(file)))); Object o = ois.readObject(); ois.close(); return ErasureUtils.uncheckedCast(o); } catch (java.util.zip.ZipException e) { ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( new FileInputStream(file))); Object o = ois.readObject(); ois.close(); return ErasureUtils.uncheckedCast(o); } } public static DataInputStream getDataInputStream(String filenameUrlOrClassPath) throws IOException { return new DataInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filenameUrlOrClassPath)); } public static DataOutputStream getDataOutputStream(String filename) throws IOException { return new DataOutputStream(getBufferedOutputStream((filename))); } /** * Read an object from a stored file. The file can be anything obtained * via a URL, the filesystem, or the classpath (eg in a jar file). * * @param filename The file pointing to the object to be retrieved * @throws IOException If file cannot be read * @throws ClassNotFoundException If reading serialized object fails * @return The object read from the file. */ public static <T> T readObjectFromURLOrClasspathOrFileSystem(String filename) throws IOException, ClassNotFoundException { ObjectInputStream ois = new ObjectInputStream(getInputStreamFromURLOrClasspathOrFileSystem(filename)); Object o = ois.readObject(); ois.close(); return ErasureUtils.uncheckedCast(o); } public static <T> T readObjectFromObjectStream(ObjectInputStream ois) throws IOException, ClassNotFoundException { Object o = ois.readObject(); return ErasureUtils.uncheckedCast(o); } /** * Read an object from a stored file. * * @param filename The filename of the object to be retrieved * @throws IOException If file cannot be read * @throws ClassNotFoundException If reading serialized object fails * @return The object read from the file. */ public static <T> T readObjectFromFile(String filename) throws IOException, ClassNotFoundException { return ErasureUtils.uncheckedCast(readObjectFromFile(new File(filename))); } /** * Read an object from a stored file without throwing exceptions. * * @param file The file pointing to the object to be retrieved * @return The object read from the file, or null if an exception occurred. */ public static <T> T readObjectFromFileNoExceptions(File file) { Object o = null; try { ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream( new GZIPInputStream(new FileInputStream(file)))); o = ois.readObject(); ois.close(); } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } return ErasureUtils.uncheckedCast(o); } public static int lineCount(String textFileOrUrl) throws IOException { BufferedReader r = readerFromString(textFileOrUrl); int numLines = 0; while (r.readLine() != null) { numLines++; } return numLines; } public static ObjectOutputStream writeStreamFromString(String serializePath) throws IOException { ObjectOutputStream oos; if (serializePath.endsWith(".gz")) { oos = new ObjectOutputStream(new BufferedOutputStream( new GZIPOutputStream(new FileOutputStream(serializePath)))); } else { oos = new ObjectOutputStream(new BufferedOutputStream( new FileOutputStream(serializePath))); } return oos; } public static ObjectInputStream readStreamFromString(String filenameOrUrl) throws IOException { InputStream is = getInputStreamFromURLOrClasspathOrFileSystem(filenameOrUrl); return new ObjectInputStream(is); } /** * Locates this file either in the CLASSPATH or in the file system. The CLASSPATH takes priority. * @param name The file or resource name * @throws FileNotFoundException If the file does not exist * @return The InputStream of name, or null if not found */ private static InputStream findStreamInClasspathOrFileSystem(String name) throws FileNotFoundException { // ms 10-04-2010: // - even though this may look like a regular file, it may be a path inside a jar in the CLASSPATH // - check for this first. This takes precedence over the file system. InputStream is = IOUtils.class.getClassLoader().getResourceAsStream(name); // windows File.separator is \, but getting resources only works with / if (is == null) { is = IOUtils.class.getClassLoader().getResourceAsStream(name.replaceAll("\\\\", "/")); } // if not found in the CLASSPATH, load from the file system if (is == null) is = new FileInputStream(name); return is; } /** * Check if this path exists either in the classpath or on the filesystem. * * @param name The file or resource name. * @return true if a call to {@link IOUtils#getBufferedReaderFromClasspathOrFileSystem(String)} would return a valid stream. */ public static boolean existsInClasspathOrFileSystem(String name) { InputStream is = IOUtils.class.getClassLoader().getResourceAsStream(name); if (is == null) { is = IOUtils.class.getClassLoader().getResourceAsStream(name.replaceAll("\\\\", "/")); } return is != null || new File(name).exists(); } /** * Locates this file either using the given URL, or in the CLASSPATH, or in the file system * The CLASSPATH takes priority over the file system! * This stream is buffered and gunzipped (if necessary). * * @param textFileOrUrl * @return An InputStream for loading a resource * @throws IOException */ public static InputStream getInputStreamFromURLOrClasspathOrFileSystem(String textFileOrUrl) throws IOException { InputStream in; if (textFileOrUrl.matches("https?://.*")) { URL u = new URL(textFileOrUrl); URLConnection uc = u.openConnection(); in = uc.getInputStream(); } else { try { in = findStreamInClasspathOrFileSystem(textFileOrUrl); } catch (FileNotFoundException e) { try { // Maybe this happens to be some other format of URL? URL u = new URL(textFileOrUrl); URLConnection uc = u.openConnection(); in = uc.getInputStream(); } catch (IOException e2) { // Don't make the original exception a cause, since it is almost certainly bogus throw new IOException("Unable to resolve \"" + textFileOrUrl + "\" as either " + "class path, filename or URL"); // , e2); } } } if (textFileOrUrl.endsWith(".gz")) { // gunzip it if necessary in = new GZIPInputStream(in, 65536); } // buffer this stream in = new BufferedInputStream(in); return in; } /** * Quietly opens a File. If the file ends with a ".gz" extension, * automatically opens a GZIPInputStream to wrap the constructed * FileInputStream. */ public static InputStream inputStreamFromFile(File file) throws RuntimeIOException { try { InputStream is = new BufferedInputStream(new FileInputStream(file)); if (file.getName().endsWith(".gz")) { is = new GZIPInputStream(is); } return is; } catch (IOException e) { throw new RuntimeIOException(e); } } /** * Open a BufferedReader to a File. If the file's getName() ends in .gz, * it is interpreted as a gzipped file (and uncompressed). The file is then * interpreted as a utf-8 text file. * * @param file What to read from * @return The BufferedReader * @throws RuntimeIOException If there is an I/O problem */ public static BufferedReader readerFromFile(File file) { InputStream is = null; try { is = inputStreamFromFile(file); return new BufferedReader(new InputStreamReader(is, "UTF-8")); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } finally { IOUtils.closeIgnoringExceptions(is); } } /** * Open a BufferedReader on stdin. Use the user's default encoding. * * @return The BufferedReader * @throws IOException If there is an I/O problem */ public static BufferedReader readerFromStdin() throws IOException { return new BufferedReader(new InputStreamReader(System.in)); } /** * Open a BufferedReader on stdin. Use the specified character encoding. * * @param encoding CharSet encoding. Maybe be null, in which case the * platform default encoding is used * @return The BufferedReader * @throws IOException If there is an I/O problem */ public static BufferedReader readerFromStdin(String encoding) throws IOException { if (encoding == null) { return new BufferedReader(new InputStreamReader(System.in)); } return new BufferedReader(new InputStreamReader(System.in, encoding)); } /** * Open a BufferedReader to a file or URL specified by a String name. If the * String starts with https?://, then it is first tried as a URL, otherwise it * is next tried as a resource on the CLASSPATH, and then finally it is tried * as a local file or other network-available file. If the String ends in .gz, it * is interpreted as a gzipped file (and uncompressed). The file is then * interpreted as a utf-8 text file. * * @param textFileOrUrl What to read from * @return The BufferedReader * @throws IOException If there is an I/O problem */ public static BufferedReader readerFromString(String textFileOrUrl) throws IOException { return new BufferedReader(new InputStreamReader( getInputStreamFromURLOrClasspathOrFileSystem(textFileOrUrl), "UTF-8")); } /** * Open a BufferedReader to a file or URL specified by a String name. If the * String starts with https?://, then it is first tried as a URL, otherwise it * is next tried as a resource on the CLASSPATH, and then finally it is tried * as a local file or other network-available file . If the String ends in .gz, it * is interpreted as a gzipped file (and uncompressed), else it is interpreted as * a regular text file in the given encoding. * * @param textFileOrUrl What to read from * @param encoding CharSet encoding. Maybe be null, in which case the * platform default encoding is used * @return The BufferedReader * @throws IOException If there is an I/O problem */ public static BufferedReader readerFromString(String textFileOrUrl, String encoding) throws IOException { InputStream is = getInputStreamFromURLOrClasspathOrFileSystem(textFileOrUrl); if (encoding == null) { return new BufferedReader(new InputStreamReader(is)); } return new BufferedReader(new InputStreamReader(is, encoding)); } /** * Returns an Iterable of the lines in the file. * * The file reader will be closed when the iterator is exhausted. IO errors * will throw an (unchecked) RuntimeIOException * * @param path The file whose lines are to be read. * @return An Iterable containing the lines from the file. */ public static Iterable<String> readLines(String path) { return readLines(path, null); } /** * Returns an Iterable of the lines in the file. * * The file reader will be closed when the iterator is exhausted. IO errors * will throw an (unchecked) RuntimeIOException * * @param path The file whose lines are to be read. * @param encoding The encoding to use when reading lines. * @return An Iterable containing the lines from the file. */ public static Iterable<String> readLines(String path, String encoding) { return new GetLinesIterable(path, null, encoding); } /** * Returns an Iterable of the lines in the file. * * The file reader will be closed when the iterator is exhausted. * * @param file The file whose lines are to be read. * @return An Iterable containing the lines from the file. */ public static Iterable<String> readLines(final File file) { return readLines(file, null, null); } /** * Returns an Iterable of the lines in the file. * * The file reader will be closed when the iterator is exhausted. * * @param file The file whose lines are to be read. * @param fileInputStreamWrapper * The class to wrap the InputStream with, e.g. GZIPInputStream. Note * that the class must have a constructor that accepts an * InputStream. * @return An Iterable containing the lines from the file. */ public static Iterable<String> readLines(final File file, final Class<? extends InputStream> fileInputStreamWrapper) { return readLines(file, fileInputStreamWrapper, null); } /** * Returns an Iterable of the lines in the file, wrapping the generated * FileInputStream with an instance of the supplied class. IO errors will * throw an (unchecked) RuntimeIOException * * @param file The file whose lines are to be read. * @param fileInputStreamWrapper * The class to wrap the InputStream with, e.g. GZIPInputStream. Note * that the class must have a constructor that accepts an * InputStream. * @param encoding The encoding to use when reading lines. * @return An Iterable containing the lines from the file. */ public static Iterable<String> readLines(final File file, final Class<? extends InputStream> fileInputStreamWrapper, final String encoding) { return new GetLinesIterable(file, fileInputStreamWrapper, encoding); } static class GetLinesIterable implements Iterable<String> { final File file; final String path; final Class<? extends InputStream> fileInputStreamWrapper; final String encoding; // TODO: better programming style would be to make this two // separate classes, but we don't expect to make more versions of // this class anyway GetLinesIterable(final File file, final Class<? extends InputStream> fileInputStreamWrapper, final String encoding) { this.file = file; this.path = null; this.fileInputStreamWrapper = fileInputStreamWrapper; this.encoding = encoding; } GetLinesIterable(final String path, final Class<? extends InputStream> fileInputStreamWrapper, final String encoding) { this.file = null; this.path = path; this.fileInputStreamWrapper = fileInputStreamWrapper; this.encoding = encoding; } private InputStream getStream() throws IOException { if (file != null) { return new FileInputStream(file); } else if (path != null) { return getInputStreamFromURLOrClasspathOrFileSystem(path); } else { throw new AssertionError("No known path to read"); } } public Iterator<String> iterator() { return new Iterator<String>() { protected BufferedReader reader = this.getReader(); protected String line = this.getLine(); public boolean hasNext() { return this.line != null; } public String next() { String nextLine = this.line; if (nextLine == null) { throw new NoSuchElementException(); } line = getLine(); return nextLine; } protected String getLine() { try { String result = this.reader.readLine(); if (result == null) { this.reader.close(); } return result; } catch (IOException e) { throw new RuntimeIOException(e); } } protected BufferedReader getReader() { try { InputStream stream = getStream(); if (fileInputStreamWrapper != null) { stream = fileInputStreamWrapper.getConstructor(InputStream.class).newInstance(stream); } if (encoding == null) { return new BufferedReader(new InputStreamReader(stream)); } else { return new BufferedReader(new InputStreamReader(stream, encoding)); } } catch (Exception e) { throw new RuntimeIOException(e); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } } /** * Given a reader, returns the lines from the reader as an Iterable. * * @param r input reader * @param includeEol whether to keep eol-characters in the returned strings * @return iterable of lines (as strings) */ public static Iterable<String> getLineIterable( Reader r, boolean includeEol) { if (includeEol) { return new EolPreservingLineReaderIterable(r); } else { return new LineReaderIterable( (r instanceof BufferedReader)? (BufferedReader) r:new BufferedReader(r) ); } } public static Iterable<String> getLineIterable( Reader r, int bufferSize, boolean includeEol) { if (includeEol) { return new EolPreservingLineReaderIterable(r, bufferSize); } else { return new LineReaderIterable( (r instanceof BufferedReader)? (BufferedReader) r:new BufferedReader(r, bufferSize) ); } } /** * Line iterator that uses BufferedReader.readLine() * EOL-characters are automatically discarded and not included in the strings returns */ private static final class LineReaderIterable implements Iterable<String> { private final BufferedReader reader; private LineReaderIterable( BufferedReader reader ) { this.reader = reader; } @Override public Iterator<String> iterator() { return new Iterator<String>() { private String next = getNext(); private String getNext() { try { return reader.readLine(); } catch (IOException ex) { throw new RuntimeIOException(ex); } } @Override public boolean hasNext() { return this.next != null; } @Override public String next() { String nextLine = this.next; if (nextLine == null) { throw new NoSuchElementException(); } next = getNext(); return nextLine; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } } /** * Line iterator that preserves the eol-character exactly as read from reader. * Line endings are: \r\n,\n,\r * Lines returns by this iterator will include the eol-characters **/ private static final class EolPreservingLineReaderIterable implements Iterable<String> { private final Reader reader; private final int bufferSize; private EolPreservingLineReaderIterable( Reader reader ) { this(reader, SLURPBUFFSIZE); } private EolPreservingLineReaderIterable( Reader reader, int bufferSize ) { this.reader = reader; this.bufferSize = bufferSize; } @Override public Iterator<String> iterator() { return new Iterator<String>() { private String next; private boolean done = false; private StringBuilder sb = new StringBuilder(80); private char[] charBuffer = new char[bufferSize]; private int charBufferPos = -1; private int charsInBuffer = 0; boolean lastWasLF = false; private String getNext() { try { while (true) { if (charBufferPos < 0) { charsInBuffer = reader.read(charBuffer); if (charsInBuffer < 0) { // No more!!! if (sb.length() > 0) { String line = sb.toString(); // resets the buffer sb.setLength(0); return line; } else { return null; } } charBufferPos = 0; } boolean eolReached = copyUntilEol(); if (eolReached) { // eol reached String line = sb.toString(); // resets the buffer sb.setLength(0); return line; } } } catch (IOException ex) { throw new RuntimeIOException(ex); } } private boolean copyUntilEol() { for (int i = charBufferPos; i < charsInBuffer; i++) { if (charBuffer[i] == '\n') { // line end // copy into our string builder sb.append(charBuffer, charBufferPos, i - charBufferPos + 1); // advance character buffer pos charBufferPos = i+1; lastWasLF = false; return true; // end of line reached } else if (lastWasLF) { // not a '\n' here - still need to terminate line (but don't include current character) if (i > charBufferPos) { sb.append(charBuffer, charBufferPos, i - charBufferPos); // advance character buffer pos charBufferPos = i; lastWasLF = false; return true; // end of line reached } } if (charBuffer[i] == '\r') { lastWasLF = true; } else { lastWasLF = false; } } sb.append(charBuffer, charBufferPos, charsInBuffer - charBufferPos); // reset character buffer pos charBufferPos = -1; return false; } @Override public boolean hasNext() { if (done) return false; if (next == null) { next = getNext(); } if (next == null) { done = true; } return !done; } @Override public String next() { if (!hasNext()) { throw new NoSuchElementException(); } String res = next; next = null; return res; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } } /** * Provides an implementation of closing a file for use in a finally block so * you can correctly close a file without even more exception handling stuff. * From a suggestion in a talk by Josh Bloch. * * @param c The IO resource to close (e.g., a Stream/Reader) */ public static void closeIgnoringExceptions(Closeable c) { if (c != null) { try { c.close(); } catch (IOException ioe) { // ignore } } } /** * Iterate over all the files in the directory, recursively. * * @param dir * The root directory. * @return All files within the directory. */ public static Iterable<File> iterFilesRecursive(final File dir) { return iterFilesRecursive(dir, (Pattern) null); } /** * Iterate over all the files in the directory, recursively. * * @param dir * The root directory. * @param ext * A string that must be at the end of all files (e.g. ".txt") * @return All files within the directory ending in the given extension. */ public static Iterable<File> iterFilesRecursive(final File dir, final String ext) { return iterFilesRecursive(dir, Pattern.compile(Pattern.quote(ext) + "$")); } /** * Iterate over all the files in the directory, recursively. * * @param dir * The root directory. * @param pattern * A regular expression that the file path must match. This uses * Matcher.find(), so use ^ and $ to specify endpoints. * @return All files within the directory. */ public static Iterable<File> iterFilesRecursive(final File dir, final Pattern pattern) { return new Iterable<File>() { public Iterator<File> iterator() { return new AbstractIterator<File>() { private final Queue<File> files = new LinkedList<File>(Collections .singleton(dir)); private File file = this.findNext(); @Override public boolean hasNext() { return this.file != null; } @Override public File next() { File result = this.file; if (result == null) { throw new NoSuchElementException(); } this.file = this.findNext(); return result; } private File findNext() { File next = null; while (!this.files.isEmpty() && next == null) { next = this.files.remove(); if (next.isDirectory()) { files.addAll(Arrays.asList(next.listFiles())); next = null; } else if (pattern != null) { if (!pattern.matcher(next.getPath()).find()) { next = null; } } } return next; } }; } }; } /** * Returns all the text in the given File. */ public static String slurpFile(File file) throws IOException { return slurpFile(file, null); } /** * Returns all the text in the given File. * * @param file The file to read from * @param encoding The character encoding to assume. This may be null, and * the platform default character encoding is used. */ public static String slurpFile(File file, String encoding) throws IOException { return IOUtils.slurpReader(IOUtils.encodedInputStreamReader( new FileInputStream(file), encoding)); } /** * Returns all the text in the given File. */ public static String slurpGZippedFile(String filename) throws IOException { Reader r = encodedInputStreamReader(new GZIPInputStream(new FileInputStream( filename)), null); return IOUtils.slurpReader(r); } /** * Returns all the text in the given File. */ public static String slurpGZippedFile(File file) throws IOException { Reader r = encodedInputStreamReader(new GZIPInputStream(new FileInputStream( file)), null); return IOUtils.slurpReader(r); } /** * Returns all the text in the given file with the given encoding. */ public static String slurpFile(String filename, String encoding) throws IOException { Reader r = new InputStreamReader(getInputStreamFromURLOrClasspathOrFileSystem(filename), encoding); return IOUtils.slurpReader(r); } /** * Returns all the text in the given file with the given * encoding. If the file cannot be read (non-existent, etc.), then * the method throws an unchecked RuntimeIOException. If the caller * is willing to tolerate missing files, they should catch that * exception. */ public static String slurpFileNoExceptions(String filename, String encoding) { try { return slurpFile(filename, encoding); } catch (IOException e) { throw new RuntimeIOException("slurpFile IO problem", e); } } /** * Returns all the text in the given file * * @return The text in the file. */ public static String slurpFile(String filename) throws IOException { return slurpFile(filename, defaultEncoding); } /** * Returns all the text at the given URL. */ public static String slurpGBURL(URL u) throws IOException { return IOUtils.slurpURL(u, "GB18030"); } /** * Returns all the text at the given URL. */ public static String slurpURLNoExceptions(URL u, String encoding) { try { return IOUtils.slurpURL(u, encoding); } catch (Exception e) { e.printStackTrace(); return null; } } /** * Returns all the text at the given URL. */ public static String slurpURL(URL u, String encoding) throws IOException { String lineSeparator = System.getProperty("line.separator"); URLConnection uc = u.openConnection(); uc.setReadTimeout(30000); InputStream is; try { is = uc.getInputStream(); } catch (SocketTimeoutException e) { // e.printStackTrace(); System.err.println("Time out. Return empty string"); return ""; } BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding)); String temp; StringBuilder buff = new StringBuilder(16000); // make biggish while ((temp = br.readLine()) != null) { buff.append(temp); buff.append(lineSeparator); } br.close(); return buff.toString(); } public static String getUrlEncoding(URLConnection connection) { String contentType = connection.getContentType(); String[] values = contentType.split(";"); String charset = defaultEncoding; // might or might not be right.... for (String value : values) { value = value.trim(); if (value.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { charset = value.substring("charset=".length()); } } return charset; } /** * Returns all the text at the given URL. */ public static String slurpURL(URL u) throws IOException { String lineSeparator = System.getProperty("line.separator"); URLConnection uc = u.openConnection(); String encoding = getUrlEncoding(uc); InputStream is = uc.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding)); String temp; StringBuilder buff = new StringBuilder(16000); // make biggish while ((temp = br.readLine()) != null) { buff.append(temp); buff.append(lineSeparator); } br.close(); return buff.toString(); } /** * Returns all the text at the given URL. */ public static String slurpURLNoExceptions(URL u) { try { return slurpURL(u); } catch (Exception e) { e.printStackTrace(); return null; } } /** * Returns all the text at the given URL. */ public static String slurpURL(String path) throws Exception { return slurpURL(new URL(path)); } /** * Returns all the text at the given URL. If the file cannot be read * (non-existent, etc.), then and only then the method returns * <code>null</code>. */ public static String slurpURLNoExceptions(String path) { try { return slurpURL(path); } catch (Exception e) { e.printStackTrace(); return null; } } /** * Returns all the text in the given file with the given * encoding. If the file cannot be read (non-existent, etc.), then * the method throws an unchecked RuntimeIOException. If the caller * is willing to tolerate missing files, they should catch that * exception. */ public static String slurpFileNoExceptions(File file) { try { return IOUtils.slurpReader(encodedInputStreamReader(new FileInputStream(file), null)); } catch (IOException e) { throw new RuntimeIOException(e); } } /** * Returns all the text in the given file with the given * encoding. If the file cannot be read (non-existent, etc.), then * the method throws an unchecked RuntimeIOException. If the caller * is willing to tolerate missing files, they should catch that * exception. */ public static String slurpFileNoExceptions(String filename) { try { return slurpFile(filename); } catch (IOException e) { throw new RuntimeIOException(e); } } /** * Returns all the text from the given Reader. * Closes the Reader when done. * * @return The text in the file. */ public static String slurpReader(Reader reader) { BufferedReader r = new BufferedReader(reader); StringBuilder buff = new StringBuilder(); try { char[] chars = new char[SLURPBUFFSIZE]; while (true) { int amountRead = r.read(chars, 0, SLURPBUFFSIZE); if (amountRead < 0) { break; } buff.append(chars, 0, amountRead); } r.close(); } catch (Exception e) { throw new RuntimeIOException("slurpReader IO problem", e); } return buff.toString(); } /** * Send all bytes from the input stream to the output stream. * * @param input * The input bytes. * @param output * Where the bytes should be written. */ public static void writeStreamToStream(InputStream input, OutputStream output) throws IOException { byte[] buffer = new byte[4096]; while (true) { int len = input.read(buffer); if (len == -1) { break; } output.write(buffer, 0, len); } } /** * Read in a CSV formatted file with a header row * @param path - path to CSV file * @param quoteChar - character for enclosing strings, defaults to " * @param escapeChar - character for escaping quotes appearing in quoted strings; defaults to " (i.e. "" is used for " inside quotes, consistent with Excel) * @return a list of maps representing the rows of the csv. The maps' keys are the header strings and their values are the row contents * @throws IOException */ public static List<Map<String,String>> readCSVWithHeader(String path, char quoteChar, char escapeChar) throws IOException { String[] labels = null; List<Map<String,String>> rows = Generics.newArrayList(); for (String line : IOUtils.readLines(path)) { System.out.println("Splitting "+line); if (labels == null) { labels = StringUtils.splitOnCharWithQuoting(line,',','"',escapeChar); } else { String[] cells = StringUtils.splitOnCharWithQuoting(line,',',quoteChar,escapeChar); assert(cells.length == labels.length); Map<String,String> cellMap = Generics.newHashMap(); for (int i=0; i<labels.length; i++) cellMap.put(labels[i],cells[i]); rows.add(cellMap); } } return rows; } public static List<Map<String,String>> readCSVWithHeader(String path) throws IOException { return readCSVWithHeader(path, '"', '"'); } /** * Read a CSV file character by character. Allows for multi-line CSV files (in quotes), but * is less flexible and likely slower than readCSVWithHeader() * @param csvContents The char[] array corresponding to the contents of the file * @param numColumns The number of columns in the file (for verification, primarily) * @return A list of lines in the file */ public static LinkedList<String[]> readCSVStrictly(char[] csvContents, int numColumns){ //--Variables StringBuilder[] buffer = new StringBuilder[numColumns]; buffer[0] = new StringBuilder(); LinkedList<String[]> lines = new LinkedList<String[]>(); //--State boolean inQuotes = false; boolean nextIsEscaped = false; int columnI = 0; //--Read for(int offset=0; offset<csvContents.length; offset++){ if(nextIsEscaped){ buffer[columnI].append(csvContents[offset]); nextIsEscaped = false; } else { switch(csvContents[offset]){ case '"': //(case: quotes) inQuotes = !inQuotes; break; case ',': //(case: field separator) if(inQuotes){ buffer[columnI].append(','); } else { columnI += 1; if(columnI >= numColumns){ throw new IllegalArgumentException("Too many columns: "+columnI+"/"+numColumns+" (offset: " + offset + ")"); } buffer[columnI] = new StringBuilder(); } break; case '\n': //(case: newline) if(inQuotes){ buffer[columnI].append('\n'); } else { //((error checks)) if(columnI != numColumns-1){ throw new IllegalArgumentException("Too few columns: "+columnI+"/"+numColumns+" (offset: " + offset + ")"); } //((create line)) String[] rtn = new String[buffer.length]; for(int i=0; i<buffer.length; i++){ rtn[i] = buffer[i].toString(); } lines.add(rtn); //((update state)) columnI = 0; buffer[columnI] = new StringBuilder(); } break; case '\\': nextIsEscaped = true; break; default: buffer[columnI].append(csvContents[offset]); } } } //--Return return lines; } public static LinkedList<String[]> readCSVStrictly(String filename, int numColumns) throws IOException { return readCSVStrictly(slurpFile(filename).toCharArray(), numColumns); } /** * Get a input file stream (automatically gunzip/bunzip2 depending on file extension) * @param filename Name of file to open * @return Input stream that can be used to read from the file * @throws IOException if there are exceptions opening the file */ public static InputStream getFileInputStream(String filename) throws IOException { InputStream in = new FileInputStream(filename); if (filename.endsWith(".gz")) { in = new GZIPInputStream(in); } else if (filename.endsWith(".bz2")) { //in = new CBZip2InputStream(in); in = getBZip2PipedInputStream(filename); } return in; } /** * Get a output file stream (automatically gzip/bzip2 depending on file extension) * @param filename Name of file to open * @return Output stream that can be used to write to the file * @throws IOException if there are exceptions opening the file */ public static OutputStream getFileOutputStream(String filename) throws IOException { OutputStream out = new FileOutputStream(filename); if (filename.endsWith(".gz")) { out = new GZIPOutputStream(out); } else if (filename.endsWith(".bz2")) { //out = new CBZip2OutputStream(out); out = getBZip2PipedOutputStream(filename); } return out; } public static BufferedReader getBufferedFileReader(String filename) throws IOException { return getBufferedFileReader(filename, defaultEncoding); } public static BufferedReader getBufferedFileReader(String filename, String encoding) throws IOException { InputStream in = getFileInputStream(filename); return new BufferedReader(new InputStreamReader(in, encoding)); } public static BufferedReader getBufferedReaderFromClasspathOrFileSystem(String filename) throws IOException { return getBufferedReaderFromClasspathOrFileSystem(filename, defaultEncoding); } public static BufferedReader getBufferedReaderFromClasspathOrFileSystem(String filename, String encoding) throws IOException { InputStream in = findStreamInClasspathOrFileSystem(filename); return new BufferedReader(new InputStreamReader(in, encoding)); } public static PrintWriter getPrintWriter(File textFile) throws IOException { return getPrintWriter(textFile, null); } public static PrintWriter getPrintWriter(File textFile, String encoding) throws IOException { File f = textFile.getAbsoluteFile(); if (encoding == null) { encoding = defaultEncoding; } return new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), encoding)), true); } public static PrintWriter getPrintWriter(String filename) throws IOException { return getPrintWriter(filename, defaultEncoding); } public static PrintWriter getPrintWriterIgnoringExceptions(String filename) { try { return getPrintWriter(filename, defaultEncoding); } catch (IOException ioe) { return null; } } public static PrintWriter getPrintWriterOrDie(String filename) { try { return getPrintWriter(filename, defaultEncoding); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } public static PrintWriter getPrintWriter(String filename, String encoding) throws IOException { OutputStream out = getFileOutputStream(filename); if (encoding == null) { encoding = defaultEncoding; } return new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, encoding)), true); } public static InputStream getBZip2PipedInputStream(String filename) throws IOException { String bzcat = System.getProperty("bzcat", "bzcat"); Runtime rt = Runtime.getRuntime(); String cmd = bzcat + " " + filename; //System.err.println("getBZip2PipedInputStream: Running command: "+cmd); Process p = rt.exec(cmd); Writer errWriter = new BufferedWriter(new OutputStreamWriter(System.err)); StreamGobbler errGobler = new StreamGobbler(p.getErrorStream(), errWriter); errGobler.start(); return p.getInputStream(); } public static OutputStream getBZip2PipedOutputStream(String filename) throws IOException { return new BZip2PipedOutputStream(filename); } private static final Pattern tab = Pattern.compile("\t"); /** * Read column as set * @param infile - filename * @param field index of field to read * @return a set of the entries in column field * @throws IOException */ public static Set<String> readColumnSet(String infile, int field) throws IOException { BufferedReader br = IOUtils.getBufferedFileReader(infile); String line; Set<String> set = Generics.newHashSet(); while ((line = br.readLine()) != null) { line = line.trim(); if (line.length() > 0) { if (field < 0) { set.add(line); } else { String[] fields = tab.split(line); if (field < fields.length) { set.add(fields[field]); } } } } br.close(); return set; } public static <C> List<C> readObjectFromColumns(Class objClass, String filename, String[] fieldNames, String delimiter) throws IOException, InstantiationException, IllegalAccessException, NoSuchFieldException, NoSuchMethodException, InvocationTargetException { Pattern delimiterPattern = Pattern.compile(delimiter); List<C> list = new ArrayList<C>(); BufferedReader br = IOUtils.getBufferedFileReader(filename); String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.length() > 0) { C item = StringUtils.columnStringToObject(objClass, line, delimiterPattern, fieldNames); list.add(item); } } br.close(); return list; } public static Map<String,String> readMap(String filename) throws IOException { Map<String,String> map = Generics.newHashMap(); try { BufferedReader br = IOUtils.getBufferedFileReader(filename); String line; while ((line = br.readLine()) != null) { String[] fields = tab.split(line,2); map.put(fields[0], fields[1]); } br.close(); } catch (IOException ex) { throw new RuntimeException(ex); } return map; } /** * Returns the contents of a file as a single string. The string may be * empty, if the file is empty. If there is an IOException, it is caught * and null is returned. */ public static String stringFromFile(String filename) { return stringFromFile(filename, defaultEncoding); } /** * Returns the contents of a file as a single string. The string may be * empty, if the file is empty. If there is an IOException, it is caught * and null is returned. Encoding can also be specified. */ public static String stringFromFile(String filename, String encoding) { try { StringBuilder sb = new StringBuilder(); BufferedReader in = new BufferedReader(new EncodingFileReader(filename,encoding)); String line; while ((line = in.readLine()) != null) { sb.append(line); sb.append(eolChar); } in.close(); return sb.toString(); } catch (IOException e) { e.printStackTrace(); return null; } } /** * Returns the contents of a file as a list of strings. The list may be * empty, if the file is empty. If there is an IOException, it is caught * and null is returned. */ public static List<String> linesFromFile(String filename) { return linesFromFile(filename, defaultEncoding); } /** * Returns the contents of a file as a list of strings. The list may be * empty, if the file is empty. If there is an IOException, it is caught * and null is returned. Encoding can also be specified */ public static List<String> linesFromFile(String filename,String encoding) { return linesFromFile(filename, encoding, false); } public static List<String> linesFromFile(String filename,String encoding, boolean ignoreHeader) { try { List<String> lines = new ArrayList<String>(); BufferedReader in = new BufferedReader(new EncodingFileReader(filename,encoding)); String line; int i = 0; while ((line = in.readLine()) != null) { i++; if(ignoreHeader && i == 1) continue; lines.add(line); } in.close(); return lines; } catch (IOException e) { e.printStackTrace(); return null; } } public static String backupName(String filename) { return backupFile(new File(filename)).toString(); } public static File backupFile(File file) { int max = 1000; String filename = file.toString(); File backup = new File(filename + "~"); if (!backup.exists()) { return backup; } for (int i = 1; i <= max; i++) { backup = new File(filename + ".~" + i + ".~"); if (!backup.exists()) { return backup; } } return null; } public static boolean renameToBackupName(File file) { return file.renameTo(backupFile(file)); } /** * A JavaNLP specific convenience routine for obtaining the current * scratch directory for the machine you're currently running on. */ public static File getJNLPLocalScratch() { try { String machineName = InetAddress.getLocalHost().getHostName().split("\\.")[0]; String username = System.getProperty("user.name"); return new File("/"+machineName+"/scr1/"+username); } catch (Exception e) { return new File("./scr/"); // default scratch } } /** * Given a filepath, makes sure a directory exists there. If not, creates and returns it. * Same as ENSURE-DIRECTORY in CL. * * @param tgtDir The directory that you wish to ensure exists * @throws IOException If directory can't be created, is an existing file, or for other reasons */ public static File ensureDir(File tgtDir) throws IOException { if (tgtDir.exists()) { if (tgtDir.isDirectory()) { return tgtDir; } else { throw new IOException("Could not create directory "+tgtDir.getAbsolutePath()+", as a file already exists at that path."); } } else { if ( ! tgtDir.mkdirs()) { throw new IOException("Could not create directory "+tgtDir.getAbsolutePath()); } return tgtDir; } } /** * Given a filepath, delete all files in the directory recursively * @param dir * @return */ public static boolean deleteDirRecursively(File dir) { if (dir.isDirectory()) { for (File f : dir.listFiles()) { boolean success = deleteDirRecursively(f); if (!success) return false; } } return dir.delete(); } public static String getExtension(String fileName) { if(!fileName.contains(".")) return null; int idx = fileName.lastIndexOf('.'); return fileName.substring(idx+1); } /** Create a Reader with an explicit encoding around an InputStream. * This static method will treat null as meaning to use the platform default, * unlike the Java library methods that disallow a null encoding. * * @param stream An InputStream * @param encoding A charset encoding * @return A Reader * @throws IOException If any IO problem */ public static Reader encodedInputStreamReader(InputStream stream, String encoding) throws IOException { // InputStreamReader doesn't allow encoding to be null; if (encoding == null) { return new InputStreamReader(stream); } else { return new InputStreamReader(stream, encoding); } } /** Create a Reader with an explicit encoding around an InputStream. * This static method will treat null as meaning to use the platform default, * unlike the Java library methods that disallow a null encoding. * * @param stream An InputStream * @param encoding A charset encoding * @return A Reader * @throws IOException If any IO problem */ public static Writer encodedOutputStreamWriter(OutputStream stream, String encoding) throws IOException { // OutputStreamWriter doesn't allow encoding to be null; if (encoding == null) { return new OutputStreamWriter(stream); } else { return new OutputStreamWriter(stream, encoding); } } /** Create a Reader with an explicit encoding around an InputStream. * This static method will treat null as meaning to use the platform default, * unlike the Java library methods that disallow a null encoding. * * @param stream An InputStream * @param encoding A charset encoding * @param autoFlush Whether to make an autoflushing Writer * @return A Reader * @throws IOException If any IO problem */ public static PrintWriter encodedOutputStreamPrintWriter(OutputStream stream, String encoding, boolean autoFlush) throws IOException { // PrintWriter doesn't allow encoding to be null; or to have charset and flush if (encoding == null) { return new PrintWriter(stream, autoFlush); } else { return new PrintWriter(new OutputStreamWriter(stream, encoding), autoFlush); } } /** * A raw file copy function -- this is not public since no error checks are made as to the * consistency of the filed being copied. Use instead: * @see IOUtils#cp(java.io.File, java.io.File, boolean) * @param source The source file. This is guaranteed to exist, and is guaranteed to be a file. * @param target The target file. * @throws IOException Throws an exception if the copy fails. */ private static void copyFile(File source, File target) throws IOException { FileChannel sourceChannel = new FileInputStream( source ).getChannel(); FileChannel targetChannel = new FileOutputStream( target ).getChannel(); sourceChannel.transferTo(0, sourceChannel.size(), targetChannel); sourceChannel.close(); targetChannel.close(); } /** * <p>An implementation of cp, as close to the Unix command as possible. * Both directories and files are valid for either the source or the target; * if the target exists, the semantics of Unix cp are [intended to be] obeyed.</p> * * @param source The source file or directory. * @param target The target to write this file or directory to. * @param recursive If true, recursively copy directory contents * @throws IOException If either the copy fails (standard IO Exception), or the command is invalid * (e.g., copying a directory without the recursive flag) */ public static void cp(File source, File target, boolean recursive) throws IOException { // Error checks if (source.isDirectory() && !recursive) { // cp a b -- a is a directory throw new IOException("cp: omitting directory: " + source); } if (!target.getParentFile().exists()) { // cp a b/c/d/e -- b/c/d doesn't exist throw new IOException("cp: cannot copy to directory: " + recursive + " (parent doesn't exist)"); } if (!target.getParentFile().isDirectory()) { // cp a b/c/d/e -- b/c/d is a regular file throw new IOException("cp: cannot copy to directory: " + recursive + " (parent isn't a directory)"); } // Get true target File trueTarget; if (target.exists() && target.isDirectory()) { trueTarget = new File(target.getPath() + File.separator + source.getName()); } else { trueTarget = target; } // Copy if (source.isFile()) { // Case: copying a file copyFile(source, trueTarget); } else if (source.isDirectory()) { // Case: copying a directory File[] children = source.listFiles(); if (children == null) { throw new IOException("cp: could not list files in source: " + source); } if (target.exists()) { // Case: cp -r a b -- b exists if (!target.isDirectory()) { // cp -r a b -- b is a regular file throw new IOException("cp: cannot copy directory into regular file: " + target); } if (trueTarget.exists() && !trueTarget.isDirectory()) { // cp -r a b -- b/a is not a directory throw new IOException("cp: overwriting a file with a directory: " + trueTarget); } if (!trueTarget.exists() && !trueTarget.mkdir()) { // cp -r a b -- b/a cannot be created throw new IOException("cp: could not create directory: " + trueTarget); } } else { // Case: cp -r a b -- b does not exist assert trueTarget == target; if (!trueTarget.mkdir()) { // cp -r a b -- canot create b as a directory throw new IOException("cp: could not create target directory: " + trueTarget); } } // Actually do the copy for (File child : children) { File childTarget = new File(trueTarget.getPath() + File.separator + child.getName()); cp(child, childTarget, recursive); } } else { throw new IOException("cp: unknown file type: " + source); } } /** * @see IOUtils#cp(java.io.File, java.io.File, boolean) */ public static void cp(File source, File target) throws IOException { cp(source, target, false); } }