/* * Copyright 2008 Glencoe Software, Inc. All rights reserved. * Use is subject to license terms supplied in LICENSE.txt */ package ome.services.fulltext; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.Reader; import java.util.Iterator; import java.util.NoSuchElementException; import ome.services.messages.ParserOpenFileMessage; import ome.system.OmeroContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.BeansException; import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContextAware; /** * Object which attempts to parse any file given to it. On an exception or * empty/missing file, an empty {@link Iterable} should be returned * rather than throwing an exception. * * Subclasses should follow * * @author Josh Moore, josh at glencoesoftware.com * @since 3.0-Beta3 */ public class FileParser implements ApplicationContextAware { private final static Logger log = LoggerFactory.getLogger(FileParser.class); protected OmeroContext context; protected long maxFileSize = 10000L; // default test is 8.8KB public void setApplicationContext(ApplicationContext arg0) throws BeansException { context = (OmeroContext) arg0; } public void setMaxFileSize(Long size) { if (size.floatValue() / Runtime.getRuntime().maxMemory() > 0.5) { log.warn("Indexer maximum file size is set to more than half of " + "total heap size. Excessively large text files may " + "cause search index corruption. Consider decreasing the " + "maximum file size or increasing the Indexer heap size."); } this.maxFileSize = size; } /** * {@link Iterable} which returns an empty {@link Iterator}. This will be * used in case */ public final static Iterable<Reader> EMPTY = new Iterable<Reader>() { public Iterator<Reader> iterator() { return new Iterator<Reader>() { public boolean hasNext() { return false; } public Reader next() { throw new NoSuchElementException(); } public void remove() { throw new UnsupportedOperationException(); } }; } }; /** * Uses {@link #doParse(File)} to create manageable chunks of a file for * indexing. If the {@link File} argument is null or unreadable, then the * {@link #EMPTY} {@link Iterable} will be returned. The same holds if a * null {@link Iterable} is returned or an {@link Exception} is thrown. * * The {@link Iterator} returned from the instance should always be * completely iterated through so that resources can be released. For * example, <code> * for (String string : parse(file)) { * /* possibly ignore string *\/ * } * </code> * * @param file * Can be null. * @return An {@link Iterable} which is never null. */ final public Iterable<Reader> parse(File file) { if (file == null) { log.warn("Argument null. Returning EMPTY:"); return EMPTY; } if (!file.exists() && !file.canRead()) { log.debug("empty|unreadable file: " + file.getAbsoluteFile()); return EMPTY; } if (file.length() > this.maxFileSize) { log.info("File too large for indexing. Skipping: " + file.getAbsoluteFile()); return EMPTY; } try { Iterable<Reader> it = doParse(file); if (it == null) { log.debug("Implementation returned null."); return EMPTY; } else { return it; } } catch (Exception e) { log.warn("Implementation threw an exception.", e); return EMPTY; } } /** * Template method to parse a {@link File} into manageable chunks. * * The default implementation reads from the file lazily with chunks * overlapping on the final white space. For example a file with: * <code>The quick brown fox jumps over the lazy dog</code> might be * parsed to: <code>The quick brown fox jumps</code> and * <code>jumps over the lazy dog</code>. * * Receives a non-null, {@link File#canRead() readable} {@link File} * instance from {@link #parse(File)} and can return a possible null * {@link Iterable} or throw an {@link Exception}. * * In any of the non-successful cases, the {@link #EMPTY} {@link Iterable} * will be returned to the consumer. */ public Iterable<Reader> doParse(File file) throws Exception { FileReader reader = new FileReader(file); BufferedReader buffered = new BufferedReader(reader); context.publishEvent(new ParserOpenFileMessage(this, buffered) { @Override public void close() { try { Reader r = (Reader) resource; r.close(); } catch (Exception e) { log.debug("Error closing " + resource, e); } } }); Iterator<Reader> it = new SingleIterator(buffered); return wrap(it); } /** * Wraps an {@link Iterator} with an {@link Iterable} instance. If the * {@link Iterator} is null, the {@link #EMPTY} {@link Iterable} will be * returned. * * @param it * Can be null. * @return Will never be null */ public Iterable<Reader> wrap(Iterator<Reader> it) { if (it == null) { return EMPTY; } return new IteratorWrapper(it); } public Iterable<Reader> wrap(Reader r) { if (r == null) { return EMPTY; } return wrap(new SingleIterator(r)); } private static class SingleIterator implements Iterator<Reader> { Reader r; SingleIterator(Reader r) { this.r = r; } public boolean hasNext() { return r != null; } public Reader next() { Reader rv = r; r = null; return rv; } public void remove() { throw new UnsupportedOperationException(); } } private static class IteratorWrapper implements Iterable<Reader> { private final Iterator<Reader> it; public IteratorWrapper(Iterator<Reader> it) { this.it = it; } public Iterator<Reader> iterator() { return it; } } private static class OverlappingChunkFileIterator implements Iterator<String> { private static final String linesep = System .getProperty("line.separator"); private static final int size = 10000; private final long fileSize; private final char[] buf; private String next; /* * will be closed nulled out when finished. */ private BufferedReader reader; public OverlappingChunkFileIterator(File file) throws Exception { this.fileSize = file.length(); if (fileSize > Integer.MAX_VALUE) { throw new RuntimeException(String.format( "%s file is too large for current implementation: %s", file, fileSize)); } this.reader = new BufferedReader(new FileReader(file), size); this.buf = new char[size]; } public boolean hasNext() { if (next == null) { next = doRead(); } return next != null; } public String next() { if (!hasNext()) { // does doRead() throw new NoSuchElementException(); } String rv = next; next = null; return rv; } public void remove() { throw new UnsupportedOperationException(); } /** * Intermediate method which parses whole file into a single String. * Please see the restriction in the constructor on filesize. */ private String doRead() { if (reader == null) { return null; } StringBuffer sb = new StringBuffer((int) fileSize); int rv = -1; try { while ((rv = reader.read(buf)) != -1) { sb.append(buf, 0, rv); } } catch (Exception e) { throw new RuntimeException("Error while parsing file", e); } closeReader(); return sb.toString(); } private void closeReader() { if (reader != null) { try { reader.close(); } catch (Exception e) { // must ignore } finally { reader = null; } } } } }