package edu.stanford.nlp.objectbank;
import edu.stanford.nlp.util.AbstractIterator;
import java.util.function.Function;
import java.util.List;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.io.Reader;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
/**
* The ObjectBank class is designed to make it easy to change the format/source
* of data read in by other classes and to standardize how data is read in
* javaNLP classes.
* This should make reuse of existing code (by non-authors of the code)
* easier because one has to just create a new ObjectBank which knows where to
* look for the data and how to turn it into Objects, and then use the new
* ObjectBank in the class. This will also make it easier to reuse code for
* reading in the same data.
* <p/>
* An ObjectBank is a Collection of Objects. These objects are taken
* from input sources and then tokenized and parsed into the desired
* kind of Object. An ObjectBank requires a ReaderIteratorFactory and a
* IteratorFromReaderFactory. The ReaderIteratorFactory is used to get
* an Iterator over java.util.Readers which contain representations of
* the Objects. A ReaderIteratorFactory resembles a collection that
* takes input sources and dispenses Iterators over java.util.Readers
* of those sources. A IteratorFromReaderFactory is used to turn a single
* java.io.Reader into an Iterator over Objects. The
* IteratorFromReaderFactory splits the contents of the java.util.Reader
* into Strings and then parses them into appropriate Objects.
* <p/>
*
* <h3>Example Usages:</h3>
*
* The general case is covered below, but the most common thing people
* <i>actually</i> want to do is read lines from a file. There are special
* methods to make this easy! You use the <code>getLineIterator</code> method.
* In its simplest use, it returns an ObjectBank<String>, which is a subclass of
* Collection<String>. So, statements like these work:
* <blockquote>
* <code>
* for (String str : ObjectBank.getLineIterator(filename) { <br>
* System.out.println(str); <br>
* } <br><br>
* String[] strings = ObjectBank.getLineIterator(filename).toArray(new String[0]); <br><br>
* String[] strings = ObjectBank.getLineIterator(filename, "GB18030").toArray(new String[0]);
* </code>
* </blockquote>
* More complex uses of getLineIterator let you interpret each line of a file
* as an object of arbitrary type via a transformer Function.
*
* For more general uses with existing classes, you first construct a collection of sources, then a class that
* will make the objects of interest from instances of those sources, and then set up an ObjectBank that can
* vend those objects:
* <pre><code>
* ReaderIteratorFactory rif = new ReaderIteratorFactory(Arrays.asList(new String[] { "file1", "file2", "file3" }));
* IteratorFromReaderFactory<Mention> corefIFRF = new MUCCorefIteratorFromReaderFactory(true);
* for (Mention m : new ObjectBank(rif, corefIFRF)) {
* ...
* }
* </code></pre>
* <p/>
* As an example of the general power of this class, suppose you have
* a collection of files in the directory /u/nlp/data/gre/questions. Each file
* contains several Puzzle documents which look like:
* <pre>
* <puzzle>
* <preamble> some text </preamble>
* <question> some intro text
* <answer> answer1 </answer>
* <answer> answer2 </answer>
* <answer> answer3 </answer>
* <answer> answer4 </answer>
* </question>
* <question> another question
* <answer> answer1 </answer>
* <answer> answer2 </answer>
* <answer> answer3 </answer>
* <answer> answer4 </answer>
* </question>
* </puzzle>
* </pre>
* <p/>
* First you need to build a ReaderIteratorFactory which will provide java.io.Readers
* over all the files in your directory:
* <p/>
* <pre>
* Collection c = new FileSequentialCollection("/u/nlp/data/gre/questions/", "", false);
* ReaderIteratorFactory rif = new ReaderIteratorFactory(c);
* </pre>
* <p/>
* Next you need to make an IteratorFromReaderFactory which will take the
* java.io.Readers vended by the ReaderIteratorFactory, split them up into
* documents (Strings) and
* then convert the Strings into Objects. In this case we want to keep everything
* between each set of <puzzle> </puzzle> tags so we would use a BeginEndTokenizerFactory.
* You would also need to write a class which extends Function and whose apply method
* converts the String between the <puzzle> </puzzle> tags into Puzzle objects.
* <p/>
* <pre>
* public class PuzzleParser implements Function {
* public Object apply (Object o) {
* String s = (String)o;
* ...
* Puzzle p = new Puzzle(...);
* ...
* return p;
* }
* }
* </pre>
* <p/>
* Now to build the IteratorFromReaderFactory:
* <p/>
* <pre>
* IteratorFromReaderFactory rtif = new BeginEndTokenizerFactory("<puzzle>", "</puzzle>", new PuzzleParser());
* </pre>
* <p/>
* Now, to create your ObjectBank you just give it the ReaderIteratorFactory and
* IteratorFromReaderFactory that you just created:
* <p/>
* <pre>
* ObjectBank puzzles = new ObjectBank(rif, rtif);
* </pre>
* <p/>
* Now, if you get a new set of puzzles that are located elsewhere and formatted differently
* you create a new ObjectBank for reading them in and use that ObjectBank instead with only
* trivial changes (or possible none at all if the ObjectBank is read in on a constructor)
* to your code. Or even better, if someone else wants to use your code to evaluate their puzzles,
* which are located elsewhere and formatted differently, they already know what they have to do
* to make your code work for them.
* <p/>
*
* @author Jenny Finkel <A HREF="mailto:jrfinkel@stanford.edu>jrfinkel@stanford.edu</A>
* @author Sarah Spikes (sdspikes@cs.stanford.edu) - cleanup and filling in types
*/
public class ObjectBank<E> implements Collection<E>, Serializable {
/**
* This creates a new ObjectBank with the given ReaderIteratorFactory
* and ObjectIteratorFactory.
*
* @param rif The {@link ReaderIteratorFactory} from which to get Readers
* @param ifrf The {@link IteratorFromReaderFactory} which turns java.io.Readers
* into Iterators of Objects
*/
public ObjectBank(ReaderIteratorFactory rif, IteratorFromReaderFactory<E> ifrf) {
this.rif = rif;
this.ifrf = ifrf;
}
@SuppressWarnings({"NonSerializableFieldInSerializableClass"})
protected ReaderIteratorFactory rif;
@SuppressWarnings({"NonSerializableFieldInSerializableClass"})
protected IteratorFromReaderFactory<E> ifrf;
@SuppressWarnings({"NonSerializableFieldInSerializableClass"})
private List<E> contents; // = null;
public static ObjectBank<String> getLineIterator(String filename) {
return getLineIterator(new File(filename));
}
public static <X> ObjectBank<X> getLineIterator(String filename, Function<String,X> op) {
return getLineIterator(new File(filename), op);
}
public static ObjectBank<String> getLineIterator(String filename, String encoding) {
return getLineIterator(new File(filename), encoding);
}
public static ObjectBank<String> getLineIterator(Reader reader) {
return getLineIterator(reader, new IdentityFunction<>());
}
public static <X> ObjectBank<X> getLineIterator(Reader reader, Function<String,X> op) {
ReaderIteratorFactory rif = new ReaderIteratorFactory(reader);
IteratorFromReaderFactory<X> ifrf = LineIterator.getFactory(op);
return new ObjectBank<>(rif, ifrf);
}
public static ObjectBank<String> getLineIterator(File file) {
return getLineIterator(Collections.singleton(file), new IdentityFunction<>());
}
public static <X> ObjectBank<X> getLineIterator(File file, Function<String,X> op) {
return getLineIterator(Collections.singleton(file), op);
}
public static ObjectBank<String> getLineIterator(File file, String encoding) {
return getLineIterator(file, new IdentityFunction<>(), encoding);
}
public static <X> ObjectBank<X> getLineIterator(File file, Function<String,X> op, String encoding) {
ReaderIteratorFactory rif = new ReaderIteratorFactory(file, encoding);
IteratorFromReaderFactory<X> ifrf = LineIterator.getFactory(op);
return new ObjectBank<>(rif, ifrf);
}
public static <X> ObjectBank<X> getLineIterator(Collection<?> filesStringsAndReaders, Function<String,X> op) {
ReaderIteratorFactory rif = new ReaderIteratorFactory(filesStringsAndReaders);
IteratorFromReaderFactory<X> ifrf = LineIterator.getFactory(op);
return new ObjectBank<>(rif, ifrf);
}
public static ObjectBank<String> getLineIterator(Collection<?> filesStringsAndReaders, String encoding) {
return getLineIterator(filesStringsAndReaders, new IdentityFunction<>(), encoding);
}
public static <X> ObjectBank<X> getLineIterator(Collection<?> filesStringsAndReaders, Function<String,X> op, String encoding) {
ReaderIteratorFactory rif = new ReaderIteratorFactory(filesStringsAndReaders, encoding);
IteratorFromReaderFactory<X> ifrf = LineIterator.getFactory(op);
return new ObjectBank<>(rif, ifrf);
}
/** This is handy for having getLineIterator return a collection of files for feeding into another ObjectBank. */
public static class PathToFileFunction implements Function<String, File> {
@Override
public File apply(String str) {
return new File(str);
}
}
@Override
public Iterator<E> iterator() {
// basically concatenates Iterator's made from
// each java.io.Reader.
if (keepInMemory) {
if (contents == null) {
contents = new ArrayList<>();
Iterator<E> iter = new OBIterator();
while (iter.hasNext()) {
contents.add(iter.next());
}
}
return contents.iterator();
}
return new OBIterator();
}
private boolean keepInMemory; // = false;
/**
* Tells the ObjectBank to store all of
* its contents in memory so that it doesn't
* have to be recomputed each time you iterate
* through it. This is useful when the data
* is small enough that it can be kept in
* memory, but reading/processing it
* is expensive/slow. Defaults to false.
*
* @param keep Whether to keep contents in memory
*/
public void keepInMemory(boolean keep) {
keepInMemory = keep;
}
/**
* If you are keeping the contents in memory,
* this will clear the memory, and they will be
* recomputed the next time iterator() is
* called.
*/
public void clearMemory(){
contents = null;
}
@Override
public boolean isEmpty() {
return !iterator().hasNext();
}
/**
* Can be slow. Usage not recommended.
*/
@Override
public boolean contains(Object o) {
for (E e : this) {
if (e == o) {
return true;
}
}
return false;
}
/**
* Can be slow. Usage not recommended.
*/
@Override
public boolean containsAll(Collection<?> c) {
for (Object obj : c) {
if ( ! contains(obj)) {
return false;
}
}
return true;
}
/**
* Can be slow. Usage not recommended.
*/
@Override
public int size() {
Iterator<E> iter = iterator();
int size = 0;
while (iter.hasNext()) {
size++;
iter.next();
}
return size;
}
@Override
public void clear() {
rif = new ReaderIteratorFactory();
}
@Override
public Object[] toArray() {
Iterator<E> iter = iterator();
ArrayList<Object> al = new ArrayList<>();
while (iter.hasNext()) {
al.add(iter.next());
}
return al.toArray();
}
/**
* Can be slow. Usage not recommended.
*/
@Override
@SuppressWarnings({"SuspiciousToArrayCall"})
public <T> T[] toArray(T[] o) {
Iterator<E> iter = iterator();
ArrayList<E> al = new ArrayList<>();
while (iter.hasNext()) {
al.add(iter.next());
}
return al.toArray(o);
}
/**
* Unsupported Operation. If you wish to add a new data source,
* do so in the underlying ReaderIteratorFactory
*/
@Override
public boolean add(E o) {
throw new UnsupportedOperationException();
}
/**
* Unsupported Operation. If you wish to remove a data source,
* do so in the underlying ReaderIteratorFactory
*/
@Override
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
/**
* Unsupported Operation. If you wish to add new data sources,
* do so in the underlying ReaderIteratorFactory
*/
@Override
public boolean addAll(Collection<? extends E> c) {
throw new UnsupportedOperationException();
}
/**
* Unsupported Operation. If you wish to remove data sources,
* remove, do so in the underlying ReaderIteratorFactory.
*/
@Override
public boolean removeAll(Collection<?> c) {
throw new UnsupportedOperationException();
}
/**
* Unsupported Operation. If you wish to retain only certain data
* sources, do so in the underlying ReaderIteratorFactory.
*/
@Override
public boolean retainAll(Collection<?> c) {
throw new UnsupportedOperationException();
}
/**
* Iterator of Objects.
*/
class OBIterator extends AbstractIterator<E> {
private final Iterator<Reader> readerIterator;
private Iterator<E> tok;
private E nextObject;
private Reader currReader; // = null;
public OBIterator() {
readerIterator = rif.iterator();
setNextObject();
}
private void setNextObject() {
if (tok != null && tok.hasNext()) {
nextObject = tok.next();
return;
}
while (true) {
try {
if (currReader != null) {
currReader.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
if (readerIterator.hasNext()) {
currReader = readerIterator.next();
tok = ifrf.getIterator(currReader);
} else {
nextObject = null;
return;
}
if (tok.hasNext()) {
nextObject = tok.next();
return;
}
}
}
@Override
public boolean hasNext() {
return nextObject != null;
}
@Override
public E next() {
if (nextObject == null) {
throw new NoSuchElementException();
}
E tmp = nextObject;
setNextObject();
return tmp;
}
} // end class OBIterator
private static final long serialVersionUID = -4030295596701541770L;
}