package edu.berkeley.nlp.lm.io;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.collections.Iterators;
import edu.berkeley.nlp.lm.util.Logger;
import edu.berkeley.nlp.lm.util.LongRef;
/**
* Class for reading raw text files.
*
* @author adampauls
*
* @param <W>
*/
public class TextReader<W> implements LmReader<LongRef, LmReaderCallback<LongRef>>
{
private final WordIndexer<W> wordIndexer;
private final Iterable<String> lineIterator;
public TextReader(final List<String> inputFiles, final WordIndexer<W> wordIndexer) {
this(getLineIterator(inputFiles), wordIndexer);
}
public TextReader(Iterable<String> lineIterator, final WordIndexer<W> wordIndexer) {
this.lineIterator = lineIterator;
this.wordIndexer = wordIndexer;
}
/**
* Reads newline-separated plain text from inputFiles, and writes an ARPA lm
* file to outputFile. If files have a .gz suffix, then they will be
* (un)zipped as necessary.
*
* @param inputFiles
* @param outputFile
*/
@Override
public void parse(final LmReaderCallback<LongRef> callback) {
readFromFiles(callback);
}
private void readFromFiles(final LmReaderCallback<LongRef> callback) {
Logger.startTrack("Reading in ngrams from raw text");
countNgrams(lineIterator, callback);
Logger.endTrack();
}
/**
* @param <W>
* @param wordIndexer
* @param maxOrder
* @param allLinesIterator
* @param callback
* @param ngrams
* @return
*/
private void countNgrams(final Iterable<String> allLinesIterator, final LmReaderCallback<LongRef> callback) {
long numLines = 0;
for (final String line : allLinesIterator) {
if (numLines % 10000 == 0) Logger.logs("On line " + numLines);
numLines++;
final String[] words = line.split("\\s+");
final int[] sent = new int[words.length + 2];
sent[0] = wordIndexer.getOrAddIndex(wordIndexer.getStartSymbol());
sent[sent.length - 1] = wordIndexer.getOrAddIndex(wordIndexer.getEndSymbol());
for (int i = 0; i < words.length; ++i) {
sent[i + 1] = wordIndexer.getOrAddIndexFromString(words[i]);
}
callback.call(sent, 0, sent.length, new LongRef(1L), line);
// for (int ngramOrder = 0; ngramOrder < lmOrder; ++ngramOrder) {
// for (int i = 0; i < sent.length; ++i) {
// if (i - ngramOrder < 0) continue;
// callback.call(sent, i - ngramOrder, i + 1, null, line);
// }
// }
}
callback.cleanup();
}
/**
* @param files
* @return
*/
private static Iterable<String> getLineIterator(final Iterable<String> files) {
final Iterable<String> allLinesIterator = Iterators.flatten(new Iterators.Transform<String, Iterator<String>>(files.iterator())
{
@Override
protected Iterator<String> transform(final String file) {
try {
if (file.equals("-")) {
return IOUtils.lineIterator(IOUtils.getReader(System.in));
} else
return IOUtils.lineIterator(file);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
});
return allLinesIterator;
}
}