// Copyright 2014 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.tokenize.preprocess;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import org.apache.commons.compress.compressors.bzip2.*;
public class WikiReader implements Iterator<Pair> {
private Pair pair_;
private InternalReader untokenized_;
private InternalReader tokenized_;
private boolean expand_;
public WikiReader(InternalReader untokenized, InternalReader tokenized, boolean expand) {
untokenized_ = untokenized;
tokenized_ = tokenized;
expand_ = expand;
}
public WikiReader(String untokenized_file, String tokenized_file, boolean expand) {
this(openFile(untokenized_file), openFile(tokenized_file), expand);
}
public static InternalReader openFile(String file) {
try {
return new BufferedReaderWrapper(new BufferedReader(
new InputStreamReader(new BZip2CompressorInputStream(
new FileInputStream(file)), "UTF-8")));
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public boolean hasNext() {
readNext();
return pair_ != null;
}
protected String fixLine(String line) {
if (line == null) {
return line;
}
line = line.replace((char)0xa0, ' ');
return line;
}
protected String readNonEmptyLine(InternalReader reader) {
String line = fixLine(reader.readLine());
if (line == null) {
throw new NoSuchElementException();
}
line = line.trim();
while (line.isEmpty()) {
line = fixLine(reader.readLine());
if (line == null) {
throw new NoSuchElementException();
}
line = line.trim();
}
return line;
}
public void readNext() {
if (pair_ != null) {
return;
}
try {
String tokenized = readNonEmptyLine(tokenized_);
String untokenized = readNonEmptyLine(untokenized_);
pair_ = new Pair(tokenized, untokenized);
if (expand_)
expandPair();
if (pair_.score > 0.7 && pair_.tokenized.length() > 20) {
throw new RuntimeException(String.format("Alignment error: %s --- %s : %g", pair_.tokenized, pair_.untokenized, pair_.score));
}
} catch (NoSuchElementException e) {
}
}
protected void expandPair() {
Pair pair;
boolean expanded = false;
// expand left:
try {
tokenized_.mark();
pair = new Pair(pair_.tokenized + readNonEmptyLine(tokenized_),
pair_.untokenized);
if (pair.score < pair_.score) {
pair_ = pair;
expanded = true;
} else {
tokenized_.reset(); // not supported in BufferedReaderWrapper!
}
} catch (NoSuchElementException e) {
}
// expand right:
untokenized_.mark();
pair = new Pair(pair_.tokenized, pair_.untokenized
+ readNonEmptyLine(untokenized_));
if (pair.score < pair_.score) {
pair_ = pair;
expanded = true;
} else {
untokenized_.reset(); // not supported in BufferedReaderWrapper!
}
if (expanded) {
expandPair();
}
}
@Override
public Pair next() {
readNext();
if (pair_ == null) {
throw new NoSuchElementException();
}
Pair pair = pair_;
pair_ = null;
return pair;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
public List<Pair> readAll() {
List<Pair> pairs = new LinkedList<Pair>();
while (hasNext()) {
pairs.add(next());
}
return pairs;
}
}