// Copyright 2014 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.tokenize.preprocess;
import java.util.Iterator;
public class WikiSelector implements Iterable<Pair> {
private int num_sentences_;
private boolean expand_;
private String tokenized_file_;
private String untokenized_file_;
private int token_threshold_;
private double score_threshold_;
public WikiSelector(String untokenized_file, String tokenized_file, boolean expand, int num_sentences, int token_threshold, double score_threshold) {
num_sentences_ = num_sentences;
expand_ = expand;
untokenized_file_ = untokenized_file;
tokenized_file_ = tokenized_file;
token_threshold_ = token_threshold;
score_threshold_ = score_threshold;
}
public WikiSelector(String untokenized_file, String tokenized_file, boolean expand, int max_sentences) {
this(untokenized_file, tokenized_file, expand, max_sentences, 5, 0.01);
}
@Override
public Iterator<Pair> iterator() {
final WikiReader reader_ = new WikiReader(untokenized_file_, tokenized_file_, expand_);
return new Iterator<Pair>() {
Pair pair_ = null;
int num_selected_sentences_ = 0;
@Override
public boolean hasNext() {
return next_();
}
private boolean next_() {
if (pair_ != null) {
return true;
}
if (num_sentences_ > 0 && num_selected_sentences_ >= num_sentences_) {
return false;
}
Pair pair = reader_.next();
int num_tokens = pair.tokenized.split("\\s+").length;
if (num_tokens > token_threshold_ && pair.score > score_threshold_) {
num_selected_sentences_ += 1;
pair_ = pair;
return true;
}
return next_();
}
@Override
public Pair next() {
next_();
Pair pair = pair_;
pair_ = null;
return pair;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
}