/* LanguageTool, a natural language style checker
* Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.dumpcheck;
import org.languagetool.Language;
import org.languagetool.tokenizers.Tokenizer;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/**
* Source of sentences to be checked/indexed. Sub classes provide access to XML files
* or plain text sources.
* @since 2.4
*/
public abstract class SentenceSource implements Iterator<Sentence> {
private static final int MIN_SENTENCE_SIZE = 10;
private static final int MIN_SENTENCE_TOKEN_COUNT = 4;
private static final int MAX_SENTENCE_LENGTH = 300;
private final Tokenizer wordTokenizer;
private final Pattern acceptPattern;
SentenceSource(Language language) {
this(language, null);
}
/** @since 3.0 */
SentenceSource(Language language, Pattern acceptPattern) {
wordTokenizer = language.getWordTokenizer();
this.acceptPattern = acceptPattern;
}
@Override
public abstract boolean hasNext();
/**
* Return the next sentence. Sentences from the source are filtered by length
* to remove very short and very long sentences.
*/
@Override
public abstract Sentence next();
public abstract String getSource();
@Override
public void remove() {
throw new UnsupportedOperationException("remove not supported");
}
@Override
public String toString() {
return getSource() + "-" + super.toString();
}
protected boolean acceptSentence(String sentence) {
if (acceptPattern != null) {
if (!acceptPattern.matcher(sentence).find()) {
// useful speedup: we don't consider sentences that cannot match anyway
return false;
}
}
String trimSentence = sentence.trim();
return trimSentence.length() >= MIN_SENTENCE_SIZE && trimSentence.length() <= MAX_SENTENCE_LENGTH
&& countTokens(trimSentence) >= MIN_SENTENCE_TOKEN_COUNT;
}
private int countTokens(String sentence) {
int realTokens = 0;
List<String> allTokens = wordTokenizer.tokenize(sentence);
for (String token : allTokens) {
if (!token.trim().isEmpty()) {
realTokens++;
}
}
return realTokens;
}
}