/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
import java.net.JarURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.jar.Manifest;
import javax.xml.parsers.ParserConfigurationException;
import org.languagetool.databroker.DefaultResourceDataBroker;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.rules.Category;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.RuleMatchFilter;
import org.languagetool.rules.SameRuleGroupFilter;
import org.languagetool.rules.patterns.FalseFriendRuleLoader;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;
import org.languagetool.rules.spelling.SpellingCheckRule;
import org.languagetool.rules.spelling.SuggestionExtractor;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.disambiguation.Disambiguator;
import org.languagetool.tokenizers.Tokenizer;
import org.xml.sax.SAXException;
/**
* The main class used for checking text against different rules:
* <ul>
* <li>the built-in rules (<i>a</i> vs. <i>an</i>, whitespace after commas, ...)
* <li>pattern rules loaded from external XML files with
* {@link #loadPatternRules(String)}
* <li>your own implementation of the abstract {@link Rule} classes added with
* {@link #addRule(Rule)}
* </ul>
*
* <p>Note that the constructors create a language checker that uses the built-in
* rules only. Other rules (e.g. from XML) need to be added explicitly or
* activated using {@link #activateDefaultPatternRules()}.
*/
@SuppressWarnings({"UnusedDeclaration"})
public final class JLanguageTool {
public static final String VERSION = "2.2-SNAPSHOT"; // keep in sync with build.properties!
public static final String BUILD_DATE = getBuildDate();
public static final String PATTERN_FILE = "grammar.xml";
public static final String FALSE_FRIEND_FILE = "false-friends.xml";
public static final String SENTENCE_START_TAGNAME = "SENT_START";
public static final String SENTENCE_END_TAGNAME = "SENT_END";
public static final String PARAGRAPH_END_TAGNAME = "PARA_END";
public static final String MESSAGE_BUNDLE = "org.languagetool.MessagesBundle";
/**
* Returns the build date or <code>null</code> if not run from JAR.
*/
private static String getBuildDate() {
try {
final URL res = JLanguageTool.class.getResource(JLanguageTool.class.getSimpleName() + ".class");
final Object connObj = res.openConnection();
if (connObj instanceof JarURLConnection) {
final JarURLConnection conn = (JarURLConnection) connObj;
final Manifest manifest = conn.getManifest();
return manifest.getMainAttributes().getValue("Implementation-Date");
} else {
return null;
}
} catch (IOException e) {
throw new RuntimeException("Could not get build date from JAR", e);
}
}
private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker();
private final List<Rule> builtinRules = new ArrayList<Rule>();
private final List<Rule> userRules = new ArrayList<Rule>(); // rules added via addRule() method
private final Set<String> disabledRules = new HashSet<String>();
private final Set<String> enabledRules = new HashSet<String>();
private final Set<String> disabledCategories = new HashSet<String>();
private Language language;
private Language motherTongue;
private Disambiguator disambiguator;
private Tagger tagger;
private Tokenizer sentenceTokenizer;
private Tokenizer wordTokenizer;
private PrintStream printStream;
private int sentenceCount;
private boolean listUnknownWords;
private Set<String> unknownWords;
/**
* Constants for correct paragraph-rule handling:
* <ul>
* <li>NORMAL - all kinds of rules run</li>
* <li>ONLYPARA - only paragraph-level rules</li>
* <li>ONLYNONPARA - only sentence-level rules</li></ul>
**/
public static enum ParagraphHandling {
/**
* Handle normally - all kinds of rules run.
*/
NORMAL,
/**
* Run only paragraph-level rules.
*/
ONLYPARA,
/**
* Run only sentence-level rules.
*/
ONLYNONPARA
}
private static List<File> temporaryFiles = new ArrayList<File>();
// just for testing:
/*
* private Rule[] allBuiltinRules = new Rule[] { new
* UppercaseSentenceStartRule() };
*/
/**
* Create a JLanguageTool and setup the built-in rules appropriate for the
* given language, ignoring false friend hints.
*/
public JLanguageTool(final Language language) throws IOException {
this(language, null);
}
/**
* Create a JLanguageTool and setup the built-in rules appropriate for the
* given language.
*
* @param language the language to be used.
* @param motherTongue the user's mother tongue or <code>null</code>. The mother tongue
* may also be used as a source language for checking bilingual texts.
*/
public JLanguageTool(final Language language, final Language motherTongue)
throws IOException {
if (language == null) {
throw new NullPointerException("language cannot be null");
}
this.language = language;
this.motherTongue = motherTongue;
final ResourceBundle messages = getMessageBundle(language);
final Rule[] allBuiltinRules = getAllBuiltinRules(language, messages);
for (final Rule element : allBuiltinRules) {
if (element.supportsLanguage(language)) {
builtinRules.add(element);
}
}
disambiguator = language.getDisambiguator();
tagger = language.getTagger();
sentenceTokenizer = language.getSentenceTokenizer();
wordTokenizer = language.getWordTokenizer();
}
/**
* The grammar checker needs resources from following
* directories:
* <ul>
* <li>{@code /resource}</li>
* <li>{@code /rules}</li>
* </ul>
*
* This method is thread-safe.
*
* @return The currently set data broker which allows to obtain
* resources from the mentioned directories above. If no
* data broker was set, a new {@link DefaultResourceDataBroker} will
* be instantiated and returned.
* @since 1.0.1
*/
public static synchronized ResourceDataBroker getDataBroker() {
if (JLanguageTool.dataBroker == null) {
JLanguageTool.dataBroker = new DefaultResourceDataBroker();
}
return JLanguageTool.dataBroker;
}
/**
* The grammar checker needs resources from following
* directories:
*
* <ul>
* <li>{@code /resource}</li>
* <li>{@code /rules}</li>
* </ul>
*
* This method is thread-safe.
*
* @param broker The new resource broker to be used.
* @since 1.0.1
*/
public static synchronized void setDataBroker(ResourceDataBroker broker) {
JLanguageTool.dataBroker = broker;
}
/**
* Whether the check() method stores unknown words. If set to
* <code>true</code> (default: false), you can get the list of unknown words
* using getUnknownWords().
*/
public void setListUnknownWords(final boolean listUnknownWords) {
this.listUnknownWords = listUnknownWords;
}
/**
* Gets the ResourceBundle for the default language of the user's system.
*/
public static ResourceBundle getMessageBundle() {
try {
final ResourceBundle bundle = ResourceBundle.getBundle(MESSAGE_BUNDLE);
final ResourceBundle fallbackBundle = ResourceBundle.getBundle(
MESSAGE_BUNDLE, Locale.ENGLISH);
return new ResourceBundleWithFallback(bundle, fallbackBundle);
} catch (final MissingResourceException e) {
return ResourceBundle.getBundle(
MESSAGE_BUNDLE, Locale.ENGLISH);
}
}
/**
* Gets the ResourceBundle for the given user interface language.
*/
static ResourceBundle getMessageBundle(final Language lang) {
try {
ResourceBundle bundle = ResourceBundle.getBundle(MESSAGE_BUNDLE, lang.getLocaleWithCountry());
if (!isValidBundleFor(lang, bundle)) {
bundle = ResourceBundle.getBundle(MESSAGE_BUNDLE, lang.getLocale());
if (!isValidBundleFor(lang, bundle)) {
// happens if 'xx' is requested but only a MessagesBundle_xx_YY.properties exists:
final Language defaultVariant = lang.getDefaultVariant();
if (defaultVariant != null && defaultVariant.getCountryVariants().length > 0) {
final Locale locale = new Locale(defaultVariant.getShortName(), defaultVariant.getCountryVariants()[0]);
bundle = ResourceBundle.getBundle(MESSAGE_BUNDLE, locale);
}
}
}
final ResourceBundle fallbackBundle = ResourceBundle.getBundle(
MESSAGE_BUNDLE, Locale.ENGLISH);
return new ResourceBundleWithFallback(bundle, fallbackBundle);
} catch (final MissingResourceException e) {
return ResourceBundle.getBundle(
MESSAGE_BUNDLE, Locale.ENGLISH);
}
}
private static boolean isValidBundleFor(final Language lang, final ResourceBundle bundle) {
return lang.getLocale().getLanguage().equals(bundle.getLocale().getLanguage());
}
private Rule[] getAllBuiltinRules(final Language language, final ResourceBundle messages) {
final List<Rule> rules = new ArrayList<Rule>();
final List<Class<? extends Rule>> languageRules = language.getRelevantRules();
for (Class<? extends Rule> ruleClass : languageRules) {
final Constructor[] constructors = ruleClass.getConstructors();
try {
if (constructors.length > 0) {
final Constructor constructor = constructors[0];
final Class[] paramTypes = constructor.getParameterTypes();
if (paramTypes.length == 1
&& paramTypes[0].equals(ResourceBundle.class)) {
rules.add((Rule) constructor.newInstance(messages));
} else if (paramTypes.length == 2
&& paramTypes[0].equals(ResourceBundle.class)
&& paramTypes[1].equals(Language.class)) {
rules.add((Rule) constructor.newInstance(messages, language));
} else {
throw new RuntimeException("No matching constructor found for rule class: " + ruleClass.getName());
}
} else {
throw new RuntimeException("No public constructor for rule class: " + ruleClass.getName());
}
} catch (Exception e) {
throw new RuntimeException("Failed to load built-in Java rules for language " + language, e);
}
}
return rules.toArray(new Rule[rules.size()]);
}
/**
* Set a PrintStream that will receive verbose output. Set to
* <code>null</code> to disable verbose output.
*/
public void setOutput(final PrintStream printStream) {
this.printStream = printStream;
}
/**
* Load pattern rules from an XML file. Use {@link #addRule(Rule)} to add these
* rules to the checking process.
*
* @return a List of {@link PatternRule} objects
*/
public List<PatternRule> loadPatternRules(final String filename)
throws IOException {
final PatternRuleLoader ruleLoader = new PatternRuleLoader();
final InputStream is = this.getClass().getResourceAsStream(filename);
if (is == null) {
// happens for external rules plugged in as an XML file:
return ruleLoader.getRules(new File(filename));
} else {
return ruleLoader.getRules(is, filename);
}
}
/**
* Load false friend rules from an XML file. Only those pairs will be loaded
* that match the current text language and the mother tongue specified in the
* JLanguageTool constructor. Use {@link #addRule(Rule)} to add these rules to the
* checking process.
*
* @return a List of {@link PatternRule} objects
*/
public List<PatternRule> loadFalseFriendRules(final String filename)
throws ParserConfigurationException, SAXException, IOException {
if (motherTongue == null) {
return new ArrayList<PatternRule>();
}
final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader();
return ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
language, motherTongue);
}
/**
* Loads and activates the pattern rules from
* <code>org/languagetool/rules/<languageCode>/grammar.xml</code>.
*/
public void activateDefaultPatternRules() throws IOException {
final List<PatternRule> patternRules = new ArrayList<PatternRule>();
for (String patternRuleFileName : language.getRuleFileName()) {
patternRules.addAll(loadPatternRules(patternRuleFileName));
}
userRules.addAll(patternRules);
}
/**
* Loads and activates the false friend rules from
* <code>rules/false-friends.xml</code>.
*/
public void activateDefaultFalseFriendRules()
throws ParserConfigurationException, SAXException, IOException {
final String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE;
final List<PatternRule> patternRules = loadFalseFriendRules(falseFriendRulesFilename);
userRules.addAll(patternRules);
}
/**
* Add a rule to be used by the next call to {@link #check(String)}.
*/
public void addRule(final Rule rule) {
userRules.add(rule);
final SuggestionExtractor extractor = new SuggestionExtractor();
final List<String> suggestionTokens = extractor.getSuggestionTokens(rule, language);
final List<Rule> allActiveRules = getAllActiveRules();
addIgnoreWords(suggestionTokens, allActiveRules);
}
private void addIgnoreWords(List<String> suggestionTokens, List<Rule> allActiveRules) {
for (Rule activeRule : allActiveRules) {
if (activeRule instanceof SpellingCheckRule) {
((SpellingCheckRule)activeRule).addIgnoreTokens(suggestionTokens);
}
}
}
private void setIgnoreWords(List<String> suggestionTokens, List<Rule> allActiveRules) {
for (Rule activeRule : allActiveRules) {
if (activeRule instanceof SpellingCheckRule) {
((SpellingCheckRule)activeRule).resetIgnoreTokens();
((SpellingCheckRule)activeRule).addIgnoreTokens(suggestionTokens);
}
}
}
/**
* Disable a given rule so {@link #check(String)} won't use it.
*
* @param ruleId the id of the rule to disable - no error will be given if the id does not exist
*/
public void disableRule(final String ruleId) {
disabledRules.add(ruleId);
reInitSpellCheckIgnoreWords();
}
private void reInitSpellCheckIgnoreWords() {
final List<Rule> allActiveRules = getAllActiveRules();
final List<String> ignoreTokens = getAllIgnoreWords(allActiveRules);
setIgnoreWords(ignoreTokens, allActiveRules);
}
private List<String> getAllIgnoreWords(List<Rule> allActiveRules) {
final List<String> suggestionTokens = new ArrayList<String>();
for (Rule activeRule : allActiveRules) {
if (activeRule instanceof PatternRule) {
final SuggestionExtractor extractor = new SuggestionExtractor();
suggestionTokens.addAll(extractor.getSuggestionTokens(activeRule, language));
}
}
return suggestionTokens;
}
/**
* Disable a given category so {@link #check(String)} won't use it.
*
* @param categoryName the id of the category to disable - no error will be given if the id does not exist
*/
public void disableCategory(final String categoryName) {
disabledCategories.add(categoryName);
reInitSpellCheckIgnoreWords();
}
/**
* Get the language that was used to configure this instance.
*/
public Language getLanguage() {
return language;
}
/**
* Get rule ids of the rules that have been explicitly disabled.
*/
public Set<String> getDisabledRules() {
return disabledRules;
}
/**
* Enable a rule that was switched off by default.
*
* @param ruleId the id of the turned off rule to enable.
*/
public void enableDefaultOffRule(final String ruleId) {
enabledRules.add(ruleId);
}
/**
* Get category ids of the rules that have been explicitly disabled.
*/
public Set<String> getDisabledCategories() {
return disabledCategories;
}
/**
* Re-enable a given rule so {@link #check(String)} will use it.
*
* @param ruleId the id of the rule to enable
*/
public void enableRule(final String ruleId) {
if (disabledRules.contains(ruleId)) {
disabledRules.remove(ruleId);
}
}
/**
* Returns tokenized sentences.
*/
public List<String> sentenceTokenize(final String text) {
return sentenceTokenizer.tokenize(text);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
*
* @param text the text to check
* @return a List of {@link RuleMatch} objects
*/
public List<RuleMatch> check(final String text) throws IOException {
return check(text, true, ParagraphHandling.NORMAL);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
*
* @param text
* The text to check. Call this method with the complete text to check. If you call it
* with smaller chunks like paragraphs or sentence, those rules that work across
* paragraphs/sentences won't work (their status gets reset whenever this).
* @param tokenizeText
* If true, then the text is tokenized into sentences.
* Otherwise, it is assumed it's already tokenized.
* @param paraMode
* Uses paragraph-level rules only if true.
* @return a List of {@link RuleMatch} objects
*/
public List<RuleMatch> check(final String text, boolean tokenizeText, final ParagraphHandling paraMode) throws IOException {
sentenceCount = 0;
final List<String> sentences;
if (tokenizeText) {
sentences = sentenceTokenize(text);
} else {
sentences = new ArrayList<String>();
sentences.add(text);
}
final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
final List<Rule> allRules = getAllRules();
printIfVerbose(allRules.size() + " rules activated for language " + language);
int charCount = 0;
int lineCount = 0;
int columnCount = 1;
unknownWords = new HashSet<String>();
for (final String sentence : sentences) {
sentenceCount++;
AnalyzedSentence analyzedSentence = getAnalyzedSentence(sentence);
rememberUnknownWords(analyzedSentence);
if (sentenceCount == sentences.size()) {
final AnalyzedTokenReadings[] anTokens = analyzedSentence.getTokens();
anTokens[anTokens.length - 1].setParaEnd();
analyzedSentence = new AnalyzedSentence(anTokens);
}
printIfVerbose(analyzedSentence.toString());
printIfVerbose(analyzedSentence.getAnnotations());
final List<RuleMatch> sentenceMatches =
checkAnalyzedSentence(paraMode, allRules, charCount, lineCount,
columnCount, sentence, analyzedSentence);
Collections.sort(sentenceMatches);
ruleMatches.addAll(sentenceMatches);
charCount += sentence.length();
lineCount += countLineBreaks(sentence);
// calculate matching column:
final int lineBreakPos = sentence.lastIndexOf('\n');
if (lineBreakPos == -1) {
columnCount += sentence.length();
} else {
if (lineBreakPos == 0) {
columnCount = sentence.length();
if (!language.getSentenceTokenizer().singleLineBreaksMarksPara()) {
columnCount--;
}
} else {
columnCount = sentence.length() - lineBreakPos;
}
}
}
if (!ruleMatches.isEmpty() && !paraMode.equals(ParagraphHandling.ONLYNONPARA)) {
// removing false positives in paragraph-level rules
for (final Rule rule : allRules) {
if (rule.isParagraphBackTrack() && (rule.getMatches() != null)) {
final List<RuleMatch> rm = rule.getMatches();
for (final RuleMatch r : rm) {
if (rule.isInRemoved(r)) {
ruleMatches.remove(r);
}
}
}
}
}
return ruleMatches;
}
public List<RuleMatch> checkAnalyzedSentence(final ParagraphHandling paraMode,
final List<Rule> allRules, int tokenCount, int lineCount,
int columnCount, final String sentence, AnalyzedSentence analyzedSentence)
throws IOException {
final List<RuleMatch> sentenceMatches = new ArrayList<RuleMatch>();
for (final Rule rule : allRules) {
if (disabledRules.contains(rule.getId())
|| (rule.isDefaultOff() && !enabledRules.contains(rule.getId()))) {
continue;
}
final Category category = rule.getCategory();
if (category != null && disabledCategories.contains(category.getName())) {
continue;
}
switch (paraMode) {
case ONLYNONPARA: {
if (rule.isParagraphBackTrack()) {
continue;
}
break;
}
case ONLYPARA: {
if (!rule.isParagraphBackTrack()) {
continue;
}
break;
}
case NORMAL:
default:
}
final RuleMatch[] thisMatches = rule.match(analyzedSentence);
for (final RuleMatch element1 : thisMatches) {
final RuleMatch thisMatch = adjustRuleMatchPos(element1,
tokenCount, columnCount, lineCount, sentence);
sentenceMatches.add(thisMatch);
if (rule.isParagraphBackTrack()) {
rule.addRuleMatch(thisMatch);
}
}
}
final RuleMatchFilter filter = new SameRuleGroupFilter();
return filter.filter(sentenceMatches);
}
/**
* Change RuleMatch positions so they are relative to the complete text,
* not just to the sentence:
* @param rm RuleMatch
* @param sentLen Count of characters
* @param columnCount Current column number
* @param lineCount Current line number
* @param sentence The text being checked
* @return The RuleMatch object with adjustments.
*/
public RuleMatch adjustRuleMatchPos(final RuleMatch rm, int sentLen,
int columnCount, int lineCount, final String sentence) {
final RuleMatch thisMatch = new RuleMatch(rm.getRule(),
rm.getFromPos() + sentLen, rm.getToPos() + sentLen, rm.getMessage(), rm.getShortMessage());
thisMatch.setSuggestedReplacements(rm.getSuggestedReplacements());
final String sentencePartToError = sentence.substring(0, rm.getFromPos());
final String sentencePartToEndOfError = sentence.substring(0,rm.getToPos());
final int lastLineBreakPos = sentencePartToError.lastIndexOf('\n');
final int column;
final int endColumn;
if (lastLineBreakPos == -1) {
column = sentencePartToError.length() + columnCount;
} else {
column = sentencePartToError.length() - lastLineBreakPos;
}
final int lastLineBreakPosInError = sentencePartToEndOfError.lastIndexOf('\n');
if (lastLineBreakPosInError == -1) {
endColumn = sentencePartToEndOfError.length() + columnCount;
} else {
endColumn = sentencePartToEndOfError.length() - lastLineBreakPosInError;
}
final int lineBreaksToError = countLineBreaks(sentencePartToError);
final int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError);
thisMatch.setLine(lineCount + lineBreaksToError);
thisMatch.setEndLine(lineCount + lineBreaksToEndOfError);
thisMatch.setColumn(column);
thisMatch.setEndColumn(endColumn);
thisMatch.setOffset(rm.getFromPos() + sentLen);
return thisMatch;
}
private void rememberUnknownWords(final AnalyzedSentence analyzedText) {
if (listUnknownWords) {
final AnalyzedTokenReadings[] atr = analyzedText
.getTokensWithoutWhitespace();
for (final AnalyzedTokenReadings t : atr) {
if (t.getReadings().toString().contains("null]")) {
unknownWords.add(t.getToken());
}
}
}
}
/**
* Get the alphabetically sorted list of unknown words in the latest run of the check() method.
*
* @throws IllegalStateException if {@link #setListUnknownWords(boolean)} has been set to <code>false</code>
*/
public List<String> getUnknownWords() {
if (!listUnknownWords) {
throw new IllegalStateException("listUnknownWords is set to false, unknown words not stored");
}
final List<String> words = new ArrayList<String>(unknownWords);
Collections.sort(words);
return words;
}
// non-private only for test case
static int countLineBreaks(final String s) {
int pos = -1;
int count = 0;
while (true) {
final int nextPos = s.indexOf('\n', pos + 1);
if (nextPos == -1) {
break;
}
pos = nextPos;
count++;
}
return count;
}
/**
* Tokenizes the given <code>sentence</code> into words and analyzes it,
* and then disambiguates POS tags.
*/
public AnalyzedSentence getAnalyzedSentence(final String sentence) throws IOException {
// disambiguate assigned tags & return
return disambiguator.disambiguate(getRawAnalyzedSentence(sentence));
}
/**
* Tokenizes the given <code>sentence</code> into words and analyzes it.
*
* @param sentence Sentence to be analyzed
* @since 0.9.8
*/
public AnalyzedSentence getRawAnalyzedSentence(final String sentence) throws IOException {
final List<String> tokens = wordTokenizer.tokenize(sentence);
final Map<Integer, String> softHyphenTokens = new HashMap<Integer, String>();
//for soft hyphens inside words, happens especially in OOo:
for (int i = 0; i < tokens.size(); i++) {
if (tokens.get(i).indexOf('\u00ad') != -1) {
softHyphenTokens.put(i, tokens.get(i));
tokens.set(i, tokens.get(i).replaceAll("\u00ad", ""));
}
}
final List<AnalyzedTokenReadings> aTokens = tagger.tag(tokens);
final int numTokens = aTokens.size();
int posFix = 0;
for (int i = 1; i < numTokens; i++) {
aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).isWhitespace());
aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix);
if (!softHyphenTokens.isEmpty()) {
if (softHyphenTokens.get(i) != null) {
aTokens.get(i).addReading(tagger.createToken(softHyphenTokens.get(i), null));
posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length();
}
}
}
final AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens
.size() + 1];
final AnalyzedToken[] startTokenArray = new AnalyzedToken[1];
int toArrayCount = 0;
final AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null);
startTokenArray[0] = sentenceStartToken;
tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0);
int startPos = 0;
for (final AnalyzedTokenReadings posTag : aTokens) {
posTag.setStartPos(startPos);
tokenArray[toArrayCount++] = posTag;
startPos += posTag.getToken().length();
}
// add additional tags
int lastToken = toArrayCount - 1;
// make SENT_END appear at last not whitespace token
for (int i = 0; i < toArrayCount - 1; i++) {
if (!tokenArray[lastToken - i].isWhitespace()) {
lastToken -= i;
break;
}
}
tokenArray[lastToken].setSentEnd();
if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) {
tokenArray[lastToken].setParaEnd();
}
return new AnalyzedSentence(tokenArray);
}
/**
* Get all rules for the current language that are built-in or that have been
* added using {@link #addRule(Rule)}.
* @return a List of {@link Rule} objects
*/
public List<Rule> getAllRules() {
final List<Rule> rules = new ArrayList<Rule>();
rules.addAll(builtinRules);
rules.addAll(userRules);
// Some rules have an internal state so they can do checks over sentence
// boundaries. These need to be reset so the checks don't suddenly
// work on different texts with the same data. However, it could be useful
// to keep the state information if we're checking a continuous text.
for (final Rule rule : rules) {
rule.reset();
}
return rules;
}
/**
* Get all active (not disabled) rules for the current language that are built-in or that
* have been added using e.g. {@link #addRule(Rule)}.
* @return a List of {@link Rule} objects
*/
public List<Rule> getAllActiveRules() {
final List<Rule> rules = new ArrayList<Rule>();
final List<Rule> rulesActive = new ArrayList<Rule>();
rules.addAll(builtinRules);
rules.addAll(userRules);
// Some rules have an internal state so they can do checks over sentence
// boundaries. These need to be reset so the checks don't suddenly
// work on different texts with the same data. However, it could be useful
// to keep the state information if we're checking a continuous text.
for (final Rule rule : rules) {
rule.reset();
if (!disabledRules.contains(rule.getId())) {
rulesActive.add(rule);
}
}
return rulesActive;
}
/**
* Number of sentences the latest call to {@link #check(String)} has checked.
*/
public int getSentenceCount() {
return sentenceCount;
}
private void printIfVerbose(final String s) {
if (printStream != null) {
printStream.println(s);
}
}
/**
* Adds a temporary file to the internal list
* @param file the file to be added.
*/
public static void addTemporaryFile(final File file) {
temporaryFiles.add(file);
}
/**
* Clean up all temporary files, if there are any.
*/
public static void removeTemporaryFiles() {
for (File file : temporaryFiles) {
file.delete();
}
}
}