/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.index;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Counter;
import org.languagetool.AnalyzedSentence;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.*;
import static org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_FIELD;
import static org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_FIELD_VAL;
import static org.languagetool.dev.wikipedia.WikipediaIndexHandler.MAX_DOC_COUNT_VALUE;
/**
* A class with a main() method that takes a rule id and the location of the
* index that runs the query on that index and prints all matches.
* Will transparently handle rules that are not supported, i.e. run on the candidate matches
* up to a limit.
*
* @author Tao Lin
* @author Daniel Naber
*/
public class Searcher {
private int maxHits = 1000;
private int maxSearchTimeMillis = 5000;
private Directory directory;
private IndexSearcher indexSearcher;
private DirectoryReader reader;
public Searcher(Directory directory) throws IOException {
//openIndex(directory);
this.directory = directory;
}
private void open() throws IOException {
reader = DirectoryReader.open(directory);
indexSearcher = new IndexSearcher(reader);
//System.out.println("Opened index " + directory + " with " + indexSearcher.getIndexReader().numDocs() + " docs");
}
private void close() throws IOException {
if (reader != null) {
reader.close();
}
}
public int getDocCount() throws IOException {
final DirectoryReader reader = DirectoryReader.open(directory);
try {
final IndexSearcher indexSearcher = new IndexSearcher(reader);
return getDocCount(indexSearcher);
} finally {
reader.close();
}
}
private int getDocCount(IndexSearcher indexSearcher) throws IOException {
final Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL);
final TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1);
if (search.totalHits != 1) {
return -1;
}
final ScoreDoc scoreDoc = search.scoreDocs[0];
final Document doc = indexSearcher.doc(scoreDoc.doc);
return Integer.parseInt(doc.get(MAX_DOC_COUNT_VALUE));
}
public int getMaxHits() {
return maxHits;
}
public void setMaxHits(int maxHits) {
this.maxHits = maxHits;
}
public int getMaxSearchTimeMillis() {
return maxSearchTimeMillis;
}
public void setMaxSearchTimeMillis(int maxSearchTimeMillis) {
this.maxSearchTimeMillis = maxSearchTimeMillis;
}
public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language) throws IOException, UnsupportedPatternRuleException {
// it seems wasteful to re-open the index every time, but I had strange problems (OOM, Array out of bounds, ...)
// when not doing so...
open();
try {
final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language);
final Query query = patternRuleQueryBuilder.buildRelaxedQuery(rule);
if (query == null) {
throw new NullPointerException("Cannot search on null query for rule: " + rule.getId());
}
final SearchRunnable runnable = new SearchRunnable(indexSearcher, query, language, rule);
final Thread searchThread = new Thread(runnable);
searchThread.start();
try {
// using a TimeLimitingCollector is not enough, as it doesn't cover all time required to
// search for a complicated regex, so interrupt the whole thread instead:
searchThread.join(maxSearchTimeMillis);
searchThread.interrupt();
} catch (InterruptedException e) {
throw new RuntimeException("Search thread got interrupted for query " + query, e);
}
if (searchThread.isInterrupted()) {
throw new SearchTimeoutException("Search timeout of " + maxSearchTimeMillis + "ms reached");
}
final Exception exception = runnable.getException();
if (exception != null) {
if (exception instanceof SearchTimeoutException) {
throw (SearchTimeoutException)exception;
}
throw new RuntimeException("Exception during search for query " + query + " on rule " + rule.getId(), exception);
}
final List<MatchingSentence> matchingSentences = runnable.getMatchingSentences();
final int sentencesChecked = getSentenceCheckCount(query, indexSearcher);
final SearcherResult searcherResult = new SearcherResult(matchingSentences, sentencesChecked, query);
searcherResult.setDocCount(getDocCount(indexSearcher));
//TODO: the search itself could also timeout, don't just ignore that:
//searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited);
return searcherResult;
} finally {
close();
}
}
private PossiblyLimitedTopDocs getTopDocs(Query query, Sort sort) throws IOException {
final TopFieldCollector topCollector = TopFieldCollector.create(sort, maxHits, true, false, false, false);
final Counter clock = Counter.newCounter(true);
final int waitMillis = 1000;
// TODO: if we interrupt the whole thread anyway, do we still need the TimeLimitingCollector?
final TimeLimitingCollector collector = new TimeLimitingCollector(topCollector, clock, maxSearchTimeMillis / waitMillis);
collector.setBaseline(0);
final Thread counterThread = new Thread() {
@Override
public void run() {
final long startTime = System.currentTimeMillis();
while (true) {
final long runTimeMillis = System.currentTimeMillis() - startTime;
if (runTimeMillis > maxSearchTimeMillis) {
// make sure there's no lingering thread for too long
return;
}
clock.addAndGet(1);
try {
Thread.sleep(waitMillis);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
};
counterThread.setName("LuceneSearchTimeoutThread");
counterThread.start();
boolean timeLimitActivated = false;
try {
indexSearcher.search(query, collector);
} catch (TimeLimitingCollector.TimeExceededException e) {
timeLimitActivated = true;
}
return new PossiblyLimitedTopDocs(topCollector.topDocs(), timeLimitActivated);
}
PatternRule getRuleById(String ruleId, File xmlRuleFile) throws IOException {
final PatternRuleLoader ruleLoader = new PatternRuleLoader();
final List<PatternRule> rules = ruleLoader.getRules(xmlRuleFile);
for (PatternRule rule : rules) {
if (rule.getId().equals(ruleId)) {
return rule;
}
}
throw new PatternRuleNotFoundException(ruleId, xmlRuleFile);
}
private int getSentenceCheckCount(Query query, IndexSearcher indexSearcher) {
final int indexSize = indexSearcher.getIndexReader().numDocs();
// we actually check up to maxHits sentences:
// TODO: ??
final int sentencesChecked = Math.min(maxHits, indexSize);
return sentencesChecked;
}
private List<MatchingSentence> findMatchingSentences(IndexSearcher indexSearcher, TopDocs topDocs, JLanguageTool languageTool) throws IOException {
final List<MatchingSentence> matchingSentences = new ArrayList<MatchingSentence>();
for (ScoreDoc match : topDocs.scoreDocs) {
final Document doc = indexSearcher.doc(match.doc);
final String sentence = doc.get(FIELD_NAME);
final List<RuleMatch> ruleMatches = languageTool.check(sentence);
if (ruleMatches.size() > 0) {
final AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(sentence);
final MatchingSentence matchingSentence = new MatchingSentence(sentence, analyzedSentence, ruleMatches);
matchingSentences.add(matchingSentence);
}
}
return matchingSentences;
}
private JLanguageTool getLanguageToolWithOneRule(Language lang, PatternRule patternRule) throws IOException {
final JLanguageTool langTool = new JLanguageTool(lang);
for (Rule rule : langTool.getAllActiveRules()) {
langTool.disableRule(rule.getId());
}
langTool.addRule(patternRule);
return langTool;
}
class PossiblyLimitedTopDocs {
TopDocs topDocs;
boolean resultIsTimeLimited;
PossiblyLimitedTopDocs(TopDocs topDocs, boolean resultIsTimeLimited) {
this.topDocs = topDocs;
this.resultIsTimeLimited = resultIsTimeLimited;
}
}
private static void ensureCorrectUsageOrExit(String[] args) {
if (args.length != 4) {
System.err.println("Usage: Searcher <ruleId> <ruleXML> <languageCode> <indexDir>");
System.err.println("\truleId Id of the rule to search for");
System.err.println("\truleXML path to a rule file, e.g. en/grammar.xml");
System.err.println("\tlanguageCode short language code, e.g. en for English");
System.err.println("\tindexDir path to a directory containing the index");
System.exit(1);
}
}
class SearchRunnable implements Runnable {
private final IndexSearcher indexSearcher;
private final Query query;
private final Language language;
private final PatternRule rule;
private List<MatchingSentence> matchingSentences;
private Exception exception;
SearchRunnable(IndexSearcher indexSearcher, Query query, Language language, PatternRule rule) {
this.indexSearcher = indexSearcher;
this.query = query;
this.language = language;
this.rule = rule;
}
@Override
public void run() {
try {
final Sort sort = new Sort(new SortField("docCount", SortField.Type.INT)); // do not sort by relevance as this will move the shortest documents to the top
final long t1 = System.currentTimeMillis();
final JLanguageTool languageTool = getLanguageToolWithOneRule(language, rule);
final long langToolCreationTime = System.currentTimeMillis() - t1;
final long t2 = System.currentTimeMillis();
final PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query, sort);
final long luceneTime = System.currentTimeMillis() - t2;
final long t3 = System.currentTimeMillis();
matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool);
System.out.println("Check done in " + langToolCreationTime + "/" + luceneTime + "/" + (System.currentTimeMillis() - t3)
+ "ms (LT creation/Lucene/matching) for " + limitedTopDocs.topDocs.scoreDocs.length + " docs, query " + query.toString(FIELD_NAME_LOWERCASE));
} catch (Exception e) {
exception = e;
}
}
Exception getException() {
return exception;
}
List<MatchingSentence> getMatchingSentences() {
return matchingSentences;
}
}
public static void main(String[] args) throws Exception {
ensureCorrectUsageOrExit(args);
final long startTime = System.currentTimeMillis();
final String[] ruleIds = args[0].split(",");
final File ruleFile = new File(args[1]);
final String languageCode = args[2];
final Language language = Language.getLanguageForShortName(languageCode);
final File indexDir = new File(args[3]);
final Searcher searcher = new Searcher(new SimpleFSDirectory(indexDir));
for (String ruleId : ruleIds) {
final long ruleStartTime = System.currentTimeMillis();
final PatternRule rule = searcher.getRuleById(ruleId, ruleFile);
final SearcherResult searcherResult = searcher.findRuleMatchesOnIndex(rule, language);
int i = 1;
if (searcherResult.getMatchingSentences().size() == 0) {
System.out.println("[no matches]");
}
for (MatchingSentence ruleMatch : searcherResult.getMatchingSentences()) {
System.out.println(i + ": " + ruleMatch.getSentence());
i++;
}
System.out.println("Time: " + (System.currentTimeMillis() - ruleStartTime) + "ms");
System.out.println("==============================================================");
}
System.out.println("Total time: " + (System.currentTimeMillis() - startTime) + "ms");
}
}