/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.index; import java.io.*; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.languagetool.JLanguageTool; import org.languagetool.Language; import org.languagetool.tokenizers.SentenceTokenizer; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME; import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE; /** * A class with a main() method that takes a text file and indexes its sentences, including POS tags * * @author Tao Lin, Miaojuan Dai */ public class Indexer { private static final Version LUCENE_VERSION = Version.LUCENE_41; private final IndexWriter writer; private final SentenceTokenizer sentenceTokenizer; public Indexer(Directory dir, Language language) { try { final Map<String, Analyzer> analyzerMap = new HashMap<String, Analyzer>(); analyzerMap.put(FIELD_NAME, new LanguageToolAnalyzer(LUCENE_VERSION, new JLanguageTool(language), false)); analyzerMap.put(FIELD_NAME_LOWERCASE, new LanguageToolAnalyzer(LUCENE_VERSION, new JLanguageTool(language), true)); final Analyzer analyzer = new PerFieldAnalyzerWrapper(new DoNotUseAnalyzer(), analyzerMap); final IndexWriterConfig writerConfig = new IndexWriterConfig(LUCENE_VERSION, analyzer); writerConfig.setOpenMode(OpenMode.CREATE); writer = new IndexWriter(dir, writerConfig); sentenceTokenizer = language.getSentenceTokenizer(); } catch (Exception e) { throw new RuntimeException(e); } } public static void main(String[] args) throws IOException { ensureCorrectUsageOrExit(args); run(args[0], args[1], args[2]); } private static void ensureCorrectUsageOrExit(String[] args) { if (args.length != 3) { System.err.println("Usage: Indexer <textFile> <indexDir> <languageCode>"); System.err.println("\ttextFile path to a text file to be indexed"); System.err.println("\tindexDir path to a directory storing the index"); System.err.println("\tlanguageCode short language code, e.g. en for English"); System.exit(1); } } private static void run(String textFile, String indexDir, String languageCode) throws IOException { final File file = new File(textFile); if (!file.exists() || !file.canRead()) { System.out.println("Text file '" + file.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } final BufferedReader reader = new BufferedReader(new FileReader(file)); System.out.println("Indexing to directory '" + indexDir + "'..."); final FSDirectory directory = FSDirectory.open(new File(indexDir)); try { final Language language = Language.getLanguageForShortName(languageCode); final Indexer indexer = new Indexer(directory, language); try { run(reader, indexer, false); } finally { indexer.close(); } } finally { directory.close(); } System.out.println("Index complete!"); } public static void run(String content, Directory dir, Language language, boolean isSentence) throws IOException { final BufferedReader br = new BufferedReader(new StringReader(content)); final Indexer indexer = new Indexer(dir, language); try { run(br, indexer, isSentence); } finally { indexer.close(); } } public static void run(BufferedReader reader, Indexer indexer, boolean isSentence) throws IOException { indexer.index(reader, isSentence, -1); } public void index(String content, boolean isSentence, int docCount) throws IOException { final BufferedReader br = new BufferedReader(new StringReader(content)); index(br, isSentence, docCount); } public void index(BufferedReader reader, boolean isSentence, int docCount) throws IOException { String line = ""; while ((line = reader.readLine()) != null) { if (isSentence) { add(-1, line); } else { final List<String> sentences = sentenceTokenizer.tokenize(line); for (String sentence : sentences) { add(docCount, sentence); } } } } public void add(Document doc) throws IOException { writer.addDocument(doc); } private void add(int docCount, String sentence) throws IOException { final Document doc = new Document(); final FieldType type = new FieldType(); type.setStored(true); type.setIndexed(true); type.setTokenized(true); doc.add(new Field(FIELD_NAME, sentence, type)); doc.add(new Field(FIELD_NAME_LOWERCASE, sentence, type)); if (docCount != -1) { final FieldType countType = new FieldType(); countType.setStored(true); countType.setIndexed(false); doc.add(new Field("docCount", docCount + "", countType)); } writer.addDocument(doc); } public void close() throws IOException { writer.close(); } }