/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.wikipedia; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.languagetool.Language; import org.languagetool.TextFilter; import org.languagetool.dev.index.Indexer; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Wikipedia handler for indexing. See {@link org.languagetool.dev.index.Searcher} for a * class that lets you use this index. * * @author Tao Lin */ public class WikipediaIndexHandler extends DefaultHandler { public static final String MAX_DOC_COUNT_VALUE = "maxDocCountValue"; public static final String MAX_DOC_COUNT_FIELD = "maxDocCount"; public static final String MAX_DOC_COUNT_FIELD_VAL = "1"; private final Indexer indexer; private int articleCount = 0; // the number of the wiki page to start indexing private int start = 0; // the number of the wiki page to end indexing private int end = 0; private boolean inText = false; private boolean inTitle = false; private StringBuilder text = new StringBuilder(); private StringBuilder title = new StringBuilder(); private TextFilter textFilter = new SwebleWikipediaTextFilter(); // =========================================================== // SAX DocumentHandler methods // =========================================================== public WikipediaIndexHandler(Directory dir, Language language, int start, int end) { this.indexer = new Indexer(dir, language); this.start = start; this.end = end; if (start > end && end != 0) { throw new RuntimeException("\"start\" should be smaller than \"end\": " + start + ", " + end); } textFilter = TextFilterTools.getTextFilter(language); } @Override @SuppressWarnings("unused") public void startElement(String namespaceURI, String lName, String qName, Attributes attrs) throws SAXException { if (qName.equals("title")) { inTitle = true; } else if (qName.equals("text")) { inText = true; } } @Override @SuppressWarnings("unused") public void endElement(String namespaceURI, String sName, String qName) { if (qName.equals("title")) { inTitle = false; } else if (qName.equals("text")) { System.out.println(++articleCount + ": " + title); title = new StringBuilder(); if (articleCount < start) { return; } else if (articleCount >= end && end != 0) { throw new DocumentLimitReachedException(end); } try { final String textToCheck; try { textToCheck = textFilter.filter(text.toString()); if (!textToCheck.contains("#REDIRECT") && !textToCheck.trim().equals("")) { indexer.index(textToCheck, false, articleCount); } } catch (Exception e) { System.err.println("Exception when filtering '" + title + "' - skipping file. Stacktrace follows:"); e.printStackTrace(); } } catch (Exception e) { throw new RuntimeException("Failed checking article " + articleCount, e); } } text = new StringBuilder(); inText = false; } @Override public void characters(char buf[], int offset, int len) { final String s = new String(buf, offset, len); if (inText) { text.append(s); } else if (inTitle) { title.append(s); } } public void close() throws Exception { indexer.close(); } private void writeMetaDocuments() throws IOException { final Document doc = new Document(); doc.add(new StringField(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL, Field.Store.YES)); doc.add(new StringField(MAX_DOC_COUNT_VALUE, articleCount + "", Field.Store.YES)); indexer.add(doc); } public static void main(String... args) throws Exception { if (args.length != 4) { System.out.println("Usage: " + WikipediaIndexHandler.class.getSimpleName() + " <wikipediaDump> <indexDir> <languageCode> <maxDocs>"); System.out.println("\t<wikipediaDump> a Wikipedia XML dump"); System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed"); System.out.println("\t<languageCode> short code like en for English, de for German etc"); System.out.println("\t<maxDocs> maximum number of documents to be indexed, use 0 for no limit"); System.exit(1); } final File dumpFile = new File(args[0]); final File indexDir = new File(args[1]); final String languageCode = args[2]; final int maxDocs = Integer.parseInt(args[3]); final Language language = Language.getLanguageForShortName(languageCode); if (maxDocs == 0) { System.out.println("Going to index all documents from " + dumpFile); } else { System.out.println("Going to index up to " + maxDocs + " documents from " + dumpFile); } System.out.println("Output index dir: " + indexDir); final long start = System.currentTimeMillis(); final SAXParserFactory factory = SAXParserFactory.newInstance(); final SAXParser saxParser = factory.newSAXParser(); final FSDirectory fsDirectory = FSDirectory.open(indexDir); try { final WikipediaIndexHandler handler = new WikipediaIndexHandler(fsDirectory, language, 1, maxDocs); try { saxParser.parse(new FileInputStream(dumpFile), handler); } catch (DocumentLimitReachedException e) { System.out.println("Document limit (" + e.limit + ") reached, stopping indexing"); } finally { handler.writeMetaDocuments(); handler.close(); } } finally { fsDirectory.close(); } final long end = System.currentTimeMillis(); final float minutes = (end - start) / (float)(1000 * 60); System.out.printf("Indexing took %.2f minutes\n", minutes); } private class DocumentLimitReachedException extends RuntimeException { int limit; DocumentLimitReachedException(int limit) { this.limit = limit; } } }