/** * Copyright 2010 T Jake Luciani * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package lucandra.wikipedia; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class LuceneOnlyWikipediaImporter { private int pageCount; private long size; private long startTime; private long lastTime; private Analyzer analyzer; private IndexWriter indexWriter; public LuceneOnlyWikipediaImporter() { pageCount = 0; size = 0; analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); try { indexWriter = new IndexWriter(FSDirectory.open(new File("/tmp/wikassandra")), analyzer, true, MaxFieldLength.UNLIMITED); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } startTime = System.currentTimeMillis(); lastTime = System.currentTimeMillis(); } private static void usage() { System.err.println("LuceneOnlyWikipediaImporter file.xml"); System.exit(0); } private void readFile(String fileName) throws IOException { InputStream inputFile = new FileInputStream(fileName); BufferedReader fileStream = new BufferedReader(new InputStreamReader(inputFile)); // rather than xml parse, just do something fast & simple. String line; Article page = new Article(); boolean inText = false; while ((line = fileStream.readLine()) != null) { // Page if (line.contains("<doc>")) { page = new Article(); continue; } if (line.contains("</doc>")) { if (++pageCount % 5000 == 0) { long now = System.currentTimeMillis(); if ((now - lastTime) / 1000.0 > 1) { System.err.println("Loaded (" + pageCount + ") " + size / 1000.0 + "Kb, in " + (now - startTime) / 1000.0 + ", avg " + (pageCount / ((now - startTime) / 1000)) + " docs/sec"); lastTime = now; } } indexPage(page); // index each page } // title if (line.contains("<title>")) { page.title = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>")); continue; } // url if (line.contains("<url>")) { if (page.url == null) page.url = line.substring(line.indexOf("<url>") + 5, line.indexOf("</url>")); continue; } // article text if (line.contains("<abstract>")) { if (line.contains("</abstract>")) { page.text = line.substring(line.indexOf("<abstract>") + 10, line.indexOf("</abstract>")).getBytes(); } else { page.text = line.substring(line.indexOf("<abstract>" + 10)).getBytes(); inText = true; continue; } } if (inText) { String text = line; if (line.contains("</abstract>")) text = line.substring(0, line.indexOf("</abstract>")); byte[] newText = new byte[page.text.length + text.getBytes().length]; System.arraycopy(page.text, 0, newText, 0, page.text.length); System.arraycopy(text.getBytes(), 0, newText, page.text.length, text.getBytes().length); page.text = newText; } if (line.contains("</abstract>")) { inText = false; continue; } } indexWriter.commit(); indexWriter.close(); long now = System.currentTimeMillis(); System.err.println("Loaded (" + pageCount + ") " + size / 1000 + "Kb, in " + (now - lastTime) / 1000.0); System.err.println("done"); } public void indexPage(Article article) throws CorruptIndexException, IOException { Document d = new Document(); d.add(new Field("title", article.title, Store.YES, Index.ANALYZED)); if (article.text != null) d.add(new Field("text", new String(article.text), Store.YES, Index.ANALYZED)); d.add(new Field("url", article.url, Store.YES, Index.NOT_ANALYZED)); indexWriter.addDocument(d, analyzer); } public static void main(String[] args) { try { if (args.length > 0) new LuceneOnlyWikipediaImporter().readFile(args[0]); else LuceneOnlyWikipediaImporter.usage(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }