LuceneOnlyWikipediaImporter.java example

Explorer
Solandra-master
/**
 * Copyright 2010 T Jake Luciani
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lucandra.wikipedia;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

public class LuceneOnlyWikipediaImporter {

    private int pageCount;
    private long size;
    private long startTime;
    private long lastTime;
    private Analyzer analyzer;
    private IndexWriter indexWriter;

    public LuceneOnlyWikipediaImporter() {
        pageCount = 0;
        size = 0;

        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        try {
            indexWriter = new IndexWriter(FSDirectory.open(new File("/tmp/wikassandra")), analyzer, true, MaxFieldLength.UNLIMITED);
        } catch (CorruptIndexException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (LockObtainFailedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        startTime = System.currentTimeMillis();
        lastTime = System.currentTimeMillis();
    }

    private static void usage() {
        System.err.println("LuceneOnlyWikipediaImporter file.xml");
        System.exit(0);
    }

    private void readFile(String fileName) throws IOException {

        InputStream inputFile = new FileInputStream(fileName);
        BufferedReader fileStream = new BufferedReader(new InputStreamReader(inputFile));

        // rather than xml parse, just do something fast & simple.
        String line;
        Article page = new Article();
        boolean inText = false;

        while ((line = fileStream.readLine()) != null) {

            // Page
            if (line.contains("<doc>")) {
                page = new Article();
                continue;
            }

            if (line.contains("</doc>")) {

                if (++pageCount % 5000 == 0) {

                    long now = System.currentTimeMillis();
                    if ((now - lastTime) / 1000.0 > 1) {
                        System.err.println("Loaded (" + pageCount + ") " + size / 1000.0 + "Kb, in " + (now - startTime) / 1000.0 + ", avg "
                                + (pageCount / ((now - startTime) / 1000)) + " docs/sec");
                        lastTime = now;
                    }

                }

                indexPage(page); // index each page
            }

            // title
            if (line.contains("<title>")) {
                page.title = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));

                continue;
            }

            // url
            if (line.contains("<url>")) {
                if (page.url == null)
                    page.url = line.substring(line.indexOf("<url>") + 5, line.indexOf("</url>"));

                continue;
            }

            // article text
            if (line.contains("<abstract>")) {

                if (line.contains("</abstract>")) {
                    page.text = line.substring(line.indexOf("<abstract>") + 10, line.indexOf("</abstract>")).getBytes();
                } else {
                    page.text = line.substring(line.indexOf("<abstract>" + 10)).getBytes();
                    inText = true;
                    continue;
                }
            }

            if (inText) {

                String text = line;
                if (line.contains("</abstract>"))
                    text = line.substring(0, line.indexOf("</abstract>"));

                byte[] newText = new byte[page.text.length + text.getBytes().length];

                System.arraycopy(page.text, 0, newText, 0, page.text.length);
                System.arraycopy(text.getBytes(), 0, newText, page.text.length, text.getBytes().length);

                page.text = newText;
            }

            if (line.contains("</abstract>")) {
                inText = false;
                continue;
            }
        }

        indexWriter.commit();
        indexWriter.close();

        long now = System.currentTimeMillis();
        System.err.println("Loaded (" + pageCount + ") " + size / 1000 + "Kb, in " + (now - lastTime) / 1000.0);

        System.err.println("done");

    }

    public void indexPage(Article article) throws CorruptIndexException, IOException {

        Document d = new Document();

        d.add(new Field("title", article.title, Store.YES, Index.ANALYZED));

        if (article.text != null)
            d.add(new Field("text", new String(article.text), Store.YES, Index.ANALYZED));

        d.add(new Field("url", article.url, Store.YES, Index.NOT_ANALYZED));

        indexWriter.addDocument(d, analyzer);

    }

    public static void main(String[] args) {

        try {

            if (args.length > 0)
                new LuceneOnlyWikipediaImporter().readFile(args[0]);
            else
                LuceneOnlyWikipediaImporter.usage();

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

}