WikipediaImporter.java example

Explorer
Solandra-master
/**
 * Copyright 2010 T Jake Luciani
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lucandra.wikipedia;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Queue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;

public class WikipediaImporter {

    private ExecutorService threadPool;
    private Queue<Future<Integer>> resultSet;
    private int pageCount;
    private int loadCount;
    private long size;
    private long startTime;
    private long lastTime;

    public WikipediaImporter() {
        threadPool = Executors.newFixedThreadPool(64);
        resultSet = new LinkedBlockingQueue<Future<Integer>>();
        pageCount = 0;
        loadCount = 0;
        size = 0;

        startTime = System.currentTimeMillis();
        lastTime = System.currentTimeMillis();
    }

    private static void usage() {
        System.err.println("WikipediaImporter file.xml");
        System.exit(0);
    }

    private void readFile(String fileName) throws IOException {

        InputStream inputFile = new FileInputStream(fileName);
        BufferedReader fileStream = new BufferedReader(new InputStreamReader(inputFile));

        // rather than xml parse, just do something fast & simple.
        String line;
        Article page = new Article();
        boolean inText = false;

        while ((line = fileStream.readLine()) != null) {

            // Page
            if (line.contains("<doc>")) {
                page = new Article();
                continue;
            }

            if (line.contains("</doc>")) {

                if (++pageCount % 5000 == 0) {
                    Future<Integer> result;

                    while ((result = resultSet.poll()) != null) {
                        try {
                            size += result.get();
                            loadCount++;
                            long now = System.currentTimeMillis();
                            if ((now - lastTime) / 1000.0 > 1) {
                                System.err.println("Loaded (" + loadCount + ") " + size / 1000.0 + "Kb, in " + (now - startTime) / 1000.0 + ", avg "
                                        + (loadCount / ((now - startTime) / 1000)) + " docs/sec");
                                lastTime = now;
                            }
                        } catch (InterruptedException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        } catch (ExecutionException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }

                }

                indexPage(page); // index each page
            }

            // title
            if (line.contains("<title>")) {
                page.title = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));

                continue;
            }

            // url
            if (line.contains("<url>")) {
                if (page.url == null)
                    page.url = line.substring(line.indexOf("<url>") + 5, line.indexOf("</url>"));

                continue;
            }

            // article text
            if (line.contains("<abstract>")) {

                if (line.contains("</abstract>")) {
                    page.text = line.substring(line.indexOf("<abstract>") + 10, line.indexOf("</abstract>")).getBytes("UTF-8");
                } else {
                    page.text = line.substring(line.indexOf("<abstract>" + 10)).getBytes("UTF-8");
                    inText = true;
                    continue;
                }
            }

            if (inText) {

                String text = line;
                if (line.contains("</abstract>"))
                    text = line.substring(0, line.indexOf("</abstract>"));

                byte[] newText = new byte[page.text.length + text.getBytes().length];

                System.arraycopy(page.text, 0, newText, 0, page.text.length);
                System.arraycopy(text.getBytes("UTF-8"), 0, newText, page.text.length, text.getBytes().length);

                page.text = newText;
            }

            if (line.contains("</abstract>")) {
                inText = false;
                continue;
            }
        }

        threadPool.shutdown();
        try {
            threadPool.awaitTermination(90, TimeUnit.SECONDS);
        } catch (InterruptedException ex) {

        }

        Future<Integer> result;
        int size = 0;
        while ((result = resultSet.poll()) != null) {
            try {
                size += result.get();

            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (ExecutionException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        long now = System.currentTimeMillis();
        System.err.println("Loaded (" + pageCount + ") " + size / 1000 + "Kb, in " + (now - lastTime) / 1000.0);

        System.err.println("done");

    }

    public void indexPage(Article page) {

        Future<Integer> result = threadPool.submit(new WikipediaIndexWorker(page));
        resultSet.add(result);

    }

    public static void main(String[] args) {

        try {

            if (args.length > 0)
                new WikipediaImporter().readFile(args[0]);
            else
                WikipediaImporter.usage();

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

}