Indexer.java example

Explorer
h2-bitmap-master
- h2
  - src
/*
 * Copyright 2004-2011 H2 Group. Multiple-Licensed under the H2 License,
 * Version 1.0, and under the Eclipse Public License, Version 1.0
 * (http://h2database.com/html/license.html).
 * Initial Developer: H2 Group
 */
package org.h2.build.indexer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import org.h2.util.IOUtils;
import org.h2.util.StringUtils;

/**
 * The indexer creates the fulltext index of the HTML documentation.
 * It is used for the built-in HTML javascript search.
 */
public class Indexer {

    private static final int MIN_WORD_SIZE = 3;
    private static final int MAX_RELATIONS = 30;
    private static final String VERY_COMMON =
        ";the;be;to;of;and;a;in;that;have;i;it;for;not;on;with;he;as;you;do;at;" +
        "this;but;his;by;from;they;we;say;her;she;or;an;will;my;one;all;would;" +
        "there;their;what;so;up;out;if;about;who;get;which;go;me;when;make;" +
        "can;like;no;just;him;know;take;into;your;good;some;" +
        "could;them;see;other;than;then;now;look;only;come;its;over;think;" +
        "also;back;after;use;two;how;our;work;first;well;way;even;new;want;" +
        "because;any;these;give;most;us;";

    private ArrayList<Page> pages = new ArrayList<Page>();

    /**
     * Lower case word to Word map.
     */
    private HashMap<String, Word> words = new HashMap<String, Word>();
    private HashSet<String> noIndex = new HashSet<String>();
    private ArrayList <Word>wordList;
    private PrintWriter output;
    private Page page;
    private boolean title;
    private boolean heading;
    private String ignored;

    /**
     * This method is called when executing this application from the command
     * line.
     *
     * @param args the command line parameters
     */
    public static void main(String... args) throws Exception {
        new Indexer().run(args);
    }

    private void run(String... args) throws Exception {
        String dir = "docs";
        String destDir = "docs/html";
        for (int i = 0; i < args.length; i++) {
            if (args[i].equals("-dir")) {
                dir = args[++i];
            } else if (args[i].equals("-destDir")) {
                destDir = args[++i];
            }
        }
        File file = new File(dir);
        setNoIndex("index.html", "html/header.html", "html/search.html",
                "html/frame.html", "html/fragments.html",
                "html/sourceError.html", "html/source.html", "html/mainWeb.html",
                "javadoc/index.html", "javadoc/classes.html", "javadoc/allclasses-frame.html",
                "javadoc/allclasses-noframe.html", "javadoc/constant-values.html", "javadoc/overview-frame.html",
                "javadoc/overview-summary.html", "javadoc/serialized-form.html");
        output = new PrintWriter(new FileWriter(destDir + "/index.js"));
        readPages("", file, 0);
        output.println("var pages=new Array();");
        output.println("var ref=new Array();");
        output.println("var ignored='';");
        output.println("function Page(title, file) { this.title=title; this.file=file; }");
        output.println("function load() {");
        sortWords();
        removeOverflowRelations();
        sortPages();
        listPages();
        listWords();
        output.println("}");
        output.close();
    }

    private void setNoIndex(String... strings) {
        for (String s : strings) {
            noIndex.add(s);
        }
    }

    private void sortWords() {
        for (String name : new ArrayList<String>(words.keySet())) {
            if (name.endsWith("s")) {
                String singular = name.substring(0, name.length() - 1);
                if (words.containsKey(singular)) {
                    Word wp = words.get(name);
                    Word ws = words.get(singular);
                    ws.addAll(wp);
                    words.remove(name);
                }
            } else if (name.startsWith("abc")) {
                words.remove(name);
            }
        }
        wordList = new ArrayList<Word>(words.values());
        // ignored very common words (to shrink the index)
        StringBuilder ignoredBuff = new StringBuilder(";");
        int maxSize = pages.size() / 4;
        for (int i = 0; i < wordList.size(); i++) {
            Word word = wordList.get(i);
            String search = ";" + word.name.toLowerCase() + ";";
            int idxCommon = VERY_COMMON.indexOf(search);
            if (word.pages.size() >= maxSize || idxCommon >= 0) {
                wordList.remove(i);
                ignoredBuff.append(word.name);
                ignoredBuff.append(';');
                i--;
            }
        }
        ignored = ignoredBuff.toString();
        // TODO support A, B, C,... class links in the index file and use them
        // for combined AND searches
        Collections.sort(wordList, new Comparator<Word>() {
            public int compare(Word w0, Word w1) {
                return w0.name.compareToIgnoreCase(w1.name);
            }
        });
    }

    private void removeOverflowRelations() {
        for (Word word : wordList) {
            ArrayList<Weight> weights = word.getSortedWeights();
            int max = MAX_RELATIONS;
            if (weights.size() > max) {
                while (max < weights.size()) {
                    Weight weight = weights.get(max);
                    if (weight.value < Weight.HEADER) {
                        break;
                    }
                    max++;
                }
            }
            while (max < weights.size()) {
                Weight weight = weights.get(max);
                weights.remove(max);
                weight.page.relations--;
            }
        }
    }

    private void sortPages() {
        Collections.sort(pages, new Comparator<Page>() {
            public int compare(Page p0, Page p1) {
                return p0.relations == p1.relations ? 0 : p0.relations < p1.relations ? 1 : -1;
            }
        });
        for (int i = 0; i < pages.size(); i++) {
            pages.get(i).id = i;
        }
    }

    private void listPages() {
        for (Page p : pages) {
            output.println("pages[" + p.id + "]=new Page('" + convertUTF(p.title) + "', '" + p.fileName
                    + "');");
        }
    }

    private void readPages(String dir, File file, int level) throws Exception {
        String name = file.getName();
        String fileName = dir.length() > 0 ? dir + "/" + name : level > 0 ? name : "";
        if (file.isDirectory()) {
            for (File f : file.listFiles()) {
                readPages(fileName, f, level + 1);
            }
            return;
        }
        String lower = StringUtils.toLowerEnglish(name);
        if (!lower.endsWith(".html") && !lower.endsWith(".htm")) {
            return;
        }
        if (lower.indexOf("_ja.") >= 0) {
            return;
        }
        if (!noIndex.contains(fileName)) {
            page = new Page(pages.size(), fileName);
            pages.add(page);
            readPage(file);
        }
    }

    private void listWords() {
        output.println("// words: " + wordList.size());
        StringBuilder buff = new StringBuilder();
        String first = "";
        int firstLen = 1;
        int totalRelations = 0;
        for (Word word : wordList) {
            ArrayList<Weight> weights = word.getSortedWeights();
            String lower = StringUtils.toLowerEnglish(word.name);
            if (!first.equals(lower.substring(0, firstLen))) {
                if (buff.length() > 0) {
                    output.println("ref['" + convertUTF(first) + "']='" + buff.toString() + "';");
                    buff = new StringBuilder();
                }
                first = lower.substring(0, firstLen);
            }
            if (buff.length() > 0) {
                buff.append(';');
            }
            buff.append(convertUTF(word.name));
            buff.append('=');
            String weightString = "r";
            totalRelations += weights.size();
            for (int j = 0; j < weights.size(); j++) {
                Weight weight = weights.get(j);
                Page p = weight.page;
                if (j > 0) {
                    buff.append(",");
                }
                String ws;
                if (weight.value >= Weight.TITLE) {
                    ws = "t";
                } else if (weight.value >= Weight.HEADER) {
                    ws = "h";
                } else {
                    ws = "r";
                }
                if (ws != weightString) {
                    weightString = ws;
                    buff.append(ws);
                }
                buff.append(p.id);
            }
        }
        output.println("ref['" + convertUTF(first) + "']='" + buff.toString() + "';");
        output.println("// totalRelations: " + totalRelations);
        output.println("ignored='" + ignored.toLowerCase() + "';");
    }

    private void readPage(File file) throws Exception {
        byte[] data = IOUtils.readBytesAndClose(new FileInputStream(file), 0);
        String text = new String(data, "UTF-8");
        StringTokenizer t = new StringTokenizer(text, "<> \r\n", true);
        boolean inTag = false;
        title = false;
        heading = false;
        while (t.hasMoreTokens()) {
            String token = t.nextToken();
            if (token.length() == 1) {
                char c = token.charAt(0);
                switch (c) {
                case '<': {
                    if (inTag) {
                        process("???");
                    }
                    inTag = true;
                    if (!t.hasMoreTokens()) {
                        break;
                    }
                    token = t.nextToken();
                    if (token.startsWith("/")) {
                        title = false;
                        heading = false;
                    } else if (token.equalsIgnoreCase("title")) {
                        title = true;
                    } else if (token.length() == 2 && Character.toLowerCase(token.charAt(0)) == 'h'
                            && Character.isDigit(token.charAt(1))) {
                        heading = true;
                    }
                    // TODO maybe skip script tags?
                    break;
                }
                case '>': {
                    if (!inTag) {
                        process("???");
                    }
                    inTag = false;
                    break;
                }
                case '\r':
                case '\n':
                case ' ':
                    break;
                default:
                    if (!inTag) {
                        process(token);
                    }
                }
            } else {
                if (!inTag) {
                    process(token);
                }
            }
        }

        if (page.title == null || page.title.trim().length() == 0) {
            System.out.println("Error: not title found in " + file.getName());
            page.title = file.getName();
        }
        page.title = page.title.trim();
    }

    private void process(String text) {
        text = HtmlConverter.convertHtmlToString(text);
        if (title) {
            if (page.title == null) {
                page.title = text;
            } else {
                page.title = page.title + " " + text;
            }
        }
        int weight;
        if (title) {
            weight = Weight.TITLE;
        } else if (heading) {
            weight = Weight.HEADER;
        } else {
            weight = Weight.PARAGRAPH;
        }
        // this list of constants needs to be the same in search.js
        // (char) 160: nbsp
        StringTokenizer t = new StringTokenizer(text, " \t\r\n\"'.,:;!&/\\?%@`[]{}()+-=<>|*^~#$" + (char) 160, false);
        while (t.hasMoreTokens()) {
            String token = t.nextToken();
            if (token.length() < MIN_WORD_SIZE) {
                continue;
            }
            if (Character.isDigit(token.charAt(0))) {
                continue;
            }
            String lower = StringUtils.toLowerEnglish(token);
            Word word = words.get(lower);
            if (word == null) {
                word = new Word(token);
                words.put(lower, word);
            } else if (!word.name.equals(token)) {
                word.name = token.compareTo(word.name) > 0 ? token : word.name;
            }
            page.totalWeight += weight;
            word.addPage(page, weight);
        }
    }

    private static String convertUTF(String s) {
        s = StringUtils.quoteJavaString(s);
        s = s.substring(1, s.length() - 1);
        return s;
    }

}