/* * Copyright 2004-2011 H2 Group. Multiple-Licensed under the H2 License, * Version 1.0, and under the Eclipse Public License, Version 1.0 * (http://h2database.com/html/license.html). * Initial Developer: H2 Group */ package org.h2.build.indexer; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.StringTokenizer; import org.h2.util.IOUtils; import org.h2.util.StringUtils; /** * The indexer creates the fulltext index of the HTML documentation. * It is used for the built-in HTML javascript search. */ public class Indexer { private static final int MIN_WORD_SIZE = 3; private static final int MAX_RELATIONS = 30; private static final String VERY_COMMON = ";the;be;to;of;and;a;in;that;have;i;it;for;not;on;with;he;as;you;do;at;" + "this;but;his;by;from;they;we;say;her;she;or;an;will;my;one;all;would;" + "there;their;what;so;up;out;if;about;who;get;which;go;me;when;make;" + "can;like;no;just;him;know;take;into;your;good;some;" + "could;them;see;other;than;then;now;look;only;come;its;over;think;" + "also;back;after;use;two;how;our;work;first;well;way;even;new;want;" + "because;any;these;give;most;us;"; private ArrayList<Page> pages = new ArrayList<Page>(); /** * Lower case word to Word map. */ private HashMap<String, Word> words = new HashMap<String, Word>(); private HashSet<String> noIndex = new HashSet<String>(); private ArrayList <Word>wordList; private PrintWriter output; private Page page; private boolean title; private boolean heading; private String ignored; /** * This method is called when executing this application from the command * line. * * @param args the command line parameters */ public static void main(String... args) throws Exception { new Indexer().run(args); } private void run(String... args) throws Exception { String dir = "docs"; String destDir = "docs/html"; for (int i = 0; i < args.length; i++) { if (args[i].equals("-dir")) { dir = args[++i]; } else if (args[i].equals("-destDir")) { destDir = args[++i]; } } File file = new File(dir); setNoIndex("index.html", "html/header.html", "html/search.html", "html/frame.html", "html/fragments.html", "html/sourceError.html", "html/source.html", "html/mainWeb.html", "javadoc/index.html", "javadoc/classes.html", "javadoc/allclasses-frame.html", "javadoc/allclasses-noframe.html", "javadoc/constant-values.html", "javadoc/overview-frame.html", "javadoc/overview-summary.html", "javadoc/serialized-form.html"); output = new PrintWriter(new FileWriter(destDir + "/index.js")); readPages("", file, 0); output.println("var pages=new Array();"); output.println("var ref=new Array();"); output.println("var ignored='';"); output.println("function Page(title, file) { this.title=title; this.file=file; }"); output.println("function load() {"); sortWords(); removeOverflowRelations(); sortPages(); listPages(); listWords(); output.println("}"); output.close(); } private void setNoIndex(String... strings) { for (String s : strings) { noIndex.add(s); } } private void sortWords() { for (String name : new ArrayList<String>(words.keySet())) { if (name.endsWith("s")) { String singular = name.substring(0, name.length() - 1); if (words.containsKey(singular)) { Word wp = words.get(name); Word ws = words.get(singular); ws.addAll(wp); words.remove(name); } } else if (name.startsWith("abc")) { words.remove(name); } } wordList = new ArrayList<Word>(words.values()); // ignored very common words (to shrink the index) StringBuilder ignoredBuff = new StringBuilder(";"); int maxSize = pages.size() / 4; for (int i = 0; i < wordList.size(); i++) { Word word = wordList.get(i); String search = ";" + word.name.toLowerCase() + ";"; int idxCommon = VERY_COMMON.indexOf(search); if (word.pages.size() >= maxSize || idxCommon >= 0) { wordList.remove(i); ignoredBuff.append(word.name); ignoredBuff.append(';'); i--; } } ignored = ignoredBuff.toString(); // TODO support A, B, C,... class links in the index file and use them // for combined AND searches Collections.sort(wordList, new Comparator<Word>() { public int compare(Word w0, Word w1) { return w0.name.compareToIgnoreCase(w1.name); } }); } private void removeOverflowRelations() { for (Word word : wordList) { ArrayList<Weight> weights = word.getSortedWeights(); int max = MAX_RELATIONS; if (weights.size() > max) { while (max < weights.size()) { Weight weight = weights.get(max); if (weight.value < Weight.HEADER) { break; } max++; } } while (max < weights.size()) { Weight weight = weights.get(max); weights.remove(max); weight.page.relations--; } } } private void sortPages() { Collections.sort(pages, new Comparator<Page>() { public int compare(Page p0, Page p1) { return p0.relations == p1.relations ? 0 : p0.relations < p1.relations ? 1 : -1; } }); for (int i = 0; i < pages.size(); i++) { pages.get(i).id = i; } } private void listPages() { for (Page p : pages) { output.println("pages[" + p.id + "]=new Page('" + convertUTF(p.title) + "', '" + p.fileName + "');"); } } private void readPages(String dir, File file, int level) throws Exception { String name = file.getName(); String fileName = dir.length() > 0 ? dir + "/" + name : level > 0 ? name : ""; if (file.isDirectory()) { for (File f : file.listFiles()) { readPages(fileName, f, level + 1); } return; } String lower = StringUtils.toLowerEnglish(name); if (!lower.endsWith(".html") && !lower.endsWith(".htm")) { return; } if (lower.indexOf("_ja.") >= 0) { return; } if (!noIndex.contains(fileName)) { page = new Page(pages.size(), fileName); pages.add(page); readPage(file); } } private void listWords() { output.println("// words: " + wordList.size()); StringBuilder buff = new StringBuilder(); String first = ""; int firstLen = 1; int totalRelations = 0; for (Word word : wordList) { ArrayList<Weight> weights = word.getSortedWeights(); String lower = StringUtils.toLowerEnglish(word.name); if (!first.equals(lower.substring(0, firstLen))) { if (buff.length() > 0) { output.println("ref['" + convertUTF(first) + "']='" + buff.toString() + "';"); buff = new StringBuilder(); } first = lower.substring(0, firstLen); } if (buff.length() > 0) { buff.append(';'); } buff.append(convertUTF(word.name)); buff.append('='); String weightString = "r"; totalRelations += weights.size(); for (int j = 0; j < weights.size(); j++) { Weight weight = weights.get(j); Page p = weight.page; if (j > 0) { buff.append(","); } String ws; if (weight.value >= Weight.TITLE) { ws = "t"; } else if (weight.value >= Weight.HEADER) { ws = "h"; } else { ws = "r"; } if (ws != weightString) { weightString = ws; buff.append(ws); } buff.append(p.id); } } output.println("ref['" + convertUTF(first) + "']='" + buff.toString() + "';"); output.println("// totalRelations: " + totalRelations); output.println("ignored='" + ignored.toLowerCase() + "';"); } private void readPage(File file) throws Exception { byte[] data = IOUtils.readBytesAndClose(new FileInputStream(file), 0); String text = new String(data, "UTF-8"); StringTokenizer t = new StringTokenizer(text, "<> \r\n", true); boolean inTag = false; title = false; heading = false; while (t.hasMoreTokens()) { String token = t.nextToken(); if (token.length() == 1) { char c = token.charAt(0); switch (c) { case '<': { if (inTag) { process("???"); } inTag = true; if (!t.hasMoreTokens()) { break; } token = t.nextToken(); if (token.startsWith("/")) { title = false; heading = false; } else if (token.equalsIgnoreCase("title")) { title = true; } else if (token.length() == 2 && Character.toLowerCase(token.charAt(0)) == 'h' && Character.isDigit(token.charAt(1))) { heading = true; } // TODO maybe skip script tags? break; } case '>': { if (!inTag) { process("???"); } inTag = false; break; } case '\r': case '\n': case ' ': break; default: if (!inTag) { process(token); } } } else { if (!inTag) { process(token); } } } if (page.title == null || page.title.trim().length() == 0) { System.out.println("Error: not title found in " + file.getName()); page.title = file.getName(); } page.title = page.title.trim(); } private void process(String text) { text = HtmlConverter.convertHtmlToString(text); if (title) { if (page.title == null) { page.title = text; } else { page.title = page.title + " " + text; } } int weight; if (title) { weight = Weight.TITLE; } else if (heading) { weight = Weight.HEADER; } else { weight = Weight.PARAGRAPH; } // this list of constants needs to be the same in search.js // (char) 160: nbsp StringTokenizer t = new StringTokenizer(text, " \t\r\n\"'.,:;!&/\\?%@`[]{}()+-=<>|*^~#$" + (char) 160, false); while (t.hasMoreTokens()) { String token = t.nextToken(); if (token.length() < MIN_WORD_SIZE) { continue; } if (Character.isDigit(token.charAt(0))) { continue; } String lower = StringUtils.toLowerEnglish(token); Word word = words.get(lower); if (word == null) { word = new Word(token); words.put(lower, word); } else if (!word.name.equals(token)) { word.name = token.compareTo(word.name) > 0 ? token : word.name; } page.totalWeight += weight; word.addPage(page, weight); } } private static String convertUTF(String s) { s = StringUtils.quoteJavaString(s); s = s.substring(1, s.length() - 1); return s; } }