HTMLSearchIndexer.java example

Explorer
Triana-master
/*
 * The University of Wales, Cardiff Triana Project Software License (Based
 * on the Apache Software License Version 1.1)
 *
 * Copyright (c) 2007 University of Wales, Cardiff. All rights reserved.
 *
 * Redistribution and use of the software in source and binary forms, with
 * or without modification, are permitted provided that the following
 * conditions are met:
 *
 * 1.  Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *
 * 2.  Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *
 * 3. The end-user documentation included with the redistribution, if any,
 *    must include the following acknowledgment: "This product includes
 *    software developed by the University of Wales, Cardiff for the Triana
 *    Project (http://www.trianacode.org)." Alternately, this
 *    acknowledgment may appear in the software itself, if and wherever
 *    such third-party acknowledgments normally appear.
 *
 * 4. The names "Triana" and "University of Wales, Cardiff" must not be
 *    used to endorse or promote products derived from this software
 *    without prior written permission. For written permission, please
 *    contact triana@trianacode.org.
 *
 * 5. Products derived from this software may not be called "Triana," nor
 *    may Triana appear in their name, without prior written permission of
 *    the University of Wales, Cardiff.
 *
 * 6. This software may not be sold, used or incorporated into any product
 *    for sale to third parties.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
 * NO EVENT SHALL UNIVERSITY OF WALES, CARDIFF OR ITS CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ------------------------------------------------------------------------
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Triana Project. For more information on the
 * Triana Project, please see. http://www.trianacode.org.
 *
 * This license is based on the BSD license as adopted by the Apache
 * Foundation and is governed by the laws of England and Wales.
 *
 */
package org.trianacode.gui.help.search;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;

/**
 * @version $Revision: 4048 $
 */
public class HTMLSearchIndexer extends SearchIndexer {

    private char[] html_comment_on;
    private char[] html_comment_off;

    private HTMLSearchResults results;

    private StringBuffer tag = new StringBuffer();
    private boolean inComment = false;
    private boolean inTag = false;

    // Part of nasty fudge
    private StringBuffer title = new StringBuffer();
    private boolean inTitle = false;

    private HTMLDocumentInfo docInfo;

    class HTMLFilenameFilter implements FilenameFilter {
        public boolean accept(File dir, String file) {
            String ext;
            int index;

            // Recurse in to directories
            if ((new File(dir, file)).isDirectory()) {
                return true;
            }

            // Ignore files without extensions
            if ((index = file.lastIndexOf(".")) < 0) {
                return false;
            }

            // Get the extension
            ext = file.substring(index + 1).toLowerCase();

            if (ext.equals("html")) {
                return true;
            }
            if (ext.equals("htm")) {
                return true;
            }

            return false;
        }
    }

    public final static void main(String[] args) {
        try {
            /*
            HTMLSearchResults results = HTMLSearchResults.loadHTMLSearchResults(new File("help.idx"));
            Vector vector = results.get(args[0]);
            System.out.println(vector.toString());
            */
            HTMLSearchIndexer indexer = new HTMLSearchIndexer(new File(args[0]));
            HTMLSearchResults results = indexer.getHTMLSearchResults();
            results.save(new File("help.idx"));
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }

        //This was removed due to a bug in ant (the build tool).
        //With this in, it ant will exit after this has been run.
        //System.exit(0);
    }

    public HTMLSearchIndexer(File searchFile, boolean caseSensitive) {
        super(searchFile, caseSensitive);
        initArrays();
    }

    public HTMLSearchIndexer(File searchFile) {
        super(searchFile);
        initArrays();
    }

    private void initArrays() {
        html_comment_on = stringToArray("<--");
        html_comment_off = stringToArray("-->");
    }

    protected char[] stringToArray(String string) {
        char[] array = new char[string.length()];
        string.getChars(0, string.length(), array, 0);
        return array;
    }

    protected void parseFile(File file) throws IOException {
        docInfo = new HTMLDocumentInfo(file, "");
        super.parseFile(file);

        if (!title.toString().equals("Untitled")) {
            docInfo.setTitle(title.toString());
        } else {
            docInfo.setTitle(file.getName());
        }

        title = new StringBuffer();
    }

    protected boolean subArrayEquals(char[] bigArray, int offset,
                                     char[] smallArray) {
        if (bigArray.length < (smallArray.length + offset)) {
            return false;
        }

        for (int i = smallArray.length - 1; i >= 0; i--) {
            if (bigArray[i + offset] != smallArray[i]) {
                return false;
            }
        }

        return true;
    }

    protected void parseLine(File file, String line) {
        StringBuffer sb = new StringBuffer();
        char[] charArray;
        int ptr;

        // Get an array of the characters in the line
        charArray = new char[line.length()];
        line.getChars(0, line.length(), charArray, 0);

        // Start parsing the line
        ptr = 0;

        for (; ;) {
            if (ptr >= charArray.length) {
                break;
            }

            if (inTag) {
                if (charArray[ptr] == '>') {
                    String tagString = tag.toString().toLowerCase();

                    // Fudge time - quick and nasty solution
                    if (tagString.startsWith("title")) {
                        inTitle = true;
                    } else if (tagString.startsWith("/title")) {
                        inTitle = false;
                    }

                    inTag = false;
                } else {
                    tag.append(Character.toLowerCase(charArray[ptr]));
                }

                ptr++;
            } else if (inTitle) {
                // Nasty fudge time - must improve
                if (charArray[ptr] == '<') {
                    inTag = true;
                    tag = new StringBuffer();
                } else {
                    title.append(charArray[ptr]);
                }

                ptr++;
            } else {
                if (Character.isLetterOrDigit(charArray[ptr])) {
                    if (!inTag && !inComment) {
                        sb.append(charArray[ptr]);
                    }
                    ptr++;
                } else {
                    if (sb.length() > 0) {
                        if (isCaseSensitive()) {
                            results.add(sb.toString(), docInfo);
                        } else {
                            results.add(sb.toString().toLowerCase(), docInfo);
                        }
                        sb = new StringBuffer();
                    }

                    if (subArrayEquals(charArray, ptr, html_comment_on)) {
                        inComment = true;
                        ptr += html_comment_on.length;
                    } else if (subArrayEquals(charArray, ptr, html_comment_off)) {
                        inComment = false;
                        ptr += html_comment_off.length;
                    } else if (charArray[ptr] == '<') {
                        inTag = true;
                        tag = new StringBuffer();
                        ptr++;
                    } else {
                        ptr++;
                    }
                }
            }
        }

        if (sb.length() > 0) {
            if (isCaseSensitive()) {
                results.add(sb.toString(), docInfo);
            } else {
                results.add(sb.toString().toLowerCase(), docInfo);
            }
        }
    }

    public HTMLSearchResults getHTMLSearchResults() {
        HTMLFilenameFilter filter = new HTMLFilenameFilter();
        results = new HTMLSearchResults();

        try {
            indexFile(getSearchFile(), filter);
        }
        catch (IOException ex) {
            ex.printStackTrace();
        }

        return results;
    }
}