/* * The University of Wales, Cardiff Triana Project Software License (Based * on the Apache Software License Version 1.1) * * Copyright (c) 2007 University of Wales, Cardiff. All rights reserved. * * Redistribution and use of the software in source and binary forms, with * or without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, * must include the following acknowledgment: "This product includes * software developed by the University of Wales, Cardiff for the Triana * Project (http://www.trianacode.org)." Alternately, this * acknowledgment may appear in the software itself, if and wherever * such third-party acknowledgments normally appear. * * 4. The names "Triana" and "University of Wales, Cardiff" must not be * used to endorse or promote products derived from this software * without prior written permission. For written permission, please * contact triana@trianacode.org. * * 5. Products derived from this software may not be called "Triana," nor * may Triana appear in their name, without prior written permission of * the University of Wales, Cardiff. * * 6. This software may not be sold, used or incorporated into any product * for sale to third parties. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL UNIVERSITY OF WALES, CARDIFF OR ITS CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. * * ------------------------------------------------------------------------ * * This software consists of voluntary contributions made by many * individuals on behalf of the Triana Project. For more information on the * Triana Project, please see. http://www.trianacode.org. * * This license is based on the BSD license as adopted by the Apache * Foundation and is governed by the laws of England and Wales. * */ package org.trianacode.gui.help.search; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; /** * @version $Revision: 4048 $ */ public class HTMLSearchIndexer extends SearchIndexer { private char[] html_comment_on; private char[] html_comment_off; private HTMLSearchResults results; private StringBuffer tag = new StringBuffer(); private boolean inComment = false; private boolean inTag = false; // Part of nasty fudge private StringBuffer title = new StringBuffer(); private boolean inTitle = false; private HTMLDocumentInfo docInfo; class HTMLFilenameFilter implements FilenameFilter { public boolean accept(File dir, String file) { String ext; int index; // Recurse in to directories if ((new File(dir, file)).isDirectory()) { return true; } // Ignore files without extensions if ((index = file.lastIndexOf(".")) < 0) { return false; } // Get the extension ext = file.substring(index + 1).toLowerCase(); if (ext.equals("html")) { return true; } if (ext.equals("htm")) { return true; } return false; } } public final static void main(String[] args) { try { /* HTMLSearchResults results = HTMLSearchResults.loadHTMLSearchResults(new File("help.idx")); Vector vector = results.get(args[0]); System.out.println(vector.toString()); */ HTMLSearchIndexer indexer = new HTMLSearchIndexer(new File(args[0])); HTMLSearchResults results = indexer.getHTMLSearchResults(); results.save(new File("help.idx")); } catch (Exception ex) { ex.printStackTrace(); } //This was removed due to a bug in ant (the build tool). //With this in, it ant will exit after this has been run. //System.exit(0); } public HTMLSearchIndexer(File searchFile, boolean caseSensitive) { super(searchFile, caseSensitive); initArrays(); } public HTMLSearchIndexer(File searchFile) { super(searchFile); initArrays(); } private void initArrays() { html_comment_on = stringToArray("<--"); html_comment_off = stringToArray("-->"); } protected char[] stringToArray(String string) { char[] array = new char[string.length()]; string.getChars(0, string.length(), array, 0); return array; } protected void parseFile(File file) throws IOException { docInfo = new HTMLDocumentInfo(file, ""); super.parseFile(file); if (!title.toString().equals("Untitled")) { docInfo.setTitle(title.toString()); } else { docInfo.setTitle(file.getName()); } title = new StringBuffer(); } protected boolean subArrayEquals(char[] bigArray, int offset, char[] smallArray) { if (bigArray.length < (smallArray.length + offset)) { return false; } for (int i = smallArray.length - 1; i >= 0; i--) { if (bigArray[i + offset] != smallArray[i]) { return false; } } return true; } protected void parseLine(File file, String line) { StringBuffer sb = new StringBuffer(); char[] charArray; int ptr; // Get an array of the characters in the line charArray = new char[line.length()]; line.getChars(0, line.length(), charArray, 0); // Start parsing the line ptr = 0; for (; ;) { if (ptr >= charArray.length) { break; } if (inTag) { if (charArray[ptr] == '>') { String tagString = tag.toString().toLowerCase(); // Fudge time - quick and nasty solution if (tagString.startsWith("title")) { inTitle = true; } else if (tagString.startsWith("/title")) { inTitle = false; } inTag = false; } else { tag.append(Character.toLowerCase(charArray[ptr])); } ptr++; } else if (inTitle) { // Nasty fudge time - must improve if (charArray[ptr] == '<') { inTag = true; tag = new StringBuffer(); } else { title.append(charArray[ptr]); } ptr++; } else { if (Character.isLetterOrDigit(charArray[ptr])) { if (!inTag && !inComment) { sb.append(charArray[ptr]); } ptr++; } else { if (sb.length() > 0) { if (isCaseSensitive()) { results.add(sb.toString(), docInfo); } else { results.add(sb.toString().toLowerCase(), docInfo); } sb = new StringBuffer(); } if (subArrayEquals(charArray, ptr, html_comment_on)) { inComment = true; ptr += html_comment_on.length; } else if (subArrayEquals(charArray, ptr, html_comment_off)) { inComment = false; ptr += html_comment_off.length; } else if (charArray[ptr] == '<') { inTag = true; tag = new StringBuffer(); ptr++; } else { ptr++; } } } } if (sb.length() > 0) { if (isCaseSensitive()) { results.add(sb.toString(), docInfo); } else { results.add(sb.toString().toLowerCase(), docInfo); } } } public HTMLSearchResults getHTMLSearchResults() { HTMLFilenameFilter filter = new HTMLFilenameFilter(); results = new HTMLSearchResults(); try { indexFile(getSearchFile(), filter); } catch (IOException ex) { ex.printStackTrace(); } return results; } }