AsciiDocIndex.java example

Explorer
persistit-master
/**
 * Copyright 2011-2012 Akiban Technologies, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.persistit.exception.PersistitException;

public class AsciiDocIndex {
    private final static String NOISY_STRINGS[] = { "\r", "\n", "<B>", "</B>", "<b>", "</b>", "<CODE>", "</CODE>",
            "<code>", "</code>", "<TT>", "</TT>", "<tt>", "</tt>", "<FONT>", "</FONT>", "<font>", "</font>" };

    // Charset and decoder for ISO-8859-15
    private final static Charset charset = Charset.forName("ISO-8859-15");

    private final static CharsetDecoder decoder = charset.newDecoder();
    //
    // Regex Pattern to pull various attributes and fields out of the anchor
    // tags in Javadoc index-NN.html files.
    //
    private final static Pattern PATTERN = Pattern.compile(
            "(<a href=\"(\\./com/persistit.*?)\" *(title=\"(.*?)\")?.*?>(.*?)</a>)", Pattern.CASE_INSENSITIVE);

    private int _count;

    final SortedMap<String, String> classMap = new TreeMap<String, String>();
    final SortedMap<String, String> methodMap = new TreeMap<String, String>();

    /**
     * Builds a JDocSearch index from the specified Javadoc file or directory.
     * If the supplied <tt>File</tt> object is a file, then read and index the
     * content of that one file. If it is a directory, read the files in that
     * directory and index them.
     * 
     * @param file
     * 
     * @return The count of indexable terms in the file or directory
     * 
     * @throws IOException
     * 
     * @throws PersistitException
     */
    public int buildIndex(final String pathName, String base) throws IOException {
        File file = new File(pathName);

        // The index generated by the standard Javadoc Doclet is either
        // at the root of the api tree, in a file called index-all.html, or
        // in a subdirectory called index-files. This code tries each case.
        //
        if (file.exists() && file.isDirectory() && !file.getPath().endsWith("index-files")) {
            final File indexAll = new File(file, "index-all.html");
            final File indexDir = new File(file, "index-files");
            if (indexAll.exists() && !indexAll.isDirectory()) {
                file = indexAll;
            } else if (indexDir.exists() && indexDir.isDirectory()) {
                file = indexDir;
            }
        }
        if (file.exists()) {
            if (base == null) {
                base = file.getParent();
            }
            if (file.isDirectory()) {
                indexOneDirectory(file, base);
            } else {
                indexOneFile(file, base);
            }
        } else {
            throw new IllegalArgumentException("Requires the name of a Javadoc API index file, "
                    + "or of a directory containing Javadoc API index files.");
        }
        return _count;
    }

    public void indexOneDirectory(final File indexDir, final String base) throws IOException {
        final File[] indexFiles = indexDir.listFiles();
        for (int i = 0; i < indexFiles.length; i++) {
            indexOneFile(indexFiles[i], base);
        }
    }

    public void indexOneFile(final File file, final String base) throws IOException {
        FileChannel fc = null;
        try {
            System.out.println("Indexing " + file);

            fc = new FileInputStream(file).getChannel();

            final MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
            final CharBuffer cb = decoder.decode(bb);
            final Matcher matcher = PATTERN.matcher(cb);

            while (matcher.find()) {
                final String wholeTag = matcher.group(1);
                final String href = matcher.group(2);
                final String url = base + "/" + fixDotSlash(href);
                final String title = matcher.group(4);
                final String text = matcher.group(5);

                indexOneTerm(wholeTag, href, url, title, text);

            }
        } catch (final IOException e) {
            System.err.println();
            e.printStackTrace();
            throw e;
        } finally {
            if (fc != null) {
                fc.close();
            }
        }
    }

    private void indexOneTerm(final String wholeTag, String href, final String url, final String title, String text) {

        text = cleanupNoise(text);
        href = fixDotSlash(href);

        final int pHtml = href.lastIndexOf(".html");
        if (pHtml == -1) {
            return;
        }

        final int pPackageSummary = href.indexOf("/package-summary");
        if (pPackageSummary > 0) {
            //
            // Enumerate the package name segments
            // The HREF starts with "./", which is chopped off here.
            //
            final String packageName = href.substring(0, pPackageSummary).replace('/', '.');

            int q = -1;
            while (q < packageName.length()) {
                final int p = q + 1;
                q = packageName.indexOf('.', p);
                if (q < 0)
                    q = packageName.length();
                final String term = packageName.substring(p, q);
                saveTerm("Package", term, url);
            }
            return;
        }

        if (href.startsWith("com/") || href.startsWith("java/") || href.startsWith("javax/") || href.startsWith("org/")
                || href.startsWith("COM/") || href.startsWith("ORG/")) {
            final int pHash = href.indexOf('#');
            if (pHash == -1) {
                // This is a class or interface name
                String category = "Class";
                if (title.startsWith("interface"))
                    category = "Interface";
                final int pSlash = href.lastIndexOf('/', pHtml - 1);
                // String className = href.substring(pSlash + 1, pHtml);
                final String className = href.substring(0, pHtml).replace('/', '.');
                saveTerm(category, className, url);
                return;
            }

            else {
                final String className = href.substring(0, pHtml).replace('/', '.');
                final String name = href.substring(pHash + 1);
                final int pLeftParen = name.indexOf('(');
                if (pLeftParen == -1) {
                    //
                    // This is a field or a constant. We'll call it a constant
                    // if
                    // it is spelled in upper case.
                    //
                    final String uCaseName = name.toUpperCase();
                    final String category = name.equals(uCaseName) ? "Constant" : "Field";
                    final String displayText = name + " in " + className;
                    saveTerm(category, name, url);
                    return;
                } else {
                    //
                    // This is a method name. We will index it as a method,
                    // and then if it conforms to the pattern for property
                    // set/get methods, we'll also index the property name.
                    //
                    final int pRightParen = name.indexOf(')', pLeftParen);
                    if (pRightParen == -1) {
                        System.out.println("Missing right paren");
                        System.out.println(wholeTag);
                        return;
                    }
                    final String paramList = name.substring(pLeftParen + 1, pRightParen).trim();
                    // String term = name.substring(0, pLeftParen);
                    String term = (href.substring(0, pHtml) + href.substring(pHtml + 5)).replace('/', '.');
                    saveTerm("Method", term, url);
                    final String displayText = name + " in " + className;
                    if (name.startsWith("get") && paramList.length() == 0 || name.startsWith("is")
                            && paramList.length() == 0 || name.startsWith("set") && paramList.length() > 0
                            && paramList.indexOf(',') == -1) {
                        term = term.substring(name.startsWith("is") ? 2 : 3);
                        saveTerm("Property", term, url);
                    }
                    return;
                }
            }
        }
    }

    private void saveTerm(final String type, final String term, final String url) {
        if (term.contains("#end")) {
            System.out.println(term);
        }
        if ("Method".equals(type)) {
            methodMap.put(term, url);
        }
        if ("Class".equals(type) || "Interface".equals(type)) {
            classMap.put(term, url);
        }
    }

    private String fixDotSlash(final String url) {
        if (url.startsWith("./"))
            return url.substring(2);
        else
            return url;
    }

    private String cleanupNoise(final String term) {
        boolean changed = false;
        final StringBuffer sb = new StringBuffer(term);
        for (int i = 0; i < NOISY_STRINGS.length; i++) {
            final String tag = NOISY_STRINGS[i];
            for (int p; (p = sb.indexOf(tag)) >= 0;) {
                sb.delete(p, p + tag.length());
                changed = true;
            }
        }
        return changed ? sb.toString() : term;
    }

    public void index(final String javaDocPathname) throws Exception {
        final String base = "http://www.akiban.com/documentation/apidocs";
        buildIndex(javaDocPathname, base);
    }

    public SortedMap<String, String> getClassMap() {
        return classMap;
    }

    public SortedMap<String, String> getMethodMap() {
        return methodMap;
    }
}