RubyParser.java example

/****************************************************************************************
 * Copyright (c) 2009 Brennan D'Aguilar <brennan.daguilar@gmail.com>                    *
 *                                                                                      *
 * This program is free software; you can redistribute it and/or modify it under        *
 * the terms of the GNU General Public License as published by the Free Software        *
 * Foundation; either version 3 of the License, or (at your option) any later           *
 * version.                                                                             *
 *                                                                                      *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY      *
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A      *
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.             *
 *                                                                                      *
 * You should have received a copy of the GNU General Public License along with         *
 * this program.  If not, see <http://www.gnu.org/licenses/>.                           *
 ****************************************************************************************/

package com.ichi2.utils;

/**
 * Parses text input from Anki cards to display ruby text correctly in AnkiDroid. Anki's Japanese language support
 * handles ruby text as: <code>basetext[rubytext]</code> where the base text begins after the first spacer proceeding
 * the ruby text, or the beginning of the string of text if no spacers exist before the start of the base text. This is
 * converted to basic ruby markup: <code><ruby><rb>baseText</rb><rt>rubyText</rt></ruby></code> While webkit on android
 * devices does not support ruby markup yet, the text can be adjusted adequately using css.
 */
public class RubyParser {

//    private static final char RUBY_SPACER_JAP_SPACE = ' ';
//    private static final char RUBY_SPACER_JAP_COMMA = '\u3001';
//    private static final char RUBY_TEXT_START = '[';
//    private static final char RUBY_TEXT_END = ']';
//    private static final char HTML_TAG_START = '<';
//    private static final char HTML_TAG_END = '>';


    /**
     * Converts ruby text from the format used by Anki's Japanese support plugin to html ruby markup.
     * 
     * @param sourceText the japanese text containing ruby text
     * @return html ruby markup equivalent of the input text.
     */
    public static String ankiRubyToMarkup(String sourceText) {
        return sourceText.replaceAll(" ?([^ >]+?)\\[([^(sound:)].*?)\\]", "<ruby><rb>$1</rb><rt>$2</rt></ruby>");
        /*int cursorIndex = 0;

        int nextRubyTextStart; // The first '[' after the cursorIndex
        int nextSpacer; // The first spacer (' ', or '、') after the cursorIndex
        int nextRubyTextEnd; // The first ']' after the cursorIndex
        int nextHtmlTagStart; // The first '<' after the cursorIndex

        StringBuilder builder = new StringBuilder();

        // Loop until the entire string is parsed
        while (cursorIndex < sourceText.length() - 1) {
            // Find the location of the beginning of the next ruby text
            nextRubyTextStart = sourceText.indexOf(RUBY_TEXT_START, cursorIndex);

            // Find the location of the next spacing character (only -1 if neither possible
            // spacing character remains.
            nextSpacer = sourceText.indexOf(RUBY_SPACER_JAP_SPACE, cursorIndex);
            if (nextSpacer == -1) {
                nextSpacer = Math.max(nextSpacer, sourceText.indexOf(RUBY_SPACER_JAP_COMMA, cursorIndex));
            }

            // Check for html tags that come before any ruby text. If found, pass the full tag
            // without parsing.
            nextHtmlTagStart = sourceText.indexOf(HTML_TAG_START, cursorIndex);

            if (nextHtmlTagStart != -1 && (nextSpacer == -1 || nextHtmlTagStart < nextSpacer)
                    && (nextRubyTextStart == -1 || nextHtmlTagStart < nextRubyTextStart)) {
                int nextHtmlTagEnd = sourceText.indexOf(HTML_TAG_END, nextHtmlTagStart);
                builder.append(sourceText.substring(cursorIndex, nextHtmlTagEnd + 1));
                cursorIndex = nextHtmlTagEnd + 1;
            } else // If no html tag is passed through on this cycle, check for ruby text.
            {
                // If any unparsed ruby text remains
                if (nextRubyTextStart != -1) {
                    // If there is any text before the next ruby tag that is part of the ruby base text,
                    // pass it through unparsed.
                    if (nextSpacer < nextRubyTextStart && nextSpacer != -1) {
                        // Remove spaces from the text
                        if (sourceText.charAt(nextSpacer) == RUBY_SPACER_JAP_SPACE) {
                            builder.append(sourceText.substring(cursorIndex, nextSpacer));
                        } else // If spacing character is not a space (eg. a comma), pass it through as well.
                        {
                            builder.append(sourceText.substring(cursorIndex, nextSpacer + 1));
                        }
                        cursorIndex = nextSpacer + 1;
                    } else {
                        // Find the end of the ruby text, and parse it into html tags.
                        nextRubyTextEnd = sourceText.indexOf(RUBY_TEXT_END, cursorIndex);
                        builder.append(newRubyPair(sourceText.substring(cursorIndex, nextRubyTextStart),
                                sourceText.substring(nextRubyTextStart + 1, nextRubyTextEnd)));
                        cursorIndex = nextRubyTextEnd + 1;
                    }
                } else {
                    // If no ruby text remains to be parsed, pass any remaining text through and finish.
                    builder.append(sourceText.substring(cursorIndex));
                    cursorIndex = sourceText.length();
                }
            }
        }

        return builder.toString();*/

    }

    /**
     * Strips kanji from ruby markup. Used for reading in question
     * 
     * @param sourceText the japanese text containing ruby text
     * @return text with kanji substituted by it's reading
     */
    public static String ankiStripKanji(String sourceText) {
    	return sourceText.replaceAll(" ?([^ >]+?)\\[([^(sound:)].*?)\\]", "$2");
    }
    
    /*
    private static String newRubyPair(String baseText, String rubyText) {
        return "<ruby><rb>" + baseText + "</rb><rt>" + rubyText + "</rt></ruby>";
    }
    */
}