WordScanner.java example

Explorer
libreveris-master
- src
- target
  - generated-sources
    - java
      - omr
        ProgramId.java
//----------------------------------------------------------------------------//
//                                                                            //
//                           W o r d S c a n n e r                            //
//                                                                            //
//----------------------------------------------------------------------------//
// <editor-fold defaultstate="collapsed" desc="hdr">                          //
//  Copyright © Hervé Bitteur and others 2000-2013. All rights reserved.      //
//  This software is released under the GNU General Public License.           //
//  Goto http://kenai.com/projects/audiveris to report bugs or suggestions.   //
//----------------------------------------------------------------------------//
// </editor-fold>
package omr.text;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

/**
 * Class {@code WordScanner} is a scanner to retrieve words out of
 * a string content, while mapping each word to a sequence of TextChar
 * instances.
 *
 * @author Hervé Bitteur
 */
public abstract class WordScanner
{
    //~ Static fields/initializers ---------------------------------------------

    /** Usual logger utility */
    private static final Logger logger = LoggerFactory.getLogger(
            WordScanner.class);

    //~ Instance fields --------------------------------------------------------
    private final boolean bySyllable;

    /** The content string */
    private final String content;

    /** The current index in the content string */
    private int strIndex = -1;

    /** Precise description of each (non blank) character */
    private List<TextChar> chars;

    /** Current word and its positions in chars sequence */
    private String currentWord = null;

    private int currentWordStart = 0;

    private int currentWordStop = 0;

    /** Next word and its positions in chars sequence */
    private String nextWord = null;

    private int nextWordStart = -1;

    private int nextWordStop = -1;

    //~ Constructors -----------------------------------------------------------
    //-------------//
    // WordScanner //
    //-------------//
    /**
     * Creates a new WordScanner object.
     *
     * @param content the string value to scan
     * @param chars   the sequence of chars descriptors
     */
    public WordScanner (String content,
                        boolean bySyllable,
                        List<TextChar> chars)
    {
        this.content = content;
        this.bySyllable = bySyllable;
        this.chars = chars;
    }

    //~ Methods ----------------------------------------------------------------
    //
    //--------------//
    // getWordChars //
    //--------------//
    /**
     * Report the sequence of TextChar instances that correspond to
     * the current word.
     *
     * @return the word sequence of TextChar's
     */
    public List<TextChar> getWordChars ()
    {
        return chars.subList(currentWordStart, currentWordStop + 1);
    }

    //---------//
    // hasNext //
    //---------//
    /**
     * Tell whether there is a next word.
     *
     * @return true if not finished, false otherwise
     */
    public boolean hasNext ()
    {
        return nextWord != null;
    }

    //------//
    // next //
    //------//
    /**
     * Make the next word current, and return it.
     *
     * @return the next word content
     */
    public String next ()
    {
        // Promote 'next' as 'current'
        currentWord = nextWord;
        currentWordStart = nextWordStart;
        currentWordStop = nextWordStop;

        // ¨Prepare the new 'next' if any
        lookAhead();

        return currentWord;
    }

    //--------------//
    // stringToDesc //
    //--------------//
    /**
     * Knowing the char strIndex in string content, determine the
     * related position in the sequence of TextChar instances
     *
     * @param strIndex strIndex in contant
     * @return position in sequence of TextChar instances
     */
    protected abstract int stringToDesc (int strIndex);

    //-------------//
    // getNextWord //
    //-------------//
    /**
     * Retrieve positions for the next word, whose content is returned.
     * The related TextChar instances can now be retrieved through their range
     * [getWordStart() .. getWordStop()].
     *
     * @return the next word content
     */
    protected String getNextWord ()
    {
        StringBuilder WordSb = new StringBuilder();

        for (strIndex += 1; strIndex < content.length(); strIndex++) {
            String charValue = content.substring(strIndex, strIndex + 1);

            // Position in sequence of TextChar instances
            int charPos = stringToDesc(strIndex);

            if (charValue.equals(" ")) {
                // White space
                if (WordSb.length() > 0) {
                    return WordSb.toString();
                }
            } else if (bySyllable && BasicContent.isSeparator(charValue)) {
                // Special characters (returned as stand-alone words)
                if (WordSb.length() > 0) {
                    strIndex--; // To get back to this index, next time
                } else {
                    nextWordStart = charPos;
                    nextWordStop = charPos;
                    WordSb.append(charValue);
                }

                return WordSb.toString();
            } else {
                // Standard word character
                if (WordSb.length() == 0) {
                    nextWordStart = charPos;
                }

                nextWordStop = charPos;
                WordSb.append(charValue);
            }
        }

        // We have reached the end
        if (WordSb.length() > 0) {
            return WordSb.toString();
        } else {
            return null;
        }
    }

    //-----------//
    // lookAhead //
    //-----------//
    /**
     * Prepare positions for the next word.
     */
    protected void lookAhead ()
    {
        nextWord = getNextWord();
    }

    //~ Inner Classes ----------------------------------------------------------
    //---------------//
    // ManualScanner //
    //---------------//
    /**
     * Class {@code ManualScanner} is a specific scanner using manual
     * text content, whose length may be different from the sequence of
     * TextChar instances.
     */
    public static class ManualScanner
            extends WordScanner
    {
        //~ Instance fields ----------------------------------------------------

        /** Ratio of number of TextChar instances / content length. */
        private final double ratio;

        //~ Constructors -------------------------------------------------------
        /**
         * Creates a new ManualScanner object.
         *
         * @param content the string value to scan
         * @param chars   the sequence of chars descriptors
         */
        public ManualScanner (String content,
                              boolean bySyllable,
                              List<TextChar> chars)
        {
            super(content, bySyllable, chars);

            ratio = chars.size() / (double) content.length();

            lookAhead();
            logger.debug("ManualScanner on ''{}''", content);
        }

        //~ Methods ------------------------------------------------------------
        /**
         * Compute charPos proportionally to strIndex.
         */
        @Override
        protected int stringToDesc (int strIndex)
        {
            return (int) Math.rint(strIndex * ratio);
        }
    }

    //------------//
    // OcrScanner //
    //------------//
    /**
     * Class {@code OcrScanner} is a basic scanner for which
     * the sequence of TextChar's is parallel to String content.
     *
     * @author Hervé Bitteur
     */
    public static class OcrScanner
            extends WordScanner
    {
        //~ Constructors -------------------------------------------------------

        /**
         * Creates a new OcrScanner object.
         *
         * @param content the string value to scan
         * @param chars   the sequence of chars descriptors
         */
        public OcrScanner (String content,
                           boolean bySyllable,
                           List<TextChar> chars)
        {
            super(content, bySyllable, chars);

            lookAhead();
            logger.debug("OcrScanner on ''{}''", content);
        }

        //~ Methods ------------------------------------------------------------
        /**
         * CharPos and strIndex are always equal.
         */
        @Override
        protected int stringToDesc (int strIndex)
        {
            return strIndex;
        }
    }
}