//----------------------------------------------------------------------------//
// //
// W o r d S c a n n e r //
// //
//----------------------------------------------------------------------------//
// <editor-fold defaultstate="collapsed" desc="hdr"> //
// Copyright © Hervé Bitteur and others 2000-2013. All rights reserved. //
// This software is released under the GNU General Public License. //
// Goto http://kenai.com/projects/audiveris to report bugs or suggestions. //
//----------------------------------------------------------------------------//
// </editor-fold>
package omr.text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/**
* Class {@code WordScanner} is a scanner to retrieve words out of
* a string content, while mapping each word to a sequence of TextChar
* instances.
*
* @author Hervé Bitteur
*/
public abstract class WordScanner
{
//~ Static fields/initializers ---------------------------------------------
/** Usual logger utility */
private static final Logger logger = LoggerFactory.getLogger(
WordScanner.class);
//~ Instance fields --------------------------------------------------------
private final boolean bySyllable;
/** The content string */
private final String content;
/** The current index in the content string */
private int strIndex = -1;
/** Precise description of each (non blank) character */
private List<TextChar> chars;
/** Current word and its positions in chars sequence */
private String currentWord = null;
private int currentWordStart = 0;
private int currentWordStop = 0;
/** Next word and its positions in chars sequence */
private String nextWord = null;
private int nextWordStart = -1;
private int nextWordStop = -1;
//~ Constructors -----------------------------------------------------------
//-------------//
// WordScanner //
//-------------//
/**
* Creates a new WordScanner object.
*
* @param content the string value to scan
* @param chars the sequence of chars descriptors
*/
public WordScanner (String content,
boolean bySyllable,
List<TextChar> chars)
{
this.content = content;
this.bySyllable = bySyllable;
this.chars = chars;
}
//~ Methods ----------------------------------------------------------------
//
//--------------//
// getWordChars //
//--------------//
/**
* Report the sequence of TextChar instances that correspond to
* the current word.
*
* @return the word sequence of TextChar's
*/
public List<TextChar> getWordChars ()
{
return chars.subList(currentWordStart, currentWordStop + 1);
}
//---------//
// hasNext //
//---------//
/**
* Tell whether there is a next word.
*
* @return true if not finished, false otherwise
*/
public boolean hasNext ()
{
return nextWord != null;
}
//------//
// next //
//------//
/**
* Make the next word current, and return it.
*
* @return the next word content
*/
public String next ()
{
// Promote 'next' as 'current'
currentWord = nextWord;
currentWordStart = nextWordStart;
currentWordStop = nextWordStop;
// ¨Prepare the new 'next' if any
lookAhead();
return currentWord;
}
//--------------//
// stringToDesc //
//--------------//
/**
* Knowing the char strIndex in string content, determine the
* related position in the sequence of TextChar instances
*
* @param strIndex strIndex in contant
* @return position in sequence of TextChar instances
*/
protected abstract int stringToDesc (int strIndex);
//-------------//
// getNextWord //
//-------------//
/**
* Retrieve positions for the next word, whose content is returned.
* The related TextChar instances can now be retrieved through their range
* [getWordStart() .. getWordStop()].
*
* @return the next word content
*/
protected String getNextWord ()
{
StringBuilder WordSb = new StringBuilder();
for (strIndex += 1; strIndex < content.length(); strIndex++) {
String charValue = content.substring(strIndex, strIndex + 1);
// Position in sequence of TextChar instances
int charPos = stringToDesc(strIndex);
if (charValue.equals(" ")) {
// White space
if (WordSb.length() > 0) {
return WordSb.toString();
}
} else if (bySyllable && BasicContent.isSeparator(charValue)) {
// Special characters (returned as stand-alone words)
if (WordSb.length() > 0) {
strIndex--; // To get back to this index, next time
} else {
nextWordStart = charPos;
nextWordStop = charPos;
WordSb.append(charValue);
}
return WordSb.toString();
} else {
// Standard word character
if (WordSb.length() == 0) {
nextWordStart = charPos;
}
nextWordStop = charPos;
WordSb.append(charValue);
}
}
// We have reached the end
if (WordSb.length() > 0) {
return WordSb.toString();
} else {
return null;
}
}
//-----------//
// lookAhead //
//-----------//
/**
* Prepare positions for the next word.
*/
protected void lookAhead ()
{
nextWord = getNextWord();
}
//~ Inner Classes ----------------------------------------------------------
//---------------//
// ManualScanner //
//---------------//
/**
* Class {@code ManualScanner} is a specific scanner using manual
* text content, whose length may be different from the sequence of
* TextChar instances.
*/
public static class ManualScanner
extends WordScanner
{
//~ Instance fields ----------------------------------------------------
/** Ratio of number of TextChar instances / content length. */
private final double ratio;
//~ Constructors -------------------------------------------------------
/**
* Creates a new ManualScanner object.
*
* @param content the string value to scan
* @param chars the sequence of chars descriptors
*/
public ManualScanner (String content,
boolean bySyllable,
List<TextChar> chars)
{
super(content, bySyllable, chars);
ratio = chars.size() / (double) content.length();
lookAhead();
logger.debug("ManualScanner on ''{}''", content);
}
//~ Methods ------------------------------------------------------------
/**
* Compute charPos proportionally to strIndex.
*/
@Override
protected int stringToDesc (int strIndex)
{
return (int) Math.rint(strIndex * ratio);
}
}
//------------//
// OcrScanner //
//------------//
/**
* Class {@code OcrScanner} is a basic scanner for which
* the sequence of TextChar's is parallel to String content.
*
* @author Hervé Bitteur
*/
public static class OcrScanner
extends WordScanner
{
//~ Constructors -------------------------------------------------------
/**
* Creates a new OcrScanner object.
*
* @param content the string value to scan
* @param chars the sequence of chars descriptors
*/
public OcrScanner (String content,
boolean bySyllable,
List<TextChar> chars)
{
super(content, bySyllable, chars);
lookAhead();
logger.debug("OcrScanner on ''{}''", content);
}
//~ Methods ------------------------------------------------------------
/**
* CharPos and strIndex are always equal.
*/
@Override
protected int stringToDesc (int strIndex)
{
return strIndex;
}
}
}