TextBuilder.java example

Explorer
libreveris-master
- src
- target
  - generated-sources
    - java
      - omr
        ProgramId.java
//----------------------------------------------------------------------------//
//                                                                            //
//                           T e x t B u i l d e r                            //
//                                                                            //
//----------------------------------------------------------------------------//
// <editor-fold defaultstate="collapsed" desc="hdr">                          //
//  Copyright © Hervé Bitteur and others 2000-2013. All rights reserved.      //
//  This software is released under the GNU General Public License.           //
//  Goto http://kenai.com/projects/audiveris to report bugs or suggestions.   //
//----------------------------------------------------------------------------//
// </editor-fold>
package omr.text;

import omr.constant.Constant;
import omr.constant.ConstantSet;

import omr.glyph.Shape;
import omr.glyph.facets.Glyph;

import omr.lag.Section;

import omr.math.LineUtil;

import omr.score.entity.Page;

import omr.sheet.Scale;
import omr.sheet.SystemInfo;

import omr.text.tesseract.TesseractOCR;

import omr.util.GeoUtil;
import omr.util.LiveParam;
import omr.util.WrappedBoolean;
import omr.util.XmlUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.awt.Point;
import java.awt.Rectangle;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/**
 * Class {@code TextBuilder} provide features to check, build and
 * reorganize text items, including interacting with the OCR engine.
 *
 * @author Hervé Bitteur
 */
public class TextBuilder
{
    //~ Static fields/initializers ---------------------------------------------

    /** Specific application parameters. */
    private static final Constants constants = new Constants();

    /** Usual logger utility. */
    private static final Logger logger = LoggerFactory.getLogger(TextBuilder.class);

    /** The related OCR. */
    private static final OCR ocr = TesseractOCR.getInstance();

    /** Abnormal characters. */
    private static final char[] ABNORMAL_CHARS = new char[]{'\\'};

    /** Regexp for abnormal words. */
    private static final Pattern ABNORMAL_WORDS = getAbnormalWords();

    //~ Instance fields --------------------------------------------------------
    //
    /** Related system. */
    private final SystemInfo system;

    /** Scale-dependent parameters. */
    private final Parameters params;

    //~ Constructors -----------------------------------------------------------
    //
    //-------------//
    // TextBuilder //
    //-------------//
    /**
     * Creates a new TextBuilder object.
     *
     * @param system the dedicated system
     */
    public TextBuilder (SystemInfo system)
    {
        this.system = system;

        params = new Parameters(system.getSheet().getScale());
    }

    //~ Methods ----------------------------------------------------------------
    //
    //--------//
    // getOcr //
    //--------//
    /**
     * Report the related OCR engine, if one is available.
     *
     * @return the available OCR engine, or null
     */
    public static OCR getOcr ()
    {
        return ocr;
    }

    //----------------//
    // isMainlyItalic //
    //----------------//
    /**
     * Check whether the (majority of) line is in italic font.
     *
     * @param line the line to check
     * @return true if mainly italics
     */
    public boolean isMainlyItalic (TextLine line)
    {
        int reliableWords = 0;
        int italicWords = 0;

        for (TextWord word : line.getWords()) {
            if (word.getConfidence() >= constants.minConfidence.getValue()
                && word.getLength() > 1) {
                reliableWords++;
                if (word.getFontInfo().isItalic) {
                    italicWords++;
                }
            }
        }

        // Check for majority among reliable words
        if (reliableWords != 0) {
            return italicWords * 2 >= reliableWords;
        } else {
            return false;
        }
    }

    //---------//
    // isValid //
    //---------//
    /**
     * Check the ocr line.
     *
     * @param textLine the ocr output
     * @return true if valid, false otherwise
     */
    public boolean isValid (TextLine textLine)
    {
        // Check confidence
        Integer conf = textLine.getConfidence();
        int minConf = constants.minConfidence.getValue();
        if (conf == null || conf < minConf) {
            logger.debug("      Too low confidence {} vs {} for {}",
                    conf, minConf, textLine);
            return false;
        }

        // Check font size
        if (!isValidFontSize(textLine)) {
            return false;
        }

        // Check ratio of invalid words in the line
        int invalidCount = 0;
        for (TextWord word : textLine.getWords()) {
            if (!isValid(word)) {
                invalidCount++;
            }
        }
        double invalidRatio = (double) invalidCount / textLine.getWords().size();

        return invalidRatio <= constants.maxInvalidRatio.getValue();
    }

    //---------//
    // isValid //
    //---------//
    public boolean isValid (TextWord word)
    {
        final String value = word.getValue();

        // Check for abnormal characters
        for (char ch : ABNORMAL_CHARS) {
            if (value.indexOf(ch) != -1) {
                logger.debug("Abnormal char {} in {}", ch, word);
                return false;
            }
        }

        // Check for invalid XML characters
        WrappedBoolean stripped = new WrappedBoolean(false);
        XmlUtil.stripNonValidXMLCharacters(value, stripped);

        if (stripped.isSet()) {
            logger.warn("Invalid XML chars in {}", word);
            return false;
        }

        // Check for invalid word values
        if (ABNORMAL_WORDS != null) {
            Matcher matcher = ABNORMAL_WORDS.matcher(value);
            if (matcher.matches()) {
                logger.debug("Abnormal word value {}", word);
                return false;
            }
        }

//        Rectangle box = word.getBounds();
//        String str = word.getValue();
//        Font font = new TextFont(word.getFontInfo());
//        TextLayout layout = new TextLayout(str, font, frc);
//        Rectangle2D rect = layout.getBounds();
//        double xRatio = box.width / rect.getWidth();
//        double yRatio = box.height / rect.getHeight();
//        double aRatio = yRatio / xRatio;
////        logger.debug("{} xRatio:{} yRatio:{} aRatio:{}", textLine,
////                    (float) xRatio, (float) yRatio, aRatio);
//
//        // Sign of something wrong
//        if ((aRatio < constants.minAspectRatio.getValue())
//                || (aRatio > constants.maxAspectRatio.getValue())) {
//            logger.debug("      Invalid ratio {} vs [{}-{}] for {}",
//                        aRatio,
//                        constants.minAspectRatio.getValue(),
//                        constants.maxAspectRatio.getValue(), word);
//            return false;
//        }
//
        return true;
    }

    //-----------------//
    // isValidFontSize //
    //-----------------//
    public boolean isValidFontSize (TextLine textLine)
    {
        for (TextWord word : textLine.getWords()) {
            FontInfo fontInfo = word.getFontInfo();

            if (fontInfo.pointsize > params.maxFontSize) {
                logger.debug("Too big font {} vs {} on {}",
                        fontInfo.pointsize, params.maxFontSize, textLine);
                return false;
            }
        }

        return true;
    }

    //-----------//
    // mapGlyphs //
    //-----------//
    /**
     * By searching through the provided sections, build one glyph for
     * each word and one sentence for each line.
     *
     * @param lines       the lines (and contained words) to be mapped
     * @param allSections the population of sections to browse
     * @param language    the OCR language specification
     */
    public void mapGlyphs (List<TextLine> lines,
                           Collection<Section> allSections,
                           String language)
    {
        if (logger.isDebugEnabled()) {
            logger.info("{} mapGlyphs", system.idString());
        }

        // To make sure that the same section is not assigned to several words
        for (Section section : allSections) {
            section.setProcessed(false);
        }

        for (TextLine line : lines) {
            logger.debug("   mapping {}", line);
            // Browse all words, starting by shorter ones
            List<TextWord> sortedWords = new ArrayList<>(line.getWords());
            Collections.sort(sortedWords, TextWord.bySize);
            List<TextWord> toRemove = new ArrayList<>();

            for (TextWord word : sortedWords) {
                // Isolate proper word glyph from its enclosed sections
                SortedSet<Section> wordSections = retrieveSections(
                        word.getChars(),
                        allSections);

                if (!wordSections.isEmpty()) {
                    Glyph wordGlyph = system.addGlyph(system.buildGlyph(
                            wordSections));

                    // Link TextWord -> Glyph
                    word.setGlyph(wordGlyph);
                    if (word.isVip()) {
                        line.setVip();
                    }

                    if (word.isVip() || logger.isDebugEnabled()) {
                        logger.info("      mapped {}", word);
                    }

                    // Link Glyph -> TextWord
                    wordGlyph.setTextWord(language, word);
                } else {
                    logger.debug("No section found for {}", word);
                    toRemove.add(word);
                }
            }

            // Purge words if any
            line.removeWords(toRemove);

            // Assign proper shape to each word glyph
            for (TextWord word : line.getWords()) {
                Glyph g = word.getGlyph();

                if (g != null) {
                    boolean many = word.getValue().length() > 1;
                    g.setShape(many ? Shape.TEXT : Shape.CHARACTER);
                }
            }

            logger.debug("  mapGlyphs adding {}", line);
            system.getSentences().add(line);
        }

        // Purge duplications, if any, in system sentences
        purgeSentences();
    }

    //----------------//
    // purgeSentences //
    //----------------//
    /**
     * Remove words whose glyphs no longer point back to them,
     * and finally remove sentences which have no word left.
     */
    public void purgeSentences ()
    {
        for (Iterator<TextLine> itLine = system.getSentences().iterator();
                itLine.hasNext();) {
            TextLine line = itLine.next();

            List<TextWord> toRemove = new ArrayList<>();
            for (TextWord word : line.getWords()) {
                Glyph glyph = word.getGlyph();

                if (glyph == null || glyph.getTextWord() != word) {
                    logger.debug("{} purging old {}", system.idString(), word);
                    toRemove.add(word);
                }
            }

            if (!toRemove.isEmpty()) {
                line.removeWords(toRemove);
            }

            if (line.getWords().isEmpty()) {
                logger.debug("{} purging empty {}", system.idString(), line);
                itLine.remove();
            }
        }
    }

    //---------------//
    // dumpSentences //
    //---------------//
    /**
     * Debug method to list current system sentences.
     */
    public void dumpSentences (String title)
    {
        Set<TextLine> sentences = system.getSentences();
        logger.info("{} {} sentences: {}",
                title, system.idString(), sentences.size());

        for (TextLine sentence : sentences) {
            logger.info("   {}", sentence);
        }
    }

    //------------//
    // mergeLines //
    //------------//
    /**
     * Merge a sequence of TextLine instances into a single instance.
     *
     * @param lines the lines to merge
     * @return a single TextLine
     */
    public TextLine mergeLines (List<TextLine> lines)
    {
        List<TextWord> words = new ArrayList<>();

        for (TextLine line : lines) {
            line.setProcessed(true);
            words.addAll(line.getWords());
        }

        Collections.sort(words, TextWord.byAbscissa);

        return new TextLine(system, words);
    }

    //----------------//
    // recomposeLines //
    //----------------//
    /**
     * Check and modify the provided raw TextLine instances for correct
     * composition.
     *
     * <ul>
     * <li>Except for lyrics line, a too large inter-word gap triggers a line
     * split</li>
     * <li>A too small inter-word gap triggers a word merge</li>
     * <li>For lyrics, separate lines with similar ordinate trigger a line
     * merge</li>
     * <li>For lyrics, a separation character triggers a word split into
     * syllables</li>
     * </ul>
     *
     * @param oldLines the lines to process
     * @return the new sequence of recomposed lines
     */
    public List<TextLine> recomposeLines (Collection<TextLine> oldLines)
    {
        if (logger.isDebugEnabled()) {
            logger.info("{} recomposeLines", system.idString());
        }

        // Separate lyrics and standard populations
        List<TextLine> standards = new ArrayList<>();
        List<TextLine> lyrics = new ArrayList<>();
        separatePopulations(oldLines, standards, lyrics);

        lyrics = purgeInvalidLines(lyrics);
        lyrics = mergeLyricsLines(lyrics);

        if (logger.isDebugEnabled()) {
            logger.info("{} splitWords for lyrics", system.idString());
        }
        for (TextLine line : lyrics) {
            splitWords(line.getWords(), line);
        }

        // Reject invalid standard lines
        standards = splitStandardLines(standards);
        standards = purgeInvalidLines(standards);

        // Recut standard lines
        standards = mergeStandardLines(standards);
        standards = splitStandardLines(standards);

        // Recut standard words
        for (TextLine line : standards) {
            recutStandardWords(line);
        }

        // Gather and sort all lines (standard & lyrics)
        List<TextLine> allLines = new ArrayList<>();
        allLines.addAll(lyrics);
        allLines.addAll(standards);
        Collections.sort(allLines, TextLine.byOrdinate);

        return allLines;
    }

    //---------------------//
    // separatePopulations //
    //---------------------//
    /**
     * Separate the provided lines into lyrics lines and standard
     * (non-lyrics) lines.
     *
     * @param lines     the global population
     * @param standards the non-lyrics population
     * @param lyrics    the lyrics population
     */
    private void separatePopulations (Collection<TextLine> lines,
                                      List<TextLine> standards,
                                      List<TextLine> lyrics)
    {
        for (TextLine line : lines) {
            if (line.getValue().trim().isEmpty()) {
                logger.debug("Empty line {}", line);
                line.setProcessed(true);
            } else {
                line.setProcessed(false);
                if (line.isLyrics()) {
                    lyrics.add(line);
                } else {
                    line.setRole(null); // To force role recomputing later
                    standards.add(line);
                }

                if (logger.isDebugEnabled()) {
                    logger.info("   Initial {}", line);
                    for (TextWord word : line.getWords()) {
                        logger.debug("      {}", word);
                    }
                }
            }
        }
    }

    //-----------------//
    // retrieveOcrLine //
    //-----------------//
    /**
     * Launch the OCR on the provided glyph, to retrieve the TextLine
     * instance(s) this glyph represents.
     *
     * @param glyph    the glyph to OCR
     * @param language the probable language
     * @return a list, not null but perhaps empty, of TextLine instances with
     *         absolute coordinates.
     */
    public List<TextLine> retrieveOcrLine (Glyph glyph,
                                           String language)
    {
        final String label = "s" + glyph.getSystem()
                .getId() + "-g" + glyph.getId();

        return getOcr()
                .recognize(glyph.getImage(),
                glyph.getBounds().getLocation(),
                language,
                OCR.LayoutMode.SINGLE_BLOCK,
                system,
                label);
    }

    //------------------//
    // retrieveSections //
    //------------------//
    /**
     * Report the set of sections that relate to the provided collection
     * of TextChar instances.
     *
     * @param chars       the OCR char descriptors
     * @param allSections the candidate sections
     * @return the corresponding set of sections
     */
    public SortedSet<Section> retrieveSections (List<TextChar> chars,
                                                Collection<Section> allSections)
    {
        SortedSet<Section> set = new TreeSet<>();

        for (TextChar charDesc : chars) {
            Rectangle charBox = charDesc.getBounds();

            for (Section section : allSections) {
                // Do we contain a section not (yet) assigned?
                if (!section.isProcessed()
                    && charBox.contains(section.getBounds())) {
                    set.add(section);
                    section.setProcessed(true);
                }
            }
        }

        return set;
    }

    //-------------//
    // mergeChunks //
    //-------------//
    /**
     * Merge line chunks horizontally
     *
     * @param chunks the (sub) lines to merge
     * @return the resulting merged line
     */
    private TextLine mergeChunks (List<TextLine> chunks)
    {
        TextLine line;
        Collections.sort(chunks, TextLine.byAbscissa);

        if (chunks.size() == 1) {
            line = chunks.get(0);
        } else {
            if (logger.isDebugEnabled()) {
                for (TextLine chunk : chunks) {
                    logger.debug("   chunk {}", chunk);
                }
            }
            line = mergeLines(chunks);
            if (line.isVip() || logger.isDebugEnabled()) {
                logger.info("      merge result {}", line);
            }
        }

        return line;
    }

    //------------------//
    // mergeLyricsLines //
    //------------------//
    /**
     * For lyrics, separate lines with similar ordinate trigger a
     * line merge.
     *
     * @param oldLyrics collection of lyrics chunks
     * @return resulting lyrics lines
     */
    private List<TextLine> mergeLyricsLines (List<TextLine> oldLyrics)
    {
        if (logger.isDebugEnabled()) {
            logger.info("{} mergeLyricsLines", system.idString());
        }
        List<TextLine> newLyrics = new ArrayList<>();
        Collections.sort(oldLyrics, TextLine.byOrdinate);

        List<TextLine> chunks = new ArrayList<>();
        double lastY = 0;

        for (TextLine line : oldLyrics) {
            double y = line.getDskOrigin().getY();

            if (chunks.isEmpty()) {
                chunks.add(line);
                lastY = y;
            } else if ((y - lastY) <= params.maxLyricsDy) {
                // Compatible line
                chunks.add(line);
                lastY = y;
            } else {
                // Non compatible line

                // Complete pending chunks, if any
                if (!chunks.isEmpty()) {
                    newLyrics.add(mergeChunks(chunks));
                }

                // Start a new collection of chunks
                chunks.clear();
                chunks.add(line);
                lastY = y;
            }
        }

        // Complete pending chunks, if any
        if (!chunks.isEmpty()) {
            newLyrics.add(mergeChunks(chunks));
        }

        return newLyrics;
    }

    //--------------------//
    // mergeStandardLines //
    //--------------------//
    /**
     * For standards, separate lines with similar ordinate and small
     * abscissa gap trigger a line merge.
     *
     * @param oldStandards collection of standard candidates
     * @return resulting standard lines
     */
    private List<TextLine> mergeStandardLines (List<TextLine> oldStandards)
    {
        if (logger.isDebugEnabled()) {
            logger.info("{} mergeStandardLines", system.idString());
        }
        Collections.sort(oldStandards, TextLine.byOrdinate);

        for (TextLine current : oldStandards) {
            current.setProcessed(false);
            TextLine candidate = current;

            CandidateLoop:
            while (true) {
                final Rectangle candidateBounds = getDeskewedCore(candidate);
                final Rectangle candidateFatBox = new Rectangle(candidateBounds);
                candidateFatBox.grow(getWordGap(candidate), params.maxLyricsDy);

                HeadsLoop:
                for (TextLine head : oldStandards) {
                    if (head == current) {
                        break CandidateLoop;
                    }
                    if (head != candidate && !head.isProcessed()) {
                        Rectangle headBounds = getDeskewedCore(head);
                        if (headBounds.intersects(candidateFatBox)) {
                            if (head.isChord()) {
                                // Check actual dx between head & candidate
                                int gap = GeoUtil.xGap(headBounds, candidateBounds);
                                if (gap <= params.maxChordDx) {
                                    continue;
                                }
                            }

                            if (candidate.isVip() || head.isVip()
                                || logger.isDebugEnabled()) {
                                logger.info("   merging {} into {}",
                                        candidate, head);
                            }

                            head.addWords(candidate.getWords());
                            candidate.setProcessed(true);
                            candidate = head;
                            break HeadsLoop;
                        }
                    }
                }

            }
        }

        // Remove unavailable lines
        List<TextLine> newStandards = new ArrayList<>();
        for (TextLine line : oldStandards) {
            if (!line.isProcessed()) {
                newStandards.add(line);
            }
        }

        return newStandards;
    }

    //-----------------//
    // getDeskewedCore //
    //-----------------//
    /**
     * Build a rectangle using deskewed baseline and min 1 pixel high.
     *
     * @param line the TextLine entity
     * @return the deskewed core
     */
    private Rectangle getDeskewedCore (TextLine line)
    {
        Point2D P1 = line.getDskOrigin();
        Point p1 = new Point((int) Math.rint(P1.getX()),
                (int) Math.rint(P1.getY()));
        Point2D P2 = system.getSkew().deskewed(line.getBaseline().getP2());
        Point p2 = new Point((int) Math.rint(P2.getX()),
                (int) Math.rint(P2.getY()));
        Rectangle rect = new Rectangle(p1);
        rect.add(p2);

        rect.height = Math.max(1, rect.height); // To allow containment test

        return rect;
    }

    //--------------------//
    // recutStandardWords //
    //--------------------//
    /**
     * Recut (merge & split) words within a standard TextLine.
     *
     * @param line the line to recut words
     */
    public void recutStandardWords (TextLine line)
    {
        mergeStandardWords(line);
        splitWords(line.getWords(), line);
    }

    //--------------------//
    // mergeStandardWords //
    //--------------------//
    private void mergeStandardWords (TextLine line)
    {
        logger.debug("   mergeLineWords for {}", line);

        List<TextWord> toAdd = new ArrayList<>();
        List<TextWord> toRemove = new ArrayList<>();
        TextWord prevWord = null;

        for (TextWord word : line.getWords()) {
            // Look for tiny inter-word gap
            if (prevWord != null) {
                Rectangle prevBounds = prevWord.getBounds();
                int prevStop = prevBounds.x + prevBounds.width;
                int gap = word.getBounds().x - prevStop;
                logger.debug("      gap {} vs {} to {}",
                        gap, params.minWordDx, word);

                if (gap < params.minWordDx) {
                    toRemove.add(prevWord);
                    toRemove.add(word);
                    TextWord bigWord = TextWord.mergeOf(prevWord, word);
                    logger.debug("         merged into {}", bigWord);
                    toAdd.add(bigWord);
                    word = bigWord;
                }
            }

            prevWord = word;
        }

        if (!toAdd.isEmpty()) {
            // No use to add & remove the same words
            List<TextWord> common = new ArrayList<>(toAdd);
            common.retainAll(toRemove);
            toAdd.removeAll(common);
            toRemove.removeAll(common);

            // Perform the modifications
            line.addWords(toAdd);
            line.removeWords(toRemove);
        }
    }

    //------------//
    // splitWords //
    //------------//
    /**
     * Check each word in the provided collection and split it in place
     * according to separating characters ('-' etc).
     * The line sequence of words may get modified, because of the addition of
     * new (sub)words and the removal of words that got split.
     * The line sequence of words remains sorted.
     *
     * @param words the collection of words to check and split
     * @param line  the containing TextLine instance
     */
    public void splitWords (Collection<TextWord> words,
                            TextLine line)
    {
        // To avoid concurrent modification errors
        Collection<TextWord> toAdd = new ArrayList<>();
        Collection<TextWord> toRemove = new ArrayList<>();

        for (TextWord word : words) {
            List<TextWord> subWords = null; // Results of split
            Glyph wordGlyph = word.getGlyph();

            if (wordGlyph != null) {
                if (!wordGlyph.getTextValue().equals(word.getInternalValue())) {
                    // A manual text modification has occurred
                    // Check for a separator in the new manual value
                    if (!word.getChars().isEmpty()) {
                        logger.debug("Manual modif for {}",
                                wordGlyph.idString());
                        subWords = getSubWords(word,
                                line,
                                new WordScanner.ManualScanner(
                                wordGlyph.getTextValue(),
                                line.isLyrics(),
                                word.getChars()));

                        // If no subdivision was made, allocate a new TextWord
                        // just to match the new manual value
                        if (subWords.isEmpty()) {
                            TextWord newWord = new TextWord(
                                    word.getBaseline(),
                                    wordGlyph.getTextValue(),
                                    word.getFontInfo(),
                                    word.getConfidence(),
                                    word.getChars(),
                                    line);
                            newWord.setGlyph(wordGlyph);
                            subWords.add(newWord);
                            wordGlyph.setTextWord(wordGlyph.getOcrLanguage(),
                                    newWord);
                        }
                    }
                }
            } else {
                subWords = getSubWords(word,
                        line,
                        new WordScanner.OcrScanner(
                        word.getValue(),
                        line.isLyrics(),
                        word.getChars()));
            }

            if (subWords != null && !subWords.isEmpty()) {
                toRemove.add(word);
                toAdd.addAll(subWords);
            }
        }

        // Now perform modification on the line sequence of words, if so needed
        if (!toRemove.isEmpty()) {
            line.addWords(toAdd);
            line.removeWords(toRemove);
        }
    }

    //--------------------//
    // splitStandardLines //
    //--------------------//
    /**
     * For standard (non-lyrics) lines, a really wide gap between two
     * words indicate the need to split the line in two.
     *
     * @param oldStandards collection of initial standard lines
     * @return resulting standard lines
     */
    private List<TextLine> splitStandardLines (List<TextLine> oldStandards)
    {
        if (logger.isDebugEnabled()) {
            logger.info("{} splitStandardLines", system.idString());
        }
        Collections.sort(oldStandards, TextLine.byOrdinate);

        List<TextLine> newStandards = new ArrayList<>();

        for (TextLine line : oldStandards) {
            if (line.isVip() || logger.isDebugEnabled()) {
                logger.info("split checking {}", line);
            }

            final int maxAbscissaGap = getWordGap(line);
            List<TextWord> words = line.getWords();
            boolean splitting = true;

            while (splitting) {
                splitting = false;

                // Look for huge inter-word gap
                Integer stop = null;

                for (TextWord word : words) {
                    Rectangle bounds = word.getBounds();

                    if (stop != null) {
                        int gap = bounds.x - stop;

                        if (gap > maxAbscissaGap) {
                            int splitPos = words.indexOf(word);
                            List<TextWord> lineWords = words.subList(0,
                                    splitPos);
                            TextLine newLine = new TextLine(system, lineWords);
                            newLine.setRole(line.getRole());
                            if (line.isVip() || logger.isDebugEnabled()) {
                                logger.info("      subLine {}", newLine);
                            }
                            newStandards.add(newLine);

                            words = words.subList(splitPos, words.size());
                            splitting = true;

                            break;
                        }
                    }

                    stop = bounds.x + bounds.width;
                }
            }

            // Pending words?
            if (words.size() < line.getWords().size()) {
                TextLine newLine = new TextLine(system, words);
                newLine.setRole(line.getRole());
                if (line.isVip() || logger.isDebugEnabled()) {
                    logger.info("      subLine {}", newLine);
                }
                newStandards.add(newLine);
            } else {
                newStandards.add(line);
            }
        }

        return newStandards;
    }

    //-------------//
    // getSubWords //
    //-------------//
    /**
     * Report the potential subwords of the provided word, based on the
     * provided scanner to adapt to Ocr or Manual values.
     *
     * @param word    the word to process
     * @param line    the containing line
     * @param scanner how to scan the word
     * @return the sequence of created (sub)words, perhaps empty
     */
    private List<TextWord> getSubWords (TextWord word,
                                        TextLine line,
                                        WordScanner scanner)
    {
        final List<TextWord> subWords = new ArrayList<>();
        final int contentLength = word.getValue().length();

        while (scanner.hasNext()) {
            String subValue = scanner.next();

            if (subValue.length() < contentLength) {
                // We have a real subword
                List<TextChar> wordChars = scanner.getWordChars();

                // Compute (sub) baseline parameters
                Line2D base = word.getBaseline();
                int x1 = wordChars.get(0).getBounds().x;
                Point2D p1 = LineUtil.intersection(
                        base.getP1(), base.getP2(),
                        new Point2D.Double(x1, 0), new Point2D.Double(x1, 100));

                Rectangle box = wordChars.get(wordChars.size() - 1).getBounds();
                int x2 = box.x + box.width - 1;
                Point2D p2 = LineUtil.intersection(
                        base.getP1(), base.getP2(),
                        new Point2D.Double(x2, 0), new Point2D.Double(x2, 100));
                Line2D subBase = new Line2D.Double(p1, p2);

                // Allocate sub word
                TextWord newWord = new TextWord(
                        subBase,
                        subValue,
                        word.getFontInfo(),
                        word.getConfidence(),
                        wordChars,
                        line);

                logger.debug("      subWord ''{}'' from ''{}''",
                        newWord.getValue(), word.getValue());
                subWords.add(newWord);
            }
        }

        return subWords;
    }

    //-------------------//
    // purgeInvalidLines //
    //-------------------//
    /**
     * Purge lines whose validity is not confirmed.
     *
     * @param lines the lines to purge
     * @return the remaining lines
     */
    private List<TextLine> purgeInvalidLines (List<TextLine> lines)
    {
        if (logger.isDebugEnabled()) {
            logger.info("{} purgeInvalidLines", system.idString());
        }
        List<TextLine> newLines = new ArrayList<>();

        for (TextLine line : lines) {
            logger.debug("   checking {}", line);
            if (isValid(line)) {
                newLines.add(line);
            } else {
                line.setProcessed(true);
                if (logger.isDebugEnabled()) {
                    for (TextWord word : line.getWords()) {
                        logger.debug("      {}", word);
                    }
                }
            }
        }

        return newLines;
    }

    //---------------------//
    // switchLanguageTexts //
    //---------------------//
    /**
     * Use a new language to update existing words when a better OCR
     * result has been found.
     */
    public void switchLanguageTexts ()
    {
        final Page page = system.getSheet().getPage();
        final LiveParam<String> textParam = page.getTextParam();
        final String language = textParam.getTarget();
        if (logger.isDebugEnabled()) {
            logger.info("{} switchLanguageTexts lan:{}",
                    system.idString(), language);
        }
        textParam.setActual(language);

        for (TextLine oldLine : new ArrayList<>(system.getSentences())) {
            // Launch OCR on the whole line image
            List<Glyph> glyphs = oldLine.getWordGlyphs();
            Glyph compound = glyphs.size() == 1
                    ? glyphs.get(0)
                    : system.
                    registerGlyph(system.buildTransientCompound(glyphs));

            List<TextLine> lines = retrieveOcrLine(compound, language);
            if (lines == null || lines.size() != 1) {
                logger.debug("{} No valid replacement for {}",
                        system.idString(), oldLine);
            } else {
                TextLine newLine = lines.get(0);
                recutStandardWords(newLine);

                if (logger.isDebugEnabled()) {
                    logger.info("{} refreshing {} by {}",
                            system.idString(), oldLine, newLine);
                    oldLine.dump();
                    newLine.dump();
                }
                List<TextWord> toRemove = new ArrayList<>();
                List<TextWord> toAdd = new ArrayList<>();
                for (TextWord oldWord : oldLine.getWords()) {
                    TextWord newWord = findNewWord(oldWord, newLine);
                    if (newWord != null) {
                        if (newWord.getConfidence() >= oldWord.getConfidence()) {
                            newWord.setGlyph(oldWord.getGlyph());
                            newWord.getGlyph().setTextWord(language, newWord);
                            toRemove.add(oldWord);
                            toAdd.add(newWord);
                        }
                    } else {
                        logger.debug("{} no word for {} in {}",
                                system.idString(), oldWord, newLine);
                    }
                }

                // Update words in place
                if (!toAdd.isEmpty()) {
                    oldLine.addWords(toAdd);
                    oldLine.removeWords(toRemove);
                }
            }
        }
    }

    //-------------//
    // findNewWord //
    //-------------//
    /**
     * Try to find in the provided new line the word that corresponds
     * to the provided old word.
     *
     * @param oldWord old word
     * @param newLine the line to search
     * @return the corresponding new word, or null if not found
     */
    private TextWord findNewWord (TextWord oldWord,
                                  TextLine newLine)
    {
        Rectangle oldBounds = oldWord.getBounds();

        for (TextWord word : newLine.getWords()) {
            if (word.getBounds().equals(oldBounds)) {
                return word;
            }
        }

        return null;
    }

    //------------------//
    // getAbnormalWords //
    //------------------//
    /**
     * Compile the provided regexp to detect abnormal words
     *
     * @return the pattern for abnormal words, if successful
     */
    private static Pattern getAbnormalWords ()
    {
        try {
            return Pattern.compile(constants.abnormalWordRegexp.getValue());
        } catch (PatternSyntaxException pse) {
            logger.warn("Error in regexp for abnormal words", pse);
            return null;
        }
    }

    //------------//
    // getWordGap //
    //------------//
    /**
     * Report the maximum abscissa gap between two consecutive words
     * of the provided line.
     * We use a smaller horizontal gap between chord names than between words
     * of ordinary standard lines.
     *
     * @param line the line provided
     * @return the maximum abscissa gap to use
     */
    private int getWordGap (TextLine line)
    {
        return line.isChord() ? params.maxChordDx : params.maxWordDx;
    }

    //~ Inner Classes ----------------------------------------------------------
    //-----------//
    // Constants //
    //-----------//
    private static final class Constants
            extends ConstantSet
    {
        //~ Instance fields ----------------------------------------------------

        Constant.String abnormalWordRegexp = new Constant.String(
                "^[\\.°>]$",
                "Regular expression to detect abnormal words");

        Constant.Integer minConfidence = new Constant.Integer(
                "0..100",
                70,
                "Minimum confidence for OCR validity");

        Constant.Integer maxCharCountForAspectCheck = new Constant.Integer(
                "CharCount",
                3,
                "Maximum character count to apply aspect check");

        Constant.Ratio minAspectRatio = new Constant.Ratio(
                0.35,
                "Minimum ratio between ocr aspect and glyph aspect");

        Constant.Ratio maxAspectRatio = new Constant.Ratio(
                2.0,
                "Maximum ratio between ocr aspect and glyph aspect");

        Scale.Fraction maxFontSize = new Scale.Fraction(
                7.0,
                "Max font size wrt interline");

        Scale.Fraction maxLyricsDy = new Scale.Fraction(
                1.0,
                "Max vertical gap between two lyrics chunks");

        Scale.Fraction maxWordDx = new Scale.Fraction(
                5.0,
                "Max horizontal gap between two non-lyrics words");

        Scale.Fraction minWordDx = new Scale.Fraction(
                0.25,
                "Min horizontal gap between two non-lyrics words");

        Scale.Fraction maxChordDx = new Scale.Fraction(
                1.0,
                "Max horizontal gap between two chord words");

        Constant.Ratio maxInvalidRatio = new Constant.Ratio(
                0.33,
                "Maximum ratio of invalid words in a line");

    }

    //------------//
    // Parameters //
    //------------//
    private static class Parameters
    {
        //~ Instance fields ----------------------------------------------------

        final int maxFontSize;

        final int maxLyricsDy;

        final int maxWordDx;

        final int minWordDx;

        final int maxChordDx;

        //~ Constructors -------------------------------------------------------
        public Parameters (Scale scale)
        {
            maxFontSize = scale.toPixels(constants.maxFontSize);
            maxLyricsDy = scale.toPixels(constants.maxLyricsDy);
            maxWordDx = scale.toPixels(constants.maxWordDx);
            minWordDx = scale.toPixels(constants.minWordDx);
            maxChordDx = scale.toPixels(constants.maxChordDx);
        }
    }
}