Segmenter.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2016 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.segmentation;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.omegat.util.Language;
import org.omegat.util.PatternConsts;

/**
 * The class that sentences the paragraphs into sentences and glues translated
 * sentences together to form a paragraph.
 *
 * @author Maxym Mykhalchuk
 */
public final class Segmenter {

    private final SRX srx;

    public Segmenter(SRX srx) {
        this.srx = srx;
    }

    public SRX getSRX() {
        return srx;
    }

    /**
     * Segments the paragraph to sentences according to currently setup rules.
     * <p>
     * Bugfix for <a href="https://sourceforge.net/p/omegat/bugs/83/">bug 83</a>
     * : Sentences are returned without spaces in the beginning and at the end
     * of a sentence.
     * <p>
     * An additional list with space information is returned to be able to glue
     * translation together with the same spaces between them as in original
     * paragraph.
     *
     * @param paragraph
     *            the paragraph text
     * @param spaces
     *            list to store information about spaces between sentences (can be null)
     * @param brules
     *            list to store rules that account to breaks (can be null)
     * @return list of sentences (String objects)
     */
    public List<String> segment(Language lang, String paragraph, List<StringBuilder> spaces,
            List<Rule> brules) {
        if (paragraph == null) {
            return null;
        }
        List<String> segments = breakParagraph(lang, paragraph, brules);
        List<String> sentences = new ArrayList<String>(segments.size());
        if (spaces != null) {
            spaces.clear();
        }
        for (String one : segments) {
            int len = one.length();
            int b = 0;
            StringBuilder bs = new StringBuilder();
            for (int cp; b < len; b += Character.charCount(cp)) {
                cp = one.codePointAt(b);
                if (!Character.isWhitespace(cp)) {
                    break;
                }
                bs.appendCodePoint(cp);
            }

            int e = len;
            StringBuilder es = new StringBuilder();
            for (int cp; e > b; e -= Character.charCount(cp)) {
                cp = one.codePointBefore(e);
                if (!Character.isWhitespace(cp)) {
                    break;
                }
                es.appendCodePoint(cp);
            }
            es.reverse();

            String trimmed = one.substring(b, e);
            sentences.add(trimmed);
            if (spaces != null) {
                spaces.add(bs);
                spaces.add(es);
            }
        }
        return sentences;
    }

    /**
     * Returns pre-sentences (sentences with spaces between), computed by breaking paragraph into chunks of
     * text. Also returns the list with "the reasons" why the breaks were made, i.e. the list of break rules
     * that contributed to each of the breaks made.
     * <p>
     * If glued back together, these strings form the same paragraph text as this function was fed.
     *
     * @param paragraph
     *            the paragraph text
     * @param brules
     *            list to store rules that account to breaks (can be null)
     */
    private List<String> breakParagraph(Language lang, String paragraph, List<Rule> brules) {
        List<Rule> rules = srx.lookupRulesForLanguage(lang);

        // determining the applicable break positions
        Set<BreakPosition> dontbreakpositions = new TreeSet<BreakPosition>();
        Set<BreakPosition> breakpositions = new TreeSet<BreakPosition>();
        for (int i = rules.size() - 1; i >= 0; i--) {
            Rule rule = rules.get(i);
            List<BreakPosition> rulebreaks = getBreaks(paragraph, rule);
            if (rule.isBreakRule()) {
                breakpositions.addAll(rulebreaks);
                dontbreakpositions.removeAll(rulebreaks);
            } else {
                dontbreakpositions.addAll(rulebreaks);
                breakpositions.removeAll(rulebreaks);
            }
        }
        breakpositions.removeAll(dontbreakpositions);

        // and now breaking the string according to the positions
        List<String> segments = new ArrayList<String>();
        if (brules != null) {
            brules.clear();
        }
        int prevpos = 0;
        for (BreakPosition bposition : breakpositions) {
            String oneseg = paragraph.substring(prevpos, bposition.position);
            segments.add(oneseg);
            if (brules != null) {
                brules.add(bposition.reason);
            }
            prevpos = bposition.position;
        }
        try {
            String oneseg = paragraph.substring(prevpos);

            // Sometimes the last segment may be empty,
            // it happens for paragraphs like "Rains. "
            // So if it's an empty segment and there's a previous segment
            if (oneseg.trim().isEmpty() && !segments.isEmpty()) {
                String prev = segments.get(segments.size() - 1);
                prev += oneseg;
                segments.set(segments.size() - 1, prev);
            } else
                segments.add(oneseg);
        } catch (IndexOutOfBoundsException iobe) {
        }

        return segments;
    }

    private static Pattern DEFAULT_BEFOREBREAK_PATTERN = Pattern.compile(".", Pattern.DOTALL);

    /**
     * Returns the places of possible breaks between sentences.
     */
    private static List<BreakPosition> getBreaks(String paragraph, Rule rule) {
        List<BreakPosition> res = new ArrayList<BreakPosition>();

        Matcher bbm = null;
        if (rule.getBeforebreak() != null)
                bbm = rule.getCompiledBeforebreak().matcher(paragraph);
        Matcher abm = null;
        if (rule.getAfterbreak() != null)
            abm = rule.getCompiledAfterbreak().matcher(paragraph);

        if (bbm == null && abm == null)
            return res;

        if (abm != null)
            if (!abm.find())
                return res;

        if (bbm == null)
            bbm = DEFAULT_BEFOREBREAK_PATTERN.matcher(paragraph);

        while (bbm.find()) {
            int bbe = bbm.end();
            if (abm == null)
                res.add(new BreakPosition(bbe, rule));
            else {
                int abs = abm.start();
                while (abs < bbe) {
                    boolean found = abm.find();
                    if (!found)
                        return res;
                    abs = abm.start();
                }
                if (abs == bbe)
                    res.add(new BreakPosition(bbe, rule));
            }
        }

        return res;
    }

    /** A class for a break position that knows which rule contributed to it. */
    static class BreakPosition implements Comparable<BreakPosition> {
        /** Break/Exception position. */
        int position;
        /** Rule that contributed to the break. */
        Rule reason;

        /** Creates a new break position. */
        BreakPosition(int position, Rule reason) {
            this.position = position;
            this.reason = reason;
        }

        /**
         * Other BreakPosition is "equal to" this one iff it has the same position.
         */
        public boolean equals(Object obj) {
            if (obj == null)
                return false;
            if (!(obj instanceof BreakPosition))
                return false;
            BreakPosition that = (BreakPosition) obj;

            return this.position == that.position;
        }

        /** Returns a hash code == position for the object. */
        public int hashCode() {
            return this.position;
        }

        /**
         * Compares this break position with another.
         *
         * @return a negative integer if its position is less than the another's, zero if they are equal, or a
         *         positive integer as its position is greater than the another's.
         * @throws ClassCastException
         *             if the specified object's type prevents it from being compared to this Object.
         */
        public int compareTo(BreakPosition that) {
            return this.position - that.position;
        }
    }

    /**
     * Glues segments back into a paragraph.
     * <p>
     * As segments are returned by
     * {@link #segment(Language, String, List, List)} without spaces before and
     * after them, this method adds spaces if needed:
     * <ul>
     * <li>For translation <i>to</i> non-space-delimited languages (Japanese,
     * Chinese, Tibetan) it does <b>not</b> add any spaces.
     * <p>
     * A special exceptions are the Break SRX rules that break on space, i.e.
     * before and after patterns consist of spaces (they get trimmed to an empty
     * string). For such rules all the spaces are added.
     * <li>For translation <i>from</i> non-space-delimited languages it adds one
     * space.
     * <li>For all other language combinations it restores the spaces present
     * before segmenting.
     * </ul>
     *
     * @param sentences
     *            list of translated sentences
     * @param spaces
     *            information about spaces in original paragraph
     * @param brules
     *            rules that account to breaks
     * @return glued translated paragraph
     */
    public String glue(Language sourceLang, Language targetLang, List<String> sentences,
            List<StringBuilder> spaces, List<Rule> brules) {
        if (sentences.size() <= 0) {
            return "";
        }
        StringBuilder res = new StringBuilder();
        res.append(sentences.get(0));

        for (int i = 1; i < sentences.size(); i++) {
            StringBuilder sp = new StringBuilder();
            sp.append(spaces.get(2 * i - 1));
            sp.append(spaces.get(2 * i));

            if (!targetLang.isSpaceDelimited()) {
                Rule rule = brules.get(i - 1);
                if (res.length() > 0) {
                    char lastChar = res.charAt(res.length() - 1);
                    Matcher matcher = LINE_BREAK_OR_TAB_PATTERN.matcher(sp.toString());
                    if (matcher.find()) {
                        // If we found line break or tab, trim left spaces.
                        // Right spaces are left for indentation of the next line.
                        String leftSpaces = matcher.group(1);
                        if (!leftSpaces.isEmpty()) {
                            sp.replace(0, leftSpaces.length(), "");
                        }
                    } else if ((lastChar != '.')
                            && (!PatternConsts.SPACY_REGEX.matcher(rule.getBeforebreak()).matches()
                            || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
                        sp.setLength(0);
                    }
                }
            } else if (!sourceLang.isSpaceDelimited() && sp.length() == 0) {
                sp.append(" ");
            }
            res.append(sp);
            res.append(sentences.get(i));
        }
        return res.toString();
    }

    /**
     * Segment source and target entries from TMX when counts are equals.
     */
    public void segmentEntries(boolean needResegment, Language sourceLang, String sourceEntry,
            Language targetLang, String targetEntry, List<String> sourceSegments, List<String> targetSegments) {
        if (needResegment) {
            List<String> srcSegments = segment(sourceLang, sourceEntry, null, null);
            if (targetEntry != null) { // There is no translation for this entry, because for instance it's a note
                                       // on an untranslated entry
                List<String> tarSegments = segment(targetLang, targetEntry, null, null);

                if (srcSegments.size() == tarSegments.size()) {
                    sourceSegments.addAll(srcSegments);
                    targetSegments.addAll(tarSegments);
                    return;
                }
            }
        }
        // No need to resegment, or segments counts not equals, or no translation
        sourceSegments.add(sourceEntry);
        targetSegments.add(targetEntry);

    }

    /** For non-space-delimited languages. */
    private static final Pattern LINE_BREAK_OR_TAB_PATTERN = Pattern.compile("^( *)[\\r\\n\\t]");
}