/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2016 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.core.segmentation; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.omegat.util.Language; import org.omegat.util.PatternConsts; /** * The class that sentences the paragraphs into sentences and glues translated * sentences together to form a paragraph. * * @author Maxym Mykhalchuk */ public final class Segmenter { private final SRX srx; public Segmenter(SRX srx) { this.srx = srx; } public SRX getSRX() { return srx; } /** * Segments the paragraph to sentences according to currently setup rules. * <p> * Bugfix for <a href="https://sourceforge.net/p/omegat/bugs/83/">bug 83</a> * : Sentences are returned without spaces in the beginning and at the end * of a sentence. * <p> * An additional list with space information is returned to be able to glue * translation together with the same spaces between them as in original * paragraph. * * @param paragraph * the paragraph text * @param spaces * list to store information about spaces between sentences (can be null) * @param brules * list to store rules that account to breaks (can be null) * @return list of sentences (String objects) */ public List<String> segment(Language lang, String paragraph, List<StringBuilder> spaces, List<Rule> brules) { if (paragraph == null) { return null; } List<String> segments = breakParagraph(lang, paragraph, brules); List<String> sentences = new ArrayList<String>(segments.size()); if (spaces != null) { spaces.clear(); } for (String one : segments) { int len = one.length(); int b = 0; StringBuilder bs = new StringBuilder(); for (int cp; b < len; b += Character.charCount(cp)) { cp = one.codePointAt(b); if (!Character.isWhitespace(cp)) { break; } bs.appendCodePoint(cp); } int e = len; StringBuilder es = new StringBuilder(); for (int cp; e > b; e -= Character.charCount(cp)) { cp = one.codePointBefore(e); if (!Character.isWhitespace(cp)) { break; } es.appendCodePoint(cp); } es.reverse(); String trimmed = one.substring(b, e); sentences.add(trimmed); if (spaces != null) { spaces.add(bs); spaces.add(es); } } return sentences; } /** * Returns pre-sentences (sentences with spaces between), computed by breaking paragraph into chunks of * text. Also returns the list with "the reasons" why the breaks were made, i.e. the list of break rules * that contributed to each of the breaks made. * <p> * If glued back together, these strings form the same paragraph text as this function was fed. * * @param paragraph * the paragraph text * @param brules * list to store rules that account to breaks (can be null) */ private List<String> breakParagraph(Language lang, String paragraph, List<Rule> brules) { List<Rule> rules = srx.lookupRulesForLanguage(lang); // determining the applicable break positions Set<BreakPosition> dontbreakpositions = new TreeSet<BreakPosition>(); Set<BreakPosition> breakpositions = new TreeSet<BreakPosition>(); for (int i = rules.size() - 1; i >= 0; i--) { Rule rule = rules.get(i); List<BreakPosition> rulebreaks = getBreaks(paragraph, rule); if (rule.isBreakRule()) { breakpositions.addAll(rulebreaks); dontbreakpositions.removeAll(rulebreaks); } else { dontbreakpositions.addAll(rulebreaks); breakpositions.removeAll(rulebreaks); } } breakpositions.removeAll(dontbreakpositions); // and now breaking the string according to the positions List<String> segments = new ArrayList<String>(); if (brules != null) { brules.clear(); } int prevpos = 0; for (BreakPosition bposition : breakpositions) { String oneseg = paragraph.substring(prevpos, bposition.position); segments.add(oneseg); if (brules != null) { brules.add(bposition.reason); } prevpos = bposition.position; } try { String oneseg = paragraph.substring(prevpos); // Sometimes the last segment may be empty, // it happens for paragraphs like "Rains. " // So if it's an empty segment and there's a previous segment if (oneseg.trim().isEmpty() && !segments.isEmpty()) { String prev = segments.get(segments.size() - 1); prev += oneseg; segments.set(segments.size() - 1, prev); } else segments.add(oneseg); } catch (IndexOutOfBoundsException iobe) { } return segments; } private static Pattern DEFAULT_BEFOREBREAK_PATTERN = Pattern.compile(".", Pattern.DOTALL); /** * Returns the places of possible breaks between sentences. */ private static List<BreakPosition> getBreaks(String paragraph, Rule rule) { List<BreakPosition> res = new ArrayList<BreakPosition>(); Matcher bbm = null; if (rule.getBeforebreak() != null) bbm = rule.getCompiledBeforebreak().matcher(paragraph); Matcher abm = null; if (rule.getAfterbreak() != null) abm = rule.getCompiledAfterbreak().matcher(paragraph); if (bbm == null && abm == null) return res; if (abm != null) if (!abm.find()) return res; if (bbm == null) bbm = DEFAULT_BEFOREBREAK_PATTERN.matcher(paragraph); while (bbm.find()) { int bbe = bbm.end(); if (abm == null) res.add(new BreakPosition(bbe, rule)); else { int abs = abm.start(); while (abs < bbe) { boolean found = abm.find(); if (!found) return res; abs = abm.start(); } if (abs == bbe) res.add(new BreakPosition(bbe, rule)); } } return res; } /** A class for a break position that knows which rule contributed to it. */ static class BreakPosition implements Comparable<BreakPosition> { /** Break/Exception position. */ int position; /** Rule that contributed to the break. */ Rule reason; /** Creates a new break position. */ BreakPosition(int position, Rule reason) { this.position = position; this.reason = reason; } /** * Other BreakPosition is "equal to" this one iff it has the same position. */ public boolean equals(Object obj) { if (obj == null) return false; if (!(obj instanceof BreakPosition)) return false; BreakPosition that = (BreakPosition) obj; return this.position == that.position; } /** Returns a hash code == position for the object. */ public int hashCode() { return this.position; } /** * Compares this break position with another. * * @return a negative integer if its position is less than the another's, zero if they are equal, or a * positive integer as its position is greater than the another's. * @throws ClassCastException * if the specified object's type prevents it from being compared to this Object. */ public int compareTo(BreakPosition that) { return this.position - that.position; } } /** * Glues segments back into a paragraph. * <p> * As segments are returned by * {@link #segment(Language, String, List, List)} without spaces before and * after them, this method adds spaces if needed: * <ul> * <li>For translation <i>to</i> non-space-delimited languages (Japanese, * Chinese, Tibetan) it does <b>not</b> add any spaces. * <p> * A special exceptions are the Break SRX rules that break on space, i.e. * before and after patterns consist of spaces (they get trimmed to an empty * string). For such rules all the spaces are added. * <li>For translation <i>from</i> non-space-delimited languages it adds one * space. * <li>For all other language combinations it restores the spaces present * before segmenting. * </ul> * * @param sentences * list of translated sentences * @param spaces * information about spaces in original paragraph * @param brules * rules that account to breaks * @return glued translated paragraph */ public String glue(Language sourceLang, Language targetLang, List<String> sentences, List<StringBuilder> spaces, List<Rule> brules) { if (sentences.size() <= 0) { return ""; } StringBuilder res = new StringBuilder(); res.append(sentences.get(0)); for (int i = 1; i < sentences.size(); i++) { StringBuilder sp = new StringBuilder(); sp.append(spaces.get(2 * i - 1)); sp.append(spaces.get(2 * i)); if (!targetLang.isSpaceDelimited()) { Rule rule = brules.get(i - 1); if (res.length() > 0) { char lastChar = res.charAt(res.length() - 1); Matcher matcher = LINE_BREAK_OR_TAB_PATTERN.matcher(sp.toString()); if (matcher.find()) { // If we found line break or tab, trim left spaces. // Right spaces are left for indentation of the next line. String leftSpaces = matcher.group(1); if (!leftSpaces.isEmpty()) { sp.replace(0, leftSpaces.length(), ""); } } else if ((lastChar != '.') && (!PatternConsts.SPACY_REGEX.matcher(rule.getBeforebreak()).matches() || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) { sp.setLength(0); } } } else if (!sourceLang.isSpaceDelimited() && sp.length() == 0) { sp.append(" "); } res.append(sp); res.append(sentences.get(i)); } return res.toString(); } /** * Segment source and target entries from TMX when counts are equals. */ public void segmentEntries(boolean needResegment, Language sourceLang, String sourceEntry, Language targetLang, String targetEntry, List<String> sourceSegments, List<String> targetSegments) { if (needResegment) { List<String> srcSegments = segment(sourceLang, sourceEntry, null, null); if (targetEntry != null) { // There is no translation for this entry, because for instance it's a note // on an untranslated entry List<String> tarSegments = segment(targetLang, targetEntry, null, null); if (srcSegments.size() == tarSegments.size()) { sourceSegments.addAll(srcSegments); targetSegments.addAll(tarSegments); return; } } } // No need to resegment, or segments counts not equals, or no translation sourceSegments.add(sourceEntry); targetSegments.add(targetEntry); } /** For non-space-delimited languages. */ private static final Pattern LINE_BREAK_OR_TAB_PATTERN = Pattern.compile("^( *)[\\r\\n\\t]"); }