/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2016 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.gui.align;
import java.io.File;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.omegat.core.Core;
import org.omegat.core.data.ParseEntry;
import org.omegat.core.data.ParseEntry.ParseEntryResult;
import org.omegat.core.data.ProtectedPart;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.IFilter;
import org.omegat.filters2.IParseCallback;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.OStrings;
import org.omegat.util.StringUtil;
import org.omegat.util.TMXWriter2;
import net.loomchild.maligna.calculator.Calculator;
import net.loomchild.maligna.calculator.length.NormalDistributionCalculator;
import net.loomchild.maligna.calculator.length.PoissonDistributionCalculator;
import net.loomchild.maligna.calculator.length.counter.CharCounter;
import net.loomchild.maligna.calculator.length.counter.Counter;
import net.loomchild.maligna.calculator.length.counter.SplitCounter;
import net.loomchild.maligna.coretypes.Alignment;
import net.loomchild.maligna.coretypes.Category;
import net.loomchild.maligna.coretypes.CategoryDefaults;
import net.loomchild.maligna.filter.Filter;
import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm;
import net.loomchild.maligna.filter.aligner.align.hmm.fb.ForwardBackwardAlgorithm;
import net.loomchild.maligna.filter.aligner.align.hmm.viterbi.ViterbiAlgorithm;
import net.loomchild.maligna.matrix.FullMatrixFactory;
import net.loomchild.maligna.matrix.MatrixFactory;
/**
* Class to drive alignment of input files. Responsible for filtering and performing automatic alignment with
* mALIGNa.
*
* @author Aaron Madlon-Kay
*
* @see <a href="https://github.com/loomchild/maligna">mALIGNa</a>
*/
public class Aligner {
final String srcFile;
final Language srcLang;
final String trgFile;
final Language trgLang;
boolean segment = true;
boolean removeTags = false;
/**
* Modes indicating the ways in which the source text can be sent to the alignment algorithm.
*/
enum ComparisonMode {
/**
* Take all source lines and align against all target lines. This is the default as it makes no
* demands of the input files.
*/
HEAPWISE,
/**
* This mode is only available when the source and target files extract to the same number of text
* units. Source and target strings with the same index are aligned separately.
*/
PARSEWISE,
/**
* This mode is only available when the source and target files provide IDs for all their text units.
* Each unit with matching ID is aligned separately.
*/
ID
}
enum AlgorithmClass {
/**
* @see <a href=
* "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/viterbi/ViterbiAlgorithm.java">
* ViterbiAlgorithm.java</a>
*/
VITERBI,
/**
* @see <a href=
* "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/fb/ForwardBackwardAlgorithm.java">
* ForwardBackwardAlgorithm.java</a>
*/
FB
}
enum CalculatorType {
/**
* @see <a href=
* "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/calculator/length/NormalDistributionCalculator.java">
* NormalDistributionCalculator.java</a>
*/
NORMAL,
/**
* @see <a href=
* "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/calculator/length/PoissonDistributionCalculator.java">
* PoissonDistributionCalculator.java</a>
*/
POISSON
}
enum CounterType {
CHAR,
WORD
}
ComparisonMode comparisonMode;
AlgorithmClass algorithmClass;
CalculatorType calculatorType;
CounterType counterType;
private List<String> srcRaw;
private List<String> trgRaw;
private List<Entry<String, String>> idPairs;
List<ComparisonMode> allowedModes;
public Aligner(String srcFile, Language srcLang, String trgFile, Language trgLang) {
this.srcFile = srcFile;
this.srcLang = srcLang;
this.trgFile = trgFile;
this.trgLang = trgLang;
restoreDefaults();
}
/**
* Parse the input files and extract the alignable text, which is retained in memory so that different
* alignment settings can be tried without re-parsing the files. This determines the available
* {@link ComparisonMode}s, available in {@link #allowedModes}.
*
* @throws Exception
* If the parsing fails for whatever reason
*/
void loadFiles() throws Exception {
Entry<List<String>, List<String>> srcResult = parseFile(srcFile);
srcRaw = srcResult.getValue();
Entry<List<String>, List<String>> trgResult = parseFile(trgFile);
trgRaw = trgResult.getValue();
List<ComparisonMode> allowed = new ArrayList<>();
allowed.add(ComparisonMode.HEAPWISE);
if (srcRaw.size() == trgRaw.size()) {
allowed.add(ComparisonMode.PARSEWISE);
}
List<String> srcIds = srcResult.getKey();
List<String> trgIds = trgResult.getKey();
if (srcIds.size() == srcRaw.size() && trgIds.size() == trgRaw.size()) {
allowed.add(ComparisonMode.ID);
comparisonMode = ComparisonMode.ID;
Map<String, String> trgMap = new HashMap<>();
IntStream.range(0, trgRaw.size()).forEach(i -> trgMap.put(trgIds.get(i), trgRaw.get(i)));
idPairs = IntStream.range(0, srcRaw.size()).mapToObj(i -> {
String src = srcRaw.get(i);
String trg = trgMap.get(srcIds.get(i));
if (src != null && trg != null) {
return new AbstractMap.SimpleImmutableEntry<>(src, trg);
} else {
return null;
}
}).filter(Objects::nonNull).collect(Collectors.toList());
} else {
idPairs = Collections.emptyList();
}
allowedModes = Collections.unmodifiableList(allowed);
}
/**
* Release all content loaded from the input files.
*/
void clearLoaded() {
srcRaw = null;
trgRaw = null;
idPairs = null;
}
void restoreDefaults() {
comparisonMode = ComparisonMode.HEAPWISE;
algorithmClass = AlgorithmClass.VITERBI;
calculatorType = CalculatorType.NORMAL;
if (!srcLang.isSpaceDelimited() || !trgLang.isSpaceDelimited()) {
counterType = CounterType.CHAR;
} else {
counterType = CounterType.WORD;
}
}
/**
* Parse the specified file and return the contents as a pair of lists:
* <ul>
* <li>Key: A list of IDs for the parsed text units
* <li>Value: A list of parsed text units
* </ul>
*
* @param file
* Path to input file
* @return Pair of lists
* @throws Exception
* If parsing fails
*/
private Entry<List<String>, List<String>> parseFile(String file) throws Exception {
final List<String> ids = new ArrayList<>();
final List<String> rawSegs = new ArrayList<>();
Core.getFilterMaster().loadFile(file, new FilterContext(srcLang, trgLang, true).setRemoveAllTags(removeTags),
new IParseCallback() {
@Override
public void linkPrevNextSegments() {
}
@Override
public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment,
IFilter filter) {
process(source, id);
}
@Override
public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment,
String path, IFilter filter, List<ProtectedPart> protectedParts) {
process(source, id != null ? id : path != null ? path : null);
}
@Override
public void addEntryWithProperties(String id, String source, String translation,
boolean isFuzzy, String[] props, String path, IFilter filter,
List<ProtectedPart> protectedParts) {
process(source, id != null ? id : path != null ? path : null);
}
private void process(String text, String id) {
boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
text = StringUtil.normalizeUnicode(ParseEntry.stripSomeChars(text,
new ParseEntryResult(), removeTags, removeSpaces));
if (!text.trim().isEmpty()) {
if (id != null) {
ids.add(id);
}
rawSegs.add(text);
}
}
});
return new AbstractMap.SimpleImmutableEntry<>(ids, rawSegs);
}
/**
* Segment the specified list of strings into a flat list of strings. The resulting list will be free of
* empty strings.
*
* @param language
* The language of the texts to be segmented
* @param rawTexts
* List of texts to be segmented
* @return Flattened list of segments
*/
private List<String> segmentAll(Language language, List<String> rawTexts) {
return rawTexts.stream().flatMap(text -> Core.getSegmenter().segment(language, text, null, null).stream())
.filter(s -> !s.isEmpty()).collect(Collectors.toList());
}
/**
* Align {@link ComparisonMode#PARSEWISE} without first segmenting the source and target strings. No
* alignment algorithm is applied.
*
* @return List of beads where each entry of {@link #srcRaw} is aligned by index with each entry of
* {@link #trgRaw}
*/
private Stream<Alignment> alignParsewiseNotSegmented() {
if (!allowedModes.contains(ComparisonMode.PARSEWISE)) {
throw new UnsupportedOperationException();
}
return IntStream.range(0, srcRaw.size())
.mapToObj(i -> new Alignment(Arrays.asList(srcRaw.get(i)), Arrays.asList(trgRaw.get(i))));
}
/**
* Align {@link ComparisonMode#PARSEWISE} the source and target strings. Each pair is segmented and
* aligned separately by algorithm.
*
* @return List of beads where each entry of {@link #srcRaw} is aligned by index with each entry of
* {@link #trgRaw}
*/
private Stream<Alignment> alignParsewiseSegmented() {
if (!allowedModes.contains(ComparisonMode.PARSEWISE)) {
throw new UnsupportedOperationException();
}
return IntStream.range(0, srcRaw.size()).mapToObj(i -> {
List<String> source = Core.getSegmenter().segment(srcLang, srcRaw.get(i), null, null).stream()
.filter(s -> !s.isEmpty()).collect(Collectors.toList());
List<String> target = Core.getSegmenter().segment(trgLang, trgRaw.get(i), null, null).stream()
.filter(s -> !s.isEmpty()).collect(Collectors.toList());
return doAlign(algorithmClass, calculatorType, counterType, source, target);
}).flatMap(List::stream);
}
/**
* Align by {@link ComparisonMode#ID} without first segmenting the source and target strings. No alignment
* algorithm is applied.
*
* @return List of beads aligned by ID
*/
private Stream<Alignment> alignByIdNotSegmented() {
if (!allowedModes.contains(ComparisonMode.ID)) {
throw new UnsupportedOperationException();
}
return idPairs.stream()
.map(e -> new Alignment(Arrays.asList(e.getKey()), Arrays.asList(e.getValue())));
}
/**
* Align source and target strings by {@link ComparisonMode#ID}. Each pair is segmented and aligned
* separately by algorithm.
*
* @return List of beads aligned by ID
*/
private Stream<Alignment> alignByIdSegmented() {
if (!allowedModes.contains(ComparisonMode.ID)) {
throw new UnsupportedOperationException();
}
return idPairs.stream().map(e -> {
List<String> source = Core.getSegmenter().segment(srcLang, e.getKey(), null, null).stream()
.filter(s -> !s.isEmpty()).collect(Collectors.toList());
List<String> target = Core.getSegmenter().segment(trgLang, e.getValue(), null, null).stream()
.filter(s -> !s.isEmpty()).collect(Collectors.toList());
return doAlign(algorithmClass, calculatorType, counterType, source, target);
}).flatMap(List::stream);
}
/**
* Align {@link ComparisonMode#HEAPWISE}. Input text is optionally segmented, then aligned by algorithm.
*
* @param doSegmenting
* Whether to segment the text
* @return List of beads aligned heapwise
*/
private Stream<Alignment> alignHeapwise(boolean doSegmenting) {
List<String> srcSegs = doSegmenting ? segmentAll(srcLang, srcRaw) : srcRaw;
List<String> trgSegs = doSegmenting ? segmentAll(trgLang, trgRaw) : trgRaw;
return doAlign(algorithmClass, calculatorType, counterType, srcSegs, trgSegs).stream();
}
public void writePairsToTMX(File outFile, List<Entry<String, String>> pairs) throws Exception {
TMXWriter2 writer = null;
String creator = OStrings.getApplicationName() + " Aligner";
long time = System.currentTimeMillis();
try {
writer = new TMXWriter2(outFile, srcLang, trgLang, true, true, true);
for (Entry<String, String> e : pairs) {
writer.writeEntry(e.getKey(), e.getValue(), null, creator, time, null, 0L, null);
}
} finally {
if (writer != null) {
try {
writer.close();
} catch (Exception ex) {
Log.log(ex);
}
}
}
}
/**
* Perform alignment according to the current settings and return the resulting list of beads. Will call
* {@link #loadFiles()} if it has not yet been called.
*
* @return List of beads
* @throws Exception
* If parsing the input files fails
*/
Stream<Alignment> alignImpl() throws Exception {
if (srcRaw == null || trgRaw == null) {
loadFiles();
}
switch (comparisonMode) {
case PARSEWISE:
return segment ? alignParsewiseSegmented() : alignParsewiseNotSegmented();
case HEAPWISE:
return alignHeapwise(segment);
case ID:
return segment ? alignByIdSegmented() : alignByIdNotSegmented();
}
throw new UnsupportedOperationException("Unknown comparison mode: " + comparisonMode);
}
/**
* Align the input files according to the current settings to a list of pairs where
* <ol>
* <li>key = source text
* <li>value = target text
* </ol>
*
* Calls {@link #loadFiles()} if it has not yet been called.
*
* @return
* @throws Exception
*/
public List<Entry<String, String>> align() throws Exception {
return alignImpl().map(bead -> {
String srcOut = Util.join(srcLang, bead.getSourceSegmentList());
String trgOut = Util.join(trgLang, bead.getTargetSegmentList());
return new AbstractMap.SimpleImmutableEntry<String, String>(srcOut, trgOut);
}).collect(Collectors.toList());
}
/**
* Obtain appropriate calculator according to the specified {@link CalculatorType}.
*
* @param calculatorType
* @param counterType
* @param aligns
* @return
*/
private static Calculator getCalculator(CalculatorType calculatorType, CounterType counterType,
List<Alignment> aligns) {
Counter counter = getCounter(counterType);
switch (calculatorType) {
case NORMAL:
return new NormalDistributionCalculator(counter);
case POISSON:
return new PoissonDistributionCalculator(counter, aligns);
}
throw new UnsupportedOperationException("Unsupported calculator type: " + calculatorType);
}
/**
* Obtain appropriate counter according to the specified {@link CounterType}.
*
* @param counterType
* @return
*/
private static Counter getCounter(CounterType counterType) {
switch (counterType) {
case CHAR:
return new CharCounter();
case WORD:
return new SplitCounter();
}
throw new UnsupportedOperationException("Unsupported counter type: " + counterType);
}
/**
* Obtain appropriate align algorithm object according to the specified {@link AlgorithmClass}.
*
* @param algorithmClass
* @param calculator
* @return
*/
private static AlignAlgorithm getAlgorithm(AlgorithmClass algorithmClass, Calculator calculator) {
MatrixFactory matrixFactory = new FullMatrixFactory();
Map<Category, Float> map = CategoryDefaults.BEST_CATEGORY_MAP;
switch (algorithmClass) {
case VITERBI:
return new ViterbiAlgorithm(calculator, map, matrixFactory);
case FB:
return new ForwardBackwardAlgorithm(calculator, map, matrixFactory);
}
throw new UnsupportedOperationException("Unsupported algorithm class: " + algorithmClass);
}
/**
* Use mALIGNa to align the specified source and target texts, according to the specified parameters.
*
* @param algorithmClass
* @param calculatorType
* @param counterType
* @param source
* @param target
* @return
*/
private static List<Alignment> doAlign(AlgorithmClass algorithmClass, CalculatorType calculatorType,
CounterType counterType, List<String> source, List<String> target) {
List<Alignment> aligns = Arrays.asList(new Alignment(source, target));
Calculator calculator = getCalculator(calculatorType, counterType, aligns);
AlignAlgorithm algorithm = getAlgorithm(algorithmClass, calculator);
Filter filter = new net.loomchild.maligna.filter.aligner.Aligner(algorithm);
// filter = FilterDecorators.decorate(filter);
return filter.apply(aligns);
}
List<MutableBead> doAlign(List<MutableBead> beads) {
List<String> source = new ArrayList<>();
List<String> target = new ArrayList<>();
for (MutableBead bead : beads) {
source.addAll(bead.sourceLines);
target.addAll(bead.targetLines);
}
return doAlign(algorithmClass, calculatorType, counterType, source, target).stream()
.map(MutableBead::new).collect(Collectors.toList());
}
}