Aligner.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2016 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.gui.align;

import java.io.File;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.omegat.core.Core;
import org.omegat.core.data.ParseEntry;
import org.omegat.core.data.ParseEntry.ParseEntryResult;
import org.omegat.core.data.ProtectedPart;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.IFilter;
import org.omegat.filters2.IParseCallback;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.OStrings;
import org.omegat.util.StringUtil;
import org.omegat.util.TMXWriter2;

import net.loomchild.maligna.calculator.Calculator;
import net.loomchild.maligna.calculator.length.NormalDistributionCalculator;
import net.loomchild.maligna.calculator.length.PoissonDistributionCalculator;
import net.loomchild.maligna.calculator.length.counter.CharCounter;
import net.loomchild.maligna.calculator.length.counter.Counter;
import net.loomchild.maligna.calculator.length.counter.SplitCounter;
import net.loomchild.maligna.coretypes.Alignment;
import net.loomchild.maligna.coretypes.Category;
import net.loomchild.maligna.coretypes.CategoryDefaults;
import net.loomchild.maligna.filter.Filter;
import net.loomchild.maligna.filter.aligner.align.AlignAlgorithm;
import net.loomchild.maligna.filter.aligner.align.hmm.fb.ForwardBackwardAlgorithm;
import net.loomchild.maligna.filter.aligner.align.hmm.viterbi.ViterbiAlgorithm;
import net.loomchild.maligna.matrix.FullMatrixFactory;
import net.loomchild.maligna.matrix.MatrixFactory;

/**
 * Class to drive alignment of input files. Responsible for filtering and performing automatic alignment with
 * mALIGNa.
 *
 * @author Aaron Madlon-Kay
 *
 * @see <a href="https://github.com/loomchild/maligna">mALIGNa</a>
 */
public class Aligner {

    final String srcFile;
    final Language srcLang;
    final String trgFile;
    final Language trgLang;

    boolean segment = true;
    boolean removeTags = false;

    /**
     * Modes indicating the ways in which the source text can be sent to the alignment algorithm.
     */
    enum ComparisonMode {
        /**
         * Take all source lines and align against all target lines. This is the default as it makes no
         * demands of the input files.
         */
        HEAPWISE,

        /**
         * This mode is only available when the source and target files extract to the same number of text
         * units. Source and target strings with the same index are aligned separately.
         */
        PARSEWISE,

        /**
         * This mode is only available when the source and target files provide IDs for all their text units.
         * Each unit with matching ID is aligned separately.
         */
        ID
    }

    enum AlgorithmClass {
        /**
         * @see <a href=
         *      "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/viterbi/ViterbiAlgorithm.java">
         *      ViterbiAlgorithm.java</a>
         */
        VITERBI,

        /**
         * @see <a href=
         *      "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/filter/aligner/align/hmm/fb/ForwardBackwardAlgorithm.java">
         *      ForwardBackwardAlgorithm.java</a>
         */
        FB
    }

    enum CalculatorType {
        /**
         * @see <a href=
         *      "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/calculator/length/NormalDistributionCalculator.java">
         *      NormalDistributionCalculator.java</a>
         */
        NORMAL,

        /**
         * @see <a href=
         *      "https://github.com/loomchild/maligna/blob/3.0.0/maligna/src/main/java/net/loomchild/maligna/calculator/length/PoissonDistributionCalculator.java">
         *      PoissonDistributionCalculator.java</a>
         */
        POISSON
    }

    enum CounterType {
        CHAR,
        WORD
    }

    ComparisonMode comparisonMode;
    AlgorithmClass algorithmClass;
    CalculatorType calculatorType;
    CounterType counterType;

    private List<String> srcRaw;
    private List<String> trgRaw;
    private List<Entry<String, String>> idPairs;
    List<ComparisonMode> allowedModes;

    public Aligner(String srcFile, Language srcLang, String trgFile, Language trgLang) {
        this.srcFile = srcFile;
        this.srcLang = srcLang;
        this.trgFile = trgFile;
        this.trgLang = trgLang;
        restoreDefaults();
    }

    /**
     * Parse the input files and extract the alignable text, which is retained in memory so that different
     * alignment settings can be tried without re-parsing the files. This determines the available
     * {@link ComparisonMode}s, available in {@link #allowedModes}.
     *
     * @throws Exception
     *             If the parsing fails for whatever reason
     */
    void loadFiles() throws Exception {
        Entry<List<String>, List<String>> srcResult = parseFile(srcFile);
        srcRaw = srcResult.getValue();
        Entry<List<String>, List<String>> trgResult = parseFile(trgFile);
        trgRaw = trgResult.getValue();

        List<ComparisonMode> allowed = new ArrayList<>();
        allowed.add(ComparisonMode.HEAPWISE);
        if (srcRaw.size() == trgRaw.size()) {
            allowed.add(ComparisonMode.PARSEWISE);
        }
        List<String> srcIds = srcResult.getKey();
        List<String> trgIds = trgResult.getKey();
        if (srcIds.size() == srcRaw.size() && trgIds.size() == trgRaw.size()) {
            allowed.add(ComparisonMode.ID);
            comparisonMode = ComparisonMode.ID;

            Map<String, String> trgMap = new HashMap<>();
            IntStream.range(0, trgRaw.size()).forEach(i -> trgMap.put(trgIds.get(i), trgRaw.get(i)));
            idPairs = IntStream.range(0, srcRaw.size()).mapToObj(i -> {
                String src = srcRaw.get(i);
                String trg = trgMap.get(srcIds.get(i));
                if (src != null && trg != null) {
                    return new AbstractMap.SimpleImmutableEntry<>(src, trg);
                } else {
                    return null;
                }
            }).filter(Objects::nonNull).collect(Collectors.toList());
        } else {
            idPairs = Collections.emptyList();
        }
        allowedModes = Collections.unmodifiableList(allowed);
    }

    /**
     * Release all content loaded from the input files.
     */
    void clearLoaded() {
        srcRaw = null;
        trgRaw = null;
        idPairs = null;
    }

    void restoreDefaults() {
        comparisonMode = ComparisonMode.HEAPWISE;
        algorithmClass = AlgorithmClass.VITERBI;
        calculatorType = CalculatorType.NORMAL;
        if (!srcLang.isSpaceDelimited() || !trgLang.isSpaceDelimited()) {
            counterType = CounterType.CHAR;
        } else {
            counterType = CounterType.WORD;
        }
    }

    /**
     * Parse the specified file and return the contents as a pair of lists:
     * <ul>
     * <li>Key: A list of IDs for the parsed text units
     * <li>Value: A list of parsed text units
     * </ul>
     *
     * @param file
     *            Path to input file
     * @return Pair of lists
     * @throws Exception
     *             If parsing fails
     */
    private Entry<List<String>, List<String>> parseFile(String file) throws Exception {
        final List<String> ids = new ArrayList<>();
        final List<String> rawSegs = new ArrayList<>();
        Core.getFilterMaster().loadFile(file, new FilterContext(srcLang, trgLang, true).setRemoveAllTags(removeTags),
                new IParseCallback() {
                    @Override
                    public void linkPrevNextSegments() {
                    }

                    @Override
                    public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment,
                            IFilter filter) {
                        process(source, id);
                    }

                    @Override
                    public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment,
                            String path, IFilter filter, List<ProtectedPart> protectedParts) {
                        process(source, id != null ? id : path != null ? path : null);
                    }

                    @Override
                    public void addEntryWithProperties(String id, String source, String translation,
                            boolean isFuzzy, String[] props, String path, IFilter filter,
                            List<ProtectedPart> protectedParts) {
                        process(source, id != null ? id : path != null ? path : null);

                    }

                    private void process(String text, String id) {
                        boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
                        text = StringUtil.normalizeUnicode(ParseEntry.stripSomeChars(text,
                                new ParseEntryResult(), removeTags, removeSpaces));
                        if (!text.trim().isEmpty()) {
                            if (id != null) {
                                ids.add(id);
                            }
                            rawSegs.add(text);
                        }
                    }
                });
        return new AbstractMap.SimpleImmutableEntry<>(ids, rawSegs);
    }

    /**
     * Segment the specified list of strings into a flat list of strings. The resulting list will be free of
     * empty strings.
     *
     * @param language
     *            The language of the texts to be segmented
     * @param rawTexts
     *            List of texts to be segmented
     * @return Flattened list of segments
     */
    private List<String> segmentAll(Language language, List<String> rawTexts) {
        return rawTexts.stream().flatMap(text -> Core.getSegmenter().segment(language, text, null, null).stream())
                .filter(s -> !s.isEmpty()).collect(Collectors.toList());
    }

    /**
     * Align {@link ComparisonMode#PARSEWISE} without first segmenting the source and target strings. No
     * alignment algorithm is applied.
     *
     * @return List of beads where each entry of {@link #srcRaw} is aligned by index with each entry of
     *         {@link #trgRaw}
     */
    private Stream<Alignment> alignParsewiseNotSegmented() {
        if (!allowedModes.contains(ComparisonMode.PARSEWISE)) {
            throw new UnsupportedOperationException();
        }
        return IntStream.range(0, srcRaw.size())
                .mapToObj(i -> new Alignment(Arrays.asList(srcRaw.get(i)), Arrays.asList(trgRaw.get(i))));
    }

    /**
     * Align {@link ComparisonMode#PARSEWISE} the source and target strings. Each pair is segmented and
     * aligned separately by algorithm.
     *
     * @return List of beads where each entry of {@link #srcRaw} is aligned by index with each entry of
     *         {@link #trgRaw}
     */
    private Stream<Alignment> alignParsewiseSegmented() {
        if (!allowedModes.contains(ComparisonMode.PARSEWISE)) {
            throw new UnsupportedOperationException();
        }
        return IntStream.range(0, srcRaw.size()).mapToObj(i -> {
            List<String> source = Core.getSegmenter().segment(srcLang, srcRaw.get(i), null, null).stream()
                    .filter(s -> !s.isEmpty()).collect(Collectors.toList());
            List<String> target = Core.getSegmenter().segment(trgLang, trgRaw.get(i), null, null).stream()
                    .filter(s -> !s.isEmpty()).collect(Collectors.toList());
            return doAlign(algorithmClass, calculatorType, counterType, source, target);
        }).flatMap(List::stream);
    }

    /**
     * Align by {@link ComparisonMode#ID} without first segmenting the source and target strings. No alignment
     * algorithm is applied.
     *
     * @return List of beads aligned by ID
     */
    private Stream<Alignment> alignByIdNotSegmented() {
        if (!allowedModes.contains(ComparisonMode.ID)) {
            throw new UnsupportedOperationException();
        }
        return idPairs.stream()
                .map(e -> new Alignment(Arrays.asList(e.getKey()), Arrays.asList(e.getValue())));
    }

    /**
     * Align source and target strings by {@link ComparisonMode#ID}. Each pair is segmented and aligned
     * separately by algorithm.
     *
     * @return List of beads aligned by ID
     */
    private Stream<Alignment> alignByIdSegmented() {
        if (!allowedModes.contains(ComparisonMode.ID)) {
            throw new UnsupportedOperationException();
        }
        return idPairs.stream().map(e -> {
            List<String> source = Core.getSegmenter().segment(srcLang, e.getKey(), null, null).stream()
                    .filter(s -> !s.isEmpty()).collect(Collectors.toList());
            List<String> target = Core.getSegmenter().segment(trgLang, e.getValue(), null, null).stream()
                    .filter(s -> !s.isEmpty()).collect(Collectors.toList());
            return doAlign(algorithmClass, calculatorType, counterType, source, target);
        }).flatMap(List::stream);
    }

    /**
     * Align {@link ComparisonMode#HEAPWISE}. Input text is optionally segmented, then aligned by algorithm.
     *
     * @param doSegmenting
     *            Whether to segment the text
     * @return List of beads aligned heapwise
     */
    private Stream<Alignment> alignHeapwise(boolean doSegmenting) {
        List<String> srcSegs = doSegmenting ? segmentAll(srcLang, srcRaw) : srcRaw;
        List<String> trgSegs = doSegmenting ? segmentAll(trgLang, trgRaw) : trgRaw;
        return doAlign(algorithmClass, calculatorType, counterType, srcSegs, trgSegs).stream();
    }

    public void writePairsToTMX(File outFile, List<Entry<String, String>> pairs) throws Exception {
        TMXWriter2 writer = null;
        String creator = OStrings.getApplicationName() + " Aligner";
        long time = System.currentTimeMillis();
        try {
            writer = new TMXWriter2(outFile, srcLang, trgLang, true, true, true);
            for (Entry<String, String> e : pairs) {
                writer.writeEntry(e.getKey(), e.getValue(), null, creator, time, null, 0L, null);
            }
        } finally {
            if (writer != null) {
                try {
                    writer.close();
                } catch (Exception ex) {
                    Log.log(ex);
                }
            }
        }
    }

    /**
     * Perform alignment according to the current settings and return the resulting list of beads. Will call
     * {@link #loadFiles()} if it has not yet been called.
     *
     * @return List of beads
     * @throws Exception
     *             If parsing the input files fails
     */
    Stream<Alignment> alignImpl() throws Exception {
        if (srcRaw == null || trgRaw == null) {
            loadFiles();
        }
        switch (comparisonMode) {
        case PARSEWISE:
            return segment ? alignParsewiseSegmented() : alignParsewiseNotSegmented();
        case HEAPWISE:
            return alignHeapwise(segment);
        case ID:
            return segment ? alignByIdSegmented() : alignByIdNotSegmented();
        }
        throw new UnsupportedOperationException("Unknown comparison mode: " + comparisonMode);
    }

    /**
     * Align the input files according to the current settings to a list of pairs where
     * <ol>
     * <li>key = source text
     * <li>value = target text
     * </ol>
     *
     * Calls {@link #loadFiles()} if it has not yet been called.
     *
     * @return
     * @throws Exception
     */
    public List<Entry<String, String>> align() throws Exception {
        return alignImpl().map(bead -> {
            String srcOut = Util.join(srcLang, bead.getSourceSegmentList());
            String trgOut = Util.join(trgLang, bead.getTargetSegmentList());
            return new AbstractMap.SimpleImmutableEntry<String, String>(srcOut, trgOut);
        }).collect(Collectors.toList());
    }

    /**
     * Obtain appropriate calculator according to the specified {@link CalculatorType}.
     *
     * @param calculatorType
     * @param counterType
     * @param aligns
     * @return
     */
    private static Calculator getCalculator(CalculatorType calculatorType, CounterType counterType,
            List<Alignment> aligns) {
        Counter counter = getCounter(counterType);
        switch (calculatorType) {
        case NORMAL:
            return new NormalDistributionCalculator(counter);
        case POISSON:
            return new PoissonDistributionCalculator(counter, aligns);
        }
        throw new UnsupportedOperationException("Unsupported calculator type: " + calculatorType);
    }

    /**
     * Obtain appropriate counter according to the specified {@link CounterType}.
     *
     * @param counterType
     * @return
     */
    private static Counter getCounter(CounterType counterType) {
        switch (counterType) {
        case CHAR:
            return new CharCounter();
        case WORD:
            return new SplitCounter();
        }
        throw new UnsupportedOperationException("Unsupported counter type: " + counterType);
    }

    /**
     * Obtain appropriate align algorithm object according to the specified {@link AlgorithmClass}.
     *
     * @param algorithmClass
     * @param calculator
     * @return
     */
    private static AlignAlgorithm getAlgorithm(AlgorithmClass algorithmClass, Calculator calculator) {
        MatrixFactory matrixFactory = new FullMatrixFactory();
        Map<Category, Float> map = CategoryDefaults.BEST_CATEGORY_MAP;
        switch (algorithmClass) {
        case VITERBI:
            return new ViterbiAlgorithm(calculator, map, matrixFactory);
        case FB:
            return new ForwardBackwardAlgorithm(calculator, map, matrixFactory);
        }
        throw new UnsupportedOperationException("Unsupported algorithm class: " + algorithmClass);
    }

    /**
     * Use mALIGNa to align the specified source and target texts, according to the specified parameters.
     *
     * @param algorithmClass
     * @param calculatorType
     * @param counterType
     * @param source
     * @param target
     * @return
     */
    private static List<Alignment> doAlign(AlgorithmClass algorithmClass, CalculatorType calculatorType,
            CounterType counterType, List<String> source, List<String> target) {
        List<Alignment> aligns = Arrays.asList(new Alignment(source, target));
        Calculator calculator = getCalculator(calculatorType, counterType, aligns);
        AlignAlgorithm algorithm = getAlgorithm(algorithmClass, calculator);
        Filter filter = new net.loomchild.maligna.filter.aligner.Aligner(algorithm);
        // filter = FilterDecorators.decorate(filter);
        return filter.apply(aligns);
    }

    List<MutableBead> doAlign(List<MutableBead> beads) {
        List<String> source = new ArrayList<>();
        List<String> target = new ArrayList<>();
        for (MutableBead bead : beads) {
            source.addAll(bead.sourceLines);
            target.addAll(bead.targetLines);
        }
        return doAlign(algorithmClass, calculatorType, counterType, source, target).stream()
                .map(MutableBead::new).collect(Collectors.toList());
    }
}