ParseEntry.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2005-2006 Henry Pijffers
               2006 Martin Wunderlich
               2006-2007 Didier Briel
               2008 Martin Fleurke
               2011 Alex Buloichik
               2012 Wildrich Fourie
               2013 Didier Briel
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.data;

import java.util.ArrayList;
import java.util.List;

import org.omegat.core.Core;
import org.omegat.core.data.IProject.FileInfo;
import org.omegat.core.segmentation.Rule;
import org.omegat.filters2.IFilter;
import org.omegat.filters2.IParseCallback;
import org.omegat.util.Language;
import org.omegat.util.PatternConsts;
import org.omegat.util.StringUtil;

/**
 * Process one entry on parse source file.
 *
 * This class caches segments for one file, then flushes they. It required to ability to link prev/next
 * segments.
 *
 * @author Maxym Mykhalchuk
 * @author Henry Pijffers
 * @author Alex Buloichik <alex73mail@gmail.com>
 */
public abstract class ParseEntry implements IParseCallback {

    private final ProjectProperties config;

    /** Cached segments. */
    private List<ParseEntryQueueItem> parseQueue = new ArrayList<ParseEntryQueueItem>();

    public ParseEntry(final ProjectProperties conf) {
        this.config = conf;
    }

    protected void setCurrentFile(FileInfo fi) {
    }

    protected void fileFinished() {
        /**
         * Flush queue.
         */
        for (ParseEntryQueueItem item : parseQueue) {
            addSegment(item.id, item.segmentIndex, item.segmentSource, item.protectedParts, item.segmentTranslation,
                    item.segmentTranslationFuzzy, item.props, item.prevSegment, item.nextSegment, item.path);
        }

        /**
         * Clear queue for next file.
         */
        parseQueue.clear();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void linkPrevNextSegments() {
        for (int i = 0; i < parseQueue.size(); i++) {
            ParseEntryQueueItem item = parseQueue.get(i);
            try {
                item.prevSegment = parseQueue.get(i - 1).segmentSource;
            } catch (IndexOutOfBoundsException ex) {
                // first entry - previous will be empty
                item.prevSegment = "";
            }
            try {
                item.nextSegment = parseQueue.get(i + 1).segmentSource;
            } catch (IndexOutOfBoundsException ex) {
                // last entry - next will be empty
                item.nextSegment = "";
            }
        }
    }

    /**
     * This method is called by filters to add new entry in OmegaT after read it from source file.
     *
     * @param id
     *            ID of entry, if format supports it
     * @param source
     *            Translatable source string
     * @param translation
     *            translation of the source string, if format supports it
     * @param isFuzzy
     *            flag for fuzzy translation. If a translation is fuzzy, it is not added to the projects TMX,
     *            but it is added to the generated 'reference' TMX, a special TMX that is used as extra
     *            reference during translation.
     * @param props
     *            a staggered array of non-uniquely-identifying key=value properties (metadata) for the entry
     * @param path
     *            path of entry in file
     * @param filter
     *            filter which produces entry
     * @param protectedParts
     *            protected parts
     */
    @Override
    public void addEntryWithProperties(String id, String source, String translation, boolean isFuzzy, String[] props,
            String path, IFilter filter, List<ProtectedPart> protectedParts) {
        if (StringUtil.isEmpty(source)) {
            // empty string - not need to save
            return;
        }

        if (props != null && props.length % 2 != 0) {
            throw new IllegalArgumentException(
                    "Entry properties must be in a key=value array with an even number of items.");
        }

        ParseEntryResult tmp = new ParseEntryResult();

        boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
        source = stripSomeChars(source, tmp, config.isRemoveTags(), removeSpaces);
        source = StringUtil.normalizeUnicode(source);
        if (config.isRemoveTags() && protectedParts != null) {
            for (int i = 0; i < protectedParts.size(); i++) {
                ProtectedPart p = protectedParts.get(i);
                String s = p.getTextInSourceSegment();
                s = PatternConsts.OMEGAT_TAG.matcher(s).replaceAll("");
                if (s.isEmpty()) {
                    protectedParts.remove(i);
                    i--;
                } else {
                    p.setTextInSourceSegment(s);
                }
            }
        }
        if (translation != null) {
            translation = stripSomeChars(translation, tmp, config.isRemoveTags(), removeSpaces);
            translation = StringUtil.normalizeUnicode(translation);
        }

        if (config.isSentenceSegmentingEnabled()) {
            List<StringBuilder> spaces = new ArrayList<StringBuilder>();
            List<Rule> brules = new ArrayList<Rule>();
            Language sourceLang = config.getSourceLanguage();
            List<String> segments = Core.getSegmenter().segment(sourceLang, source, spaces, brules);
            if (segments.size() == 1) {
                internalAddSegment(id, (short) 0, segments.get(0), translation, isFuzzy, props, path,
                        protectedParts);
            } else {
                for (short i = 0; i < segments.size(); i++) {
                    String onesrc = segments.get(i);
                    List<ProtectedPart> segmentProtectedParts = ProtectedPart.extractFor(protectedParts,
                            onesrc);
                    internalAddSegment(id, i, onesrc, null, false, props, path, segmentProtectedParts);
                }
            }
        } else {
            internalAddSegment(id, (short) 0, source, translation, isFuzzy, props, path, protectedParts);
        }
    }

    /**
     * This method is called by filters to add new entry in OmegaT after read it from source file.
     * <p>
     * Old call for filters that only support extracting a "comment" property. Kept for compatibility.
     */
    @Override
    public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment,
            String path, IFilter filter, List<ProtectedPart> protectedParts) {
        String[] props = comment == null ? null : new String[] { SegmentProperties.COMMENT, comment };
        addEntryWithProperties(id, source, translation, isFuzzy, props, path, filter, protectedParts);
    }

    /**
     * This method is called by filters to add new entry in OmegaT after read it from source file.
     * <p>
     * Old call without path, for compatibility.
     *
     * @param id
     *            ID of entry, if format supports it
     * @param source
     *            Translatable source string
     * @param translation
     *            translation of the source string, if format supports it
     * @param isFuzzy
     *            flag for fuzzy translation. If a translation is fuzzy, it is not added to the projects TMX,
     *            but it is added to the generated 'reference' TMX, a special TMX that is used as extra
     *            reference during translation.
     * @param comment
     *            entry's comment, if format supports it
     * @param filter
     *            filter which produces entry
     */
    @Override
    public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment,
            IFilter filter) {
        addEntry(id, source, translation, isFuzzy, comment, null, filter, null);
    }

    /**
     * Add segment to queue because we possible need to link prev/next segments.
     */
    private void internalAddSegment(String id, short segmentIndex, String segmentSource, String segmentTranslation,
            boolean segmentTranslationFuzzy, String[] props, String path, List<ProtectedPart> protectedParts) {
        if (segmentSource.trim().isEmpty()) {
            // skip empty segments
            return;
        }
        ParseEntryQueueItem item = new ParseEntryQueueItem();
        item.id = id;
        item.segmentIndex = segmentIndex;
        item.segmentSource = segmentSource;
        item.protectedParts = protectedParts;
        item.segmentTranslation = segmentTranslation;
        item.segmentTranslationFuzzy = segmentTranslationFuzzy;
        item.props = props;
        item.path = path;
        parseQueue.add(item);
    }

    /**
     * Adds a segment to the project. If a translation is given, it it added to
     * the projects TMX.
     *
     * @param id
     *            ID of entry, if format supports it
     * @param segmentIndex
     *            Number of the segment-part of the original source string.
     * @param segmentSource
     *            Translatable source string
     * @param protectedParts
     *            protected parts
     * @param segmentTranslation
     *            translation of the source string, if format supports it
     * @param segmentTranslationFuzzy
     *            fuzzy flag of translation of the source string, if format
     *            supports it
     * @param comment
     *            entry's comment, if format supports it
     * @param prevSegment
     *            previous segment's text
     * @param nextSegment
     *            next segment's text
     * @param path
     *            path of segment
     */
    protected abstract void addSegment(String id, short segmentIndex, String segmentSource,
            List<ProtectedPart> protectedParts, String segmentTranslation, boolean segmentTranslationFuzzy,
            String[] props, String prevSegment, String nextSegment, String path);

    /**
     * Strip some chars for represent string in UI.
     *
     * @param src
     *            source string to strip chars
     * @return result
     */
    public static String stripSomeChars(final String src, final ParseEntryResult per, boolean removeTags,
            boolean removeSpaces) {
        String r = src;

        /**
         * AB: we need to find begin/end spaces first, then replace \r,\n chars.
         * Since \r,\n are spaces, we will not need to store spaces in buffer,
         * but we can just remember spaces count at the begin and at the end,
         * then restore spaces from original string.
         */

        /*
         * Some special space handling: skip leading and trailing whitespace and
         * non-breaking-space
         */
        int len = r.length();
        int b = 0;
        if (removeSpaces) {
            for (int cp; b < len; b += Character.charCount(cp)) {
                cp = r.codePointAt(b);
                if (!Character.isWhitespace(cp) && cp != '\u00A0') {
                    break;
                }
            }
        }
        per.spacesAtBegin = b;

        int e = len;
        if (removeSpaces) {
            for (int cp; e > b; e -= Character.charCount(cp)) {
                cp = r.codePointBefore(e);
                if (!Character.isWhitespace(cp) && cp != '\u00A0') {
                    break;
                }
            }
        }
        per.spacesAtEnd = len - e;

        r = r.substring(b, e);

        /*
         * Replacing all occurrences of single CR (\r) or CRLF (\r\n) by LF
         * (\n). This is reversed on create translation. (fix for bug 1462566)
         * We don't need to remember crlf/cr presents on parse, but only on
         * translate.
         */
        per.crlf = r.indexOf("\r\n") > 0;
        if (per.crlf) {
            r = r.replace("\r\n", "\n");
        }
        per.cr = r.indexOf("\r") > 0;
        if (per.cr) {
            r = r.replace("\r", "\n");
        }
        if (removeTags) {
            r = PatternConsts.OMEGAT_TAG.matcher(r).replaceAll("");
        }

        r = StringUtil.removeXMLInvalidChars(r);

        return r;
    }

    /**
     * Storage for results of entry parsing, i.e. cr/crlf flags, spaces counts
     * on the begin and end.
     */
    public static class ParseEntryResult {
        public boolean crlf, cr;
        int spacesAtBegin, spacesAtEnd;
    }

    /**
     * Storage for collected segments.
     */
    protected static class ParseEntryQueueItem {
        String id;
        short segmentIndex;
        String segmentSource;
        List<ProtectedPart> protectedParts;
        String segmentTranslation;
        boolean segmentTranslationFuzzy;
        String[] props;
        String prevSegment;
        String nextSegment;
        String path;
    }
}