TagUtil.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2013-2014 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.omegat.core.Core;
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.statistics.StatisticsSettings;

/**
 * A collection of tag-related static utilities.
 *
 * @author Aaron Madlon-Kay
 */
public class TagUtil {

    private static final Comparator<Tag> TAG_COMPARATOR = new Comparator<Tag>() {
        @Override
        public int compare(Tag o1, Tag o2) {
            return o1.pos - o2.pos;
        }
    };

    public static class Tag {
        public final int pos;
        public final String tag;

        public Tag(int pos, String tag) {
            this.pos = pos;
            this.tag = tag;
        }

        public TagType getType() {
            Matcher m = PatternConsts.OMEGAT_TAG_DECOMPILE.matcher(tag);
            if (!m.find()) {
                return TagType.SINGLE;
            }

            boolean hasFrontSlash = "/".equals(m.group(1));
            boolean hasBackSlash = "/".equals(m.group(4));

            if (hasFrontSlash && !hasBackSlash) {
                return TagType.END;
            }

            if (!hasFrontSlash && !hasBackSlash) {
                return TagType.START;
            }

            return TagType.SINGLE;
        }

        public String getName() {
            Matcher m = PatternConsts.OMEGAT_TAG_DECOMPILE.matcher(tag);
            if (!m.find()) {
                return tag;
            }

            boolean hasFrontSlash = "/".equals(m.group(1));
            boolean hasBackSlash = "/".equals(m.group(4));

            if (hasFrontSlash && hasBackSlash) {
                return tag;
            }

            return m.group(2) + m.group(3);
        }

        public String getPairedTag() {
            switch(getType()) {
            case START:
                return "</" + getName() + ">";
            case END:
                return "<" + getName() + ">";
            case SINGLE:
            default:
                return null;
            }
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + pos;
            result = prime * result + ((tag == null) ? 0 : tag.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            Tag other = (Tag) obj;
            if (pos != other.pos) {
                return false;
            }
            if (tag == null) {
                if (other.tag != null) {
                    return false;
                }
            } else if (!tag.equals(other.tag)) {
                return false;
            }
            return true;
        }

        @Override
        public String toString() {
            return tag + "@" + pos;
        }
    }

    /**
     * Indicates the type of a tag, e.g.:
     * <ul>
     * <li><foo> = START</li>
     * <li></foo> = END</li>
     * <li><bar/> = SINGLE</li>
     * </ul>
     */
    public static enum TagType {
        START, END, SINGLE
    }

    /**
     * A tuple containing
     * <ul><li>A tag's name</li>
     * <li>The tag's {@link TagType} type</li>
     * </ul>
     */
    public static class TagInfo {
        public final TagType type;
        public final String name;

        public TagInfo (String name, TagType type) {
            this.name = name;
            this.type = type;
        }
    }

    final public static String TAG_SEPARATOR_SENTINEL = "\uE100";
    final public static char TEXT_REPLACEMENT = '\uE100';

    public static List<Tag> getAllTagsInSource() {
        SourceTextEntry ste = Core.getEditor().getCurrentEntry();
        return buildTagList(ste.getSrcText(), ste.getProtectedParts());
    }

    public static List<Tag> getAllTagsMissingFromTarget() {
        List<Tag> result = new ArrayList<Tag>();

        StringBuilder target = new StringBuilder(Core.getEditor().getCurrentTranslation());

        for (Tag tag : getAllTagsInSource()) {
            int pos = -1;
            if ((pos = target.indexOf(tag.tag)) != -1) {
                replaceWith(target, pos, pos + tag.tag.length(), TEXT_REPLACEMENT);
            } else {
                result.add(tag);
            }
        }
        return result;
    }

    public static List<String> getGroupedMissingTagsFromTarget() {
        List<String> result = new ArrayList<String>();

        List<Tag> tags = getAllTagsMissingFromTarget();
        for (int i = 0; i < tags.size(); i++) {
            Tag tag = tags.get(i);

            // Compile the longest possible list of contiguous tags and offer as
            // a group.
            List<String> group = getGroupAt(tags, i).stream().map(t -> t.tag).collect(Collectors.toList());
            if (group.size() > 1) {
                result.add(String.join("", group));
            }

            // See if this tag and next tag make a pair and offer them as a set,
            // regardless of whether or not they're contiguous.
            // E.g. either an actual pair like <foo></foo> or a potential pair
            // like <foo/><foo/>.
            if (i + 1 < tags.size()) {
                Tag next = tags.get(i + 1);
                String pair = tag.getPairedTag();
                if (next.tag.equals(pair) || (tag.getType() == TagType.SINGLE && next.getType() == TagType.SINGLE)) {
                    // Insert sentinel to allow cursor relocating.
                    result.add(tag.tag + TAG_SEPARATOR_SENTINEL + next.tag);
                }
            }

            result.addAll(group);
        }

        return result.stream().distinct().collect(Collectors.toList());
    }

    private static List<Tag> getGroupAt(List<Tag> tags, int index) {
        Tag tag = tags.get(index);
        if (index > 0) {
            Tag prev = tags.get(index - 1);
            if (prev.pos + prev.tag.length() == tag.pos) {
                // This tag is in the middle of a group; return just this tag.
                return Arrays.asList(tag);
            }
        }
        List<Tag> group = new ArrayList<>();
        group.add(tag);
        for (int j = index + 1; j < tags.size(); j++) {
            Tag prev = tags.get(j - 1);
            Tag next = tags.get(j);
            if (prev.pos + prev.tag.length() != next.pos) {
                break;
            }
            group.add(next);
        }
        return group;
    }

    /**
     * Builds a list of format tags within the supplied string. Format tags are
     * 'protected parts' and OmegaT style tags: <xx02> or </yy01>.
     */
    public static List<Tag> buildTagList(String str, ProtectedPart[] protectedParts) {
        List<Tag> tags = new ArrayList<Tag>();
        if (protectedParts != null) {
            // Put string in temporary buffer and replace tags with spaces as we find them.
            // This ensures that we don't find identical tags multiple times unless they are
            // actually present multiple times.
            StringBuilder sb = new StringBuilder(str);
            while (true) {
                boolean loopAgain = false;
                for (ProtectedPart pp : protectedParts) {
                    int pos = -1;
                    if ((pos = sb.indexOf(pp.getTextInSourceSegment())) != -1) {
                        tags.add(new Tag(pos, pp.getTextInSourceSegment()));
                        replaceWith(sb, pos, pos + pp.getTextInSourceSegment().length(), TEXT_REPLACEMENT);
                        loopAgain = true;
                    }
                }
                if (!loopAgain) {
                    break;
                }
            }
        }

        Collections.sort(tags, TAG_COMPARATOR);
        return tags;
    }

    private static void replaceWith(StringBuilder sb, int start, int end, char replacement) {
        for (int i = start; i < end; i++) {
            sb.setCharAt(i, replacement);
        }
    }

    /**
     * Builds a list of format tags within the supplied string. Format tags are
     * OmegaT style tags: <xx02> or </yy01>.
     * @return a string containing the tags
     */
    public static String buildTagListForRemove(String str) {
        StringBuilder res = new StringBuilder();
        Pattern placeholderPattern = PatternConsts.OMEGAT_TAG;
        Matcher placeholderMatcher = placeholderPattern.matcher(str);
        while (placeholderMatcher.find()) {
            res.append(placeholderMatcher.group(0));
        }
        return res.toString();
    }

    /**
     * Find the first tag in a segment
     * @param str A segment
     * @return the first tag in the segment, or null if there are no tags
     */
    public static String getFirstTag(String str) {
        Pattern placeholderPattern = PatternConsts.OMEGAT_TAG;
        Matcher placeholderMatcher = placeholderPattern.matcher(str);
        if (placeholderMatcher.find()) {
            return placeholderMatcher.group(0);
        }
        return null;
    }

    /**
     * Find some protected parts defined in Tag Validation Options dialog: printf variables, java
     * MessageFormat patterns, user defined cusom tags.
     *
     * These protected parts shouldn't affect statistic but just be displayed in gray in editor and take part
     * in tag validation.
     */
    public static List<ProtectedPart> applyCustomProtectedParts(String source,
            Pattern protectedPartsPatterns, List<ProtectedPart> protectedParts) {
        List<ProtectedPart> result;
        if (protectedParts != null) {
            // Remove already define protected parts first for prevent intersection
            for (ProtectedPart pp : protectedParts) {
                source = source.replace(pp.getTextInSourceSegment(), StaticUtils.TAG_REPLACEMENT);
            }
            result = protectedParts;
        } else {
            result = new ArrayList<ProtectedPart>();
        }

        Matcher placeholderMatcher = protectedPartsPatterns.matcher(source);
        while (placeholderMatcher.find()) {
            ProtectedPart pp = new ProtectedPart();
            pp.setTextInSourceSegment(placeholderMatcher.group());
            pp.setDetailsFromSourceFile(placeholderMatcher.group());
            if (StatisticsSettings.isCountingCustomTags()) {
                pp.setReplacementWordsCountCalculation(placeholderMatcher.group());
            } else {
                pp.setReplacementWordsCountCalculation(StaticUtils.TAG_REPLACEMENT);
            }
            pp.setReplacementUniquenessCalculation(placeholderMatcher.group());
            pp.setReplacementMatchCalculation(placeholderMatcher.group());
            result.add(pp);
        }
        return result;
    }

    /**
     * Strips all XML tags (converts to plain text). Tags detected only by
     * pattern. Protected parts are not used.
     */
    public static String stripXmlTags(String xml) {
        return PatternConsts.OMEGAT_TAG.matcher(xml).replaceAll("");
    }
}