/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2013-2014 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.omegat.core.Core;
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.statistics.StatisticsSettings;
/**
* A collection of tag-related static utilities.
*
* @author Aaron Madlon-Kay
*/
public class TagUtil {
private static final Comparator<Tag> TAG_COMPARATOR = new Comparator<Tag>() {
@Override
public int compare(Tag o1, Tag o2) {
return o1.pos - o2.pos;
}
};
public static class Tag {
public final int pos;
public final String tag;
public Tag(int pos, String tag) {
this.pos = pos;
this.tag = tag;
}
public TagType getType() {
Matcher m = PatternConsts.OMEGAT_TAG_DECOMPILE.matcher(tag);
if (!m.find()) {
return TagType.SINGLE;
}
boolean hasFrontSlash = "/".equals(m.group(1));
boolean hasBackSlash = "/".equals(m.group(4));
if (hasFrontSlash && !hasBackSlash) {
return TagType.END;
}
if (!hasFrontSlash && !hasBackSlash) {
return TagType.START;
}
return TagType.SINGLE;
}
public String getName() {
Matcher m = PatternConsts.OMEGAT_TAG_DECOMPILE.matcher(tag);
if (!m.find()) {
return tag;
}
boolean hasFrontSlash = "/".equals(m.group(1));
boolean hasBackSlash = "/".equals(m.group(4));
if (hasFrontSlash && hasBackSlash) {
return tag;
}
return m.group(2) + m.group(3);
}
public String getPairedTag() {
switch(getType()) {
case START:
return "</" + getName() + ">";
case END:
return "<" + getName() + ">";
case SINGLE:
default:
return null;
}
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + pos;
result = prime * result + ((tag == null) ? 0 : tag.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
Tag other = (Tag) obj;
if (pos != other.pos) {
return false;
}
if (tag == null) {
if (other.tag != null) {
return false;
}
} else if (!tag.equals(other.tag)) {
return false;
}
return true;
}
@Override
public String toString() {
return tag + "@" + pos;
}
}
/**
* Indicates the type of a tag, e.g.:
* <ul>
* <li><foo> = START</li>
* <li></foo> = END</li>
* <li><bar/> = SINGLE</li>
* </ul>
*/
public static enum TagType {
START, END, SINGLE
}
/**
* A tuple containing
* <ul><li>A tag's name</li>
* <li>The tag's {@link TagType} type</li>
* </ul>
*/
public static class TagInfo {
public final TagType type;
public final String name;
public TagInfo (String name, TagType type) {
this.name = name;
this.type = type;
}
}
final public static String TAG_SEPARATOR_SENTINEL = "\uE100";
final public static char TEXT_REPLACEMENT = '\uE100';
public static List<Tag> getAllTagsInSource() {
SourceTextEntry ste = Core.getEditor().getCurrentEntry();
return buildTagList(ste.getSrcText(), ste.getProtectedParts());
}
public static List<Tag> getAllTagsMissingFromTarget() {
List<Tag> result = new ArrayList<Tag>();
StringBuilder target = new StringBuilder(Core.getEditor().getCurrentTranslation());
for (Tag tag : getAllTagsInSource()) {
int pos = -1;
if ((pos = target.indexOf(tag.tag)) != -1) {
replaceWith(target, pos, pos + tag.tag.length(), TEXT_REPLACEMENT);
} else {
result.add(tag);
}
}
return result;
}
public static List<String> getGroupedMissingTagsFromTarget() {
List<String> result = new ArrayList<String>();
List<Tag> tags = getAllTagsMissingFromTarget();
for (int i = 0; i < tags.size(); i++) {
Tag tag = tags.get(i);
// Compile the longest possible list of contiguous tags and offer as
// a group.
List<String> group = getGroupAt(tags, i).stream().map(t -> t.tag).collect(Collectors.toList());
if (group.size() > 1) {
result.add(String.join("", group));
}
// See if this tag and next tag make a pair and offer them as a set,
// regardless of whether or not they're contiguous.
// E.g. either an actual pair like <foo></foo> or a potential pair
// like <foo/><foo/>.
if (i + 1 < tags.size()) {
Tag next = tags.get(i + 1);
String pair = tag.getPairedTag();
if (next.tag.equals(pair) || (tag.getType() == TagType.SINGLE && next.getType() == TagType.SINGLE)) {
// Insert sentinel to allow cursor relocating.
result.add(tag.tag + TAG_SEPARATOR_SENTINEL + next.tag);
}
}
result.addAll(group);
}
return result.stream().distinct().collect(Collectors.toList());
}
private static List<Tag> getGroupAt(List<Tag> tags, int index) {
Tag tag = tags.get(index);
if (index > 0) {
Tag prev = tags.get(index - 1);
if (prev.pos + prev.tag.length() == tag.pos) {
// This tag is in the middle of a group; return just this tag.
return Arrays.asList(tag);
}
}
List<Tag> group = new ArrayList<>();
group.add(tag);
for (int j = index + 1; j < tags.size(); j++) {
Tag prev = tags.get(j - 1);
Tag next = tags.get(j);
if (prev.pos + prev.tag.length() != next.pos) {
break;
}
group.add(next);
}
return group;
}
/**
* Builds a list of format tags within the supplied string. Format tags are
* 'protected parts' and OmegaT style tags: <xx02> or </yy01>.
*/
public static List<Tag> buildTagList(String str, ProtectedPart[] protectedParts) {
List<Tag> tags = new ArrayList<Tag>();
if (protectedParts != null) {
// Put string in temporary buffer and replace tags with spaces as we find them.
// This ensures that we don't find identical tags multiple times unless they are
// actually present multiple times.
StringBuilder sb = new StringBuilder(str);
while (true) {
boolean loopAgain = false;
for (ProtectedPart pp : protectedParts) {
int pos = -1;
if ((pos = sb.indexOf(pp.getTextInSourceSegment())) != -1) {
tags.add(new Tag(pos, pp.getTextInSourceSegment()));
replaceWith(sb, pos, pos + pp.getTextInSourceSegment().length(), TEXT_REPLACEMENT);
loopAgain = true;
}
}
if (!loopAgain) {
break;
}
}
}
Collections.sort(tags, TAG_COMPARATOR);
return tags;
}
private static void replaceWith(StringBuilder sb, int start, int end, char replacement) {
for (int i = start; i < end; i++) {
sb.setCharAt(i, replacement);
}
}
/**
* Builds a list of format tags within the supplied string. Format tags are
* OmegaT style tags: <xx02> or </yy01>.
* @return a string containing the tags
*/
public static String buildTagListForRemove(String str) {
StringBuilder res = new StringBuilder();
Pattern placeholderPattern = PatternConsts.OMEGAT_TAG;
Matcher placeholderMatcher = placeholderPattern.matcher(str);
while (placeholderMatcher.find()) {
res.append(placeholderMatcher.group(0));
}
return res.toString();
}
/**
* Find the first tag in a segment
* @param str A segment
* @return the first tag in the segment, or null if there are no tags
*/
public static String getFirstTag(String str) {
Pattern placeholderPattern = PatternConsts.OMEGAT_TAG;
Matcher placeholderMatcher = placeholderPattern.matcher(str);
if (placeholderMatcher.find()) {
return placeholderMatcher.group(0);
}
return null;
}
/**
* Find some protected parts defined in Tag Validation Options dialog: printf variables, java
* MessageFormat patterns, user defined cusom tags.
*
* These protected parts shouldn't affect statistic but just be displayed in gray in editor and take part
* in tag validation.
*/
public static List<ProtectedPart> applyCustomProtectedParts(String source,
Pattern protectedPartsPatterns, List<ProtectedPart> protectedParts) {
List<ProtectedPart> result;
if (protectedParts != null) {
// Remove already define protected parts first for prevent intersection
for (ProtectedPart pp : protectedParts) {
source = source.replace(pp.getTextInSourceSegment(), StaticUtils.TAG_REPLACEMENT);
}
result = protectedParts;
} else {
result = new ArrayList<ProtectedPart>();
}
Matcher placeholderMatcher = protectedPartsPatterns.matcher(source);
while (placeholderMatcher.find()) {
ProtectedPart pp = new ProtectedPart();
pp.setTextInSourceSegment(placeholderMatcher.group());
pp.setDetailsFromSourceFile(placeholderMatcher.group());
if (StatisticsSettings.isCountingCustomTags()) {
pp.setReplacementWordsCountCalculation(placeholderMatcher.group());
} else {
pp.setReplacementWordsCountCalculation(StaticUtils.TAG_REPLACEMENT);
}
pp.setReplacementUniquenessCalculation(placeholderMatcher.group());
pp.setReplacementMatchCalculation(placeholderMatcher.group());
result.add(pp);
}
return result;
}
/**
* Strips all XML tags (converts to plain text). Tags detected only by
* pattern. Protected parts are not used.
*/
public static String stripXmlTags(String xml) {
return PatternConsts.OMEGAT_TAG.matcher(xml).replaceAll("");
}
}