/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2005-2006 Henry Pijffers 2006 Martin Wunderlich 2006-2007 Didier Briel 2008 Martin Fleurke 2011 Alex Buloichik 2012 Wildrich Fourie 2013 Didier Briel Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.core.data; import java.util.ArrayList; import java.util.List; import org.omegat.core.Core; import org.omegat.core.data.IProject.FileInfo; import org.omegat.core.segmentation.Rule; import org.omegat.filters2.IFilter; import org.omegat.filters2.IParseCallback; import org.omegat.util.Language; import org.omegat.util.PatternConsts; import org.omegat.util.StringUtil; /** * Process one entry on parse source file. * * This class caches segments for one file, then flushes they. It required to ability to link prev/next * segments. * * @author Maxym Mykhalchuk * @author Henry Pijffers * @author Alex Buloichik <alex73mail@gmail.com> */ public abstract class ParseEntry implements IParseCallback { private final ProjectProperties config; /** Cached segments. */ private List<ParseEntryQueueItem> parseQueue = new ArrayList<ParseEntryQueueItem>(); public ParseEntry(final ProjectProperties conf) { this.config = conf; } protected void setCurrentFile(FileInfo fi) { } protected void fileFinished() { /** * Flush queue. */ for (ParseEntryQueueItem item : parseQueue) { addSegment(item.id, item.segmentIndex, item.segmentSource, item.protectedParts, item.segmentTranslation, item.segmentTranslationFuzzy, item.props, item.prevSegment, item.nextSegment, item.path); } /** * Clear queue for next file. */ parseQueue.clear(); } /** * {@inheritDoc} */ @Override public void linkPrevNextSegments() { for (int i = 0; i < parseQueue.size(); i++) { ParseEntryQueueItem item = parseQueue.get(i); try { item.prevSegment = parseQueue.get(i - 1).segmentSource; } catch (IndexOutOfBoundsException ex) { // first entry - previous will be empty item.prevSegment = ""; } try { item.nextSegment = parseQueue.get(i + 1).segmentSource; } catch (IndexOutOfBoundsException ex) { // last entry - next will be empty item.nextSegment = ""; } } } /** * This method is called by filters to add new entry in OmegaT after read it from source file. * * @param id * ID of entry, if format supports it * @param source * Translatable source string * @param translation * translation of the source string, if format supports it * @param isFuzzy * flag for fuzzy translation. If a translation is fuzzy, it is not added to the projects TMX, * but it is added to the generated 'reference' TMX, a special TMX that is used as extra * reference during translation. * @param props * a staggered array of non-uniquely-identifying key=value properties (metadata) for the entry * @param path * path of entry in file * @param filter * filter which produces entry * @param protectedParts * protected parts */ @Override public void addEntryWithProperties(String id, String source, String translation, boolean isFuzzy, String[] props, String path, IFilter filter, List<ProtectedPart> protectedParts) { if (StringUtil.isEmpty(source)) { // empty string - not need to save return; } if (props != null && props.length % 2 != 0) { throw new IllegalArgumentException( "Entry properties must be in a key=value array with an even number of items."); } ParseEntryResult tmp = new ParseEntryResult(); boolean removeSpaces = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg(); source = stripSomeChars(source, tmp, config.isRemoveTags(), removeSpaces); source = StringUtil.normalizeUnicode(source); if (config.isRemoveTags() && protectedParts != null) { for (int i = 0; i < protectedParts.size(); i++) { ProtectedPart p = protectedParts.get(i); String s = p.getTextInSourceSegment(); s = PatternConsts.OMEGAT_TAG.matcher(s).replaceAll(""); if (s.isEmpty()) { protectedParts.remove(i); i--; } else { p.setTextInSourceSegment(s); } } } if (translation != null) { translation = stripSomeChars(translation, tmp, config.isRemoveTags(), removeSpaces); translation = StringUtil.normalizeUnicode(translation); } if (config.isSentenceSegmentingEnabled()) { List<StringBuilder> spaces = new ArrayList<StringBuilder>(); List<Rule> brules = new ArrayList<Rule>(); Language sourceLang = config.getSourceLanguage(); List<String> segments = Core.getSegmenter().segment(sourceLang, source, spaces, brules); if (segments.size() == 1) { internalAddSegment(id, (short) 0, segments.get(0), translation, isFuzzy, props, path, protectedParts); } else { for (short i = 0; i < segments.size(); i++) { String onesrc = segments.get(i); List<ProtectedPart> segmentProtectedParts = ProtectedPart.extractFor(protectedParts, onesrc); internalAddSegment(id, i, onesrc, null, false, props, path, segmentProtectedParts); } } } else { internalAddSegment(id, (short) 0, source, translation, isFuzzy, props, path, protectedParts); } } /** * This method is called by filters to add new entry in OmegaT after read it from source file. * <p> * Old call for filters that only support extracting a "comment" property. Kept for compatibility. */ @Override public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment, String path, IFilter filter, List<ProtectedPart> protectedParts) { String[] props = comment == null ? null : new String[] { SegmentProperties.COMMENT, comment }; addEntryWithProperties(id, source, translation, isFuzzy, props, path, filter, protectedParts); } /** * This method is called by filters to add new entry in OmegaT after read it from source file. * <p> * Old call without path, for compatibility. * * @param id * ID of entry, if format supports it * @param source * Translatable source string * @param translation * translation of the source string, if format supports it * @param isFuzzy * flag for fuzzy translation. If a translation is fuzzy, it is not added to the projects TMX, * but it is added to the generated 'reference' TMX, a special TMX that is used as extra * reference during translation. * @param comment * entry's comment, if format supports it * @param filter * filter which produces entry */ @Override public void addEntry(String id, String source, String translation, boolean isFuzzy, String comment, IFilter filter) { addEntry(id, source, translation, isFuzzy, comment, null, filter, null); } /** * Add segment to queue because we possible need to link prev/next segments. */ private void internalAddSegment(String id, short segmentIndex, String segmentSource, String segmentTranslation, boolean segmentTranslationFuzzy, String[] props, String path, List<ProtectedPart> protectedParts) { if (segmentSource.trim().isEmpty()) { // skip empty segments return; } ParseEntryQueueItem item = new ParseEntryQueueItem(); item.id = id; item.segmentIndex = segmentIndex; item.segmentSource = segmentSource; item.protectedParts = protectedParts; item.segmentTranslation = segmentTranslation; item.segmentTranslationFuzzy = segmentTranslationFuzzy; item.props = props; item.path = path; parseQueue.add(item); } /** * Adds a segment to the project. If a translation is given, it it added to * the projects TMX. * * @param id * ID of entry, if format supports it * @param segmentIndex * Number of the segment-part of the original source string. * @param segmentSource * Translatable source string * @param protectedParts * protected parts * @param segmentTranslation * translation of the source string, if format supports it * @param segmentTranslationFuzzy * fuzzy flag of translation of the source string, if format * supports it * @param comment * entry's comment, if format supports it * @param prevSegment * previous segment's text * @param nextSegment * next segment's text * @param path * path of segment */ protected abstract void addSegment(String id, short segmentIndex, String segmentSource, List<ProtectedPart> protectedParts, String segmentTranslation, boolean segmentTranslationFuzzy, String[] props, String prevSegment, String nextSegment, String path); /** * Strip some chars for represent string in UI. * * @param src * source string to strip chars * @return result */ public static String stripSomeChars(final String src, final ParseEntryResult per, boolean removeTags, boolean removeSpaces) { String r = src; /** * AB: we need to find begin/end spaces first, then replace \r,\n chars. * Since \r,\n are spaces, we will not need to store spaces in buffer, * but we can just remember spaces count at the begin and at the end, * then restore spaces from original string. */ /* * Some special space handling: skip leading and trailing whitespace and * non-breaking-space */ int len = r.length(); int b = 0; if (removeSpaces) { for (int cp; b < len; b += Character.charCount(cp)) { cp = r.codePointAt(b); if (!Character.isWhitespace(cp) && cp != '\u00A0') { break; } } } per.spacesAtBegin = b; int e = len; if (removeSpaces) { for (int cp; e > b; e -= Character.charCount(cp)) { cp = r.codePointBefore(e); if (!Character.isWhitespace(cp) && cp != '\u00A0') { break; } } } per.spacesAtEnd = len - e; r = r.substring(b, e); /* * Replacing all occurrences of single CR (\r) or CRLF (\r\n) by LF * (\n). This is reversed on create translation. (fix for bug 1462566) * We don't need to remember crlf/cr presents on parse, but only on * translate. */ per.crlf = r.indexOf("\r\n") > 0; if (per.crlf) { r = r.replace("\r\n", "\n"); } per.cr = r.indexOf("\r") > 0; if (per.cr) { r = r.replace("\r", "\n"); } if (removeTags) { r = PatternConsts.OMEGAT_TAG.matcher(r).replaceAll(""); } r = StringUtil.removeXMLInvalidChars(r); return r; } /** * Storage for results of entry parsing, i.e. cr/crlf flags, spaces counts * on the begin and end. */ public static class ParseEntryResult { public boolean crlf, cr; int spacesAtBegin, spacesAtEnd; } /** * Storage for collected segments. */ protected static class ParseEntryQueueItem { String id; short segmentIndex; String segmentSource; List<ProtectedPart> protectedParts; String segmentTranslation; boolean segmentTranslationFuzzy; String[] props; String prevSegment; String nextSegment; String path; } }