/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2007 Didier Briel, Zoltan Bartko 2008 Martin Fleurke 2009 Didier Briel, Martin Fleurke 2010 Didier Briel 2012 Martin Fleurke 2015 Didier Briel Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.util; import java.util.regex.Pattern; /** * Constant patterns, used in different other classes. * * @author Maxym Mykhalchuk * @author Didier Briel * @author Zoltan Bartko (bartkozoltan@bartkozoltan.com) * @author Martin Fleurke */ public class PatternConsts { private static final String RE_OMEGAT_TAG = "<\\/?[a-zA-Z]+[0-9]+\\/?>"; private static final String RE_PRINTF_VARS = "%([1-9]+\\$)?([+-])?('.)?(-)?([0-9]*)(\\.[0-9]*)?[bcdeEfFgGinopsuxX%]"; private static final String RE_SIMPLE_PRINTF_VARS = "%([1-9]+\\$)?([0-9]*)(\\.[0-9]*)?[bcdeEfFgGinopsuxX%]"; private static final String RE_SIMPLE_JAVA_MESSAGEFORMAT_PATTERN_VARS = "\\{([0-9])+\\}"; /** * Compiled pattern to extract the encoding from XML file, if any. Found * encoding is stored in group #1. */ public static final Pattern XML_ENCODING = Pattern .compile("<\\?xml.*?encoding\\s*=\\s*\"(\\S+?)\".*?\\?>"); /** compiled pattern to match XML header */ public static final Pattern XML_HEADER = Pattern.compile("(<\\?xml.*?\\?>)"); /** * Compiled pattern to extract the DOCTYPE declaration from XML file, if * any. Groups: <br> * #1 - DOCTYPE name <br> * #3 - PUBLIC DOCTYPE URL <br> * #5 - SYSTEM DOCTYPE URL */ public static final Pattern XML_DOCTYPE = Pattern .compile("<\\!DOCTYPE\\s+(\\w+)\\s+(PUBLIC\\s+\"(-//.*)\"\\s+)?"); // (SYSTEM\\s+)?\"(.*?)\"\\s+>"); /** * Compiled pattern to extract the root tag from XML file, if any. Group #1 * should contain the root tag. */ public static final Pattern XML_ROOTTAG = Pattern.compile("<(\\w+)"); /** * Compiled pattern to extract the xlmns declaration from an XML file, if * any. Group #2 should contain the xmlns declaration. E.g., * http://www.w3.org/2001/XMLSchema-instance */ public static final Pattern XML_XMLNS = Pattern.compile("xmlns(:\\w+)?=\"(.*?)\""); /** compiled pattern to extract the encoding from HTML file, if any */ public static final Pattern HTML_ENCODING = Pattern.compile( "<meta.*?content\\s*=\\s*[\"']\\s*text/html\\s*;\\s*charset\\s*=\\s*(\\S+?)[\"'].*?/?\\s*>", Pattern.CASE_INSENSITIVE); /** compiled pattern to extract the encoding from HTML5 file, if any */ public static final Pattern HTML5_ENCODING = Pattern.compile( "<meta.*?charset\\s*=\\s*[\"'](\\S+?)[\"'].*?/?\\s*>", Pattern.CASE_INSENSITIVE); /** Compiled pattern to look for HTML file HEAD declaration. Using [^e] instead of . prevents confusing <head> with <header> from HTML 5*/ public static final Pattern HTML_HEAD = Pattern.compile("<head[^e]*?>", Pattern.CASE_INSENSITIVE); /** compiled pattern to look for HTML file HTML declaration */ public static final Pattern HTML_HTML = Pattern.compile("<html.*?>", Pattern.CASE_INSENSITIVE); /** Pattern for detecting html <BR> tags */ public static final Pattern HTML_BR = Pattern.compile("<BR>", Pattern.CASE_INSENSITIVE); /** * Pattern that matches full string containing in full and only * omegat-specific tag (without leading < and trailing >). */ public static final Pattern OMEGAT_TAG_ONLY = Pattern.compile("^\\/?[a-zA-Z]+[0-9]+\\/?$"); /** * Pattern that matches omegat-specific tags (with leading < and trailing * > in any place of a string). */ public static final Pattern OMEGAT_TAG = Pattern.compile(RE_OMEGAT_TAG); /** * Pattern that matches omegat-specific tags (with leading < and trailing * > in any place of a string) plus a space after it. */ public static final Pattern OMEGAT_TAG_SPACE = Pattern.compile("<\\/?[a-zA-Z]+[0-9]+\\/?>\\s"); /** * Pattern that matches omegat-specific tags (with leading < and trailing * > in any place of a string) with a space before it. */ public static final Pattern SPACE_OMEGAT_TAG = Pattern.compile("\\s<\\/?[a-zA-Z]+[0-9]+\\/?>"); /** * Pattern that matches omegat-specific tags (with leading < and trailing * > in any place of a string) and decompiles them into pieces: * <ol> * <li>leading /, if any * <li>tag shortcut * <li>tag number * <li>trailing /, if any * </ol> * Call <code>matcher.group(n)</code> to get each piece. */ public static final Pattern OMEGAT_TAG_DECOMPILE = Pattern.compile("<(\\/?)([a-zA-Z]+)([0-9]+)(\\/?)>"); /** * Pattern that matches paired tag in protected parts * <ol> * <li>opening tag * <li>text between tags * <li>closing tag * </ol> * Call <code>matcher.group(n)</code> to get each piece. */ public static final Pattern PROTECTED_PARTS_PAIRED_TAG_DECOMPILE = Pattern.compile("^(<.+?>)([^<]+?)(<\\/.+?>)"); /** * Pattern that matches an equiv-text attribute as used in XLIFF inline codes. */ public static final Pattern EQUIV_TEXT_ATTRIBUTE_DECOMPILE = Pattern.compile("equiv-text=\"([^\"]+)\""); /** Pattern that detects space-only regular expressions. */ public static final Pattern SPACY_REGEX = Pattern.compile("((\\s|\\\\n|\\\\t|\\\\s)(\\+|\\*)?)+"); /** Pattern that detects language and country, with an optionnal script in the middle. */ public static final Pattern LANG_AND_COUNTRY = Pattern .compile("([A-Za-z]{1,8})(?:(?:-|_)(?:[A-Za-z]{4}(?:-|_))?([A-Za-z0-9]{1,8}))?"); /** Pattern for detecting remote dictionary file archives */ public static final Pattern DICTIONARY_ZIP = Pattern.compile( // "\"([a-z]{1,8})(_([A-Z]{1,8})?)?\\.zip\""); // Hardcoded pattern to get the French dictionary // (fr_FR_1-3-2.zip) in addition to the others // The initial pattern is above. // [ 2138846 ] French dictionary cannot be downloaded and installed "\"([a-z]{1,8})(_([A-Z]{1,8})?)(_1-3-2)?\\.zip\""); public static final Pattern SPACE_TAB = Pattern.compile("( |\t)+"); /** * Pattern for detecting the placeholders in a printf-function string which * can occur in languages like php, C and others. placeholder ::= "%" * [ARGUMENTSWAPSPECIFIER] [SIGNSPECIFIER] [PADDINGSPECIFIER] * [ALIGNMENTSPECIFIER] [WIDTHSPECIFIER] [PRECISIONSPECIFIER] TYPESPECIFIER * NUMBER ::= { "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" } * ARGUMENTSWAPSPECIFIER = NUMBER "$" SIGNSPECIFIER ::= "+" | "-" * PADDINGSPECIFIER ::= " " | "0" | "'" CHARACTER ALIGNMENTSPECIFIER ::= "" * | "-" WIDTHSPECIFIER ::= NUMBER PRECISIONSPECIFIER ::= "." NUMBER * TYPESPECIFIER ::= "b" | "c" | "d" | "e" | "E" | "f" | "F" | "g" | "G" | * "i" | "n" | "o" | "p" | "s" | "u" | "x" | "X" | "%" //c++: * [cdieEfgGosuxXpn%] //php: [bcdeufFosxX%] NB: Because having space as * paddingspecifier leads to many false matches in regular text, and space * being the default padding specifier in php, and being able to have space * or 0 as padding specifier by prefixing it with ', and having the padding * specifier not being used frequently in most cases, the regular expression * only corresponds with quote+paddingspecifier. NB2: The argument swap * specifier gives explicit ordering of variables, without it, the ordering * is implicit (first in sequence is first in order) Example in code: * <code>echo printf(gettext("%s is very %s"), "OmegaT", "great");</code> */ public static final Pattern PRINTF_VARS = Pattern .compile(RE_PRINTF_VARS); /** * Pattern for detecting the placeholders in a printf-function string. It * detects only simple placeholders, without SIGN-, PADDING-, ALIGNMENT- and * WIDTH specifier. * * @see #PRINTF_VARS */ public static final Pattern SIMPLE_PRINTF_VARS = Pattern.compile(RE_SIMPLE_PRINTF_VARS); public static final Pattern SIMPLE_JAVA_MESSAGEFORMAT_PATTERN_VARS = Pattern .compile(RE_SIMPLE_JAVA_MESSAGEFORMAT_PATTERN_VARS); /** * Pattern for detecting OmegaT-tags and other placeholders (extended sprintf-variant) in texts */ public static final Pattern SIMPLE_PLACEHOLDERS = Pattern.compile(RE_OMEGAT_TAG + "|" + RE_PRINTF_VARS); /** * combined pattern for all placeholder tags */ private static Pattern PLACEHOLDERS; /** * pattern for text that should be removed from translation. Can be null! */ private static Pattern REMOVE; /** * Pattern for text that should be considered a custom tag. Can be null! */ private static Pattern CUSTOM_TAGS; /** * Returns the placeholder pattern (OmegaT tags, printf tags, java * MessageFomat tags, custom tags, combined according to user configuration) * * @return the pattern * @see #updatePlaceholderPattern() */ public static Pattern getPlaceholderPattern() { if (PLACEHOLDERS == null) { String regexp = RE_OMEGAT_TAG; if ("true".equalsIgnoreCase(Preferences.getPreference(Preferences.CHECK_ALL_PRINTF_TAGS))) { regexp += "|" + RE_PRINTF_VARS; } else if ("true".equalsIgnoreCase(Preferences.getPreference(Preferences.CHECK_SIMPLE_PRINTF_TAGS))) { regexp += "|" + RE_SIMPLE_PRINTF_VARS; } if ("true".equalsIgnoreCase(Preferences.getPreference(Preferences.CHECK_JAVA_PATTERN_TAGS))) { regexp += "|" + RE_SIMPLE_JAVA_MESSAGEFORMAT_PATTERN_VARS; } // assume: customRegExp has already been validated. String customRegExp = Preferences.getPreference(Preferences.CHECK_CUSTOM_PATTERN); if (!"".equalsIgnoreCase(customRegExp)) { regexp += "|" + customRegExp; } PLACEHOLDERS = Pattern.compile(regexp); } return PLACEHOLDERS; } /** * Resets the placeholder pattern. Use it when the user has changed tagvalidation configuration. */ public static void updatePlaceholderPattern() { PLACEHOLDERS = null; } public static Pattern getRemovePattern() { if (REMOVE == null) { String removeRegExp = Preferences.getPreference(Preferences.CHECK_REMOVE_PATTERN); if (!"".equalsIgnoreCase(removeRegExp)) { REMOVE = Pattern.compile(removeRegExp); } } return REMOVE; } /** * Resets the remove pattern. Use it when the user has changed tagvalidation configuration. */ public static void updateRemovePattern() { REMOVE = null; } public static Pattern getCustomTagPattern() { if (CUSTOM_TAGS == null) { String customTagsRegex = Preferences.getPreference(Preferences.CHECK_CUSTOM_PATTERN); if (!"".equalsIgnoreCase(customTagsRegex)) { CUSTOM_TAGS = Pattern.compile(customTagsRegex); } } return CUSTOM_TAGS; } /** * Resets the remove pattern. Use it when the user has changed tagvalidation configuration. */ public static void updateCustomTagPattern() { CUSTOM_TAGS = null; } }