/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2006 Martin Wunderlich 2011 Alex Buloichik, Didier Briel, 2012 Guido Leenders 2015 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.filters2; import java.awt.Dialog; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import org.omegat.util.EncodingDetector; import org.omegat.util.NullBufferedWriter; import org.omegat.util.OStrings; /** * The base class for all filters (aka file handlers). Each filter should extend this class or one of its * descendants. * <p> * The process how the filter works is the following: * <ol> * <li>Source text is extracted. * <li>Tags are converted into shortcuts and these shortcuts are temporarily stored * <li>Source text with shortened tags is sent to OmegaT core * <li>Core returns a translation (or the same text if there's no translation) * <li>Tags shortcuts are expanded in translated text * <li>Translated text is written into the output file * </ol> * * @author Maxym Mykhalchuk * @author Martin Wunderlich * @author Alex Buloichik (alex73mail@gmail.com) * @author Didier Briel * @author Guido Leenders * @author Aaron Madlon-Kay */ public abstract class AbstractFilter implements IFilter { /** * This value represents to the user that the encoding is determined by the filter itself. "In code" the * <code>null</code> is used to represent automatic encoding selection. */ public static final String ENCODING_AUTO_HUMAN = OStrings.getString("ENCODING_AUTO"); /** The original filename (with extension). */ public static final String TFP_FILENAME = "${filename}"; /** The original filename without extension. */ public static final String TFP_NAMEONLY = "${nameOnly}"; /** The original file extension. */ public static final String TFP_EXTENSION = "${extension}"; /** "xx_YY", locale code */ public static final String TFP_TARGET_LOCALE = "${targetLocale}"; /** "XX-YY", the TMX/XML language code */ public static final String TFP_TARGET_LANGUAGE = "${targetLanguage}"; /** language "XX" only */ public static final String TFP_TARGET_LANG_CODE = "${targetLanguageCode}"; /** country "YY" only */ public static final String TFP_TARGET_COUNTRY_CODE = "${targetCountryCode}"; /** Old spelling of the variable for country "YY" only */ public static final String TFP_TARGET_COUTRY_CODE = "${targetCoutryCode}"; /** System time at generation time in various patterns. */ public static final String TFP_TIMESTAMP_LA = "${timestamp-a}"; public static final String TFP_TIMESTAMP_LD = "${timestamp-d}"; public static final String TFP_TIMESTAMP_LDD = "${timestamp-dd}"; public static final String TFP_TIMESTAMP_LH = "${timestamp-h}"; public static final String TFP_TIMESTAMP_LHH = "${timestamp-hh}"; public static final String TFP_TIMESTAMP_LM = "${timestamp-m}"; public static final String TFP_TIMESTAMP_LMM = "${timestamp-mm}"; public static final String TFP_TIMESTAMP_LS = "${timestamp-s}"; public static final String TFP_TIMESTAMP_LSS = "${timestamp-ss}"; public static final String TFP_TIMESTAMP_LYYYY = "${timestamp-yyyy}"; public static final String TFP_TIMESTAMP_UD = "${timestamp-D}"; public static final String TFP_TIMESTAMP_UEEE = "${timestamp-EEE}"; public static final String TFP_TIMESTAMP_UEEEE = "${timestamp-EEEE}"; public static final String TFP_TIMESTAMP_UH = "${timestamp-H}"; public static final String TFP_TIMESTAMP_UHH = "${timestamp-HH}"; public static final String TFP_TIMESTAMP_UM = "${timestamp-M}"; public static final String TFP_TIMESTAMP_UMM = "${timestamp-MM}"; public static final String TFP_TIMESTAMP_UMMM = "${timestamp-MMM}"; /** Workstation properties. */ public static final String TFP_SYSTEM_OS_NAME = "${system-os-name}"; public static final String TFP_SYSTEM_OS_VERSION = "${system-os-version}"; public static final String TFP_SYSTEM_OS_ARCH = "${system-os-arch}"; public static final String TFP_SYSTEM_USER_NAME = "${system-user-name}"; public static final String TFP_SYSTEM_HOST_NAME = "${system-host-name}"; /** File properties. */ public static final String TFP_FILE_SOURCE_ENCODING = "${file-source-encoding}"; public static final String TFP_FILE_TARGET_ENCODING = "${file-target-encoding}"; public static final String TFP_FILE_FILTER_NAME = "${file-filter-name}"; /** Microsoft. */ public static final String TFP_TARGET_LOCALE_LCID = "${targetLocaleLCID}"; protected String inEncodingLastParsedFile; /** All target filename patterns. */ private static final String[] TARGET_FILENAME_PATTERNS = new String[] { TFP_FILENAME, TFP_NAMEONLY, TFP_EXTENSION, TFP_TARGET_LOCALE, TFP_TARGET_LOCALE_LCID, TFP_TARGET_LANGUAGE, TFP_TARGET_LANG_CODE, TFP_TARGET_COUNTRY_CODE, TFP_TIMESTAMP_LA, TFP_TIMESTAMP_LD, TFP_TIMESTAMP_LDD, TFP_TIMESTAMP_LH, TFP_TIMESTAMP_LHH, TFP_TIMESTAMP_LM, TFP_TIMESTAMP_LMM, TFP_TIMESTAMP_LS, TFP_TIMESTAMP_LSS, TFP_TIMESTAMP_LYYYY, TFP_TIMESTAMP_UD, TFP_TIMESTAMP_UEEE, TFP_TIMESTAMP_UEEEE, TFP_TIMESTAMP_UH, TFP_TIMESTAMP_UHH, TFP_TIMESTAMP_UM, TFP_TIMESTAMP_UMM, TFP_TIMESTAMP_UMMM, TFP_SYSTEM_OS_NAME, TFP_SYSTEM_OS_VERSION, TFP_SYSTEM_OS_ARCH, TFP_SYSTEM_USER_NAME, TFP_SYSTEM_HOST_NAME, TFP_FILE_SOURCE_ENCODING, TFP_FILE_TARGET_ENCODING, TFP_FILE_FILTER_NAME }; public static List<String> getTargetFilenamePatterns() { return Collections.unmodifiableList(Arrays.asList(TARGET_FILENAME_PATTERNS)); } /** Callback for parse. */ protected IParseCallback entryParseCallback; /** Callback for translate. */ protected ITranslateCallback entryTranslateCallback; /** Callback for align. */ protected IAlignCallback entryAlignCallback; /** Options for processing time. */ protected Map<String, String> processOptions; /** * The default output filename pattern. * <p> * It is equal to "${filename}", which means that the name of the translated file should be the same as * the name of the input file. */ public static final String TARGET_DEFAULT = TFP_FILENAME; /** * Human-readable name of the File Format this filter supports. * * @return File format name */ @Override public abstract String getFileFormatName(); /** * The default list of filter instances that this filter class has. One filter class may have different * filter instances, different by source file mask, encoding of the source file etc. * <p> * Note that the user may change the instances freely. * * @return Default filter instances */ @Override public abstract Instance[] getDefaultInstances(); /** * Whether source encoding can be varied by the user. * <p> * True means that OmegaT should handle all the encoding mess. * <p> * Return false to state that your filter doesn't need encoding management provided by OmegaT, because it * either autodetects the encoding based on file contents (like HTML filter does) or the encoding is fixed * (like in OpenOffice files). * * @return whether source encoding can be changed by the user */ @Override public abstract boolean isSourceEncodingVariable(); /** * Whether target encoding can be varied by the user. * <p> * True means that OmegaT should handle all the encoding mess. * <p> * Return false to state that your filter doesn't need encoding management provided by OmegaT, because the * encoding is fixed (like in OpenOffice files), or for some other reason. * * @return whether target encoding can be changed by the user */ @Override public abstract boolean isTargetEncodingVariable(); /** * Returns whether the file is supported by the filter, given the reader with file's contents. There * exists a version of this method that takes file and encoding {@link #isFileSupported(File, Map, FilterContext))}. You * should override only one of the two. * <p> * By default returns true, because this method should be overriden only by filters that differentiate * input files not by extensions, but by file's content. * <p> * For example, DocBook files have .xml extension, as possibly many other XML files, so the filter should * check a DTD of the document. * * @param reader * The reader of the source file * @return Does the filter support the file */ protected boolean isFileSupported(BufferedReader reader) { return true; } /** * Returns whether the file is supported by the filter, given the file and possible file's encoding ( * <code>null</code> encoding means autodetect). Default implementation creates a reader and calls * {@link #isFileSupported(BufferedReader)}. You should override only one of the two. * <p> * For example, DocBook files have .xml extension, as possibly many other XML files, so the filter should * check a DTD of the document. * * @param inFile * Source file. * @param fc * Filter context. * @return Does the filter support the file. */ @Override public boolean isFileSupported(File inFile, Map<String, String> config, FilterContext fc) { try (BufferedReader reader = createReader(inFile, fc.getInEncoding())) { return isFileSupported(reader); } catch (IOException | TranslationException e) { return false; } } /** * Define fuzzy mark prefix for source which will be stored in TM. It's 'fuzzy' by default, but each * filter can redefine it. * * @return fuzzy mark prefix */ @Override public String getFuzzyMark() { return "fuzzy"; } /** * Returns the hint displayed while the user edits the filter, and when she adds/edits the instance of * this filter. The hint may be any string, preferably in a non-geek language. * * @return The hint for editing the filter in a non-geek language. */ @Override public String getHint() { return ""; } /** * OmegaT calls this to see whether the filter has any options. By default returns false, so filter * authors should override this to tell OmegaT core that this filter has options. * * @return True if the filter has any options, and false otherwise. */ @Override public boolean hasOptions() { return false; } /** * {@inheritDoc} */ @Deprecated @Override public Map<String, String> changeOptions(Dialog parent, Map<String, String> config) { return null; } /** * Creates a reader of an input file. * * @param inFile * The source file. * @param inEncoding * Encoding of the input file, if the filter supports it. Otherwise null. * @return The reader for the source file * @throws UnsupportedEncodingException * Thrown if JVM doesn't support the specified inEncoding * @throws IOException * If any I/O Error occurs upon reader creation * @throws TranslationException * Should be thrown when processed file has any format defects. */ protected BufferedReader createReader(File inFile, String inEncoding) throws UnsupportedEncodingException, IOException, TranslationException { InputStreamReader isr; if (inEncoding == null) { isr = new InputStreamReader(new FileInputStream(inFile), Charset.defaultCharset()); } else { isr = new InputStreamReader(new FileInputStream(inFile), inEncoding); } return new BufferedReader(isr); } /** * Creates a writer of the translated file. * * @param outFile * The target file * @param outEncoding * Encoding of the target file, if the filter supports it. Otherwise null. * @return The writer for the target file * @throws UnsupportedEncodingException * Thrown if JVM doesn't support the specified outEncoding * @throws IOException * If any I/O Error occurs upon writer creation */ protected BufferedWriter createWriter(File outFile, String outEncoding) throws UnsupportedEncodingException, IOException { OutputStreamWriter osw; if (outEncoding == null) { osw = new OutputStreamWriter(new FileOutputStream(outFile), Charset.defaultCharset()); } else { osw = new OutputStreamWriter(new FileOutputStream(outFile), outEncoding); } return new BufferedWriter(osw); } /** * Processes a single file given a reader and a writer. Generally this * method should read strings from the input reader and write them to the * output reader. In order to let OmegaT know what strings are translatable * and to get their translation, filter should call * {@link #processEntry(String)} method. * <p> * Note that outFile is never null, even when the project is loading. (in * this case it writes no nowhere, but anyway you may use it...) * <p> * If you need more control over processed files, override * {@link #processFile(File, File, FilterContext)} instead. * * @param inFile * Reader of the source file. It's the result of calling * {@link #createReader(File,String)}. * @param outFile * Writer of the target file on compilation (the result of * calling {@link #createWriter(File, String)}), or a fictive * writer to /dev/null. * @throws TranslationException * Should be thrown when processed file has any format defects. * @throws IOException * In case of any I/O error. */ protected abstract void processFile(BufferedReader inFile, BufferedWriter outFile, FilterContext fc) throws IOException, TranslationException; /** * Processes a single file given an input and output files (output file may be null while loading files). * This method can be used to create a filter that works with the source/target files directly, rather * than using BufferedReader/BufferedWriter. * <p> * Generally this method should read strings from the input reader and write them to the output reader. In * order to let OmegaT know what strings are translatable and to get thair translation, filter should call * {@link #processEntry(String)} method. * <p> * If you override this method and do all the processing here, you should simply implement * {@link #processFile(BufferedReader,BufferedWriter)} with a stub. * <p> * Default implementation calls {@link #createReader(File,String)} to create a reader, * <code>new BufferedWriter(new StringWriter())</code> to create a writer for <code>null</code> output * file, or {@link #createWriter(File,String)} to create a writer if output file is not <code>null</code>; * then calls {@link #processFile(BufferedReader,BufferedWriter)} to process source file, and then closes * reader and writer. * * @param inFile * The source file. * @param outFile * The target file. * @param fc * Filter context. * @returns List of processed files (each element of type {@link File}) or null if the filter can not/did * not process multiple files. * * @throws IOException * In case of any I/O error. * @throws TranslationException * Should be thrown when processed file has any format defects. */ protected void processFile(File inFile, File outFile, FilterContext fc) throws IOException, TranslationException { String encoding = getInputEncoding(fc, inFile); BufferedReader reader = createReader(inFile, encoding); inEncodingLastParsedFile = encoding == null ? Charset.defaultCharset().name() : encoding; try { BufferedWriter writer; if (outFile != null) { String outEncoding = getOutputEncoding(fc); writer = createWriter(outFile, outEncoding); } else { writer = new NullBufferedWriter(); } try { processFile(reader, writer, fc); } finally { writer.close(); } } finally { reader.close(); } } /** * Get the input encoding. If it's not set in the FilterContext (setting is "<auto>") * and the filter allows ({@link #isSourceEncodingVariable()}), try to detect it. The result may be null. * @param fc * @param inFile * @return * @throws IOException */ protected String getInputEncoding(FilterContext fc, File inFile) throws IOException { String encoding = fc.getInEncoding(); if (encoding == null && isSourceEncodingVariable()) { encoding = EncodingDetector.detectEncoding(inFile); } return encoding; } /** * Get the output encoding. If it's not set in the FilterContext (setting is "<auto>") * and the filter allows ({@link #isTargetEncodingVariable()}): * <ul><li>Reuse the input encoding if it's Unicode * <li>If the input was not Unicode, fall back to UTF-8. * </ul> * The result may be null. * @param fc * @return */ protected String getOutputEncoding(FilterContext fc) { String encoding = fc.getOutEncoding(); if (encoding == null && isTargetEncodingVariable()) { // Use input encoding if it's Unicode; otherwise default to UTF-8 if (inEncodingLastParsedFile != null && inEncodingLastParsedFile.toLowerCase().startsWith("utf-")) { encoding = inEncodingLastParsedFile; } else { encoding = "UTF-8"; } } return encoding; } @Override public final void parseFile(File inFile, Map<String, String> config, FilterContext fc, IParseCallback callback) throws Exception { entryParseCallback = callback; entryTranslateCallback = null; entryAlignCallback = null; processOptions = config; try { processFile(inFile, null, fc); if (requirePrevNextFields()) { // parsing - need to link prev/next entryParseCallback.linkPrevNextSegments(); } } finally { entryParseCallback = null; processOptions = null; } } @Override public final void alignFile(File inFile, File outFile, Map<String, String> config, FilterContext fc, IAlignCallback callback) throws Exception { entryParseCallback = null; entryTranslateCallback = null; entryAlignCallback = callback; processOptions = config; BufferedReader readerIn = createReader(inFile, fc.getInEncoding()); BufferedReader readerOut = createReader(outFile, fc.getOutEncoding()); try { alignFile(readerIn, readerOut, fc); } finally { readerIn.close(); readerOut.close(); } } /** * Align source file against translated file. * * @param sourceFile * source file * @param translatedFile * translated file */ protected void alignFile(BufferedReader sourceFile, BufferedReader translatedFile, FilterContext fc) throws Exception { } /** * Method can be overrided for return true. It means what two-pass parsing and translating will be * processed and prev/next segments will be linked. */ protected boolean requirePrevNextFields() { return false; } @Override public final void translateFile(File inFile, File outFile, Map<String, String> config, FilterContext fc, ITranslateCallback callback) throws Exception { entryParseCallback = null; entryTranslateCallback = callback; entryAlignCallback = null; processOptions = config; try { entryTranslateCallback.setPass(1); processFile(inFile, outFile, fc); if (requirePrevNextFields()) { entryTranslateCallback.linkPrevNextSegments(); entryTranslateCallback.setPass(2); processFile(inFile, outFile, fc); } } finally { entryTranslateCallback = null; processOptions = null; } } /** * Call this method to: * <ul> * <li>Instruct OmegaT what source strings are translatable. * <li>Get the translation of each source string. * </ul> * * @param entry * Translatable source string * @return Translation of the source string. If there's no translation, returns the source string itself. */ protected final String processEntry(String entry) { return processEntry(entry, null); } /** * Call this method to: * <ul> * <li>Instruct OmegaT what source strings are translatable. * <li>Get the translation of each source string. * </ul> * * @param entry * Translatable source string * @param comment comment on the source string in the source file (if available) * @return Translation of the source string. If there's no translation, returns the source string itself. */ protected final String processEntry(String entry, String comment) { if (entryParseCallback != null) { entryParseCallback.addEntry(null, entry, null, false, comment, null, this, null); return entry; } else { String translation = entryTranslateCallback.getTranslation(null, entry, null); return translation != null ? translation : entry; } } /** * Set both callbacks. Used for child XML filters only. * * @param parseCallback * @param translateCallback */ public void setCallbacks(IParseCallback parseCallback, ITranslateCallback translateCallback) { this.entryParseCallback = parseCallback; this.entryTranslateCallback = translateCallback; } @Override public String getInEncodingLastParsedFile() { return inEncodingLastParsedFile; } }