AbstractFilter.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2006 Martin Wunderlich
               2011 Alex Buloichik, Didier Briel,
               2012 Guido Leenders
               2015 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.filters2;

import java.awt.Dialog;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import org.omegat.util.EncodingDetector;
import org.omegat.util.NullBufferedWriter;
import org.omegat.util.OStrings;

/**
 * The base class for all filters (aka file handlers). Each filter should extend this class or one of its
 * descendants.
 * <p>
 * The process how the filter works is the following:
 * <ol>
 * <li>Source text is extracted.
 * <li>Tags are converted into shortcuts and these shortcuts are temporarily stored
 * <li>Source text with shortened tags is sent to OmegaT core
 * <li>Core returns a translation (or the same text if there's no translation)
 * <li>Tags shortcuts are expanded in translated text
 * <li>Translated text is written into the output file
 * </ol>
 *
 * @author Maxym Mykhalchuk
 * @author Martin Wunderlich
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Didier Briel
 * @author Guido Leenders
 * @author Aaron Madlon-Kay
 */
public abstract class AbstractFilter implements IFilter {

    /**
     * This value represents to the user that the encoding is determined by the filter itself. "In code" the
     * <code>null</code> is used to represent automatic encoding selection.
     */
    public static final String ENCODING_AUTO_HUMAN = OStrings.getString("ENCODING_AUTO");

    /** The original filename (with extension). */
    public static final String TFP_FILENAME = "${filename}";
    /** The original filename without extension. */
    public static final String TFP_NAMEONLY = "${nameOnly}";
    /** The original file extension. */
    public static final String TFP_EXTENSION = "${extension}";
    /** "xx_YY", locale code */
    public static final String TFP_TARGET_LOCALE = "${targetLocale}";
    /** "XX-YY", the TMX/XML language code */
    public static final String TFP_TARGET_LANGUAGE = "${targetLanguage}";
    /** language "XX" only */
    public static final String TFP_TARGET_LANG_CODE = "${targetLanguageCode}";
    /** country "YY" only */
    public static final String TFP_TARGET_COUNTRY_CODE = "${targetCountryCode}";
    /** Old spelling of the variable for country "YY" only */
    public static final String TFP_TARGET_COUTRY_CODE = "${targetCoutryCode}";
    /** System time at generation time in various patterns. */
    public static final String TFP_TIMESTAMP_LA = "${timestamp-a}";
    public static final String TFP_TIMESTAMP_LD = "${timestamp-d}";
    public static final String TFP_TIMESTAMP_LDD = "${timestamp-dd}";
    public static final String TFP_TIMESTAMP_LH = "${timestamp-h}";
    public static final String TFP_TIMESTAMP_LHH = "${timestamp-hh}";
    public static final String TFP_TIMESTAMP_LM = "${timestamp-m}";
    public static final String TFP_TIMESTAMP_LMM = "${timestamp-mm}";
    public static final String TFP_TIMESTAMP_LS = "${timestamp-s}";
    public static final String TFP_TIMESTAMP_LSS = "${timestamp-ss}";
    public static final String TFP_TIMESTAMP_LYYYY = "${timestamp-yyyy}";
    public static final String TFP_TIMESTAMP_UD = "${timestamp-D}";
    public static final String TFP_TIMESTAMP_UEEE = "${timestamp-EEE}";
    public static final String TFP_TIMESTAMP_UEEEE = "${timestamp-EEEE}";
    public static final String TFP_TIMESTAMP_UH = "${timestamp-H}";
    public static final String TFP_TIMESTAMP_UHH = "${timestamp-HH}";
    public static final String TFP_TIMESTAMP_UM = "${timestamp-M}";
    public static final String TFP_TIMESTAMP_UMM = "${timestamp-MM}";
    public static final String TFP_TIMESTAMP_UMMM = "${timestamp-MMM}";
    /** Workstation properties. */
    public static final String TFP_SYSTEM_OS_NAME = "${system-os-name}";
    public static final String TFP_SYSTEM_OS_VERSION = "${system-os-version}";
    public static final String TFP_SYSTEM_OS_ARCH = "${system-os-arch}";
    public static final String TFP_SYSTEM_USER_NAME = "${system-user-name}";
    public static final String TFP_SYSTEM_HOST_NAME = "${system-host-name}";
    /** File properties. */
    public static final String TFP_FILE_SOURCE_ENCODING = "${file-source-encoding}";
    public static final String TFP_FILE_TARGET_ENCODING = "${file-target-encoding}";
    public static final String TFP_FILE_FILTER_NAME = "${file-filter-name}";
    /** Microsoft. */
    public static final String TFP_TARGET_LOCALE_LCID = "${targetLocaleLCID}";

    protected String inEncodingLastParsedFile;

    /** All target filename patterns. */
    private static final String[] TARGET_FILENAME_PATTERNS = new String[] {
                TFP_FILENAME,
                TFP_NAMEONLY,
                TFP_EXTENSION,
                TFP_TARGET_LOCALE,
                TFP_TARGET_LOCALE_LCID,
                TFP_TARGET_LANGUAGE,
                TFP_TARGET_LANG_CODE,
                TFP_TARGET_COUNTRY_CODE,
                TFP_TIMESTAMP_LA,
                TFP_TIMESTAMP_LD,
                TFP_TIMESTAMP_LDD,
                TFP_TIMESTAMP_LH,
                TFP_TIMESTAMP_LHH,
                TFP_TIMESTAMP_LM,
                TFP_TIMESTAMP_LMM,
                TFP_TIMESTAMP_LS,
                TFP_TIMESTAMP_LSS,
                TFP_TIMESTAMP_LYYYY,
                TFP_TIMESTAMP_UD,
                TFP_TIMESTAMP_UEEE,
                TFP_TIMESTAMP_UEEEE,
                TFP_TIMESTAMP_UH,
                TFP_TIMESTAMP_UHH,
                TFP_TIMESTAMP_UM,
                TFP_TIMESTAMP_UMM,
                TFP_TIMESTAMP_UMMM,
                TFP_SYSTEM_OS_NAME,
                TFP_SYSTEM_OS_VERSION,
                TFP_SYSTEM_OS_ARCH,
                TFP_SYSTEM_USER_NAME,
                TFP_SYSTEM_HOST_NAME,
                TFP_FILE_SOURCE_ENCODING,
                TFP_FILE_TARGET_ENCODING,
                TFP_FILE_FILTER_NAME
        };

    public static List<String> getTargetFilenamePatterns() {
        return Collections.unmodifiableList(Arrays.asList(TARGET_FILENAME_PATTERNS));
    }

    /** Callback for parse. */
    protected IParseCallback entryParseCallback;

    /** Callback for translate. */
    protected ITranslateCallback entryTranslateCallback;

    /** Callback for align. */
    protected IAlignCallback entryAlignCallback;

    /** Options for processing time. */
    protected Map<String, String> processOptions;

    /**
     * The default output filename pattern.
     * <p>
     * It is equal to "${filename}", which means that the name of the translated file should be the same as
     * the name of the input file.
     */
    public static final String TARGET_DEFAULT = TFP_FILENAME;

    /**
     * Human-readable name of the File Format this filter supports.
     *
     * @return File format name
     */
    @Override
    public abstract String getFileFormatName();

    /**
     * The default list of filter instances that this filter class has. One filter class may have different
     * filter instances, different by source file mask, encoding of the source file etc.
     * <p>
     * Note that the user may change the instances freely.
     *
     * @return Default filter instances
     */
    @Override
    public abstract Instance[] getDefaultInstances();

    /**
     * Whether source encoding can be varied by the user.
     * <p>
     * True means that OmegaT should handle all the encoding mess.
     * <p>
     * Return false to state that your filter doesn't need encoding management provided by OmegaT, because it
     * either autodetects the encoding based on file contents (like HTML filter does) or the encoding is fixed
     * (like in OpenOffice files).
     *
     * @return whether source encoding can be changed by the user
     */
    @Override
    public abstract boolean isSourceEncodingVariable();

    /**
     * Whether target encoding can be varied by the user.
     * <p>
     * True means that OmegaT should handle all the encoding mess.
     * <p>
     * Return false to state that your filter doesn't need encoding management provided by OmegaT, because the
     * encoding is fixed (like in OpenOffice files), or for some other reason.
     *
     * @return whether target encoding can be changed by the user
     */
    @Override
    public abstract boolean isTargetEncodingVariable();

    /**
     * Returns whether the file is supported by the filter, given the reader with file's contents. There
     * exists a version of this method that takes file and encoding {@link #isFileSupported(File, Map, FilterContext))}. You
     * should override only one of the two.
     * <p>
     * By default returns true, because this method should be overriden only by filters that differentiate
     * input files not by extensions, but by file's content.
     * <p>
     * For example, DocBook files have .xml extension, as possibly many other XML files, so the filter should
     * check a DTD of the document.
     *
     * @param reader
     *            The reader of the source file
     * @return Does the filter support the file
     */
    protected boolean isFileSupported(BufferedReader reader) {
        return true;
    }

    /**
     * Returns whether the file is supported by the filter, given the file and possible file's encoding (
     * <code>null</code> encoding means autodetect). Default implementation creates a reader and calls
     * {@link #isFileSupported(BufferedReader)}. You should override only one of the two.
     * <p>
     * For example, DocBook files have .xml extension, as possibly many other XML files, so the filter should
     * check a DTD of the document.
     *
     * @param inFile
     *            Source file.
     * @param fc
     *            Filter context.
     * @return Does the filter support the file.
     */
    @Override
    public boolean isFileSupported(File inFile, Map<String, String> config, FilterContext fc) {
        try (BufferedReader reader = createReader(inFile, fc.getInEncoding())) {
            return isFileSupported(reader);
        } catch (IOException | TranslationException e) {
            return false;
        }
    }

    /**
     * Define fuzzy mark prefix for source which will be stored in TM. It's 'fuzzy' by default, but each
     * filter can redefine it.
     *
     * @return fuzzy mark prefix
     */
    @Override
    public String getFuzzyMark() {
        return "fuzzy";
    }

    /**
     * Returns the hint displayed while the user edits the filter, and when she adds/edits the instance of
     * this filter. The hint may be any string, preferably in a non-geek language.
     *
     * @return The hint for editing the filter in a non-geek language.
     */
    @Override
    public String getHint() {
        return "";
    }

    /**
     * OmegaT calls this to see whether the filter has any options. By default returns false, so filter
     * authors should override this to tell OmegaT core that this filter has options.
     *
     * @return True if the filter has any options, and false otherwise.
     */
    @Override
    public boolean hasOptions() {
        return false;
    }

    /**
     * {@inheritDoc}
     */
    @Deprecated
    @Override
    public Map<String, String> changeOptions(Dialog parent, Map<String, String> config) {
        return null;
    }

    /**
     * Creates a reader of an input file.
     *
     * @param inFile
     *            The source file.
     * @param inEncoding
     *            Encoding of the input file, if the filter supports it. Otherwise null.
     * @return The reader for the source file
     * @throws UnsupportedEncodingException
     *             Thrown if JVM doesn't support the specified inEncoding
     * @throws IOException
     *             If any I/O Error occurs upon reader creation
     * @throws TranslationException
     *             Should be thrown when processed file has any format defects.
     */
    protected BufferedReader createReader(File inFile, String inEncoding)
            throws UnsupportedEncodingException, IOException, TranslationException {
        InputStreamReader isr;
        if (inEncoding == null) {
            isr = new InputStreamReader(new FileInputStream(inFile), Charset.defaultCharset());
        } else {
            isr = new InputStreamReader(new FileInputStream(inFile), inEncoding);
        }
        return new BufferedReader(isr);
    }

    /**
     * Creates a writer of the translated file.
     *
     * @param outFile
     *            The target file
     * @param outEncoding
     *            Encoding of the target file, if the filter supports it. Otherwise null.
     * @return The writer for the target file
     * @throws UnsupportedEncodingException
     *             Thrown if JVM doesn't support the specified outEncoding
     * @throws IOException
     *             If any I/O Error occurs upon writer creation
     */
    protected BufferedWriter createWriter(File outFile, String outEncoding)
            throws UnsupportedEncodingException, IOException {
        OutputStreamWriter osw;
        if (outEncoding == null) {
            osw = new OutputStreamWriter(new FileOutputStream(outFile), Charset.defaultCharset());
        } else {
            osw = new OutputStreamWriter(new FileOutputStream(outFile), outEncoding);
        }
        return new BufferedWriter(osw);
    }

    /**
     * Processes a single file given a reader and a writer. Generally this
     * method should read strings from the input reader and write them to the
     * output reader. In order to let OmegaT know what strings are translatable
     * and to get their translation, filter should call
     * {@link #processEntry(String)} method.
     * <p>
     * Note that outFile is never null, even when the project is loading. (in
     * this case it writes no nowhere, but anyway you may use it...)
     * <p>
     * If you need more control over processed files, override
     * {@link #processFile(File, File, FilterContext)} instead.
     *
     * @param inFile
     *            Reader of the source file. It's the result of calling
     *            {@link #createReader(File,String)}.
     * @param outFile
     *            Writer of the target file on compilation (the result of
     *            calling {@link #createWriter(File, String)}), or a fictive
     *            writer to /dev/null.
     * @throws TranslationException
     *             Should be thrown when processed file has any format defects.
     * @throws IOException
     *             In case of any I/O error.
     */
    protected abstract void processFile(BufferedReader inFile, BufferedWriter outFile, FilterContext fc) throws IOException,
            TranslationException;

    /**
     * Processes a single file given an input and output files (output file may be null while loading files).
     * This method can be used to create a filter that works with the source/target files directly, rather
     * than using BufferedReader/BufferedWriter.
     * <p>
     * Generally this method should read strings from the input reader and write them to the output reader. In
     * order to let OmegaT know what strings are translatable and to get thair translation, filter should call
     * {@link #processEntry(String)} method.
     * <p>
     * If you override this method and do all the processing here, you should simply implement
     * {@link #processFile(BufferedReader,BufferedWriter)} with a stub.
     * <p>
     * Default implementation calls {@link #createReader(File,String)} to create a reader,
     * <code>new BufferedWriter(new StringWriter())</code> to create a writer for <code>null</code> output
     * file, or {@link #createWriter(File,String)} to create a writer if output file is not <code>null</code>;
     * then calls {@link #processFile(BufferedReader,BufferedWriter)} to process source file, and then closes
     * reader and writer.
     *
     * @param inFile
     *            The source file.
     * @param outFile
     *            The target file.
     * @param fc
     *            Filter context.
     * @returns List of processed files (each element of type {@link File}) or null if the filter can not/did
     *          not process multiple files.
     *
     * @throws IOException
     *             In case of any I/O error.
     * @throws TranslationException
     *             Should be thrown when processed file has any format defects.
     */
    protected void processFile(File inFile, File outFile, FilterContext fc) throws IOException,
            TranslationException {
        String encoding = getInputEncoding(fc, inFile);
        BufferedReader reader = createReader(inFile, encoding);
        inEncodingLastParsedFile = encoding == null ? Charset.defaultCharset().name() : encoding;
        try {
            BufferedWriter writer;

            if (outFile != null) {
                String outEncoding = getOutputEncoding(fc);
                writer = createWriter(outFile, outEncoding);
            } else {
                writer = new NullBufferedWriter();
            }

            try {
                processFile(reader, writer, fc);
            } finally {
                writer.close();
            }
        } finally {
            reader.close();
        }
    }

    /**
     * Get the input encoding. If it's not set in the FilterContext (setting is "<auto>")
     * and the filter allows ({@link #isSourceEncodingVariable()}), try to detect it. The result may be null.
     * @param fc
     * @param inFile
     * @return
     * @throws IOException
     */
    protected String getInputEncoding(FilterContext fc, File inFile) throws IOException {
        String encoding = fc.getInEncoding();
        if (encoding == null && isSourceEncodingVariable()) {
            encoding = EncodingDetector.detectEncoding(inFile);
        }
        return encoding;
    }

    /**
     * Get the output encoding. If it's not set in the FilterContext (setting is "<auto>")
     * and the filter allows ({@link #isTargetEncodingVariable()}):
     * <ul><li>Reuse the input encoding if it's Unicode
     * <li>If the input was not Unicode, fall back to UTF-8.
     * </ul>
     * The result may be null.
     * @param fc
     * @return
     */
    protected String getOutputEncoding(FilterContext fc) {
        String encoding = fc.getOutEncoding();
        if (encoding == null && isTargetEncodingVariable()) {
            // Use input encoding if it's Unicode; otherwise default to UTF-8
            if (inEncodingLastParsedFile != null && inEncodingLastParsedFile.toLowerCase().startsWith("utf-")) {
                encoding = inEncodingLastParsedFile;
            } else {
                encoding = "UTF-8";
            }
        }
        return encoding;
    }

    @Override
    public final void parseFile(File inFile, Map<String, String> config, FilterContext fc,
            IParseCallback callback) throws Exception {
        entryParseCallback = callback;
        entryTranslateCallback = null;
        entryAlignCallback = null;
        processOptions = config;

        try {
            processFile(inFile, null, fc);
            if (requirePrevNextFields()) {
                // parsing - need to link prev/next
                entryParseCallback.linkPrevNextSegments();
            }
        } finally {
            entryParseCallback = null;
            processOptions = null;
        }
    }

    @Override
    public final void alignFile(File inFile, File outFile, Map<String, String> config, FilterContext fc,
            IAlignCallback callback) throws Exception {
        entryParseCallback = null;
        entryTranslateCallback = null;
        entryAlignCallback = callback;
        processOptions = config;

        BufferedReader readerIn = createReader(inFile, fc.getInEncoding());
        BufferedReader readerOut = createReader(outFile, fc.getOutEncoding());

        try {
            alignFile(readerIn, readerOut, fc);
        } finally {
            readerIn.close();
            readerOut.close();
        }
    }

    /**
     * Align source file against translated file.
     *
     * @param sourceFile
     *            source file
     * @param translatedFile
     *            translated file
     */
    protected void alignFile(BufferedReader sourceFile, BufferedReader translatedFile, FilterContext fc) throws Exception {
    }

    /**
     * Method can be overrided for return true. It means what two-pass parsing and translating will be
     * processed and prev/next segments will be linked.
     */
    protected boolean requirePrevNextFields() {
        return false;
    }

    @Override
    public final void translateFile(File inFile, File outFile, Map<String, String> config, FilterContext fc,
       ITranslateCallback callback) throws Exception {
        entryParseCallback = null;
        entryTranslateCallback = callback;
        entryAlignCallback = null;
        processOptions = config;

        try {
            entryTranslateCallback.setPass(1);
            processFile(inFile, outFile, fc);
            if (requirePrevNextFields()) {
                entryTranslateCallback.linkPrevNextSegments();
                entryTranslateCallback.setPass(2);
                processFile(inFile, outFile, fc);
            }
        } finally {
            entryTranslateCallback = null;
            processOptions = null;
        }
    }

    /**
     * Call this method to:
     * <ul>
     * <li>Instruct OmegaT what source strings are translatable.
     * <li>Get the translation of each source string.
     * </ul>
     *
     * @param entry
     *            Translatable source string
     * @return Translation of the source string. If there's no translation, returns the source string itself.
     */
    protected final String processEntry(String entry) {
        return processEntry(entry, null);
    }
    /**
     * Call this method to:
     * <ul>
     * <li>Instruct OmegaT what source strings are translatable.
     * <li>Get the translation of each source string.
     * </ul>
     *
     * @param entry
     *            Translatable source string
     * @param comment comment on the source string in the source file (if available)
     * @return Translation of the source string. If there's no translation, returns the source string itself.
     */
    protected final String processEntry(String entry, String comment) {
        if (entryParseCallback != null) {
            entryParseCallback.addEntry(null, entry, null, false, comment, null, this, null);
            return entry;
        } else {
            String translation = entryTranslateCallback.getTranslation(null, entry, null);
            return translation != null ? translation : entry;
        }
    }

    /**
     * Set both callbacks. Used for child XML filters only.
     *
     * @param parseCallback
     * @param translateCallback
     */
    public void setCallbacks(IParseCallback parseCallback, ITranslateCallback translateCallback) {
        this.entryParseCallback = parseCallback;
        this.entryTranslateCallback = translateCallback;
    }

    @Override
    public String getInEncodingLastParsedFile() {
        return inEncodingLastParsedFile;
    }

}