/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2007-2008 Martin Fleurke
2012 Didier Briel
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.filters2.html2;
import java.awt.Window;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.htmlparser.Parser;
import org.htmlparser.util.ParserException;
import org.omegat.filters2.AbstractFilter;
import org.omegat.filters2.Instance;
import org.omegat.filters2.TranslationException;
import org.omegat.util.Log;
import org.omegat.util.OStrings;
import org.omegat.util.StringUtil;
/**
* A filter to translate HTML and XHTML files.
* <p>
* Some useful discussion why HTML filter should behave like it does, happened
* on a <a href="https://sourceforge.net/p/omegat/bugs/108/">bug report</a>
* devoted to compressing space.
*
* @author Maxym Mykhalchuk
* @author Martin Fleurke
* @author Didier Briel
*/
public class HTMLFilter2 extends AbstractFilter {
/** Creates a new instance of HTMLFilter2 */
public HTMLFilter2() {
}
/** Stores the source encoding of HTML file. */
private String sourceEncoding;
/** Stores the target encoding of HTML file. */
private String targetEncoding;
/**
* A regular Expression Pattern to be matched to the strings to be
* translated. If there is a match, the string should not be translated
*/
private Pattern skipRegExpPattern;
/**
* A map of attribute-name and attribute value pairs that, if it exist in a
* meta-tag, indicates that the meta-tag should not be translated
*/
private HashMap<String, String> skipMetaAttributes;
/**
* A map of attribute-name and attribute value pairs that, if exist in a
* tag, indicate that this tag should not be translated
*/
private HashMap<String, String> ignoreTagsAttributes;
@Override
protected boolean requirePrevNextFields() {
return true;
}
/**
* Customized version of creating input reader for HTML files, aware of
* encoding by using <code>EncodingAwareReader</code> class.
*
* @see HTMLReader
*/
@Override
public BufferedReader createReader(File infile, String encoding) throws UnsupportedEncodingException,
IOException {
HTMLReader hreader = new HTMLReader(infile.getAbsolutePath(), encoding);
sourceEncoding = hreader.getEncoding();
return new BufferedReader(hreader);
}
/**
* Customized version of creating an output stream for HTML files, appending
* charset meta by using <code>HTMLWriter</code> class.
*
* @see HTMLWriter
*/
@Override
public BufferedWriter createWriter(File outfile, String encoding) throws UnsupportedEncodingException,
IOException {
HTMLWriter hwriter;
HTMLOptions options = new HTMLOptions(processOptions);
if (encoding == null) {
this.targetEncoding = sourceEncoding;
} else {
this.targetEncoding = encoding;
}
hwriter = new HTMLWriter(outfile.getAbsolutePath(), this.targetEncoding, options);
return new BufferedWriter(hwriter);
}
@Override
public void processFile(BufferedReader infile, BufferedWriter outfile,
org.omegat.filters2.FilterContext fc) throws IOException, TranslationException {
StringBuilder all = null;
try {
all = new StringBuilder();
char[] cbuf = new char[1000];
int len = -1;
while ((len = infile.read(cbuf)) > 0) {
all.append(cbuf, 0, len);
}
} catch (OutOfMemoryError e) {
// out of memory?
all = null;
System.gc();
throw new IOException(OStrings.getString("HTML__FILE_TOO_BIG"));
}
HTMLOptions options = new HTMLOptions(processOptions);
// Prepare matcher
String skipRegExp = options.getSkipRegExp();
if (!StringUtil.isEmpty(skipRegExp)) {
try {
this.skipRegExpPattern = Pattern.compile(skipRegExp, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) {
Log.log(e);
}
}
// prepare set of attributes that indicate not to translate a meta-tag
String skipMetaString = options.getSkipMeta();
skipMetaAttributes = new HashMap<String, String>();
String[] skipMetaAttributesStringarray = skipMetaString.split(",");
for (int i = 0; i < skipMetaAttributesStringarray.length; i++) {
String keyvalue = skipMetaAttributesStringarray[i].trim().toUpperCase();
skipMetaAttributes.put(keyvalue, "");
}
// Prepare set of attributes that indicate not to translate a tag
String ignoreTagString = options.getIgnoreTags();
ignoreTagsAttributes = new HashMap<String, String>();
String[] ignoreTagsAttributesStringarray = ignoreTagString.split(",");
for (int i = 0; i < ignoreTagsAttributesStringarray.length; i++) {
String keyvalue = ignoreTagsAttributesStringarray[i].trim().toUpperCase();
ignoreTagsAttributes.put(keyvalue, "");
}
Parser parser = new Parser();
try {
parser.setInputHTML(all.toString());
parser.visitAllNodesWith(new FilterVisitor(this, outfile, options));
} catch (ParserException pe) {
System.out.println(pe);
} catch (StringIndexOutOfBoundsException se) {
throw new StringIndexOutOfBoundsException(OStrings.getString("HTML__INVALID_HTML"));
}
}
// ////////////////////////////////////////////////////////////////////////
/** Package-internal processEntry to give it to FilterVisitor */
public String privateProcessEntry(String entry, String comment) {
if (skipRegExpPattern != null) {
if (skipRegExpPattern.matcher(entry).matches()) {
// System.out.println("Skipping \""+entry+"\"");
return entry;
} else {
// System.out.println("Using: \""+entry+"\"");
return super.processEntry(entry, comment);
}
}
return super.processEntry(entry, comment);
}
// ////////////////////////////////////////////////////////////////////////
public boolean isTargetEncodingVariable() {
return true;
}
public boolean isSourceEncodingVariable() {
return true;
}
public String getFileFormatName() {
return OStrings.getString("HTML__FILTER_NAME");
}
public Instance[] getDefaultInstances() {
return new Instance[] { new Instance("*.htm", null, "UTF-8"), new Instance("*.html", null, "UTF-8"),
new Instance("*.xhtml", null, "UTF-8"), new Instance("*.xht", null, "UTF-8") };
}
/**
* Returns the editing hint for HTML filter.
* <p>
* In English, the hint is as follows: <br>
* Note: Source File Encoding setting affects only the HTML files that have
* no encoding declaration inside. If HTML file has the encoding
* declaration, it will be used disregarding any value you set in this
* dialog.
*/
@Override
public String getHint() {
return OStrings.getString("HTML_NOTE");
}
/**
* Returns true to indicate that (X)HTML filter has options.
*
* @return True, because (X)HTML filter has options.
*/
@Override
public boolean hasOptions() {
return true;
}
/**
* (X)HTML Filter shows a <b>modal</b> dialog to edit its own options.
*
* @param currentOptions
* Current options to edit.
* @return Updated filter options if user confirmed the changes, and current
* options otherwise.
*/
@Override
public Map<String, String> changeOptions(Window parent, Map<String, String> config) {
try {
EditOptionsDialog dialog = new EditOptionsDialog(parent, config);
dialog.setVisible(true);
if (EditOptionsDialog.RET_OK == dialog.getReturnStatus()) {
return dialog.getOptions().getOptionsMap();
} else {
return null;
}
} catch (Exception e) {
Log.logErrorRB("HTML_EXC_EDIT_OPTIONS");
Log.log(e);
return null;
}
}
/**
* Returns the encoding of the html writer (if already set)
*
* @return the target encoding
*/
public String getTargetEncoding() {
return this.targetEncoding;
}
public boolean checkDoSkipMetaTag(String key, String value) {
return skipMetaAttributes.containsKey(key.toUpperCase() + "=" + value.toUpperCase());
}
public boolean checkIgnoreTags(String key, String value) {
return ignoreTagsAttributes.containsKey(key.toUpperCase() + "=" + value.toUpperCase());
}
@Override
public String getInEncodingLastParsedFile() {
return sourceEncoding;
}
}