/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2007-2008 Didier Briel 2013 Didier Briel, Alex Buloichik 2015 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.filters3.xml; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.omegat.core.data.ProtectedPart; import org.omegat.filters2.AbstractFilter; import org.omegat.filters2.FilterContext; import org.omegat.filters2.TranslationException; import org.omegat.util.Language; import org.omegat.util.OConsts; import org.omegat.util.PatternConsts; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Abstract basis filter for XML format filters: OpenDocument, DocBook etc. Ideally should allow creation of a * new XML dialect filter by simply specifying translatable tags and attributes. * * @author Maxym Mykhalchuk * @author Didier Briel * @author Alex Buloichik * @author Aaron Madlon-Kay */ public abstract class XMLFilter extends AbstractFilter implements Translator { /** Factory for SAX parsers. */ private SAXParserFactory parserFactory; /** XML dialect this filter handles. */ private XMLDialect dialect; /** Creates a new instance of XMLFilter */ public XMLFilter(XMLDialect dialect) { parserFactory = SAXParserFactory.newInstance(); // parserFactory.setValidating(false); try { parserFactory.setFeature("http://xml.org/sax/features/validation", true); } catch (Exception e) { } this.dialect = dialect; } /** Gives the dialect */ public XMLDialect getDialect() { return dialect; } /** Detected encoding of the input XML file. */ private String encoding; /** Detected EOL chars. */ private String eol; /** * Creates a special XML-encoding-aware reader of an input file. * * @param inFile * The source file. * @param outEncoding * Encoding of the source file, if the filter supports it. Otherwise null. * @return The reader of the source file. * * @throws UnsupportedEncodingException * Thrown if JVM doesn't support the specified inEncoding. * @throws IOException * If any I/O Error occurs upon reader creation. */ @Override public BufferedReader createReader(File inFile, String inEncoding) throws UnsupportedEncodingException, IOException { XMLReader xmlreader = new XMLReader(inFile, inEncoding); this.encoding = xmlreader.getEncoding(); this.eol = xmlreader.getEol(); return new BufferedReader(xmlreader); } /** * Creates a writer of the translated file. Accepts <code>null</code> output file -- returns a writer to * <code>/dev/null</code> in this case ;-) * * @param outFile * The target file. * @param outEncoding * Encoding of the target file, if the filter supports it. Otherwise null. * @return The writer for the target file. * * @throws UnsupportedEncodingException * Thrown if JVM doesn't support the specified outEncoding * @throws IOException * If any I/O Error occurs upon writer creation */ @Override public BufferedWriter createWriter(File outFile, String outEncoding) throws UnsupportedEncodingException, IOException { if (outEncoding == null) { outEncoding = this.encoding; } if (outFile == null) { return new BufferedWriter(new StringWriter()); } else { return new BufferedWriter(new XMLWriter(outFile, outEncoding, eol)); } } /** * Target language of the project */ private Language targetLanguage; /** * @return The target language of the project */ @Override public Language getTargetLanguage() { return targetLanguage; } /** Processes an XML file. */ @Override public void processFile(File inFile, File outFile, FilterContext fc) throws IOException, TranslationException { try (BufferedReader inReader = createReader(inFile, fc.getInEncoding())) { inEncodingLastParsedFile = this.encoding; targetLanguage = fc.getTargetLang(); InputSource source = new InputSource(inReader); source.setSystemId(inFile.toURI().toString()); SAXParser parser = parserFactory.newSAXParser(); Handler handler = new Handler(this, dialect, inFile, outFile, fc); parser.setProperty("http://xml.org/sax/properties/lexical-handler", handler); parser.setProperty("http://xml.org/sax/properties/declaration-handler", handler); parser.parse(source, handler); } catch (ParserConfigurationException e) { throw new TranslationException(e); } catch (SAXException e) { throw new TranslationException(e); } } @Override protected void processFile(BufferedReader inFile, BufferedWriter outFile, FilterContext fc) throws IOException, TranslationException { throw new UnsupportedOperationException( "XMLFilter.processFile(BufferedReader,BufferedWriter) should never be called!"); } /** * Whether source encoding can be varied by the user. If XML file has no encoding declaration, UTF-8 will * be used, hence returns <code>false</code> by default. * * @return <code>false</code> */ @Override public boolean isSourceEncodingVariable() { return false; } /** * Target encoding can be varied by the user. * * @return <code>true</code> */ @Override public boolean isTargetEncodingVariable() { return true; } /** * The method the Handler would call to pass translatable content to OmegaT core and receive translation. */ @Override public String translate(String entry, List<ProtectedPart> protectedParts) { if (entryParseCallback != null) { entryParseCallback.addEntry(null, entry, null, false, null, null, this, protectedParts); return entry; } else if (entryTranslateCallback != null) { String translation = entryTranslateCallback.getTranslation(null, entry, null); return translation != null ? translation : entry; } else { // We're not supposed to be there, (parsing called from inside isFileSupported, for instance) return entry; // so what we return is not important } } /** * Returns whether the XML file is supported by the filter. <br> * Reads {@link org.omegat.util.OConsts#READ_AHEAD_LIMIT} and tries to detect constrained text and match * constraints defined in {@link XMLDialect} against them. */ @Override public boolean isFileSupported(BufferedReader reader) { if (dialect.getConstraints() == null || dialect.getConstraints().isEmpty()) { return true; } try { char[] cbuf = new char[OConsts.READ_AHEAD_LIMIT]; int cbufLen = reader.read(cbuf); String buf = new String(cbuf, 0, cbufLen); Matcher matcher = PatternConsts.XML_DOCTYPE.matcher(buf); if (matcher.find()) { Pattern doctype = dialect.getConstraints().get(XMLDialect.CONSTRAINT_DOCTYPE); if (doctype != null && (matcher.group(1) == null || !doctype.matcher(matcher.group(1)).matches())) { return false; } Pattern publicc = dialect.getConstraints().get(XMLDialect.CONSTRAINT_PUBLIC_DOCTYPE); if (publicc != null && (matcher.group(3) == null || !publicc.matcher(matcher.group(3)).matches())) { return false; } Pattern system = dialect.getConstraints().get(XMLDialect.CONSTRAINT_SYSTEM_DOCTYPE); if (system != null && (matcher.group(5) == null || !system.matcher(matcher.group(5)).matches())) { return false; } } else if (dialect.getConstraints().containsKey(XMLDialect.CONSTRAINT_DOCTYPE) || dialect.getConstraints().containsKey(XMLDialect.CONSTRAINT_PUBLIC_DOCTYPE) || dialect.getConstraints().containsKey(XMLDialect.CONSTRAINT_SYSTEM_DOCTYPE)) { return false; } matcher = PatternConsts.XML_ROOTTAG.matcher(buf); if (matcher.find()) { Pattern root = dialect.getConstraints().get(XMLDialect.CONSTRAINT_ROOT); if (root != null && (matcher.group(1) == null || !root.matcher(matcher.group(1)).matches())) { return false; } } else if (dialect.getConstraints().containsKey(XMLDialect.CONSTRAINT_ROOT)) { return false; } matcher = PatternConsts.XML_XMLNS.matcher(buf); if (matcher.find()) { Pattern xmlns = dialect.getConstraints().get(XMLDialect.CONSTRAINT_XMLNS); if (xmlns != null && (matcher.group(2) == null || !xmlns.matcher(matcher.group(2)).matches())) { return false; } } else if (dialect.getConstraints().containsKey(XMLDialect.CONSTRAINT_XMLNS)) { return false; } return true; } catch (Exception e) { return false; } } @Override public void tagStart(String path, Attributes atts) { } @Override public void tagEnd(String path) { } @Override public void comment(String comment) { } @Override public void text(String text) { } @Override public boolean isInIgnored() { return false; } }