/** * * Copyright 2009-2013 The MITRE Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * ************************************************************************** * NOTICE This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** */ package org.opensextant.xtext.converters; import java.io.IOException; import java.io.UnsupportedEncodingException; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; import org.apache.commons.io.IOUtils; import org.opensextant.xtext.ConvertedDocument; import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; /** * Text converter that detects original encoding to include pure ASCII, Latin1, UTF-8 etc * Whereas Tika does not make a not of pure-ASCII texts. * * @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org */ public class TextTranscodingConverter extends ConverterAdapter { private final static CharsetDetector chardet = new CharsetDetector(); private final static int IGNORE_THRESHOLD_SIZE = 1024; // 1KB private final static int IGNORE_THRESHOLD_CONF = 65; // 0 to 100 /** * A converter that tries to get a decent encoding ASCII, UTF-8 or other, * and then the buffer converted or not. * * IF ASCII OR UTF-8 accept file as is, do not convert, alter buffer... * ELSE file must be read in and converted. * * CAVEAT: If file is short and low-confidence for encoding detection ALSO * do not convert. Treat as a plain text file. */ @Override protected ConvertedDocument conversionImplementation(java.io.InputStream in, java.io.File doc) throws IOException { ConvertedDocument textdoc = new ConvertedDocument(doc); byte[] data = null; if (in != null) { // Get byte data from input stream or file if (doc != null) { data = FileUtility.readBytesFrom(doc); } else { data = IOUtils.toByteArray(in); } in.close(); } // Encoding heuristics here..... // // Objective: mark small plain text payloads with unknown character set // as not worthy of conversion. Leave them as plain/text // indeed they might even be straight Unicode // // Test for ASCII only first, otherwise try to detect the best charset for the text // textdoc.is_plaintext = true; boolean is_ascii = TextUtils.isASCII(data); if (is_ascii) { textdoc.do_convert = false; textdoc.setEncoding("ASCII"); textdoc.setText(new String(data)); } else { chardet.setText(data); CharsetMatch cs = chardet.detect(); if (ConvertedDocument.OUTPUT_ENCODING.equalsIgnoreCase(cs.getName())) { textdoc.do_convert = false; } else if (data.length < IGNORE_THRESHOLD_SIZE && cs.getConfidence() < IGNORE_THRESHOLD_CONF) { textdoc.do_convert = false; } textdoc.setEncoding(cs.getName()); textdoc.setText(new String(data, cs.getName())); } return textdoc; } /** * If you have a buffer of text for a document and are unable to get a provided charset, * try this static method. Better than nothing. This does not imply that the original document is a plain text doc. * It could be an object that was parsed adhoc. We cannot make any assumption about * the state of the conversion. This only sets String buffer and charset. * * @param doc the doc * @param data the byte data to test * @throws UnsupportedEncodingException on err */ public static void setTextAndEncoding(ConvertedDocument doc, byte[] data) throws UnsupportedEncodingException { boolean is_ascii = TextUtils.isASCII(data); if (is_ascii) { doc.setEncoding("ASCII"); doc.setText(new String(data)); return; } chardet.setText(data); CharsetMatch cs = chardet.detect(); doc.setEncoding(cs.getName()); doc.setText(new String(data, cs.getName())); } }