/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.parse.text;
import java.io.Closeable;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import no.trank.openpipe.parse.api.ParseData;
/**
* @version $Revision$
*/
public class TextDecoder implements Closeable {
private static final Logger log = LoggerFactory.getLogger(TextParser.class);
private static final int CAPACITY = 4096 * 16;
private final Set<CharsetDecoder> decoders = new LinkedHashSet<CharsetDecoder>();
private final Map<Charset, CharsetDecoder> decoderMap = new HashMap<Charset, CharsetDecoder>();
private final CharsetDetector detector;
private String encoding;
private CharBuffer charBuffer;
private ByteBuffer directByteBuffer;
private ByteBuffer byteBuffer;
private String language;
public TextDecoder() {
this("UTF-8");
}
public TextDecoder(String encoding) {
this(Arrays.asList(encoding));
}
public TextDecoder(List<String> encodings) {
this(encodings, null);
}
public TextDecoder(CharsetDetector detector) {
this.detector = detector;
}
public TextDecoder(List<String> encodings, CharsetDetector detector) {
this.detector = detector;
for (String enc : encodings) {
decoders.add(getDecoder(Charset.forName(enc)));
}
if (decoders.isEmpty() && detector == null) {
throw new IllegalArgumentException("No encodings/detector");
}
}
private CharsetDecoder getDecoder(Charset charset) throws UnsupportedCharsetException {
final CharsetDecoder dec = decoderMap.get(charset);
if (dec != null) {
return dec;
}
final CharsetDecoder decoder = charset.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
decoderMap.put(charset, decoder);
return decoder;
}
public String decode(ParseData data) throws IOException {
encoding = null;
language = null;
final InputStream in = data.getInputStream();
final FileChannel channel = getChannel(in);
final StringBuilder buf = new StringBuilder(data.getLength());
if (channel != null) {
return decode(channel, getDirectByteBuffer(), getCharBuffer(), buf);
}
return decode(in, data, getByteBuffer(), getCharBuffer(), buf);
}
private String decode(InputStream in, ParseData data, ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf)
throws IOException {
if (detector != null) {
final int len = read(in, bBuf, bBuf.array());
final byte[] bytes = bBuf.capacity() != len ? Arrays.copyOf(bBuf.array(), len) : bBuf.array();
final String enc = detect(bytes);
close(in);
if (enc != null) {
final CharsetDecoder decoder = getDecoder(Charset.forName(enc));
final String text = decode(data.getInputStream(), bBuf, cBuf, buf, decoder);
if (text != null) {
foundEncoding(decoder.charset());
return text;
}
}
in = data.getInputStream();
}
for (CharsetDecoder decoder : decoders) {
String text = decode(in, bBuf, cBuf, buf, decoder);
if (text != null) {
foundEncoding(decoder.charset());
return text;
} else if (decoder.isAutoDetecting() && decoder.isCharsetDetected()) {
final Charset charset = decoder.detectedCharset();
text = decode(data.getInputStream(), bBuf, cBuf, buf, getDecoder(charset));
if (text != null) {
foundEncoding(charset);
return text;
}
}
in = data.getInputStream();
}
close(in);
encoding = null;
return null;
}
private void foundEncoding(Charset charset) {
encoding = charset.name();
log.debug("Decoded stream with detected charset {}", encoding);
}
private static String decode(InputStream in, ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf,
CharsetDecoder decoder) throws IOException {
try {
reset(bBuf, cBuf, buf, decoder);
final byte[] bytes = bBuf.array();
while (read(in, bBuf, bytes) >= 0) {
if (decodeBuffer(bBuf, cBuf, buf, decoder)) return null;
}
return flushDecoder(bBuf, cBuf, buf, decoder);
} finally {
close(in);
}
}
private static void close(InputStream in) {
try {
in.close();
} catch (IOException e) {
// Ignoring
}
}
private static int read(InputStream in, ByteBuffer bBuf, byte[] bytes) throws IOException {
final int pos = bBuf.position();
final int read = in.read(bytes, pos, bBuf.remaining());
if (read > 0) {
bBuf.position(pos + read);
}
return read;
}
private String decode(FileChannel channel, ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf) throws IOException {
try {
if (detector != null) {
final int len = channel.read(bBuf);
bBuf.flip();
final byte[] bytes = new byte[len];
bBuf.get(bytes);
final String enc = detect(bytes);
if (enc != null) {
final CharsetDecoder decoder = getDecoder(Charset.forName(enc));
final String text = decode(channel, bBuf, cBuf, buf, decoder);
if (text != null) {
foundEncoding(decoder.charset());
return text;
}
}
}
for (CharsetDecoder decoder : decoders) {
String text = decode(channel, bBuf, cBuf, buf, decoder);
if (text != null) {
foundEncoding(decoder.charset());
return text;
} else if (decoder.isAutoDetecting() && decoder.isCharsetDetected()) {
final Charset charset = decoder.detectedCharset();
text = decode(channel, bBuf, cBuf, buf, getDecoder(charset));
if (text != null) {
foundEncoding(charset);
return text;
}
}
}
} finally {
try {
channel.close();
} catch (IOException e) {
// Ignoring
}
}
return null;
}
private String detect(byte[] bytes) {
detector.setText(bytes);
final CharsetMatch match = detector.detect();
if (log.isDebugEnabled()) {
log.debug("Detector has confidence: {} encoding {} lang {}",
new Object[] {match.getConfidence(), match.getName(), match.getLanguage()});
}
if (match.getConfidence() > 50) {
language = match.getLanguage();
return match.getName();
}
return null;
}
private static String decode(FileChannel channel, ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf,
CharsetDecoder decoder) throws IOException {
channel.position(0);
reset(bBuf, cBuf, buf, decoder);
while (channel.read(bBuf) >= 0) {
if (decodeBuffer(bBuf, cBuf, buf, decoder)) return null;
}
return flushDecoder(bBuf, cBuf, buf, decoder);
}
private static void reset(ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf, CharsetDecoder decoder) {
decoder.reset();
bBuf.clear();
cBuf.clear();
buf.setLength(0);
}
private static String flushDecoder(ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf, CharsetDecoder decoder) {
if (bBuf.position() == 0) {
bBuf.limit(0);
} else {
bBuf.flip();
}
if (decoder.decode(bBuf, cBuf, true).isError()) {
return null;
}
decoder.flush(cBuf);
if (cBuf.position() > 0) {
cBuf.flip();
buf.append(cBuf.array(), cBuf.arrayOffset(), cBuf.remaining());
}
return buf.toString();
}
private static boolean decodeBuffer(ByteBuffer bBuf, CharBuffer cBuf, StringBuilder buf, CharsetDecoder decoder) {
bBuf.flip();
if (decoder.decode(bBuf, cBuf, false).isError()) {
return true;
}
cBuf.flip();
buf.append(cBuf.array(), cBuf.arrayOffset(), cBuf.remaining());
bBuf.compact();
cBuf.clear();
return false;
}
private static FileChannel getChannel(InputStream in) {
if (FileInputStream.class.equals(in.getClass())) {
return ((FileInputStream)in).getChannel();
}
return null;
}
private ByteBuffer getDirectByteBuffer() {
if (directByteBuffer == null) {
directByteBuffer = ByteBuffer.allocateDirect(CAPACITY);
} else {
directByteBuffer.clear();
}
return directByteBuffer;
}
private ByteBuffer getByteBuffer() {
if (byteBuffer == null) {
byteBuffer = ByteBuffer.allocate(CAPACITY);
} else {
byteBuffer.clear();
}
return byteBuffer;
}
private CharBuffer getCharBuffer() {
if (charBuffer == null) {
charBuffer = CharBuffer.allocate(CAPACITY);
} else {
charBuffer.clear();
}
return charBuffer;
}
public String getEncoding() {
return encoding;
}
public String getLanguage() {
return language;
}
@Override
public void close() throws IOException {
decoderMap.clear();
byteBuffer = null;
directByteBuffer = null;
charBuffer = null;
}
}