package org.archive.wayback.replay.charset;
import java.io.IOException;
import org.archive.wayback.core.Resource;
import org.mozilla.universalchardet.UniversalDetector;
/**
* {@link EncodingSniffer} that runs {@link UniversalDetector} on
* the content.
* <p>Note: as of version 1.0.3, UniversalDetector returns {@code null}
* for ASCII-only text encoded as either {@code UTF-8} or {@code UTF-16}.</p>
*/
public class UniversalChardetSniffer extends BaseEncodingSniffer {
// hand off this many bytes to the chardet library
protected final static int MAX_CHARSET_READAHEAD = 65536;
@Override
public String sniff(Resource resource) {
String charsetName = null;
byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
// (1)
UniversalDetector detector = new UniversalDetector(null);
// (2)
resource.mark(MAX_CHARSET_READAHEAD);
try {
int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
resource.reset();
detector.handleData(bbuffer, 0, len);
} catch (IOException ex) {
//
}
// (3)
detector.dataEnd();
// (4)
charsetName = detector.getDetectedCharset();
// (5)
detector.reset();
if (isCharsetSupported(charsetName)) {
return charsetName;
}
return null;
}
}