/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutchbase.util.hbase.ImmutableRowPart;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
/**
* A simple class for detecting character encodings.
*
* <p>
* Broadly this encompasses two functions, which are distinctly separate:
*
* <ol>
* <li>Auto detecting a set of "clues" from input text.</li>
* <li>Taking a set of clues and making a "best guess" as to the
* "real" encoding.</li>
* </ol>
* </p>
*
* <p>
* A caller will often have some extra information about what the
* encoding might be (e.g. from the HTTP header or HTML meta-tags, often
* wrong but still potentially useful clues). The types of clues may differ
* from caller to caller. Thus a typical calling sequence is:
* <ul>
* <li>Run step (1) to generate a set of auto-detected clues;</li>
* <li>Combine these clues with the caller-dependent "extra clues"
* available;</li>
* <li>Run step (2) to guess what the most probable answer is.</li>
* </p>
*/
public class EncodingDetector {
private class EncodingClue {
private String value;
private String source;
private int confidence;
// Constructor for clues with no confidence values (ignore thresholds)
public EncodingClue(String value, String source) {
this(value, source, NO_THRESHOLD);
}
public EncodingClue(String value, String source, int confidence) {
this.value = value.toLowerCase();
this.source = source;
this.confidence = confidence;
}
public String getSource() {
return source;
}
public String getValue() {
return value;
}
public String toString() {
return value + " (" + source +
((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
}
public boolean isEmpty() {
return (value==null || "".equals(value));
}
public boolean meetsThreshold() {
return (confidence < 0 ||
(minConfidence >= 0 && confidence >= minConfidence));
}
}
public static final Log LOG = LogFactory.getLog(EncodingDetector.class);
public static final int NO_THRESHOLD = -1;
public static final String MIN_CONFIDENCE_KEY =
"encodingdetector.charset.min.confidence";
private static final HashMap<String, String> ALIASES =
new HashMap<String, String>();
private static final HashSet<String> DETECTABLES = new HashSet<String>();
// CharsetDetector will die without a minimum amount of data.
private static final int MIN_LENGTH=4;
static {
DETECTABLES.add("text/html");
DETECTABLES.add("text/plain");
DETECTABLES.add("text/richtext");
DETECTABLES.add("text/rtf");
DETECTABLES.add("text/sgml");
DETECTABLES.add("text/tab-separated-values");
DETECTABLES.add("text/xml");
DETECTABLES.add("application/rss+xml");
DETECTABLES.add("application/xhtml+xml");
/*
* the following map is not an alias mapping table, but
* maps character encodings which are often used in mislabelled
* documents to their correct encodings. For instance,
* there are a lot of documents labelled 'ISO-8859-1' which contain
* characters not covered by ISO-8859-1 but covered by windows-1252.
* Because windows-1252 is a superset of ISO-8859-1 (sharing code points
* for the common part), it's better to treat ISO-8859-1 as
* synonymous with windows-1252 than to reject, as invalid, documents
* labelled as ISO-8859-1 that have characters outside ISO-8859-1.
*/
ALIASES.put("ISO-8859-1", "windows-1252");
ALIASES.put("EUC-KR", "x-windows-949");
ALIASES.put("x-EUC-CN", "GB18030");
ALIASES.put("GBK", "GB18030");
//ALIASES.put("Big5", "Big5HKSCS");
//ALIASES.put("TIS620", "Cp874");
//ALIASES.put("ISO-8859-11", "Cp874");
}
private int minConfidence;
private CharsetDetector detector;
private List<EncodingClue> clues;
public EncodingDetector(Configuration conf) {
minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
detector = new CharsetDetector();
clues = new ArrayList<EncodingClue>();
}
public void autoDetectClues(Content content, boolean filter) {
autoDetectClues(content.getContent(), content.getContentType(),
parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)),
filter);
}
public void autoDetectClues(ImmutableRowPart row, boolean filter) {
autoDetectClues(row.getContent(), row.getContentType(),
parseCharacterEncoding(row.getHeader(Response.CONTENT_TYPE)), filter);
}
private void autoDetectClues(byte[] data, String type,
String encoding, boolean filter) {
if (minConfidence >= 0 && DETECTABLES.contains(type)
&& data.length > MIN_LENGTH) {
CharsetMatch[] matches = null;
// do all these in a try/catch; setText and detect/detectAll
// will sometimes throw exceptions
try {
detector.enableInputFilter(filter);
if (data.length > MIN_LENGTH) {
detector.setText(data);
matches = detector.detectAll();
}
} catch (Exception e) {
LOG.debug("Exception from ICU4J (ignoring): ");
e.printStackTrace(LogUtil.getDebugStream(LOG));
}
if (matches != null) {
for (CharsetMatch match : matches) {
addClue(match.getName(), "detect", match.getConfidence());
}
}
}
// add character encoding coming from HTTP response header
addClue(encoding, "header");
}
public void addClue(String value, String source, int confidence) {
if (value == null || "".equals(value)) {
return;
}
value = resolveEncodingAlias(value);
if (value != null) {
clues.add(new EncodingClue(value, source, confidence));
}
}
public void addClue(String value, String source) {
addClue(value, source, NO_THRESHOLD);
}
/**
* Guess the encoding with the previously specified list of clues.
*
* @param content Content instance
* @param defaultValue Default encoding to return if no encoding can be
* detected with enough confidence. Note that this will <b>not</b> be
* normalized with {@link EncodingDetector#resolveEncodingAlias}
*
* @return Guessed encoding or defaultValue
*/
public String guessEncoding(Content content, String defaultValue) {
return guessEncoding(content.getBaseUrl(), defaultValue);
}
/**
* Guess the encoding with the previously specified list of clues.
*
* @param row URL's row
* @param defaultValue Default encoding to return if no encoding can be
* detected with enough confidence. Note that this will <b>not</b> be
* normalized with {@link EncodingDetector#resolveEncodingAlias}
*
* @return Guessed encoding or defaultValue
*/
public String guessEncoding(ImmutableRowPart row, String defaultValue) {
return guessEncoding(row.getBaseUrl(), defaultValue);
}
/**
* Guess the encoding with the previously specified list of clues.
*
* @param baseUrl Base URL
* @param defaultValue Default encoding to return if no encoding can be
* detected with enough confidence. Note that this will <b>not</b> be
* normalized with {@link EncodingDetector#resolveEncodingAlias}
*
* @return Guessed encoding or defaultValue
*/
private String guessEncoding(String baseUrl, String defaultValue) {
/*
* This algorithm could be replaced by something more sophisticated;
* ideally we would gather a bunch of data on where various clues
* (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with
* the correct answer, and use machine learning/some statistical method
* to generate a better heuristic.
*/
if (LOG.isTraceEnabled()) {
findDisagreements(baseUrl, clues);
}
/*
* Go down the list of encoding "clues". Use a clue if:
* 1. Has a confidence value which meets our confidence threshold, OR
* 2. Doesn't meet the threshold, but is the best try,
* since nothing else is available.
*/
EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
EncodingClue bestClue = defaultClue;
for (EncodingClue clue : clues) {
if (LOG.isTraceEnabled()) {
LOG.trace(baseUrl + ": charset " + clue);
}
String charset = clue.value;
if (minConfidence >= 0 && clue.confidence >= minConfidence) {
if (LOG.isTraceEnabled()) {
LOG.trace(baseUrl + ": Choosing encoding: " + charset +
" with confidence " + clue.confidence);
}
return resolveEncodingAlias(charset).toLowerCase();
} else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
bestClue = clue;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace(baseUrl + ": Choosing encoding: " + bestClue);
}
return bestClue.value.toLowerCase();
}
/** Clears all clues. */
public void clearClues() {
clues.clear();
}
/*
* Strictly for analysis, look for "disagreements." The top guess from
* each source is examined; if these meet the threshold and disagree, then
* we log the information -- useful for testing or generating training data
* for a better heuristic.
*/
private void findDisagreements(String url, List<EncodingClue> newClues) {
HashSet<String> valsSeen = new HashSet<String>();
HashSet<String> sourcesSeen = new HashSet<String>();
boolean disagreement = false;
for (int i = 0; i < newClues.size(); i++) {
EncodingClue clue = newClues.get(i);
if (!clue.isEmpty() && !sourcesSeen.contains(clue.source)) {
if (valsSeen.size() > 0 && !valsSeen.contains(clue.value)
&& clue.meetsThreshold()) {
disagreement = true;
}
if (clue.meetsThreshold()) {
valsSeen.add(clue.value);
}
sourcesSeen.add(clue.source);
}
}
if (disagreement) {
// dump all values in case of disagreement
StringBuffer sb = new StringBuffer();
sb.append("Disagreement: "+url+"; ");
for (int i = 0; i < newClues.size(); i++) {
if (i>0) {
sb.append(", ");
}
sb.append(newClues.get(i));
}
LOG.trace(sb.toString());
}
}
public static String resolveEncodingAlias(String encoding) {
if (encoding == null || !Charset.isSupported(encoding))
return null;
String canonicalName = new String(Charset.forName(encoding).name());
return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName)
: canonicalName;
}
/**
* Parse the character encoding from the specified content type header.
* If the content type is null, or there is no explicit character encoding,
* <code>null</code> is returned.
* <br />
* This method was copied from org.apache.catalina.util.RequestUtil,
* which is licensed under the Apache License, Version 2.0 (the "License").
*
* @param contentType a content type header
*/
public static String parseCharacterEncoding(String contentType) {
if (contentType == null)
return (null);
int start = contentType.indexOf("charset=");
if (start < 0)
return (null);
String encoding = contentType.substring(start + 8);
int end = encoding.indexOf(';');
if (end >= 0)
encoding = encoding.substring(0, end);
encoding = encoding.trim();
if ((encoding.length() > 2) && (encoding.startsWith("\""))
&& (encoding.endsWith("\"")))
encoding = encoding.substring(1, encoding.length() - 1);
return (encoding.trim());
}
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.err.println("Usage: EncodingDetector <file>");
System.exit(1);
}
Configuration conf = NutchConfiguration.create();
EncodingDetector detector =
new EncodingDetector(NutchConfiguration.create());
// do everything as bytes; don't want any conversion
BufferedInputStream istr =
new BufferedInputStream(new FileInputStream(args[0]));
ByteArrayOutputStream ostr = new ByteArrayOutputStream();
byte[] bytes = new byte[1000];
boolean more = true;
while (more) {
int len = istr.read(bytes);
if (len < bytes.length) {
more = false;
if (len > 0) {
ostr.write(bytes, 0, len);
}
} else {
ostr.write(bytes);
}
}
byte[] data = ostr.toByteArray();
MimeUtil mimeTypes = new MimeUtil(conf);
// make a fake Content
Content content =
new Content("", "", data, "text/html", new Metadata(), mimeTypes);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content,
conf.get("parser.character.encoding.default"));
System.out.println("Guessed encoding: " + encoding);
}
}