package org.apache.nutch.util;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import junit.framework.TestCase;
public class TestEncodingDetector extends TestCase {
private static Configuration conf = NutchConfiguration.create();
private static byte[] contentInOctets;
static {
try {
contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8");
} catch (UnsupportedEncodingException e) {
// not possible
}
}
public TestEncodingDetector(String name) {
super(name);
}
public void testGuessing() {
// first disable auto detection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
Metadata metadata = new Metadata();
EncodingDetector detector;
Content content;
String encoding;
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
encoding = detector.guessEncoding(content, "windows-1252");
// no information is available, so it should return default encoding
assertEquals("windows-1252", encoding.toLowerCase());
metadata.clear();
metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
encoding = detector.guessEncoding(content, "windows-1252");
assertEquals("utf-16", encoding.toLowerCase());
metadata.clear();
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue("windows-1254", "sniffed");
encoding = detector.guessEncoding(content, "windows-1252");
assertEquals("windows-1254", encoding.toLowerCase());
// enable autodetection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
metadata.clear();
metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue("utf-32", "sniffed");
encoding = detector.guessEncoding(content, "windows-1252");
assertEquals("utf-8", encoding.toLowerCase());
}
}