package org.archive.wayback.replay.charset;
import java.io.IOException;
import org.archive.wayback.core.Resource;
import org.archive.wayback.replay.TagMagix;
/**
* {@link EncodingSniffer} that pre-scan byte stream for
* {@code <meta http-equiv="content-type" ... >} tag.
* <p>This is step 6 of WHAT-NG prescription, but decodes pre-scanned
* content as {@code UTF-8} to simplify the code. That should okay
* for the purpose...</p>
* <p>CHANGE: 1.8.1 2014-07-07 override {@code UTF-16} encodings to
* {@code UTF-8}, and x-user-defined encoding to {@code Windows-1252},
* as prescribed by WHAT-NG.</p>
*/
public class PrescanMetadataSniffer extends BaseEncodingSniffer {
// hand off this many bytes to the chardet library
protected final static int MAX_CHARSET_READAHEAD = 65536;
@Override
public String sniff(Resource resource) {
String charsetName = null;
try {
byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
resource.mark(MAX_CHARSET_READAHEAD);
resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
resource.reset();
// convert to UTF-8 String -- which hopefully will not mess up the
// characters we're interested in...
StringBuilder sb = new StringBuilder(new String(bbuffer,
"UTF-8"));
String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
"content", "http-equiv", "Content-Type");
if (metaContentType != null) {
charsetName = contentTypeToCharset(metaContentType);
// override
if (charsetName != null) {
String upped = charsetName.toUpperCase();
if (upped.startsWith("UTF-16"))
charsetName = "UTF-8";
}
}
return charsetName;
} catch (IOException ex) {
// TODO: log at FINE.
return null;
}
}
}