package org.archive.wayback.replay.charset; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import org.archive.wayback.core.Resource; /** * Implements common utility methods for EncodingSniffer. */ public abstract class BaseEncodingSniffer implements EncodingSniffer { protected final static String CHARSET_TOKEN = "charset="; protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type"; public abstract String sniff(Resource resource); /** * test if {@code charsetName} is supported by Java. * @param charsetName character encoding name * @return {@code true} if supported. */ protected boolean isCharsetSupported(String charsetName) { // Charset.isSupproted throws RuntimeException if charset // is not supported (wow!!) try { return charsetName != null && Charset.isSupported(charsetName); } catch (IllegalCharsetNameException ex) { return false; } } /** * return character encoding from content-type, if specified * and valid. * <p>some encoding names are replaced. see {@link #mapCharset(String)}</p> * @param contentType content-type text, ex. {@code "text/html; charset=shift_jis"} * @return character encoding, ex. {@code "shift_jis"}, or null * character encoding is unspecified, or invalid. */ protected String contentTypeToCharset(final String contentType) { // FIXME: should we toLowerCase() so that we don't need to // upper-case CHARSET_TOKEN every time. int offset = contentType.toUpperCase().indexOf( CHARSET_TOKEN.toUpperCase()); if (offset != -1) { String cs = contentType.substring(offset + CHARSET_TOKEN.length()); if (cs.equalsIgnoreCase("x-user-defined")) { cs = "windows-1252"; } if (isCharsetSupported(cs)) { return mapCharset(cs); } // test for extra spaces... there's at least one page out there // that indicates it's charset with: // <meta http-equiv="Content-type" // content="text/html; charset=i so-8859-1"> // bad web page! if (isCharsetSupported(cs.replace(" ", ""))) { return mapCharset(cs.replace(" ", "")); } } return null; } protected String mapCharset(String orig) { String lc = orig.toLowerCase(); if (lc.contains("iso8859-1") || lc.contains("iso-8859-1")) { return "windows-1252"; } if (lc.contains("unicode")) { return CharsetDetector.DEFAULT_CHARSET; } return orig; } }