/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.format.text.charset;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.format.http.HttpHeader;
import org.archive.format.http.HttpHeaders;
import org.mozilla.universalchardet.UniversalDetector;
/**
* Abstract class containing common methods for determining the character
* encoding of a text Resource, most of which should be refactored into a
* Util package.
* @author brad
*
*/
public abstract class CharsetDetector {
private final static String META_TAGNAME = "META";
private final static String META_CONTENT_ATTRIBUTE = "content";
private final static String META_HTTP_EQUIV_ATTRIBUTE = "http-equiv";
private final static String META_CONTENT_TYPE = "Content-Type";
private final static String QUOTED_ATTR_VALUE = "(?:\"[^\">]*\")";
private final static String ESC_QUOTED_ATTR_VALUE = "(?:\\\\\"[^>\\\\]*\\\\\")";
private final static String APOSED_ATTR_VALUE = "(?:'[^'>]*')";
// private final static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)";
private final static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|"
+ APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|";
private final static String META_TAG_PATTERN_STRING = "<\\s*" + META_TAGNAME
+ "((>)|(\\s+[^>]*>))";
private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" +
META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" +
META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
private final static Pattern META_TAG_PATTERN;
private final static Pattern META_CONTENT_ATTR_PATTERN;
private final static Pattern META_HTTP_EQUIV_ATTR_PATTERN;
// String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
// "content", "http-equiv", "Content-Type");
static {
META_TAG_PATTERN = Pattern.compile(META_TAG_PATTERN_STRING,
Pattern.CASE_INSENSITIVE);
META_CONTENT_ATTR_PATTERN =
Pattern.compile(META_CONTENT_ATTR_PATTERN_STRING,
Pattern.CASE_INSENSITIVE);
META_HTTP_EQUIV_ATTR_PATTERN =
Pattern.compile(META_HTTP_EQUIV_ATTR_PATTERN_STRING,
Pattern.CASE_INSENSITIVE);
};
// hand off this many bytes to the chardet library
protected final static int MAX_CHARSET_READAHEAD = 65536;
// ...if it also includes "charset="
protected final static String CHARSET_TOKEN = "charset=";
// ...and if the chardet library fails, use the Content-Type header
protected final static String HTTP_CONTENT_TYPE_HEADER = "CONTENT-TYPE";
/** the default charset name to use when giving up */
public final static String DEFAULT_CHARSET = "UTF-8";
protected boolean isCharsetSupported(String charsetName) {
// can you believe that this throws a runtime? Just asking if it's
// supported!!?! They coulda just said "no"...
if(charsetName == null) {
return false;
}
try {
return Charset.isSupported(charsetName);
} catch(IllegalCharsetNameException e) {
return false;
}
}
protected String mapCharset(String orig) {
String lc = orig.toLowerCase();
if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
return "cp1252";
}
return orig;
}
protected String contentTypeToCharset(final String contentType) {
int offset =
contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
if (offset != -1) {
String cs = contentType.substring(offset + CHARSET_TOKEN.length());
if(isCharsetSupported(cs)) {
return mapCharset(cs);
}
// test for extra spaces... there's at least one page out there that
// indicates it's charset with:
// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">
// bad web page!
String alternate = cs.replace(" ", "");
if(isCharsetSupported(alternate)) {
return mapCharset(alternate);
}
}
return null;
}
/**
* Attempt to divine the character encoding of the document from the
* Content-Type HTTP header (with a "charset=")
*
* @param resource
* @return String character set found or null if the header was not present
* @throws IOException
*/
protected String getCharsetFromHeaders(HttpHeaders headers)
throws IOException {
if(headers == null) {
return null;
}
for(HttpHeader header : headers) {
if(header.getName().toUpperCase().trim().equals(
HTTP_CONTENT_TYPE_HEADER)) {
return contentTypeToCharset(header.getValue());
}
}
return null;
}
/**
* Attempt to find a META tag in the HTML that hints at the character set
* used to write the document.
*
* @param resource
* @return String character set found from META tags in the HTML
* @throws IOException
*/
protected String getCharsetFromMeta(byte buffer[],int len) throws IOException {
String charsetName = null;
// convert to UTF-8 String -- which hopefully will not mess up the
// characters we're interested in...
String sample = new String(buffer,0,len,DEFAULT_CHARSET);
String metaContentType = findMetaContentType(sample);
if(metaContentType != null) {
charsetName = contentTypeToCharset(metaContentType);
}
return charsetName;
}
private static String trimAttrValue(String value) {
String result = value;
if (result.charAt(0) == '"') {
result = result.substring(1, result.length() - 1);
} else if (result.charAt(0) == '\'') {
result = result.substring(1, result.length() - 1);
}
return result;
}
public static String findMetaContentType(String pageSample) {
Matcher tagMatcher = META_TAG_PATTERN.matcher(pageSample);
while (tagMatcher.find()) {
String wholeTag = tagMatcher.group();
Matcher whereAttrMatcher = META_HTTP_EQUIV_ATTR_PATTERN.matcher(wholeTag);
if (whereAttrMatcher.find()) {
String attrValue = whereAttrMatcher.group(1);
attrValue = trimAttrValue(attrValue);
if (attrValue.compareToIgnoreCase(META_CONTENT_TYPE) == 0) {
// this tag contains the right set, return the value for
// the attribute findAttr:
Matcher findAttrMatcher = META_CONTENT_ATTR_PATTERN.matcher(wholeTag);
String value = null;
if (findAttrMatcher.find()) {
value = findAttrMatcher.group(1);
value = trimAttrValue(value);
}
return value;
}
// not the tag we want... maybe there is another: loop
}
}
return null;
}
/**
* Attempts to figure out the character set of the document using
* the excellent juniversalchardet library.
*
* @param resource
* @return String character encoding found, or null if nothing looked good.
* @throws IOException
*/
protected String getCharsetFromBytes(byte buffer[], int len)
throws IOException {
String charsetName = null;
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(buffer, 0, len);
detector.dataEnd();
charsetName = detector.getDetectedCharset();
detector.reset();
if(isCharsetSupported(charsetName)) {
return mapCharset(charsetName);
}
return null;
}
/**
* @param resource (presumably text) Resource to determine the charset
* @param request WaybackRequest which may contain additional hints to
* processing
* @return String charset name for the Resource
* @throws IOException if there are problems reading the Resource
*/
public abstract String getCharset(byte buffer[],int len, HttpHeaders headers)
throws IOException;
}