CharsetDetector.java example

Explorer
wayback-machine-master
/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.format.text.charset;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.archive.format.http.HttpHeader;
import org.archive.format.http.HttpHeaders;
import org.mozilla.universalchardet.UniversalDetector;

/**
 * Abstract class containing common methods for determining the character 
 * encoding of a text Resource, most of which should be refactored into a
 * Util package.
 * @author brad
 *
 */
public abstract class CharsetDetector {
	private final static String META_TAGNAME = "META";
	private final static String META_CONTENT_ATTRIBUTE = "content";
	private final static String META_HTTP_EQUIV_ATTRIBUTE = "http-equiv";
	private final static String META_CONTENT_TYPE = "Content-Type";
	
	
	private final static String QUOTED_ATTR_VALUE = "(?:\"[^\">]*\")";

	private final static String ESC_QUOTED_ATTR_VALUE = "(?:\\\\\"[^>\\\\]*\\\\\")";

	private final static String APOSED_ATTR_VALUE = "(?:'[^'>]*')";

//	private final static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)";

	
	private final static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|"
			+ APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|";
	
	private final static String META_TAG_PATTERN_STRING = "<\\s*" + META_TAGNAME 
			+ "((>)|(\\s+[^>]*>))";
	private final static String META_CONTENT_ATTR_PATTERN_STRING = "\\b" + 
		META_CONTENT_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
	private final static String META_HTTP_EQUIV_ATTR_PATTERN_STRING = "\\b" + 
		META_HTTP_EQUIV_ATTRIBUTE + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";

	
	
	private final static Pattern META_TAG_PATTERN;
	private final static Pattern META_CONTENT_ATTR_PATTERN;
	private final static Pattern META_HTTP_EQUIV_ATTR_PATTERN;
	
//	String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
//			"content", "http-equiv", "Content-Type");


	static {
		META_TAG_PATTERN = Pattern.compile(META_TAG_PATTERN_STRING,
				Pattern.CASE_INSENSITIVE);
		META_CONTENT_ATTR_PATTERN = 
			Pattern.compile(META_CONTENT_ATTR_PATTERN_STRING, 
					Pattern.CASE_INSENSITIVE);
		META_HTTP_EQUIV_ATTR_PATTERN = 
			Pattern.compile(META_HTTP_EQUIV_ATTR_PATTERN_STRING, 
					Pattern.CASE_INSENSITIVE);
	};

	// hand off this many bytes to the chardet library
	protected final static int MAX_CHARSET_READAHEAD = 65536;
	// ...if it also includes "charset="
	protected final static String CHARSET_TOKEN = "charset=";
	// ...and if the chardet library fails, use the Content-Type header
	protected final static String HTTP_CONTENT_TYPE_HEADER = "CONTENT-TYPE";
	/** the default charset name to use when giving up */
	public final static String DEFAULT_CHARSET = "UTF-8";
	
	protected boolean isCharsetSupported(String charsetName) {
		// can you believe that this throws a runtime? Just asking if it's
		// supported!!?! They coulda just said "no"...
		if(charsetName == null) {
			return false;
		}
		try {
			return Charset.isSupported(charsetName);
		} catch(IllegalCharsetNameException e) {
			return false;
		}
	}
	protected String mapCharset(String orig) {
		String lc = orig.toLowerCase();
		if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
			return "cp1252";
		}
		return orig;
	}
	protected String contentTypeToCharset(final String contentType) {
		int offset = 
			contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
		
		if (offset != -1) {
			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
			if(isCharsetSupported(cs)) {
				return mapCharset(cs);
			}
			// test for extra spaces... there's at least one page out there that
			// indicates it's charset with:

//  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">

			// bad web page!
			String alternate = cs.replace(" ", "");
			if(isCharsetSupported(alternate)) {
				return mapCharset(alternate);
			}
		}
		return null;
	}
	
	/**
	 * Attempt to divine the character encoding of the document from the 
	 * Content-Type HTTP header (with a "charset=")
	 * 
	 * @param resource
	 * @return String character set found or null if the header was not present
	 * @throws IOException 
	 */
	protected String getCharsetFromHeaders(HttpHeaders headers) 
	throws IOException {
		if(headers == null) {
			return null;
		}
		for(HttpHeader header : headers) {
			if(header.getName().toUpperCase().trim().equals(
					HTTP_CONTENT_TYPE_HEADER)) {
				return contentTypeToCharset(header.getValue());
			}
		}
		return null;
	}

	/**
	 * Attempt to find a META tag in the HTML that hints at the character set
	 * used to write the document.
	 * 
	 * @param resource
	 * @return String character set found from META tags in the HTML
	 * @throws IOException
	 */
	protected String getCharsetFromMeta(byte buffer[],int len) throws IOException {
		String charsetName = null;

		// convert to UTF-8 String -- which hopefully will not mess up the
		// characters we're interested in...
		String sample = new String(buffer,0,len,DEFAULT_CHARSET);
		String metaContentType = findMetaContentType(sample);
		if(metaContentType != null) {
			charsetName = contentTypeToCharset(metaContentType);
		}
		return charsetName;
	}

	private static String trimAttrValue(String value) {
		String result = value;
		if (result.charAt(0) == '"') {
			result = result.substring(1, result.length() - 1);
		} else if (result.charAt(0) == '\'') {
			result = result.substring(1, result.length() - 1);
		}
		return result;
	}

	public static String findMetaContentType(String pageSample) {

		Matcher tagMatcher = META_TAG_PATTERN.matcher(pageSample);

		while (tagMatcher.find()) {
			String wholeTag = tagMatcher.group();
			Matcher whereAttrMatcher = META_HTTP_EQUIV_ATTR_PATTERN.matcher(wholeTag);
			if (whereAttrMatcher.find()) {
				String attrValue = whereAttrMatcher.group(1);
				attrValue = trimAttrValue(attrValue);
				if (attrValue.compareToIgnoreCase(META_CONTENT_TYPE) == 0) {
					// this tag contains the right set, return the value for
					// the attribute findAttr:
					Matcher findAttrMatcher = META_CONTENT_ATTR_PATTERN.matcher(wholeTag);
					String value = null;
					if (findAttrMatcher.find()) {
						value = findAttrMatcher.group(1);
						value = trimAttrValue(value);
					}
					return value;
				}
				// not the tag we want... maybe there is another: loop
			}
		}

		return null;
	}
	
	
	/**
	 * Attempts to figure out the character set of the document using
	 * the excellent juniversalchardet library.
	 * 
	 * @param resource
	 * @return String character encoding found, or null if nothing looked good.
	 * @throws IOException
	 */
	protected String getCharsetFromBytes(byte buffer[], int len) 
	throws IOException {
		String charsetName = null;

	    UniversalDetector detector = new UniversalDetector(null);
		detector.handleData(buffer, 0, len);
		detector.dataEnd();
	    charsetName = detector.getDetectedCharset();
	    detector.reset();
	    if(isCharsetSupported(charsetName)) {
	    	return mapCharset(charsetName);
	    }
	    return null;
	}
	/**
	 * @param resource (presumably text) Resource to determine the charset
	 * @param request WaybackRequest which may contain additional hints to
	 *        processing
	 * @return String charset name for the Resource
	 * @throws IOException if there are problems reading the Resource
	 */
	public abstract String getCharset(byte buffer[],int len, HttpHeaders headers)
		throws IOException;
}