CharsetDetector.java example

Explorer
wayback-machine-master
/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.replay.charset;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Iterator;
import java.util.Map;

import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.replay.TagMagix;
import org.mozilla.universalchardet.UniversalDetector;

/**
 * Abstract class containing common methods for determining the character 
 * encoding of a text Resource, most of which should be refactored into a
 * Util package.
 * @author brad
 *
 */
public abstract class CharsetDetector {
	// hand off this many bytes to the chardet library
	protected final static int MAX_CHARSET_READAHEAD = 65536;
	// ...if it also includes "charset="
	protected final static String CHARSET_TOKEN = "charset=";
	// ...and if the chardet library fails, use the Content-Type header
	protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
	/** the default charset name to use when giving up */
	public final static String DEFAULT_CHARSET = "UTF-8";
	
	protected boolean isCharsetSupported(String charsetName) {
		// can you believe that this throws a runtime? Just asking if it's
		// supported!!?! They coulda just said "no"...
		if(charsetName == null) {
			return false;
		}
		try {
			return Charset.isSupported(charsetName);
		} catch(IllegalCharsetNameException e) {
			return false;
		}
	}
	protected String mapCharset(String orig) {
		String lc = orig.toLowerCase();
		if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
			return "cp1252";
		}
		return orig;
	}
	protected String contentTypeToCharset(final String contentType) {
		int offset = 
			contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
		
		if (offset != -1) {
			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
			if(isCharsetSupported(cs)) {
				return mapCharset(cs);
			}
			// test for extra spaces... there's at least one page out there that
			// indicates it's charset with:

//  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">

			// bad web page!
			if(isCharsetSupported(cs.replace(" ", ""))) {
				return mapCharset(cs.replace(" ", ""));
			}
		}
		return null;
	}
	
	/**
	 * Attempt to divine the character encoding of the document from the 
	 * Content-Type HTTP header (with a "charset=")
	 * 
	 * @param resource
	 * @return String character set found or null if the header was not present
	 * @throws IOException 
	 */
	protected String getCharsetFromHeaders(Resource resource) 
	throws IOException {
		
		String charsetName = null;

		Map<String,String> httpHeaders = resource.getHttpHeaders();
		Iterator<String> keys = httpHeaders.keySet().iterator();
		String ctype = null;
		while(keys.hasNext()) {
			String headerKey = keys.next();
			String keyCmp = headerKey.toUpperCase().trim();
			if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) {
				ctype = httpHeaders.get(headerKey);
				break;
			}
		}
		if (ctype != null) {
			charsetName = contentTypeToCharset(ctype);
		}
		return charsetName;
	}

	/**
	 * Attempt to find a META tag in the HTML that hints at the character set
	 * used to write the document.
	 * 
	 * @param resource
	 * @return String character set found from META tags in the HTML
	 * @throws IOException
	 */
	protected String getCharsetFromMeta(InputStream resource) throws IOException {
		String charsetName = null;

		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
		resource.mark(MAX_CHARSET_READAHEAD);
		resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
		resource.reset();
		// convert to UTF-8 String -- which hopefully will not mess up the
		// characters we're interested in...
		StringBuilder sb = new StringBuilder(new String(bbuffer,DEFAULT_CHARSET));
		String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
				"content", "http-equiv", "Content-Type");
		if(metaContentType != null) {
			charsetName = contentTypeToCharset(metaContentType);
		}
		return charsetName;
	}
	
	/**
	 * Attempts to figure out the character set of the document using
	 * the excellent juniversalchardet library.
	 * 
	 * @param resource
	 * @return String character encoding found, or null if nothing looked good.
	 * @throws IOException
	 */
	protected String getCharsetFromBytes(InputStream resource) throws IOException {
		String charsetName = null;

		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
		   // (1)
	    UniversalDetector detector = new UniversalDetector(null);

	    // (2)
		resource.mark(MAX_CHARSET_READAHEAD);
		int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
		resource.reset();
		detector.handleData(bbuffer, 0, len);
		// (3)
		detector.dataEnd();
	    // (4)
	    charsetName = detector.getDetectedCharset();

	    // (5)
	    detector.reset();
	    if(isCharsetSupported(charsetName)) {
	    	return charsetName;
	    }
	    return null;
	}
	/**
	 * @param resource (presumably text) Resource to determine the charset
	 * @param request WaybackRequest which may contain additional hints to
	 *        processing
	 * @return String charset name for the Resource
	 * @throws IOException if there are problems reading the Resource
	 */
	public abstract String getCharset(Resource resource, WaybackRequest request)
		throws IOException;
}