HTMLResourceEncodingDetector.java example

Explorer
webtools.sourceediting-master
/*******************************************************************************
 * Copyright (c) 2004, 2012 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.wst.html.core.internal.contenttype;

import java.io.IOException;
import java.util.regex.Pattern;

import org.eclipse.core.runtime.content.IContentDescription;
import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;
import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants;
import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants;

public class HTMLResourceEncodingDetector extends AbstractResourceEncodingDetector implements IResourceCharsetDetector {

	private HTMLHeadTokenizer fTokenizer;

	/**
	 * There is no spec defined encoding for HTML (historically), so null is
	 * returned.
	 */
	public String getSpecDefaultEncoding() {
		return null;
	}

	private boolean canHandleAsUnicodeStream(String tokenType) {
		boolean canHandleAsUnicodeStream = false;
		if (tokenType == EncodingParserConstants.UTF83ByteBOM) {
			canHandleAsUnicodeStream = true;
			String enc = "UTF-8"; //$NON-NLS-1$
			createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES);
			fEncodingMemento.setUTF83ByteBOMUsed(true);
		}
		else if (tokenType == EncodingParserConstants.UTF16BE || tokenType == EncodingParserConstants.UTF16LE) {
			canHandleAsUnicodeStream = true;
			String enc = "UTF-16"; //$NON-NLS-1$
			byte[] bom = (tokenType == EncodingParserConstants.UTF16BE) ? IContentDescription.BOM_UTF_16BE : IContentDescription.BOM_UTF_16LE;
			createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES);
			fEncodingMemento.setUnicodeStream(true);
			fEncodingMemento.setUnicodeBOM(bom);
		}
		return canHandleAsUnicodeStream;
	}

	/**
	 * @return Returns the tokenizer.
	 */
	private HTMLHeadTokenizer getTokenizer() {
		// TODO: need to work on 'reset' in tokenizer, so new instance isn't
		// always needed
		//if (fTokenizer == null) {
		fTokenizer = new HTMLHeadTokenizer();
		//		}
		return fTokenizer;
	}

	private boolean isLegalString(String valueTokenType) {
		if (valueTokenType == null)
			return false;
		else
			return valueTokenType.equals(EncodingParserConstants.StringValue) || valueTokenType.equals(EncodingParserConstants.UnDelimitedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTerminatedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue);
	}

	protected void parseInput() throws IOException {
		checkInContent();
		if (fEncodingMemento == null) {
			checkHeuristics();
		}
	}

	/**
	 *  
	 */
	private void checkHeuristics() throws IOException {
		boolean noHeuristic = false;
		String heuristicEncoding = null;
		try {
			if (EncodingGuesser.canGuess()) {
				fReader.reset();
				fReader.mark(CodedIO.MAX_MARK_SIZE);
				byte[] bytes = new byte[CodedIO.MAX_MARK_SIZE];
				int nRead = 0;
				for (int i = 0; i < bytes.length; i++) {
					int oneByte = fReader.read();
					nRead++;
					if (oneByte == -1) {
						break;
					}
					if (oneByte <= 0xFF) {
						bytes[i] = (byte) oneByte;
					}
					else {
						noHeuristic = true;
						break;
					}
				}
				if (!noHeuristic) {
					heuristicEncoding = EncodingGuesser.guessEncoding(bytes, nRead);
				}
			}
		}
		catch (IOException e) {
			// if any IO exception, then not a heuristic case
		}
		finally {
			fReader.reset();
		}
		if (heuristicEncoding != null) {
			createEncodingMemento(heuristicEncoding, EncodingMemento.GUESSED_ENCODING_FROM_STREAM);
		}

	}

	private void checkInContent() throws IOException {
		HTMLHeadTokenizer tokenizer = getTokenizer();
		tokenizer.reset(fReader);
		HeadParserToken token = null;
		String tokenType = null;
		String contentTypeValue = null;
		String xhtmlEncoding = HTMLHeadTokenizerConstants.UNDEFINED;
		boolean isXHTML = false;
		do {
			token = tokenizer.getNextToken();
			tokenType = token.getType();
			if (tokenizer.isXHTML()) {
				isXHTML = true;
				if (!xhtmlEncoding.equals(HTMLHeadTokenizerConstants.UNDEFINED)) {
					if (xhtmlEncoding.length() > 0) {
						createEncodingMemento(xhtmlEncoding, EncodingMemento.FOUND_ENCODING_IN_CONTENT);
						return ;
					}
				}
			}
			if (canHandleAsUnicodeStream(tokenType)) {
				// side effect of canHandle is to create appropriate
				// memento
			}
			else if (tokenType == HTMLHeadTokenizerConstants.MetaTagContentType) {
				if (tokenizer.hasMoreTokens()) {
					HeadParserToken valueToken = tokenizer.getNextToken();
					String valueTokenType = valueToken.getType();
					if (isLegalString(valueTokenType)) {
						contentTypeValue = valueToken.getText();

					}
				}
			}
			else if (tokenType == XMLHeadTokenizerConstants.XMLDelEncoding ) {
				if (tokenizer.hasMoreTokens()) {
					token = tokenizer.getNextToken();
					tokenType = token.getType();
					if (isLegalString(tokenType)) 
						xhtmlEncoding = token.getText();
				}
			}

		}
		while (tokenizer.hasMoreTokens());
		if (contentTypeValue != null) {
			if (tokenizer.hasCharsetAttr()) {
				contentTypeValue = contentTypeValue.trim();
				if (contentTypeValue.length() > 0) {
					createEncodingMemento(contentTypeValue, EncodingMemento.FOUND_ENCODING_IN_CONTENT);
				}
			}
			else {
				parseContentTypeValue(contentTypeValue);
			}
		}
		//Content type is XHTML and no encoding found(since we did't hit return statement), use UTF-8
		//https://bugs.eclipse.org/bugs/show_bug.cgi?id=318768
		if (fEncodingMemento == null && isXHTML) {
			createEncodingMemento("UTF-8", EncodingMemento.DEFAULTS_ASSUMED_FOR_EMPTY_INPUT); //$NON-NLS-1$		
		}
	}

	private void parseContentTypeValue(String contentType) {
		String charset = null;
		Pattern pattern = Pattern.compile(";\\s*charset\\s*=\\s*"); //$NON-NLS-1$
		String[] parts = pattern.split(contentType);
		if (parts.length > 0) {
			// if only one item, it can still be charset instead of
			// contentType
			if (parts.length == 1) {
				if (parts[0].length() > 6) {
					String checkForCharset = parts[0].substring(0, 7);
					if (checkForCharset.equalsIgnoreCase("charset")) { //$NON-NLS-1$
						int eqpos = parts[0].indexOf('=');
						eqpos = eqpos + 1;
						if (eqpos < parts[0].length()) {
							charset = parts[0].substring(eqpos);
							charset = charset.trim();
						}
					}
				}
			}
			else {
				//fContentType = parts[0];
			}
		}
		if (parts.length > 1) {
			charset = parts[1].trim();
		}
		
		if (charset != null && charset.length() > 0) {
			createEncodingMemento(charset, EncodingMemento.FOUND_ENCODING_IN_CONTENT);
		}
	}

}