JSPResourceEncodingDetector.java example

Explorer
webtools.sourceediting-master
/*******************************************************************************
 * Copyright (c) 2004, 2009 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.jst.jsp.core.internal.contenttype;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern;

import org.eclipse.core.resources.IStorage;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.core.runtime.content.IContentDescription;
import org.eclipse.jst.jsp.core.internal.Logger;
import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;
import org.eclipse.wst.sse.core.internal.encoding.NonContentBasedEncodingRules;
import org.eclipse.wst.sse.core.utils.StringUtils;
import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants;
import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants;

import com.ibm.icu.util.StringTokenizer;

public class JSPResourceEncodingDetector implements IResourceCharsetDetector {

	private String fCharset;

	private String fContentType;

	private String fContentTypeValue;

	private String fLanguage;

	private String fPageEncodingValue;

	private JSPHeadTokenizer fTokenizer;

	private String fXMLDecEncodingName;

	private boolean unicodeCase;

	private EncodingMemento fEncodingMemento;

	private boolean fHeaderParsed;

	private Reader fReader;

	private boolean fXHTML;

	private boolean fWML;


	/**
	 * No Arg constructor.
	 */
	public JSPResourceEncodingDetector() {
		super();
	}

	class NullMemento extends EncodingMemento {
		/**
		 * 
		 */
		public NullMemento() {
			super();
			String defaultCharset = NonContentBasedEncodingRules.useDefaultNameRules(null);
			setJavaCharsetName(defaultCharset);
			setAppropriateDefault(defaultCharset);
			setDetectedCharsetName(null);
		}

	}

	/**
	 * @return Returns the contentType.
	 */
	public String getContentType() throws IOException {
		ensureInputSet();
		if (!fHeaderParsed) {
			parseInput();
			// we keep track of if header's already been parse, so can make
			// multiple 'get' calls, without causing reparsing.
			fHeaderParsed = true;
			// Note: there is a "hidden assumption" here that an empty
			// string in content should be treated same as not present.
		}
		return fContentType;
	}

	public String getEncoding() throws IOException {
		return getEncodingMemento().getDetectedCharsetName();
	}

	// to ensure consist overall rules used, we'll mark as
	// final,
	// and require subclasses to provide certain pieces of
	// the
	// implementation
	public EncodingMemento getEncodingMemento() throws IOException {
		ensureInputSet();
		if (!fHeaderParsed) {
			parseInput();
			// we keep track of if header's already been
			// parse, so can make
			// multiple 'get' calls, without causing
			// reparsing.
			fHeaderParsed = true;
			// Note: there is a "hidden assumption" here
			// that an empty
			// string in content should be treated same as
			// not present.
		}
		if (fEncodingMemento == null) {
			handleSpecDefault();
		}
		if (fEncodingMemento == null) {
			// safty net
			fEncodingMemento = new NullMemento();
		}
		return fEncodingMemento;
	}

	public String getLanguage() throws IOException {
		ensureInputSet();
		if (!fHeaderParsed) {
			parseInput();
			fHeaderParsed = true;
		}
		return fLanguage;
	}

	public String getSpecDefaultEncoding() {
		// by JSP Spec
		final String enc = "ISO-8859-1"; //$NON-NLS-1$
		return enc;
	}

	public EncodingMemento getSpecDefaultEncodingMemento() {
		resetAll();
		EncodingMemento result = null;
		String enc = getSpecDefaultEncoding();
		if (enc != null) {
			createEncodingMemento(enc, EncodingMemento.DEFAULTS_ASSUMED_FOR_EMPTY_INPUT);
			fEncodingMemento.setAppropriateDefault(enc);
			result = fEncodingMemento;
		}
		return result;
	}

	/**
	 * 
	 */
	public void set(InputStream inputStream) {
		resetAll();
		fReader = new ByteReader(inputStream);
		try {
			fReader.mark(CodedIO.MAX_MARK_SIZE);
		}
		catch (IOException e) {
			// impossible, since we know ByteReader
			// supports marking
			throw new Error(e);
		}
	}

	/**
	 * 
	 */
	public void set(IStorage iStorage) throws CoreException {
		resetAll();
		InputStream inputStream = iStorage.getContents();
		InputStream resettableStream = new BufferedInputStream(inputStream, CodedIO.MAX_BUF_SIZE);
		resettableStream.mark(CodedIO.MAX_MARK_SIZE);
		set(resettableStream);
		// TODO we'll need to "remember" IFile, or
		// get its (or its project's) settings, in case
		// those are needed to handle cases when the
		// encoding is not in the file stream.
	}

	/**
	 * Note: this is not part of interface to help avoid confusion ... it
	 * expected this Reader is a well formed character reader ... that is, its
	 * all ready been determined to not be a unicode marked input stream. And,
	 * its assumed to be in the correct position, at position zero, ready to
	 * read first character.
	 */
	public void set(Reader reader) {
		resetAll();
		fReader = reader;
		if (!fReader.markSupported()) {
			fReader = new BufferedReader(fReader);
		}
		try {
			fReader.mark(CodedIO.MAX_MARK_SIZE);
		}
		catch (IOException e) {
			// impossble, since we just checked if markable
			throw new Error(e);
		}
	}

	private boolean canHandleAsUnicodeStream(String tokenType) {
		boolean canHandleAsUnicode = false;
		if (tokenType == EncodingParserConstants.UTF83ByteBOM) {
			canHandleAsUnicode = true;
			String enc = "UTF-8"; //$NON-NLS-1$
			createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES);
			fEncodingMemento.setUTF83ByteBOMUsed(true);
		}
		else if (tokenType == EncodingParserConstants.UTF16BE || tokenType == EncodingParserConstants.UTF16LE) {
			canHandleAsUnicode = true;
			String enc = "UTF-16"; //$NON-NLS-1$
			byte[] bom = (tokenType == EncodingParserConstants.UTF16BE) ? IContentDescription.BOM_UTF_16BE : IContentDescription.BOM_UTF_16LE;
			createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES);
			fEncodingMemento.setUnicodeStream(true);
			fEncodingMemento.setUnicodeBOM(bom);
		}
		return canHandleAsUnicode;
	}

	/**
	 * Note: once this instance is created, trace info still needs to be
	 * appended by caller, depending on the context its created.
	 */
	private void createEncodingMemento(String detectedCharsetName) {
		fEncodingMemento = new EncodingMemento();
		fEncodingMemento.setJavaCharsetName(getAppropriateJavaCharset(detectedCharsetName));
		fEncodingMemento.setDetectedCharsetName(detectedCharsetName);
		// TODO: if detectedCharset and spec default is
		// null, need to use "work
		// bench based" defaults.
		fEncodingMemento.setAppropriateDefault(getSpecDefaultEncoding());
	}

	/**
	 * There can sometimes be mulitple 'encodings' specified in a file. This
	 * is an attempt to centralize the rules for deciding between them.
	 * Returns encoding according to priority: 1. XML Declaration 2. page
	 * directive pageEncoding name 3. page directive contentType charset name
	 */
	private String getAppropriateEncoding() {
		String result = null;
		if (fXMLDecEncodingName != null)
			result = fXMLDecEncodingName;
		else if (fPageEncodingValue != null)
			result = fPageEncodingValue;
		else if (fCharset != null)
			result = fCharset;
		return result;
	}

	/**
	 * This method can return null, if invalid charset name (in which case
	 * "appropriateDefault" should be used, if a name is really need for some
	 * "save anyway" cases).
	 * 
	 * @param detectedCharsetName
	 * @return
	 */
	private String getAppropriateJavaCharset(String detectedCharsetName) {
		String result = null;
		// 1. Check explicit mapping overrides from
		// property file -- its here we pick up "rules" for cases
		// that are not even in Java
		result = CodedIO.checkMappingOverrides(detectedCharsetName);
		// 2. Use the "canonical" name from JRE mappings
		// Note: see Charset JavaDoc, the name you get one
		// with can be alias,
		// the name you get back is "standard" name.
		Charset javaCharset = null;
		try {
			javaCharset = Charset.forName(detectedCharsetName);
		}
		catch (UnsupportedCharsetException e) {
			// only set invalid, if result is same as detected -- they won't
			// be equal if
			// overridden
			if (result != null && result.equals(detectedCharsetName)) {
				fEncodingMemento.setInvalidEncoding(detectedCharsetName);
			}
		}
		catch (IllegalCharsetNameException e) {
			// only set invalid, if result is same as detected -- they won't
			// be equal if
			// overridden
			if (result != null && result.equals(detectedCharsetName)) {
				fEncodingMemento.setInvalidEncoding(detectedCharsetName);
			}
		}
		// give priority to java cononical name, if present
		if (javaCharset != null) {
			result = javaCharset.name();
			// but still allow overrides
			result = CodedIO.checkMappingOverrides(result);
		}
		return result;
	}

	private JSPHeadTokenizer getTokinizer() {
		if (fTokenizer == null) {
			fTokenizer = new JSPHeadTokenizer();
		}
		return fTokenizer;
	}

	private void handleSpecDefault() {
		String encodingName;
		encodingName = getSpecDefaultEncoding();
		if (encodingName != null) {
			// createEncodingMemento(encodingName,
			// EncodingMemento.USED_CONTENT_TYPE_DEFAULT);
			fEncodingMemento = new EncodingMemento();
			fEncodingMemento.setJavaCharsetName(encodingName);
			fEncodingMemento.setAppropriateDefault(encodingName);
		}
	}

	private boolean isLegalString(String valueTokenType) {
		boolean result = false;
		if (valueTokenType != null) {
			result = valueTokenType.equals(EncodingParserConstants.StringValue) || valueTokenType.equals(EncodingParserConstants.UnDelimitedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTerminatedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue);
		}
		return result;
	}


	/**
	 * This method should be exactly the same as what is in 
	 * JSPHeadTokenizerTester
	 * @param contentType
	 */
	private void parseContentTypeValue(String contentType) {
		/*
		 * Based partially on
		 * org.eclipse.jst.jsp.core.internal.document.PageDirectiveAdapterImpl
		 * .getMimeTypeFromContentTypeValue(String) , divides the full value
		 * into segments according to ';', assumes the first specifies the
		 * content type itself if it has no '=', and that the remainder are
		 * parameters which may specify a charset
		 */
		
		String cleanContentTypeValue = StringUtils.stripNonLetterDigits(contentType);
		/* Break the mime header into the main value and its parameters, separated by ';' */
		StringTokenizer tokenizer = new StringTokenizer(cleanContentTypeValue, ";"); //$NON-NLS-1$
		int tLen = tokenizer.countTokens();
		if (tLen == 0)
			return;
		String[] tokens = new String[tLen];
		int j = 0;
		while (tokenizer.hasMoreTokens()) {
			tokens[j] = tokenizer.nextToken();
			j++;
		}
		
		int firstParameter = 0;
		if (tokens[0].indexOf('=') == -1) {
			/*
			 * no equal sign in the first segment, so assume it indicates a
			 * content type properly
			 */
			fContentType = tokens[0].trim();
			firstParameter = 1;
		}
		/*
		 * now handle parameters as name=value pairs, looking for "charset"
		 * specifically
		 */
		Pattern equalPattern = Pattern.compile("\\s*=\\s*"); //$NON-NLS-1$
		for (int i = firstParameter; i < tokens.length; i++) {
			String[] pair = equalPattern.split(tokens[i]);
			if (pair.length < 2)
				continue;
			if (pair[0].trim().equals("charset")) { //$NON-NLS-1$
				fCharset = pair[1].trim();
			}
		}
	}


	/**
	 * Looks for what ever encoding properties the tokenizer returns. Its the
	 * responsibility of the tokenizer to stop when appropriate and not go too
	 * far.
	 */
	private void parseHeader(JSPHeadTokenizer tokenizer) throws Exception {
		fPageEncodingValue = null;
		fCharset = null;

		HeadParserToken token = null;
		do {
			// don't use 'get' here (at least until reset issue fixed)
			token = tokenizer.getNextToken();
			String tokenType = token.getType();
			if (canHandleAsUnicodeStream(tokenType))
				unicodeCase = true;
			else {
				if (tokenType == XMLHeadTokenizerConstants.XMLDelEncoding) {
					if (tokenizer.hasMoreTokens()) {
						HeadParserToken valueToken = tokenizer.getNextToken();
						String valueTokenType = valueToken.getType();
						if (isLegalString(valueTokenType)) {
							fXMLDecEncodingName = valueToken.getText();
						}
					}
				}
				else if (tokenType == JSPHeadTokenizerConstants.PageEncoding) {
					if (tokenizer.hasMoreTokens()) {
						HeadParserToken valueToken = tokenizer.getNextToken();
						String valueTokenType = valueToken.getType();
						if (isLegalString(valueTokenType)) {
							fPageEncodingValue = valueToken.getText();
						}
					}
				}
				else if (tokenType == JSPHeadTokenizerConstants.PageContentType) {
					if (tokenizer.hasMoreTokens()) {
						HeadParserToken valueToken = tokenizer.getNextToken();
						String valueTokenType = valueToken.getType();
						if (isLegalString(valueTokenType)) {
							fContentTypeValue = valueToken.getText();
						}
					}
				}
				else if (tokenType == JSPHeadTokenizerConstants.PageLanguage) {
					if (tokenizer.hasMoreTokens()) {
						HeadParserToken valueToken = tokenizer.getNextToken();
						String valueTokenType = valueToken.getType();
						if (isLegalString(valueTokenType)) {
							fLanguage = valueToken.getText();
						}
					}
				}
			}
		}
		while (tokenizer.hasMoreTokens());
		if (fContentTypeValue != null) {
			parseContentTypeValue(fContentTypeValue);
		}
		if (tokenizer.isXHTML()) {
			fXHTML = true;
		}
		if (tokenizer.isWML() ) {
			fWML = true;
		}
	}

	private void parseInput() throws IOException {
		JSPHeadTokenizer tokenizer = getTokinizer();
		fReader.reset();
		tokenizer.reset(fReader);
		try {
			parseHeader(tokenizer);
			// unicode stream cases are created directly in parseHeader
			if (!unicodeCase) {
				String enc = getAppropriateEncoding();
				if (enc != null && enc.length() > 0) {
					createEncodingMemento(enc, EncodingMemento.FOUND_ENCODING_IN_CONTENT);
				}
			}
		} catch (Exception e) {
			Logger.log(Logger.ERROR_DEBUG, e.getMessage());
		}
	}

	/**
	 * 
	 */
	private void resetAll() {
		fReader = null;
		fHeaderParsed = false;
		fEncodingMemento = null;
		fCharset = null;
		fContentTypeValue = null;
		fPageEncodingValue = null;
		fXMLDecEncodingName = null;
		unicodeCase = false;
		fXHTML=false;
		fWML=false;
	}



	/**
	 * convience method all subclasses can use (but not override)
	 * 
	 * @param detectedCharsetName
	 * @param reason
	 */
	private void createEncodingMemento(String detectedCharsetName, String reason) {
		createEncodingMemento(detectedCharsetName);
	}

	/**
	 * convience method all subclasses can use (but not override)
	 */
	private void ensureInputSet() {
		if (fReader == null) {
			throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$
		}
	}

	public boolean isWML() throws IOException {
		ensureInputSet();
		if (!fHeaderParsed) {
			parseInput();
			// we keep track of if header's already been parse, so can make
			// multiple 'get' calls, without causing reparsing.
			fHeaderParsed = true;
			// Note: there is a "hidden assumption" here that an empty
			// string in content should be treated same as not present.
		}
		return fWML;
	}

	public boolean isXHTML() throws IOException {
		ensureInputSet();
		if (!fHeaderParsed) {
			parseInput();
			// we keep track of if header's already been parse, so can make
			// multiple 'get' calls, without causing reparsing.
			fHeaderParsed = true;
			// Note: there is a "hidden assumption" here that an empty
			// string in content should be treated same as not present.
		}
		return fXHTML;
	}
}