UnicodeBOMEncodingDetector.java example

Explorer
webtools.sourceediting-master
/*******************************************************************************
 * Copyright (c) 2001, 2005 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *     Jens Lukowski/Innoopract - initial renaming/restructuring
 *     
 *******************************************************************************/
package org.eclipse.wst.sse.core.internal.encoding.util;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.Charset;

import org.eclipse.core.resources.IStorage;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.wst.sse.core.internal.encoding.CodedIO;
import org.eclipse.wst.sse.core.internal.encoding.EncodingMemento;
import org.eclipse.wst.sse.core.internal.encoding.IResourceCharsetDetector;


/**
 * This is a "common function" class to decide if an input stream, is a
 * unicode stream.
 */
public class UnicodeBOMEncodingDetector implements IResourceCharsetDetector {

	//private static final String UTF_16_CHARSET_NAME = "UTF-16";
	// //$NON-NLS-1$

	public static class NotEnoughInputForBOMException extends IOException {

		/**
		 * Default <code>serialVersionUID</code>
		 */
		private static final long serialVersionUID = 1L;

		public NotEnoughInputForBOMException() {
			super();
		}

		public NotEnoughInputForBOMException(String s) {
			super(s);
		}

	}

	private final static byte BB = (byte) 0xBB;
	private final static byte BF = (byte) 0xBF;
	private final static byte EF = (byte) 0xEF;
	private final static byte FE = (byte) -2;

	private final static byte FF = (byte) -1;
	private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$
	private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$

	private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$

	private InputStream fInputStream = null;
	private boolean fNoBOMPossible;

	private EncodingMemento checkForBOM(InputStream inputStream) {
		EncodingMemento result = null;

		try {
			byte b1 = getNextByte(inputStream);
			byte b2 = getNextByte(inputStream);
			if (b1 == FE && b2 == FF) {
				result = createEncodingMemento(UTF_16BE_CHARSET_NAME);
				result.setUnicodeStream(true);
			} else {
				if (b1 == FF && b2 == FE) {
					result = createEncodingMemento(UTF_16LE_CHARSET_NAME);
					result.setUnicodeStream(true);
				} else {
					byte b3 = getNextByte((inputStream));
					if (b1 == EF && b2 == BB && b3 == BF) {
						result = createEncodingMemento(UTF_8_CHARSET_NAME);
						result.setUTF83ByteBOMUsed(true);
					}
				}
			}
		} catch (NotEnoughInputForBOMException e) {
			// This is sort of unexpected for normal cases, but can occur for
			// empty
			// streams. And, this can occur "normally" for non-BOM streams
			// that
			// have only two
			// bytes, and for which those two bytes match the first two bytes
			// of UTF-8
			// BOM In any case, we'll simply return null;
			result = null;
		} catch (IOException e) {
			// other errors should be impossible
			throw new Error(e);
		}

		return result;
	}

	private EncodingMemento createEncodingMemento(String javaEncodingName) {
		EncodingMemento encodingMemento = new EncodingMemento();
		encodingMemento.setJavaCharsetName(javaEncodingName);
		String ianaName = Charset.forName(javaEncodingName).name();
		encodingMemento.setDetectedCharsetName(ianaName);
		if (javaEncodingName.equals(UTF_8_CHARSET_NAME)) {
			encodingMemento.setUTF83ByteBOMUsed(true);
		}
		return encodingMemento;
	}

	public String getEncoding() throws IOException {

		return getEncodingMemento().getDetectedCharsetName();
	}

	/**
	 * Returns IANA encoding name if BOM detected in stream. If a BOM is
	 * detected, the stream is left positioned after readying the BOM. If a
	 * BOM is not detected, the steam is reset.
	 * 
	 * 0xFEFF UTF-16, big-endian 0xFFFE UTF-16, little-endian 0xEFBBBF UTF-8
	 * (BOM is optional)
	 * 
	 * @param inputStream -
	 *            must be a resetable (mark supported) stream so it can be
	 *            reset, if not BOM encoded stream
	 * @return String - IANA encodingname (may not work well on 1.3, but 1.4
	 *         seems to have good support for IANA names)
	 */
	public EncodingMemento getEncodingMemento() {

		EncodingMemento result = null;
		if (!fNoBOMPossible) {

			if (fInputStream == null)
				throw new IllegalStateException("input must be set before use"); //$NON-NLS-1$

			if (!fInputStream.markSupported()) {
				throw new IllegalArgumentException("inputStream must be resetable"); //$NON-NLS-1$
			}

			result = checkForBOM(fInputStream);
		}

		return result;

	}

	private byte getNextByte(InputStream inputStream) throws IOException {

		int byteCharAsInt = -1;
		// be sure we won't block
		if (inputStream.available() > 0) {
			byteCharAsInt = inputStream.read();
			byteCharAsInt = byteCharAsInt & 0XFF;
		}
		// to avoid confustion over meaning of returned byte,
		// throw exception if EOF reached.
		if (byteCharAsInt == -1)
			throw new NotEnoughInputForBOMException("typically not an error"); //$NON-NLS-1$
		return (byte) byteCharAsInt;
	}

	/**
	 *  
	 */

	public String getSpecDefaultEncoding() {
		// There is no default for this case
		return null;
	}

	/**
	 *  
	 */
	private void resetAll() {
		fNoBOMPossible = false;
		fInputStream = null;

	}

	/**
	 *  
	 */

	public void set(InputStream inputStream) {
		resetAll();
		fInputStream = inputStream;
	}

	public void set(IStorage iStorage) throws CoreException {
		set(new BufferedInputStream(iStorage.getContents(), CodedIO.MAX_BUF_SIZE));

	}

	public void set(Reader reader) {
		if (reader instanceof ByteReader) {
			ByteReader byteReader = (ByteReader) reader;
			fInputStream = byteReader.fInputStream;
		} else {
			fNoBOMPossible = true;
		}

	}

}