/******************************************************************************* * Copyright (c) 2001, 2005 IBM Corporation and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * IBM Corporation - initial API and implementation * Jens Lukowski/Innoopract - initial renaming/restructuring * *******************************************************************************/ package org.eclipse.wst.sse.core.internal.encoding; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.CharArrayReader; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.StringReader; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.UnmappableCharacterException; import org.eclipse.core.resources.IFile; import org.eclipse.core.runtime.CoreException; import org.eclipse.core.runtime.IProgressMonitor; import org.eclipse.core.runtime.IStatus; import org.eclipse.core.runtime.Platform; import org.eclipse.core.runtime.Status; import org.eclipse.core.runtime.content.IContentDescription; import org.eclipse.core.runtime.content.IContentTypeManager; import org.eclipse.core.runtime.jobs.Job; import org.eclipse.wst.sse.core.internal.SSECoreMessages; import org.eclipse.wst.sse.core.internal.SSECorePlugin; import org.eclipse.wst.sse.core.internal.encoding.util.Assert; import org.eclipse.wst.sse.core.internal.encoding.util.Logger; import org.eclipse.wst.sse.core.internal.exceptions.CharConversionErrorWithDetail; import org.eclipse.wst.sse.core.internal.exceptions.MalformedOutputExceptionWithDetail; import org.eclipse.wst.sse.core.internal.exceptions.UnsupportedCharsetExceptionWithDetail; public class CodedStreamCreator extends CodedIO { private final static int INITIAL_BUFFER_SIZE = 1024 * 16; // the 32 bytes used by default by ByteOutputStream is // a little small private static final String PROGRAM_ERROR__FAILED_TO_FIND_ANY_CHARSET_ANYWHERE_ = "Program error: failed to find any charset anywhere!"; //$NON-NLS-1$ private static final String UTF_16BE_CHARSET_NAME = "UTF-16BE"; //$NON-NLS-1$ private static final String UTF_16LE_CHARSET_NAME = "UTF-16LE"; //$NON-NLS-1$ // private static final String UTF_16_CHARSET_NAME = "UTF-16"; // //$NON-NLS-1$ private static final String UTF_8_CHARSET_NAME = "UTF-8"; //$NON-NLS-1$ private boolean fClientSuppliedReader; // future_TODO: this 'checkConversion' can be a little // pricey for large // files, chould be a user preference, or something. // private static final boolean checkConversion = true; private EncodingMemento fCurrentEncodingMemento; private EncodingMemento fEncodingMemento; private String fFilename; private boolean fHasBeenAnalyzed; private IFile fIFile; private EncodingMemento fPreviousEncodingMemento; private Reader fReader; private Reader fResettableReader; private byte[] UTF16BEBOM = new byte[]{(byte) 0xFE, (byte) 0xFF}; private byte[] UTF16LEBOM = new byte[]{(byte) 0xFF, (byte) 0xFE}; private byte[] UTF3BYTEBOM = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; public CodedStreamCreator() { super(); } public CodedStreamCreator(String filename, char[] characterArray) { super(); fFilename = filename; fReader = new CharArrayReader(characterArray); } public CodedStreamCreator(String filename, Reader reader) { super(); fFilename = filename; fReader = reader; } public CodedStreamCreator(String filename, String textString) { super(); fFilename = filename; fReader = new StringReader(textString); } /** * The primary method which contains the highest level rules for how to * decide appropriate decoding rules: 1. first check for unicode stream 2. * then looked for encoding specified in content (according to the type of * content that is it ... xml, html, jsp, etc. 3. then check for various * settings: file settings first, if null check project settings, if null, * check user preferences. 4. lastly (or, what is the last user * preference) is to use "workbench defaults". */ private void analyze() throws CoreException, IOException { Reader resettableReader = getResettableReader(); try { if (fCurrentEncodingMemento == null) { resettableReader.reset(); fCurrentEncodingMemento = checkForEncodingInContents(); } // if encoding stratagy doesn't provide answer, // then try file settings, project settings, // user preferences, and // finally workbench default. // if (fCurrentEncodingMemento == null || fCurrentEncodingMemento.getDetectedCharsetName() == null) { resettableReader.reset(); fCurrentEncodingMemento = getEncodingMementoFromResourceAndPreference(); } // use DefaultNameRules from NonContentBasedEncodingRules as the // final default if (fEncodingMemento == null) { handleNotProvidedFromContentCase(); } fHasBeenAnalyzed = true; } finally { if (resettableReader != null) { resettableReader.reset(); } } } /** * Need to check conversion early on. There's some danger than old * contents of a file are set to empty, if an exception occurs. * * @param allText * @param encoding * @param encodingRule * @throws java.io.UnsupportedEncodingException * @throws MalformedOutputExceptionWithDetail * @deprecated - we need to find "cheaper" way to to this functionality so * likely to go away in future */ private void checkConversion(EncodingMemento memento, EncodingRule encodingRule) throws IOException { String javaEncoding = memento.getJavaCharsetName(); String detectedEncoding = memento.getDetectedCharsetName(); Charset charset = Charset.forName(javaEncoding); CharsetEncoder charsetEncoder = charset.newEncoder(); charsetEncoder.onMalformedInput(CodingErrorAction.REPORT); charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = getResettableReader(); reader.reset(); int currentChar = reader.read(); int currentPos = 1; try { while (currentChar != -1) { // note: this can probably be made more // efficient later to // check buffer by buffer, instead of // character by character. try { boolean canConvert = charsetEncoder.canEncode((char) currentChar); if (!canConvert) { if (encodingRule == EncodingRule.IGNORE_CONVERSION_ERROR) { // if we're told to ignore the // encoding conversion // error, // notice we still want to detect // and log it. We simply // don't throw the exception, and // we do continue with // the // save. Logger.log(Logger.ERROR, "Encoding Conversion Error during save"); //$NON-NLS-1$ } else { throw new MalformedOutputExceptionWithDetail(javaEncoding, detectedEncoding, currentPos); } } currentChar = reader.read(); currentPos++; } // IBM's JRE seems to throw NPE when DBCS char is given to // SBCS charsetEncoder catch (NullPointerException e) { throw new CharConversionErrorWithDetail(javaEncoding); //$NON-NLS-1$ } } // if we get all the way through loop without throwing exception, // then there must // be an error not detectable when going character by character. throw new CharConversionErrorWithDetail(javaEncoding); //$NON-NLS-1$ } finally { reader.reset(); } } private EncodingMemento checkForEncodingInContents() throws CoreException, IOException { EncodingMemento result = null; // if encoding memento already set, and no need to get again. if (fEncodingMemento != null) { result = fEncodingMemento; } else { if (fClientSuppliedReader) { fReader.reset(); IContentTypeManager contentTypeManager = Platform.getContentTypeManager(); try { IContentDescription contentDescription = contentTypeManager.getDescriptionFor(fReader, fFilename, IContentDescription.ALL); if (contentDescription != null) { fEncodingMemento = createMemento(contentDescription); } else { fEncodingMemento = CodedIO.createEncodingMemento("UTF-8"); //$NON-NLS-1$ } } catch (NullPointerException e) { // TODO: work around for 5/14 bug in base, should be // removed when move up to 5/21 // just created a simple default one fEncodingMemento = CodedIO.createEncodingMemento("UTF-8"); //$NON-NLS-1$ } result = fEncodingMemento; } else { throw new IllegalStateException("unexpected state: encodingMemento was null but no input stream supplied"); //$NON-NLS-1$ } } // try { // result = getEncodingDetector().getEncodingMemento(); // if (result != null && !result.isValid() && !forceDefault()) { // throw new UnsupportedCharsetExceptionWithDetail(result); // } // } // finally { // handleStreamClose(fEncodingDetectorStream); // } return result; } private void dump(OutputStream outputStream, EncodingRule encodingRule, boolean use3ByteBOMifUTF8) throws CoreException, IOException { getCurrentEncodingMemento(); String javaEncodingName = null; if (encodingRule == EncodingRule.CONTENT_BASED) { if (fCurrentEncodingMemento.isValid()) { javaEncodingName = fCurrentEncodingMemento.getJavaCharsetName(); } else { throw new UnsupportedCharsetExceptionWithDetail(fCurrentEncodingMemento); } } else if (encodingRule == EncodingRule.IGNORE_CONVERSION_ERROR) javaEncodingName = fCurrentEncodingMemento.getJavaCharsetName(); else if (encodingRule == EncodingRule.FORCE_DEFAULT) javaEncodingName = fCurrentEncodingMemento.getAppropriateDefault(); // write appropriate "header" unicode BOM bytes // Note: Java seems to write appropriate header for // UTF-16, but not // UTF-8 nor UTF-16BE. This // may vary by JRE version, so need to test well. // Note: javaEncodingName can be null in invalid // cases, so we no hard // to skip whole check if that's the case. if (javaEncodingName != null) { if ((javaEncodingName.equals(UTF_8_CHARSET_NAME) && use3ByteBOMifUTF8) || (javaEncodingName.equals(UTF_8_CHARSET_NAME) && fCurrentEncodingMemento.isUTF83ByteBOMUsed())) { outputStream.write(UTF3BYTEBOM); } else if (javaEncodingName.equals(UTF_16LE_CHARSET_NAME)) { outputStream.write(UTF16LEBOM); } else if (javaEncodingName.equals(UTF_16BE_CHARSET_NAME)) { outputStream.write(UTF16BEBOM); } } // TODO add back in line delimiter handling the // "right" way (updating // markers, not requiring string, etc. .. may need // to move to document // level) //allTextBuffer = // handleLineDelimiter(allTextBuffer, document); Reader reader = getResettableReader(); // be sure to test large "readers" ... we'll need // to make sure they all // can reset to initial position (StringReader, // CharArrayReader, and // DocumentReader should all work ok). reader.reset(); // There must be cleaner logic somehow, but the // idea is that // javaEncodingName can be null // if original detected encoding is not valid (and // if FORCE_DEFAULT was // not specified). Hence, we WANT the first // Charset.forName to // throw appropriate exception. Charset charset = null; // this call checks "override" properties file javaEncodingName = CodedIO.getAppropriateJavaCharset(javaEncodingName); if (javaEncodingName == null) { charset = Charset.forName(fCurrentEncodingMemento.getDetectedCharsetName()); } else { charset = Charset.forName(javaEncodingName); } CharsetEncoder charsetEncoder = charset.newEncoder(); if (!(encodingRule == EncodingRule.IGNORE_CONVERSION_ERROR)) { charsetEncoder.onMalformedInput(CodingErrorAction.REPORT); charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT); } else { charsetEncoder.onMalformedInput(CodingErrorAction.REPLACE); charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } OutputStreamWriter outputStreamWriter = new OutputStreamWriter(outputStream, charsetEncoder); //TODO: this may no longer be needed (and is at // least wrong spot for // it). // if (checkConversion && (!(encodingRule == // EncodingRule.IGNORE_CONVERSION_ERROR))) { // checkConversion(fCurrentEncodingMemento, // encodingRule); // } char[] charbuf = new char[CodedIO.MAX_BUF_SIZE]; int nRead = 0; try { while (nRead != -1) { nRead = reader.read(charbuf, 0, MAX_BUF_SIZE); if (nRead > 0) { outputStreamWriter.flush(); outputStreamWriter.write(charbuf, 0, nRead); } } } catch (UnmappableCharacterException e) { checkConversion(fCurrentEncodingMemento, encodingRule); } finally { // since we don't own the original output stream, we // won't close it ours. // the caller who passed it to us must close original one // when appropriate. // (but we do flush to be sure all up-to-date) outputStreamWriter.flush(); } } private boolean get3ByteBOMPreference() { return SSECorePlugin.getDefault().getPluginPreferences().getBoolean(CommonEncodingPreferenceNames.USE_3BYTE_BOM_WITH_UTF8); } public ByteArrayOutputStream getCodedByteArrayOutputStream() throws CoreException, IOException { return getCodedByteArrayOutputStream(EncodingRule.CONTENT_BASED); } public ByteArrayOutputStream getCodedByteArrayOutputStream(EncodingRule encodingRule) throws CoreException, IOException { //Assert.isNotNull(fPreviousEncodingMemento, // "previousEncodingMemento // needs to be set first"); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(INITIAL_BUFFER_SIZE); dump(byteArrayOutputStream, encodingRule, get3ByteBOMPreference()); return byteArrayOutputStream; } public EncodingMemento getCurrentEncodingMemento() throws CoreException, IOException { //Assert.isNotNull(fPreviousEncodingMemento, // "previousEncodingMemento // needs to be set first"); if (!fHasBeenAnalyzed) { analyze(); } // post condition Assert.isNotNull(fCurrentEncodingMemento, "illegal post condition state"); //$NON-NLS-1$ // be sure to carry over appropriate encoding // "state" that may be // relevent. if (fPreviousEncodingMemento != null) { fCurrentEncodingMemento.setUTF83ByteBOMUsed(fPreviousEncodingMemento.isUTF83ByteBOMUsed()); } return fCurrentEncodingMemento; } /* * This method is called only when encoding is not detected in the file. * * Here is encoding lookup order we will try: - try resource content * description (Eclipse Text file encoding) - try resource content * properties (for JSP only) - try content type encoding preferences (for * HTML only) - try resource content description (Eclipse Text file * encoding, implicit check) * * Note: This method appears in both CodedReaderCreator and * CodedStreamCreator (with just a minor difference). They should be kept * the same. */ private EncodingMemento getEncodingMementoFromResourceAndPreference() throws IOException, CoreException { EncodingMemento encodingMemento = fEncodingMemento; // Follow Eclipse Platform's direction. Get the charset from IFile. if (fIFile != null) { String charset = fIFile.getCharset(); encodingMemento = CodedIO.createEncodingMemento(charset); } return encodingMemento; } private Reader getResettableReader() { if (fResettableReader == null) { if (fReader.markSupported()) { fResettableReader = fReader; } else { fResettableReader = new BufferedReader(fReader); try { fResettableReader.mark(MAX_MARK_SIZE); } catch (IOException e) { // impossible, since we just checked if // markable throw new Error(e); } } } return fResettableReader; } protected void handleNotProvidedFromContentCase() { // move to "detectors" if not already String specDefault = null; //specDefault = getEncodingDetector().getSpecDefaultEncoding(); String charset = NonContentBasedEncodingRules.useDefaultNameRules(specDefault); Assert.isNotNull(charset, PROGRAM_ERROR__FAILED_TO_FIND_ANY_CHARSET_ANYWHERE_); fCurrentEncodingMemento = CodedIO.createEncodingMemento(charset); } // TODO We just copy the content properties encoding to current resource's // encoding for now. May improve the UI later by setting an informational // message and/or disable the content properties encoding field. // TODO make priviate if needed, else remove void migrateContentPropertiesEncoding(String encoding) throws CoreException { if (fIFile != null) fIFile.setCharset(encoding, null); final IFile file = fIFile; final String charset = encoding; // TODO: externalize string later Job migrater = new Job(SSECoreMessages.Migrate_Charset) { //$NON-NLS-1$ protected IStatus run(IProgressMonitor monitor) { if (file != null) { try { file.setCharset(charset, null); } catch (CoreException e) { Logger.logException(e); } } return Status.OK_STATUS; } }; migrater.setSystem(true); migrater.schedule(); } /** * */ private void resetAll() { fFilename = null; fReader = null; fPreviousEncodingMemento = null; fCurrentEncodingMemento = null; fHasBeenAnalyzed = false; fClientSuppliedReader = false; } public void set(IFile file, Reader reader) { fIFile = file; set(file.getName(), reader); } public void set(String filename, char[] characterArray) { resetAll(); fFilename = filename; fReader = new CharArrayReader(characterArray); } public void set(String filename, Reader reader) { resetAll(); fFilename = filename; fReader = reader; fClientSuppliedReader = true; } public void set(String filename, String textString) { set(filename, new StringReader(textString)); } public void setPreviousEncodingMemento(EncodingMemento previousEncodingMemento) { fPreviousEncodingMemento = previousEncodingMemento; } }