GenerateXMLFiles.java example

Explorer
webtools.sourceediting-master
/*******************************************************************************
 * Copyright (c) 2004, 2005 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.wst.xml.tests.encoding;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.eclipse.core.runtime.content.IContentDescription;

/**
 * The purpose and logic of this class is to create small "XML files" of
 * various, known encodings, write them to files, and in later tests, be sure
 * appropriate encoding can be detected, and read in and intact characters.
 */
public class GenerateXMLFiles extends GenerateFiles {
	private String LF = "\n";
	private String CR = "\r";
	private String CRLF = CR + LF;
	// different text strings for comparisons
	private String textUS_ASCII_LF = "abcdefghijklmnopqrstuvwxyz\n1234567890\nABCDEFGHIJKLMNOPQRSTUVWXYZ";
	private String textUS_ASCII_CRLF = "abcdefghijklmnopqrstuvwxyz\r\n1234567890\r\nABCDEFGHIJKLMNOPQRSTUVWXYZ";
	private boolean DEBUG = true;
	private boolean DEBUGCRLF = false;
	private boolean DEBUGINFO = true;


	public GenerateXMLFiles() {
		super();
	}

	public static void main(String[] args) {
		//junit.textui.TestRunner.run(GenerateXMLFiles.class);
		GenerateXMLFiles thisApp = new GenerateXMLFiles();
		try {
			//thisApp.generateOriginalFiles();
			thisApp.generateAllFilesForCurrentVM();
		}
		catch (IOException e) {

			e.printStackTrace();
		}
	}

	private void generateAllFilesForCurrentVM() throws IOException {
		Map allCharsetMap = Charset.availableCharsets();
		Set allKeys = allCharsetMap.keySet();
		Object[] allcharsets = allKeys.toArray();
		String[] allcharsetNames = new String[allcharsets.length];
		for (int i = 0; i < allcharsets.length; i++) {
			allcharsetNames[i] = allcharsets[i].toString();

		}
		//createFiles(allcharsetNames, false);
		createFiles(allcharsetNames, true);

	}

	private void createFiles(String[] charsetnames, boolean useCRLF) throws FileNotFoundException, IOException {

		String charsetName = null;
		Writer out = null;
		String mainDirectory = getMainDirectoryBasedOnVMNameAndFileExtension();
		List charsetFilesWritten = new ArrayList();
		for (int i = 0; i < charsetnames.length; i++) {
			try {


				charsetName = charsetnames[i];

				Charset charset = Charset.forName(charsetName);
				CharsetEncoder charsetEncoder = charset.newEncoder();
				charsetEncoder.onMalformedInput(CodingErrorAction.REPORT);
				charsetEncoder.onUnmappableCharacter(CodingErrorAction.REPORT);

				String header = getHeaderStart() + charsetName + getHeaderEnd();
				String fulltext = null;
				if (useCRLF) {
					fulltext = header + textUS_ASCII_CRLF;
				}
				else {
					fulltext = header + textUS_ASCII_LF;
				}

				if (!isEbcidic(charsetName, charsetEncoder)) {
					if (charsetEncoder.canEncode(fulltext)) {
						//						if (canEncodeCRLF(charsetName, charsetEncoder)
						// &&
						// canEncodeSimpleString(charsetName,
						// charsetEncoder, "<?") &&
						// charsetEncoder.canEncode(fulltext)) {
						String outputfilename = "test-" + charsetName + ".xml";
						File outFile = FileUtil.makeFileFor(mainDirectory, outputfilename, null);
						//System.out.println(outFile.getAbsolutePath());
						OutputStream outputStream = new FileOutputStream(outFile);
						ByteArrayOutputStream bytesout = new ByteArrayOutputStream();

						Writer fileWriter = new OutputStreamWriter(outputStream, charsetEncoder);
						// this byte writer is created just to be able to
						// count precise bytes.
						Writer byteWriter = new OutputStreamWriter(bytesout, charsetEncoder);

						supplyBOMs(charsetName, outputStream, bytesout);

						out = new BufferedWriter(fileWriter);


						out.write(fulltext);
						byteWriter.write(fulltext);
						out.close();
						byteWriter.flush();
						// if we made is this far, with no exceptions,
						// etc.,
						// then
						// must have been
						// really written.
						String writtenRecord = charsetName;
						charsetFilesWritten.add(writtenRecord);
						if (DEBUG) {
							printDebugInfo(useCRLF, header, outputfilename, bytesout);
						}
					}
					else {
						if (DEBUGINFO) {
							System.out.println(" *** could not convert sample ascii text for " + charsetName);
						}
					}
				}
			}

			catch (IOException e) {
				if (DEBUGINFO) {
					System.out.println(" ***** could not generate for " + charsetName);
					String msg = e.getMessage();
					if (msg == null)
						msg = "";
					System.out.println("          due to " + e.getClass().getName() + "  " + msg);
				}
			}
			catch (Exception e) {
				if (DEBUGINFO) {
					System.out.println(" ***** could not generate for " + charsetName);
					String msg = e.getMessage();
					if (msg == null)
						msg = "";
					System.out.println("          due to " + e.getClass().getName() + "  " + msg);
				}
			}
			finally {
				if (out != null) {
					out.close();
				}
			}
		}


		// now create file that summarizes what was written
		// suitable to paste as method in test class
		File outFile = FileUtil.makeFileFor(mainDirectory, "testMethods.text", null);
		FileWriter outproperties = new FileWriter(outFile);
		outproperties.write(charsetFilesWritten.size() + CRLF);
		Iterator items = charsetFilesWritten.iterator();
		int n = 0;
		while (items.hasNext()) {
			String itemCreated = (String) items.next();
			String testMethod = createMethod(n++, itemCreated);
			outproperties.write(testMethod + CRLF);
		}
		outproperties.close();

	}

	/**
	 * I thought this used to be automatic, but doesn't seem to be now?!
	 */
	private void supplyBOMs(String charsetName, OutputStream outputStream, ByteArrayOutputStream bytesout) throws IOException {
		byte[] nullBytes = new byte[]{0x00, 0x00};
		if (charsetName.equals("UTF-16")) {
			outputStream.write(IContentDescription.BOM_UTF_16LE);
			bytesout.write(IContentDescription.BOM_UTF_16LE);
		}
		if (charsetName.equals("UTF-16LE")) {
			outputStream.write(IContentDescription.BOM_UTF_16LE);
			bytesout.write(IContentDescription.BOM_UTF_16LE);
		}
		if (charsetName.equals("X-UnicodeLittle")) {
			outputStream.write(IContentDescription.BOM_UTF_16LE);
			bytesout.write(IContentDescription.BOM_UTF_16LE);
		}
		if (charsetName.equals("UTF-16BE")) {
			outputStream.write(IContentDescription.BOM_UTF_16BE);
			bytesout.write(IContentDescription.BOM_UTF_16BE);
		}
		if (charsetName.equals("X-UnicodeBig")) {
			outputStream.write(IContentDescription.BOM_UTF_16BE);
			bytesout.write(IContentDescription.BOM_UTF_16BE);
		}
		if (charsetName.equals("UTF-32")) {
			outputStream.write(nullBytes);
			outputStream.write(IContentDescription.BOM_UTF_16LE);
			bytesout.write(nullBytes);
			bytesout.write(IContentDescription.BOM_UTF_16LE);
		}
		if (charsetName.equals("UTF-32LE")) {
			outputStream.write(nullBytes);
			outputStream.write(IContentDescription.BOM_UTF_16LE);
			bytesout.write(nullBytes);
			bytesout.write(IContentDescription.BOM_UTF_16LE);
		}
		if (charsetName.equals("UTF-32BE")) {
			outputStream.write(nullBytes);
			outputStream.write(IContentDescription.BOM_UTF_16BE);
			bytesout.write(nullBytes);
			bytesout.write(IContentDescription.BOM_UTF_16BE);
		}
	}

	/**
	 * @param i
	 * @param itemCreated
	 */
	private String createMethod(int i, String itemCreated) {
		String template = "	public void testFile" + i + "() throws CoreException, IOException  {\r\n" + "		String charsetName = \"" + itemCreated + "\";\r\n" + "		doGenTest(charsetName);\r\n" + "	}";
		return template;
	}

	private void printDebugInfo(boolean useCRLF, String header, String outputfilename, ByteArrayOutputStream bytesout) {
		byte[] bytes = bytesout.toByteArray();
		int nBytes = bytes.length;
		int nChars = 0;
		if (useCRLF) {
			nChars = header.length() + textUS_ASCII_CRLF.length();
		}
		else {
			nChars = header.length() + textUS_ASCII_LF.length();
		}

		System.out.println("Wrote " + nChars + " characters and " + nBytes + " bytes to " + outputfilename);
	}

	// TODO: never used
	 boolean canEncodeSimpleString(String charsetName, CharsetEncoder charsetEncocder, String simpleString) {
		// this method added since some encoders don't report that they can't
		// encode something, but they obviously
		// can't, at least in the normal meaning of the word.
		// This seems to mostly apply to some IBM varieties where, apparently,
		// the input can't be interpreted at all without knowing encoding
		// (that is
		// could not be used for content based encoding).
		boolean result = false;

		String newAsciiString = null;
		byte[] translatedBytes = null;
		try {
			translatedBytes = simpleString.getBytes(charsetName);
			newAsciiString = new String(translatedBytes, "ascii");
		}
		catch (UnsupportedEncodingException e) {
			// impossible, since checked already
			throw new Error(e);
		}
		result = simpleString.equals(newAsciiString);
		if (!result) {
			if (charsetEncocder.maxBytesPerChar() != 1) {
				// don't check mulitbyte encoders, just assume true (for now).
				result = true;
				if (charsetEncocder.maxBytesPerChar() == 4) {
					//except, let's just exclude four byte streams, for now.
					result = false;
					if (charsetEncocder.averageBytesPerChar() == 2) {
						// except, for some reason UTF has max bytes of 4
						// (average bytes of 2).
						result = false;
					}
				}
			}
		}

		return result;
	}

	/**
	 * A very heuristic method. Should have table, someday.
	 */
	private boolean isEbcidic(String charsetName, CharsetEncoder charsetEncocder) {
		boolean result = false;
		String simpleString = "<?";
		String newAsciiString = null;
		byte[] translatedBytes = null;
		try {
			translatedBytes = simpleString.getBytes(charsetName);
			newAsciiString = new String(translatedBytes, "ascii");
		}
		catch (UnsupportedEncodingException e) {
			// impossible, since checked already
			throw new Error(e);
		}
		// experimenting/debugging showed the known ebcidic onces always
		// "mis" tranlated to characters L and o.
		result = "Lo".equals(newAsciiString);
		if (result) {
			System.out.println(charsetName + " assumed to be Edcidic");
		}
		return result;
	}

	/**
	 * @param charset
	 */
	 boolean canEncodeCRLF(String charsetName, CharsetEncoder charsetEncoder) {
		boolean result = true;
		//String charsetCononicalName = charsetEncoder.charset().name();
		if (!charsetEncoder.canEncode(LF)) {
			if (DEBUGCRLF) {
				String stringName = "LF";
				String testString = LF;
				exploreConversion(charsetName, stringName, testString);
				System.out.println("can not encode LF for " + charsetEncoder.charset().name());
			}
			result = false;
		}
		if (!charsetEncoder.canEncode(CR)) {
			if (DEBUGCRLF) {
				String stringName = "CR";
				String testString = CR;
				exploreConversion(charsetName, stringName, testString);
				System.out.println("can not encode CR for " + charsetEncoder.charset().name());
			}
			result = false;
		}
		if (!charsetEncoder.canEncode(CRLF)) {
			if (DEBUGCRLF) {
				String stringName = "CRLF";
				String testString = CRLF;
				exploreConversion(charsetName, stringName, testString);
				System.out.println("can not encode CRLF for " + charsetEncoder.charset().name());
			}
			result = false;
		}
		return result;

	}

	private void exploreConversion(String charsetName, String stringName, String testString) throws Error {
		try {
			String newLF = new String(testString.getBytes(charsetName));
			System.out.print("old " + stringName + " (dec): ");
			dumpString(System.out, testString);
			System.out.println();
			System.out.print("new " + stringName + " (dec): ");
			dumpString(System.out, newLF);
			System.out.println();
		}
		catch (UnsupportedEncodingException e) {
			//should never happen, already checked
			throw new Error(e);
		}
	}

	/**
	 * @param out
	 * @param lf2
	 */
	private void dumpString(PrintStream out, String lf2) {
		for (int i = 0; i < lf2.length(); i++) {
			out.print((int) lf2.charAt(i) + " ");
		}

	}

	public final static String getMainDirectoryBasedOnVMNameAndFileExtension() {
		String mainDirectory = getMainDirectoryBasedOnVMName() + "/xml";
		return mainDirectory;
	}

	private String getHeaderStart() {
		return "<?xml version=\"1.0\" encoding=\"";
	}

	private String getHeaderEnd() {
		return "\"?>";
	}

}