EncodingUtil.java example

Explorer
LEADT-master
 /*
 * EncodingUtil.java
 TODO methods for booleans
 *
 * Copyright (C) 2005-2006 Tommi Laukkanen
 * http://www.substanceofcode.com
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

// Expand to define test define
//#define DNOTEST
// Expand to define logging define
//#define DNOLOGGING
package com.substanceofcode.utils;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Hashtable;
import java.util.Vector;

import com.substanceofcode.utils.CauseException;

//#ifdef DLOGGING
import net.sf.jlogmicro.util.logging.Logger;
import net.sf.jlogmicro.util.logging.Level;
//#endif

/**
 * Simple encoding handler to allow handling utf-16 and 1252.
 *
 * @author Irving Bunton Jr
 */
public class EncodingUtil {
    
	final static public boolean m_midpIso = (System.getProperty(
			"microedition.encoding").toLowerCase().startsWith("iso-8859") ||
	                                        System.getProperty(
			"microedition.encoding").toLowerCase().startsWith("iso8859"));
	final static public String m_isoEncoding = initIsoEncoding();
	final static public boolean m_midpWin = (System.getProperty(
			"microedition.encoding").toLowerCase().startsWith("cp") ||
	                                        System.getProperty(
			"microedition.encoding").toLowerCase().startsWith("windows"));
	final static public String m_winEncoding = initWinEncoding();
	final static public boolean m_midpUni = System.getProperty(
			"microedition.encoding").toLowerCase().startsWith("utf-8");
	final static String[] m_isoCommonEntities =
		{"iexcl", "cent", "pound", "curren", "yen",
		"brvbar", "sect", "uml", "copy", "ordf",
		"laquo", "not", "shy", "reg", "macr",
		"deg", "plusmn", "sup2", "sup3", "acute",
		"micro", "para", "middot", "cedil", "sup1",
		"ordm", "raquo", "frac14", "frac12", "frac34",
		"iquest"};

	final static String[] m_isoSpecialEntities =
			{"ndash", // en dash 
			"mdash", // em dash 
			"lsquo", // left single quotation mark 
			"rsquo", // right single quotation mark 
			"sbquo", // single low-9 quotation mark 
			"ldquo", // left double quotation mark 
			"rdquo", // right double quotation mark 
			"bdquo"}; // double low-9 quotation mark 

	final static char[] m_isoSpecialValues =
			{'-', // en dash 
			'-', // em dash 
			'\'', // left single quotation mark 
			'\'', // right single quotation mark 
			'\'', // single low-9 quotation mark 
			'\"', // left double quotation mark 
			'\"', // right double quotation mark 
			'\"'}; // double low-9 quotation mark 

	final static char[] m_isoCommValues = 
		{0xA1, 0xA2, 0xA3, 0xA4, 0xA5,
		0xA6, 0xA7, 0xA8, 0xA9, 0xAA,
		0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
		0xB0, 0xB1, 0xB2, 0xB3, 0xB4,
		0xB5, 0xB6, 0xB7, 0xB8, 0xB9,
		0xBA, 0xBB, 0xBC, 0xBD, 0xBE,
		0xBF};

	final static String[] m_isoLatin1Entities = 
		{"Agrave", "Aacute", "Acirc", "Atilde", "Auml",
		"Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml",
		"Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", "Ograve",
		"Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave",
		"Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave",
		"aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil",
		"egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
		"iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
		"ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml",
		"yacute", "thorn", "yuml"};

	// Convert windows characters in iso 8859 control range to ISO
	// (not the actual character, but a good fix or remove if no equivalent)
	final public static char[] m_winIsoConvx80 = initWinIsoConv();

	// Convert uni chars to equivalent windows characters in the 0x80 - 0x9f
	// range.
	final public static char[] m_uniWinConvx80 = initUniWinConvx80();

	// See if windows cp-1252 is supported.
	final public static boolean m_hasWinEncoding = hasWinEncoding();
	// See if ISO8859-1 is supported.
	final public static boolean m_hasIso8859Encoding = hasIso8859Encoding();

	final private static String m_xmlEntKeys =
			"<  >   & '"";
	final private static String[] m_xmlEntValues =
			{"<", ">", " ", "&", "'", "\""};

	// Left single quote in cp-1252 (Windows) encoding.
    public static final char CWSGL_LOW9_QUOTE = 0x82; // #130;
    public static final char CWDBL_LOW9_QUOTE = 0x84; // #132;
    public static final char CWLEFT_SGL_QUOTE = 0x91; // #145;
    public static final char CWRIGHT_SGL_QUOTE = 0x92; // #146;
    public static final char [] CAWRIGHT_SGL_QUOTE = {CWRIGHT_SGL_QUOTE};
    public static final String WRIGHT_SGL_QUOTE = new String(CAWRIGHT_SGL_QUOTE);
    public static final char CWLEFT_DBL_QUOTE = 0x93; // #147;
    public static final char CWRIGHT_DBL_QUOTE = 0x94; // #148;
    public static final char CWEN_DASH = 0x96; // #150;
    public static final char CWEM_DASH = 0x97; // #151;
	// Left single quote in Unicode (utf-16) encoding.
	// Long dash a.k.a en dash
    public static final char CEN_DASH = 0x2013;
    public static final char CEM_DASH = 0x2014;
    public static final char CLEFT_SGL_QUOTE = 0x2018;
    public static final char CRIGHT_SGL_QUOTE = 0x2019;
    public static final char [] CARIGHT_SGL_QUOTE = {CRIGHT_SGL_QUOTE};
    public static final String RIGHT_SGL_QUOTE = new String(CARIGHT_SGL_QUOTE);
    public static final char CSGL_LOW9_QUOTE = 0x201A;
    private static final char CLEFT_DBL_QUOTE = 0x201C;
    private static final char CRIGHT_DBL_QUOTE = 0x201D;
    public static final char CDBL_LOW9_QUOTE = 0x201E;
    public static final char CA_UMLAUTE = (char)228;
    private static final char CO_UMLAUTE = (char)246;
    public static final char CNON_BREAKING_SP = (char)160;
    
    private EncodingStreamReader m_encodingStreamReader;
	final private static Hashtable m_convXmlEntities = initXmlEntities();
	final private static Hashtable m_convIso88591 = initAlphaIso88591(false);
	final private static Hashtable m_convXmlIso88591 = initAlphaIso88591(true);
	final private static Hashtable m_convCp1252 = initAlphaCp1252(false);
	final private static Hashtable m_convXmlCp1252 = initAlphaCp1252(true);
    private String m_docEncoding = "";  // Default for XML is UTF-8.
	                                    // unexpected UTF-16.
    private boolean m_utf = false;  // Doc is utf.
    private boolean m_getPrologue = true;
    private boolean m_windows = false;  // True if windows code space
	final private static boolean m_convWinUni = initConvWinUni();
	static Vector m_statExcs = null; // Exceptions encountered

	Vector m_excs = null; // Exceptions encountered

	//#ifdef DTEST
    final private static boolean m_debugTrace = false;  // True if want to trace more
	//#endif
	//#ifdef DLOGGING
    final private Logger logger = Logger.getLogger("EncodingUtil");
    final private boolean fineLoggable = logger.isLoggable(Level.FINE);
    final private boolean finestLoggable = logger.isLoggable(Level.FINEST);
	//#endif
    
    /** Creates a new instance of EncodingUtil */
    public EncodingUtil(InputStream inputStream) {
		m_encodingStreamReader = new EncodingStreamReader(inputStream);
    }

	/**  Determine the encoding based on what is passed in as well
	  as if/when strings are to be further encoded.  Also decide to
	  modify bytes read.  
	 **/

    public void getEncoding(final String fileEncoding, final String encoding) {
		getEncoding(m_hasIso8859Encoding, m_isoEncoding, m_hasWinEncoding,
				m_winEncoding, fileEncoding, encoding);
	}

	/**  Determine the encoding based on what is passed in as well
	  as if/when strings are to be further encoded.  Also decide to
	  modify bytes read.  
	 **/

    public void getEncoding(final boolean hasIso8859Encoding,
			final String isoEncoding, final boolean hasWinEncoding,
			final String winEncoding, final String fileEncoding,
			final String encoding) {
		String cencoding = encoding;
        // If there is a second char, don't stop splitting until we
        // return that char as input.
        if (cencoding == null) {
           cencoding = "UTF-8";
        }
        cencoding = cencoding.toUpperCase();
		boolean modUTF16 = m_encodingStreamReader.isModUTF16();
		boolean modEncoding = m_encodingStreamReader.isModEncoding();
		m_utf = false;
		m_windows = false;
		String docEncoding = fileEncoding;
		// Only need to convert from 2 byte to 1 byte and vsa versa.
        if ((cencoding.equals("UTF-8") || cencoding.equals("UTF8"))) {
            docEncoding = "UTF-8";
            modEncoding = false;
			m_utf = true;
        } else if (cencoding.equals("UTF-16") || cencoding.equals("UTF16")) {
			// If utf-16, don't set doc encoding as we are converting the
			// bytes to single chars.
            modUTF16 = true;
			m_utf = true;
			// Don't do doc encoding as the stream reader does it.
            docEncoding = "";
		} else if (cencoding.startsWith("ISO-8859")) {
			if (hasIso8859Encoding) {
				if (isoEncoding.indexOf("-") == -1) {
					docEncoding = StringUtil.replace(cencoding, "ISO-",
							"ISO");
					docEncoding = docEncoding.replace('-', '_');
				} else {
					docEncoding = cencoding;
				}
			} else {
				docEncoding = "";
			}
			modEncoding = false;

		} else if (cencoding.startsWith("ISO8859")) {
			if (hasIso8859Encoding) {
				if (isoEncoding.indexOf("-") >= 0) {
					docEncoding = StringUtil.replace(cencoding, "ISO",
							"ISO-");
					docEncoding = docEncoding.replace('_', '-');
				} else {
					docEncoding = cencoding;
				}
			} else {
				docEncoding = "";
			}
			modEncoding = false;
		} else if (cencoding.startsWith("WINDOWS-12")) {
			if (hasWinEncoding) {
				if (winEncoding.indexOf("-") == -1) {
					docEncoding = StringUtil.replace(cencoding, "WINDOWS-",
							"Cp");
				} else {
					docEncoding = cencoding;
				}
			} else {
				docEncoding = "";
			}
			modEncoding = false;
			m_windows = true;
		} else if (cencoding.indexOf("CP-") == 0) {
			if (hasWinEncoding) {
				if (winEncoding.indexOf("-") >= 0) {
					docEncoding = StringUtil.replace(cencoding, "CP-",
							"WINDOWS-");
				} else {
					docEncoding = StringUtil.replace(cencoding, "CP-",
							"Cp");
				}
			} else {
				docEncoding = "";
			}
			modEncoding = false;
			m_windows = true;
		} else if (cencoding.startsWith("CP")) {
			if (hasWinEncoding) {
				if (winEncoding.indexOf("-") >= 0) {
					docEncoding = StringUtil.replace(cencoding, "CP",
							"WINDOWS-");
				} else {
					docEncoding = StringUtil.replace(cencoding, "CP", "Cp");
				}
			} else {
				docEncoding = "";
			}
			modEncoding = false;
			m_windows = true;
		}
		if (docEncoding.equals(fileEncoding)) {
			m_docEncoding = "";
		} else {
			m_docEncoding = docEncoding;
		}
		if (m_docEncoding.length() != 0) {
			try {
				String a = new String("a".getBytes(), m_docEncoding);
			} catch (UnsupportedEncodingException e) {
				CauseException ce = new CauseException(
						"UnsupportedEncodingException while trying to " +
						"convert doc encoding: " + m_docEncoding, e);
				if (m_excs == null) {
					m_excs = new Vector();
				}
				m_excs.addElement(ce);
				//#ifdef DLOGGING
				logger.severe(ce.getMessage(), e);
				//#endif
				System.out.println(ce.getMessage());
				// If encoding problem, use the main encoding as it is
				// close enough.
				if (m_windows) {
					if (hasWinEncoding) {
						m_docEncoding = winEncoding;
					} else {
						m_docEncoding = "";
					}
				} else if (m_utf) {
					m_docEncoding = "";
				} else {
					if (hasIso8859Encoding) {
						m_docEncoding = isoEncoding;
					} else {
						m_docEncoding = "";
					}
				}
				try {
					String a = new String("a".getBytes(), m_docEncoding);
				} catch (UnsupportedEncodingException e2) {
					CauseException ce2 = new CauseException(
							"Second unsupportedEncodingException while " +
							" trying to convert doc encoding: " +
							m_docEncoding, e2);
					m_excs.addElement(ce2);
					//#ifdef DLOGGING
					logger.severe(ce2.getMessage(), e2);
					//#endif
					System.out.println(ce2.getMessage());
					m_docEncoding = "";
				}
			}
		}
		m_encodingStreamReader.setModEncoding(modEncoding);
		m_encodingStreamReader.setModUTF16(modUTF16);

		//#ifdef DLOGGING
        if (fineLoggable) {logger.fine("hasIso8859Encoding=" + hasIso8859Encoding);}
        if (fineLoggable) {logger.fine("isoEncoding=" + isoEncoding);}
        if (fineLoggable) {logger.fine("hasWinEncoding=" + hasWinEncoding);}
        if (fineLoggable) {logger.fine("winEncoding=" + winEncoding);}
        if (fineLoggable) {logger.fine("encoding=" + encoding);}
        if (fineLoggable) {logger.fine("cencoding=" + cencoding);}
        if (fineLoggable) {logger.fine("docEncoding=" + docEncoding);}
        if (fineLoggable) {logger.fine("m_docEncoding=" + m_docEncoding);}
        if (fineLoggable) {logger.fine("fileEncoding=" + fileEncoding);}
        if (fineLoggable) {logger.fine("m_windows=" + m_windows);}
        if (fineLoggable) {logger.fine("m_utf=" + m_utf);}
        if (fineLoggable) {logger.fine("modEncoding=" + modEncoding);}
        if (fineLoggable) {logger.fine("modUTF16=" + modUTF16);}
		//#endif
    }

	/* Replace special characters with valid ones for the specified
	   encoding. */
	public static String replaceSpChars(String text, boolean isWindows,
										boolean isUtf) {
		return replaceSpChars(text, isWindows, isUtf, m_midpWin, m_midpUni);
	}

	/* Replace special characters with valid ones for the specified
	   encoding.   For callers which use an instance of this class.  */
	public String replaceSpChars(String text) {
		return replaceSpChars(text, m_windows, m_utf, m_midpWin, m_midpUni);
	}

	/* Replace special characters with valid ones for the specified
	   encoding. */
	public static String replaceSpChars(String text, final boolean isWindows,
										final boolean isUtf,
										final boolean midpWin,
										final boolean midpUni) {
		try {
			// No need to convert i diaeresis anymore as we do encoding
			// change.
			if (isWindows) {
				if (midpWin) {
					if (m_convWinUni) {
						text = replaceSpUniChars(text);
						return text;
					}
				/* If we are converting a windows doc, the windows special
				   characters are control characters in other encodings,
				   so change to ASCII. */
				} else if (m_convWinUni) {
					if (!midpUni) {
						text = replaceSpUniWinChars(text);
					}
				} else {
					char [] ctext = text.toCharArray();
					char [] ntext = new char[text.length()];
					int jc = 0;
					for (int ic = 0; ic < ctext.length; ic++) {
						final char cchr = ctext[ic];
						if ((0x80 <= (int)cchr) && ((int)cchr <= 0x9f)) {
							if (m_winIsoConvx80[(int)cchr - 0x80] != 0x01) {
								ntext[jc++] = m_winIsoConvx80[(int)cchr - 0x80];
								//#ifdef DTEST
								if (m_debugTrace) {System.out.println("array cchr,conv=" + cchr + "," + Integer.toHexString(cchr) + "," + ntext[jc - 1] + "," + Integer.toHexString(ntext[jc - 1]));}
								//#endif
							}
						} else {
							ntext[jc++] = cchr;
							//#ifdef DTEST
							if (m_debugTrace) {System.out.println("cchr,conv=" + cchr + "," + Integer.toHexString(cchr) + "," + ntext[jc - 1] + "," + Integer.toHexString(ntext[jc - 1]));}
							//#endif
						}
					}
					text = new String(ntext, 0, jc);
					//#ifdef DTEST
					if (m_debugTrace) {System.out.println( "text,len=" + text + "," + text.length());}
					//#endif
				}
			} else if (isUtf && !midpUni) {
				text = replaceSpUniChars(text);
			}
			text = text.replace(CNON_BREAKING_SP, ' ');
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
            logger.severe("replaceSpChars error ", t);
			//#endif
            System.out.println("replaceSpChars error " + t + "," +
					           t.getMessage());
		}
		return text;
	}

	/* Replace Unicode special characters with valid ones for Windows
	   encoding as they sometimes are valid even in iso8859_1 even though
	   it shouldn't be.  */
	public static String replaceSpUniWinChars(String text) {
		try {
			final char [] ctext = text.toCharArray();
			char [] ntext = new char[text.length()];
			int jc = 0;
			for (int ic = 0; ic < ctext.length; ic++) {
				final char c = ctext[ic];
				switch(c & 0xff00) {
					case 0x2000:
						switch(c) {
							case CEN_DASH:
								ntext[jc++] = '-';
								break;
							case CEM_DASH:
								ntext[jc++] = '-';
								break;
							case CLEFT_SGL_QUOTE:
								ntext[jc++] = '\'';
								break;
							case CRIGHT_SGL_QUOTE:
								ntext[jc++] = '\'';
								break;
							case CSGL_LOW9_QUOTE:
								ntext[jc++] = '\'';
								break;
							case CLEFT_DBL_QUOTE:
								ntext[jc++] = '\"';
								break;
							case CRIGHT_DBL_QUOTE:
								ntext[jc++] = '\"';
								break;
							case CDBL_LOW9_QUOTE:
								ntext[jc++] = '\"';
								break;
							case 0x2020:
								ntext[jc++] = 0x86;
								break;
							case 0x2021:
								ntext[jc++] = 0x87;
								break;
							case 0x2022:
								ntext[jc++] = 0x95;
								break;
							case 0x2026:
								ntext[jc++] = 0x85;
								break;
							case 0x2030:
								ntext[jc++] = 0x89;
								break;
							case 0x2039:
								ntext[jc++] = 0x8B;
								break;
							case 0x203A:
								ntext[jc++] = 0x9B;
								break;
							case 0x20AC:
								ntext[jc++] = 0x80;
								System.out.println("ic,c=" + c + "," + Integer.toHexString(ntext[jc-1]));
								break;
							default:
								ntext[jc++] = c;
								break;
						}
						break;
					default:
						ntext[jc++] = c;
						break;
				}
			}
			text = new String(ntext, 0, jc);
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
            logger.severe("replaceSpUniWinChars error ", t);
			//#endif
            System.out.println("replaceSpUniWinChars error " + t + "," +
					           t.getMessage());
		}
		return text;
	}

	/* Replace Unicode special characters which have Windows (cp1252)
	   equivalents into their windows equivalents except for those
	   that have simi-equivalents (e.g. en dash to regular dash)*/
	public static String replaceSpUniChars(String text) {
		text = text.replace(CSGL_LOW9_QUOTE, '\'');
		text = text.replace(CLEFT_SGL_QUOTE, '\'');
		text = text.replace(CRIGHT_SGL_QUOTE, '\'');
		text = text.replace(CLEFT_DBL_QUOTE, '\"');
		text = text.replace(CRIGHT_DBL_QUOTE, '\"');
		text = text.replace(CDBL_LOW9_QUOTE, '\"');
		text = text.replace(CEN_DASH, '-');
		text = text.replace(CEM_DASH, '-');
		return text;
	}

	/* Replace Windows special characters with simi-equivalents
	   (e.g. en dash to regular dash)*/
	public static String replaceSpWinChars(String text) {
		text = text.replace(CWSGL_LOW9_QUOTE, '\'');
		text = text.replace(CWLEFT_SGL_QUOTE, '\'');
		text = text.replace(CWRIGHT_SGL_QUOTE, '\'');
		text = text.replace(CWLEFT_DBL_QUOTE, '\"');
		text = text.replace(CWRIGHT_DBL_QUOTE, '\"');
		text = text.replace(CWDBL_LOW9_QUOTE, '\"');
		text = text.replace(CWEN_DASH, '-');
		text = text.replace(CWEM_DASH, '-');
		return text;
	}

    /* Replace all numeric entites e.g. ä
     *   @param  s  String to alter.
     */
    public static String replaceNumEntity( String s) {
        if (s == null)  return s;
		String snum = "";
		try {
			
			int index01 = s.indexOf( "&#" );
			char [] achar = new char[1];
			while (index01 != -1) {
				int index02 = s.indexOf( ';' , index01 );
				if (index02 == -1) {
					return s;
				}
				try {
					snum = s.substring(index01 + 2, index02);
					// TODO redo with StringBuffer?
					if (snum.length() == 0) {
						return s;
					}
					switch (snum.charAt(0)) {
						case 'x':
						case 'X':
							achar[0] = (char)Integer.parseInt(snum.substring(
										1), 16);
							break;
						default:
							achar[0] = (char)Integer.parseInt(snum);
							break;
					}
					s = s.substring(0, index01) + new String(achar) +
							  s.substring(index02 + 1);
				} catch (NumberFormatException e) {
					//#ifdef DLOGGING
					Logger logger = Logger.getLogger("EncodingUtil");
					logger.severe("replaceNumEntity NumberFormatException error  for " + snum, e);
					//#endif
					System.out.println("replaceNumEntity error " + e + "," +
									   e.getMessage());
					return s;
				}
				index01 = s.indexOf( "&#" );
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
            logger.severe("replaceNumEntity error ", t);
			//#endif
            System.out.println("replaceNumEntity error " + t + "," +
					           t.getMessage());
		}
        return s;
    }
    
	/**
	  Replace alphabetic entities.
	  */
	public static String replaceAlphaEntities(final boolean convXmlEnts,
			String text) {
		final Hashtable m_convEntities = (m_midpWin) ?
			    (convXmlEnts ? m_convXmlCp1252 : m_convCp1252) :
				(convXmlEnts ? m_convXmlIso88591 : m_convIso88591);
		int beginPos = 0;
		int pos = -1;
		while ((pos = text.indexOf('&', beginPos)) >= 0) {
			int epos = text.indexOf(';', pos);
			if (epos < 0) {
				break;
			}
			int nbpos = text.indexOf('&', pos + 1);
			if ((nbpos >= 0) && (nbpos < epos)) {
				beginPos = nbpos;
				continue;
			}
			if ((pos + 1) == epos) {
				beginPos = epos + 1;
				continue;
			}
			String entity = text.substring(pos + 1, epos);
			Object oent = m_convEntities.get(entity);
			if (oent != null) {
				String ent = (String)oent;
				text = text.substring(0, pos) + ent + text.substring(epos + 1);
				// If we made a substitution, keep the position the same
				// as sometimes, we get a double substitution when
				// we substitute & for & this may create another
				// entity (e.g. &quot; becomes & ")
				beginPos = pos;
			} else {
				beginPos = epos + 1;
			}
		}
		return text;
	}

	/**
	  Replace alphabetic entities.
	  */
	public static String replaceXmlEntities(String text) {
		int beginPos = 0;
		int pos = -1;
		while ((pos = text.indexOf('&', beginPos)) >= 0) {
			int epos = text.indexOf(';', pos);
			if (epos < 0) {
				break;
			}
			int nbpos = text.indexOf('&', pos + 1);
			if ((nbpos >= 0) && (nbpos < epos)) {
				beginPos = nbpos;
				continue;
			}
			if ((pos + 1) == epos) {
				beginPos = epos + 1;
				continue;
			}
			String entity = text.substring(pos, epos + 1);
			int spos = m_xmlEntKeys.indexOf(entity);
			if (spos >= 0) {
				String ent = m_xmlEntValues[spos / 6];
				text = text.substring(0, pos) + ent + text.substring(epos + 1);
				// If we made a substitution, keep the position the same
				// as sometimes, we get a double substitution when
				// we substitute & for & this may create another
				// entity (e.g. &quot; becomes & ")
				beginPos = pos;
			} else {
				beginPos = epos + 1;
			}
		}
		return text;
	}

	/**
	  Create table of XML entities.
	  */
	public static Hashtable initXmlEntities() {
		Hashtable convEntities = new Hashtable();
		try {
			initHtmlCommEnts(convEntities);
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initXmlEntities", t);
			//#endif
		}
		return convEntities;
	}

	/**
	  Create table of alpha entities for iso8859-1.
	  */
	public static Hashtable initAlphaIso88591(final boolean convXmlEnts) {

		//#ifdef DTEST
		System.out.println( "m_midpIso=" + m_midpIso);
		//#endif
		final char isoLatin1Values[] =
			{0xC0, 0xC1, 0xC2, 0xC3, 0xC4,
			0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
			0xCA, 0xCB, 0xCC, 0xCD, 0xCE,
			0xCF, 0xD0, 0xD1, 0xD2, 0xD3,
			0xD4, 0xD5, 0xD6, 0xD7, 0xD8,
			0xD9, 0xDA, 0xDB, 0xDC, 0xDD,
			0xDE, 0xDF, 0xE0, 0xE1, 0xE2,
			0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
			0xE8, 0xE9, 0xEA, 0xEB, 0xEC,
			0xED, 0xEE, 0xEF, 0xF0, 0xF1,
			0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
			0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
			0xFC, 0xFD, 0xFE, 0xFF};

		Hashtable convEntities = new Hashtable();
		try {
			initEntVals(convEntities, m_isoCommonEntities, m_isoCommValues);
			initEntVals(convEntities, m_isoLatin1Entities, isoLatin1Values);
			initEntVals(convEntities, m_isoSpecialEntities, m_isoSpecialValues);
			if (convXmlEnts) {
				initHtmlCommEnts(convEntities);
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initAlphaIso88591", t);
			//#endif
		}
		return convEntities;
	}

	/**
	  Create table of alpha entities for windows 1252.
	  */
	public static Hashtable initAlphaCp1252(final boolean convXmlEnts) {

		//#ifdef DTEST
		System.out.println( "m_midpWin=" + m_midpWin);
		//#endif
		char isoLatin1Values[] =
			{0xC0, 0xC1, 0xC2, 0xC3, 0xC4,
			0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
			0xCA, 0xCB, 0xCC, 0xCD, 0xCE,
			0xCF, 0xD0, 0xD1, 0xD2, 0xD3,
			0xD4, 0xD5, 0xD6, 0xD7, 0xD8,
			0xD9, 0xDA, 0xDB, 0xDC, 0xDD,
			0xDE, 0xDF, 0xE0, 0xE1, 0xE2,
			0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
			0xE8, 0xE9, 0xEA, 0xEB, 0xEC,
			0xED, 0xEE, 0xEF, 0xF0, 0xF1,
			0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
			0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
			0xFC, 0xFD, 0xFE, 0xFF};

		Hashtable convEntities = new Hashtable();
		try {
			/* ISO common entities have same encodings as Cp1252 */
			initEntVals(convEntities, m_isoCommonEntities, m_isoCommValues);
			initEntVals(convEntities, m_isoLatin1Entities, isoLatin1Values);
			char wm_isoSpecialValues[] =
				{CWEN_DASH, // en dash 
				CWEM_DASH, // em dash 
				CWLEFT_SGL_QUOTE, // left single quotation mark 
				CWRIGHT_SGL_QUOTE, // right single quotation mark 
				0x82, // single low-9 quotation mark 
				CWLEFT_DBL_QUOTE, // left double quotation mark 
				CWRIGHT_DBL_QUOTE, // right double quotation mark 
				0x84}; // double low-9 quotation mark 
			initEntVals(convEntities, m_isoSpecialEntities, wm_isoSpecialValues);
			if (convXmlEnts) {
				initHtmlCommEnts(convEntities);
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initAlphaCp1252", t);
			//#endif
		}
		return convEntities;
	}

	/* Initialize entries with passed in entity strings and character
	   values turned into strings. */
	public static void initEntVals(Hashtable convEntities, String[] entities, char[] entValues) {
		try {
			//#ifdef DTEST
			System.out.println( "Entities, values len=" + entities.length + "," + entValues.length);
			//#endif
			for (int ic = 0; (ic < entities.length) && (ic < entValues.length);
					ic++) {
				char [] cvalue = {entValues[ic]};
				// Sometimes, this can produce an error in some default
				// encodings.
				try {
					String value = new String(cvalue);
					convEntities.put(entities[ic], value);
				} catch (Throwable t) {
					//#ifdef DLOGGING
					Logger logger = Logger.getLogger("EncodingUtil");
					logger.severe("initEntVals convert error bvalue=" +
							Integer.toHexString(cvalue[0]), t);
					//#endif
				}
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initEntVals", t);
			//#endif
		}
	}

	/* Init windows (cp-1252) to Iso 8859 encoding.  This has either 1
	   if there is no equivalent (this is used to remove the equivalent char
	   from the string to be converted).  If not a 1, the character is
	   used to replace the character in the string to be converted.
	   The conversion starts at 0x80 and goes to including 0x9f.
	   */
	private static char [] initWinIsoConv() {
		char [] convTable = new char[0x9f - 0x80 + 1];
		try {
			//#ifdef DTEST
			System.out.println( "convTable.length=" + convTable.length);
			//#endif
			convTable[0x80 - 0x80] = 0x20AC; //EURO SIGN
			convTable[0x81 - 0x80] = 0x01;
			convTable[0x82 - 0x80] = '\''; //SINGLE LOW-9 QUOTATION MARK
			convTable[0x83 - 0x80] = 0x0192; //LATIN SMALL LETTER F WITH HOOK
			convTable[0x84 - 0x80] = '\"'; //DOUBLE LOW-9 QUOTATION MARK
			convTable[0x85 - 0x80] = 0x2026; //HORIZONTAL ELLIPSIS
			convTable[0x86 - 0x80] = 0x2020; //DAGGER
			convTable[0x87 - 0x80] = 0x2021; //DOUBLE DAGGER
			convTable[0x88 - 0x80] = 0x02C6; //MODIFIER LETTER CIRCUMFLEX ACCENT
			convTable[0x89 - 0x80] = 0x2030; //PER MILLE SIGN
			convTable[0x8A - 0x80] = 0x0160; //LATIN CAPITAL LETTER S WITH CARON
			convTable[0x8B - 0x80] = 0x2039; //SINGLE LEFT-POINTING ANGLE QUOTATION MARK
			convTable[0x8C - 0x80] = 0x0152; //LATIN CAPITAL LIGATURE OE
			convTable[0x8D - 0x80] = 0x01;
			convTable[0x8E - 0x80] = 0x017D; //LATIN CAPITAL LETTER Z WITH CARON
			convTable[0x8F - 0x80] = 0x01;
			convTable[0x90 - 0x80] = 0x01;
			convTable[0x91 - 0x80] = '\''; //LEFT SINGLE QUOTATION MARK
			convTable[0x92 - 0x80] = '\''; //RIGHT SINGLE QUOTATION MARK
			convTable[0x93 - 0x80] = '\"'; //LEFT DOUBLE QUOTATION MARK
			convTable[0x94 - 0x80] = '\"'; //RIGHT DOUBLE QUOTATION MARK
			convTable[0x95 - 0x80] = 0x2022; //BULLET
			convTable[0x96 - 0x80] = '-'; //EN DASH
			convTable[0x97 - 0x80] = '-'; //EM DASH
			convTable[0x98 - 0x80] = 0x02DC; //SMALL TILDE
			convTable[0x99 - 0x80] = 0x2122; //TRADE MARK SIGN
			convTable[0x9A - 0x80] = 0x0161; //LATIN SMALL LETTER S WITH CARON
			convTable[0x9B - 0x80] = 0x203A; //SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
			convTable[0x9C - 0x80] = 0x0153; //LATIN SMALL LIGATURE OE
			convTable[0x9D - 0x80] = 0x01;
			convTable[0x9E - 0x80] = 0x017E; //LATIN SMALL LETTER Z WITH CARON
			convTable[0x9F - 0x80] = 0x0178; //LATIN CAPITAL LETTER Y WITH DIAERESIS
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initWinIsoConv", t);
			//#endif
		}
		return convTable;
	}

	/* Init unicode to windows (cp-1252).  This has either 1
	   if there is no equivalent (this is used to remove the equivalent char
	   from the string to be converted).  If not a 1, the character is
	   used to replace the character in the string to be converted.
	   The conversion starts at 0x80 and goes to including 0x9f.
	   */
	private static char [] initUniWinConvx80() {
		char [] convTable = new char[0x9f - 0x80 + 1];
		try {
			//#ifdef DTEST
			System.out.println( "convTable.length=" + convTable.length);
			//#endif
			for (int ic = 0; ic < convTable.length; ic++) {
				char cc = (char)(ic + 0x80);
				switch (cc) {
					case CWSGL_LOW9_QUOTE:
						convTable[ic] = '\'';
						break;
					case CWDBL_LOW9_QUOTE:
						convTable[ic] = '\"';
						break;
					case CWLEFT_DBL_QUOTE:
						convTable[ic] = '\"';
						break;
					case CWRIGHT_DBL_QUOTE:
						convTable[ic] = '\"';
						break;
					case CWLEFT_SGL_QUOTE:
						convTable[ic] = '\'';
						break;
					case CWEN_DASH:
						convTable[ic] = '-';
						break;
					case CWEM_DASH:
						convTable[ic] = '-';
						break;
					default:
						convTable[ic] = 0x01;
						break;
				}
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initUniWinConvx80", t);
			//#endif
		}
		return convTable;
	}

	/* Initialize entries for XML. */
	private static void initHtmlCommEnts(Hashtable convEntities) {
		String htmlCommonEntities[] =
				{"lt", "gt", "nbsp", "amp", "apos", "quot"};
		char htmlCommonValues[] = {'<', '>', ' ', '&', '\'', '\"'};
		initEntVals(convEntities, htmlCommonEntities, htmlCommonValues);
	}

	/* Determine if creating a string converts the windows chars to
	   Unicode. */
	private static boolean initConvWinUni() {
		boolean rtn = false;
		try {
			byte[] blftSgl = {(byte)CWLEFT_SGL_QUOTE};
			try {
				String convStr = new String(blftSgl, "Cp1252");
				rtn = convStr.charAt(0) == CLEFT_SGL_QUOTE;
			} catch (UnsupportedEncodingException e) {
				//#ifdef DTEST
				System.out.println( "Unsupported encoding Cp1252");
				//#endif
				//#ifdef DLOGGING
				Logger logger = Logger.getLogger("EncodingUtil");
				logger.severe("UnsupportedEncodingException Cp1252", e);
				//#endif
				try {
					String convStr2 = new String(blftSgl, "Cp1252");
					rtn = convStr2.charAt(0) == CLEFT_SGL_QUOTE;
				} catch (UnsupportedEncodingException e2) {
					//#ifdef DTEST
					System.out.println( "Unsupported encoding WINDOWS-1252");
					//#endif
					//#ifdef DLOGGING
					logger.severe("UnsupportedEncodingException Cp1252", e2);
					//#endif
				}
			}
			//#ifdef DTEST
			System.out.println( "initConvWinUni()=" + rtn);
			//#endif
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initConvWinUni", t);
			//#endif
		}
		return rtn;
	}

	/* Determine ISO encoding string. */
	private static String initIsoEncoding() {
		try {
			try {
				String convStr = new String("a".getBytes(), "ISO8859_1");
				return "ISO8859_1";
			} catch (UnsupportedEncodingException e) {
				//#ifdef DTEST
				System.out.println( "Unsupported encoding ISO8859_1");
				//#endif
				//#ifdef DLOGGING
				Logger logger = Logger.getLogger("EncodingUtil");
				logger.severe("initIsoEncoding UnsupportedEncodingException ISO8859_1", e);
				//#endif
				try {
					String convStr2 = new String("a".getBytes(), "ISO-8859-1");
					return "ISO-8859-1";
				} catch (UnsupportedEncodingException e2) {
					//#ifdef DTEST
					System.out.println("initIsoEncoding Unsupported encoding ISO-8859-1");
					//#endif
					//#ifdef DLOGGING
					logger.severe("initIsoEncoding UnsupportedEncodingException ISO-8859-1", e2);
					//#endif
				}
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initIsoEncoding initConvWinUni", t);
			//#endif
		}
		return "ISO8859_1";
	}

	/* Determine Windows encoding string. */
	private static String initWinEncoding() {
		try {
			try {
				String convStr = new String("a".getBytes(), "Cp1252");
				return "Cp1252";
			} catch (UnsupportedEncodingException e) {
				CauseException ce = new CauseException(
						"initWinEncoding UnsupportedEncodingException " +
						"while trying to convert encoding Cp1252.", e);
				if (m_statExcs == null) {
					m_statExcs = new Vector();
				}
				m_statExcs.addElement(ce);
				//#ifdef DTEST
				System.out.println(ce.getMessage());
				//#endif
				//#ifdef DLOGGING
				Logger logger = Logger.getLogger("EncodingUtil");
				logger.severe(ce.getMessage(), e);
				//#endif
				try {
					String convStr2 = new String("a".getBytes(), "WINDOWS-1252");
					return "WINDOWS-1252";
				} catch (UnsupportedEncodingException e2) {
					CauseException ce2 = new CauseException(
							"initWinEncoding second " +
							"unsupportedEncodingException while " +
							" trying to convert encoding WINDOWS-1252.", e2);
					m_statExcs.addElement(ce2);
					//#ifdef DTEST
					System.out.println(ce2.getMessage());
					//#endif
					//#ifdef DLOGGING
					logger.severe(ce2.getMessage(), e2);
					//#endif
				}
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("initWinEncoding() initConvWinUni", t);
			//#endif
		}
		return "Cp1252";
	}

	/* Determine if windows encoding is supported.  */
	public static boolean hasWinEncoding() {
		try {
			try {
				String convStr = new String("a".getBytes(), "Cp1252");
				return true;
			} catch (UnsupportedEncodingException e) {
				CauseException ce = new CauseException(
						"hasWinEncoding UnsupportedEncodingException " +
						"while trying to convert encoding Cp1252.", e);
				if (m_statExcs == null) {
					m_statExcs = new Vector();
				}
				m_statExcs.addElement(ce);
				//#ifdef DTEST
				System.out.println(ce.getMessage());
				//#endif
				//#ifdef DLOGGING
				Logger logger = Logger.getLogger("EncodingUtil");
				logger.severe(ce.getMessage(), e);
				//#endif
				try {
					String convStr2 = new String("a".getBytes(), "WINDOWS-1252");
					return true;
				} catch (UnsupportedEncodingException e2) {
					CauseException ce2 = new CauseException(
							"initWinEncoding second " +
							"unsupportedEncodingException while " +
							" trying to convert encoding WINDOWS-1252.", e2);
					m_statExcs.addElement(ce2);
					//#ifdef DTEST
					System.out.println(ce2.getMessage());
					//#endif
					//#ifdef DLOGGING
					logger.severe(ce2.getMessage(), e2);
					//#endif
				}
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("hasWinEncoding initConvWinUni", t);
			//#endif
		}
		return false;
	}

	/* Determine if iso-8859-1 encoding is supported.  */
	private static boolean hasIso8859Encoding() {
		try {
			try {
				String convStr = new String("a".getBytes(), "ISO8859_1");
				return true;
			} catch (UnsupportedEncodingException e) {
				//#ifdef DTEST
				System.out.println( "Unsupported encoding ISO8859_1");
				//#endif
				//#ifdef DLOGGING
				Logger logger = Logger.getLogger("EncodingUtil");
				logger.severe("hasIso8859Encoding UnsupportedEncodingException ISO8859_1", e);
				//#endif
				try {
					String convStr2 = new String("a".getBytes(), "ISO-8859-1");
					return true;
				} catch (UnsupportedEncodingException e2) {
					//#ifdef DTEST
					System.out.println("hasIso8859Encoding Unsupported encoding ISO-8859-1");
					//#endif
					//#ifdef DLOGGING
					logger.severe("initIsoEncoding UnsupportedEncodingException ISO-8859-1", e2);
					//#endif
				}
			}
		} catch (Throwable t) {
			//#ifdef DLOGGING
			Logger logger = Logger.getLogger("EncodingUtil");
			logger.severe("hasIso8859Encoding initConvWinUni", t);
			//#endif
		}
		return false;
	}

    public void setDocEncoding(String m_docEncoding) {
        this.m_docEncoding = m_docEncoding;
    }

    public String getDocEncoding() {
        return (m_docEncoding);
    }

    public void setEncodingStreamReader(EncodingStreamReader m_encodingStreamReader) {
        this.m_encodingStreamReader = m_encodingStreamReader;
    }

    public EncodingStreamReader getEncodingStreamReader() {
        return (m_encodingStreamReader);
    }

    public boolean isWindows() {
        return (m_windows);
    }

    public boolean isUtf() {
        return (m_utf);
    }

	//#ifdef DTEST
    public static String[] getIsoCommonEntities() {
        return (m_isoCommonEntities);
    }

    public static Hashtable getConvIso88591() {
        return (m_convIso88591);
    }

    public static Hashtable getConvCp1252() {
        return (m_convCp1252);
    }

    static public String[] getIsoSpecialEntities() {
        return (m_isoSpecialEntities);
    }

    static public String getWinEncoding() {
        return (m_winEncoding);
    }

    public static boolean isConvWinUni() {
        return (m_convWinUni);
    }

    public static boolean isHasWinEncoding() {
        return (m_hasWinEncoding);
    }

	//#endif

    static public String getIsoEncoding() {
        return (m_isoEncoding);
    }

    public Vector getExcs() {
		if (m_excs == null) {
			return new Vector();
		} else {
			return (m_excs);
		}
    }

    public static Vector getStatExcs() {
		if (m_statExcs == null) {
			return new Vector();
		} else {
			return (m_statExcs);
		}
    }

}