NumericCharacterReference.java example

Explorer
uima_prolog-master
- uima-PrologInterface-Examples
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.2
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;

/**
 * Represents an HTML <a target="_blank" href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1">Numeric Character Reference</a>.
 * <p>
 * A numeric character reference can be one of two types:
 * <dl>
 *  <dt><a name="DecimalCharacterReference">Decimal Character Reference</a>
 *   <dd>A numeric character reference specifying the unicode code point in decimal notation.<br />
 *    This is signified by the absence of an '<code>x</code>' character after the '<code>#</code>', (eg "<code>&#62;</code>").
 *  <dt><a name="HexadecimalCharacterReference">Hexadecimal Character Reference</a>
 *   <dd>A numeric character reference specifying the unicode code point in hexadecimal notation.<br />
 *    This is signified by the presence of an '<code>x</code>' character after the '<code>#</code>', (eg "<code>&#x3e;</code>").
 * </dl>
 * <p>
 * Static methods to {@linkplain #encode(CharSequence) encode} and {@linkplain #decode(CharSequence) decode} strings
 * and single characters can be found in the {@link CharacterReference} superclass.
 * <p>
 * <code>NumericCharacterReference</code> instances are obtained using one of the following methods:
 * <ul>
 *  <li>{@link CharacterReference#parse(CharSequence characterReferenceText)}
 *  <li>{@link Source#getNextCharacterReference(int pos)}
 *  <li>{@link Source#getPreviousCharacterReference(int pos)}
 *  <li>{@link Segment#getAllCharacterReferences()}
 * </ul>
 *
 * @see CharacterReference
 * @see CharacterEntityReference
 */
public class NumericCharacterReference extends CharacterReference {
	private boolean hex;

	private NumericCharacterReference(final Source source, final int begin, final int end, final int codePoint, final boolean hex) {
		super(source,begin,end,codePoint);
		this.hex=hex;
	}

	/**
	 * Indicates whether this numeric character reference specifies the unicode code point in decimal format.
	 * <p>
	 * A numeric character reference in decimal format is referred to in this library as a
	 * <a href="#DecimalCharacterReference">decimal character reference</a>.
	 *
	 * @return <code>true</code> if this numeric character reference specifies the unicode code point in decimal format, otherwise <code>false</code>.
	 * @see #isHexadecimal()
	 */
	public boolean isDecimal() {
		return !hex;
	}

	/**
	 * Indicates whether this numeric character reference specifies the unicode code point in hexadecimal format.
	 * <p>
	 * A numeric character reference in hexadecimal format is referred to in this library as a
	 * <a href="#HexadecimalCharacterReference">hexadecimal character reference</a>.
	 *
	 * @return <code>true</code> if this numeric character reference specifies the unicode code point in hexadecimal format, otherwise <code>false</code>.
	 * @see #isDecimal()
	 */
	public boolean isHexadecimal() {
		return hex;
	}

	/**
	 * Encodes the specified text, escaping special characters into numeric character references.
	 * <p>
	 * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
	 * <p>
	 * This method encodes all character references in <a href="#DecimalCharacterReference">decimal format</a>, and is exactly the same as calling
	 * {@link #encodeDecimal(CharSequence)}.
	 * <p>
	 * To encode text using both character entity references and numeric character references, use the<br />
	 * {@link CharacterReference#encode(CharSequence)} method instead.
	 * <p>
	 * To encode text using <a href="#HexadecimalCharacterReference">hexadecimal character references</a> only,
	 * use the {@link #encodeHexadecimal(CharSequence)} method instead.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string.
	 * @see #decode(CharSequence)
	 */
	public static String encode(final CharSequence unencodedText) {
		if (unencodedText==null) return null;
		final StringBuilder sb=new StringBuilder(unencodedText.length()*2);
		for (int i=0; i<unencodedText.length(); i++) {
			final char ch=unencodedText.charAt(i);
			if (requiresEncoding(ch)) {
				try {
					appendDecimalCharacterReferenceString(sb,ch);
				} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
			} else {
				sb.append(ch);
			}
		}
		return sb.toString();
	}

	/**
	 * Encodes the specified text, escaping special characters into <a href="#DecimalCharacterReference">decimal character references</a>.
	 * <p>
	 * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
	 * <p>
	 * To encode text using both character entity references and numeric character references, use the<br />
	 * {@link CharacterReference#encode(CharSequence)} method instead.
	 * <p>
	 * To encode text using <a href="#HexadecimalCharacterReference">hexadecimal character references</a> only,
	 * use the {@link #encodeHexadecimal(CharSequence)} method instead.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string.
	 * @see #decode(CharSequence)
	 */
	public static String encodeDecimal(final CharSequence unencodedText) {
		return encode(unencodedText);
	}

	/**
	 * Encodes the specified text, escaping special characters into <a href="#HexadecimalCharacterReference">hexadecimal character references</a>.
	 * <p>
	 * Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return <code>true</code> for that character.
	 * <p>
	 * To encode text using both character entity references and numeric character references, use the<br />
	 * {@link CharacterReference#encode(CharSequence)} method instead.
	 * <p>
	 * To encode text using <a href="#DecimalCharacterReference">decimal character references</a> only,
	 * use the {@link #encodeDecimal(CharSequence)} method instead.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string.
	 * @see #decode(CharSequence)
	 */
	public static String encodeHexadecimal(final CharSequence unencodedText) {
		if (unencodedText==null) return null;
		final StringBuilder sb=new StringBuilder(unencodedText.length()*2);
		for (int i=0; i<unencodedText.length(); i++) {
			final char ch=unencodedText.charAt(i);
			if (requiresEncoding(ch)) {
				try {
					appendHexadecimalCharacterReferenceString(sb,ch);
				} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
			} else {
				sb.append(ch);
			}
		}
		return sb.toString();
	}

	/**
	 * Returns the correct encoded form of this numeric character reference.
	 * <p>
	 * The returned string uses the same radix as the original character reference in the source document,
	 * i.e. decimal format if {@link #isDecimal()} is <code>true</code>, and hexadecimal format if {@link #isHexadecimal()} is <code>true</code>.
	 * <p>
	 * Note that the returned string is not necessarily the same as the original source text used to create this object.
	 * This library recognises certain invalid forms of character references,
	 * as detailed in the {@link #decode(CharSequence) decode(CharSequence)} method.
	 * <p>
	 * To retrieve the original source text, use the {@link #toString() toString()} method instead.
	 * <p>
	 * <dl>
	 *  <dt>Example:</dt>
	 *   <dd><code>CharacterReference.parse("&#62").getCharacterReferenceString()</code> returns "<code>&#62;</code>"</dd>
	 * </dl>
	 *
	 * @return the correct encoded form of this numeric character reference.
	 * @see CharacterReference#getCharacterReferenceString(int codePoint)
	 */
	public String getCharacterReferenceString() {
		return hex ? getHexadecimalCharacterReferenceString(codePoint) : getDecimalCharacterReferenceString(codePoint);
	}

	/**
	 * Returns the numeric character reference encoded form of the specified unicode code point.
	 * <p>
	 * This method returns the character reference in decimal format, and is exactly the same as calling
	 * {@link #getDecimalCharacterReferenceString(int codePoint)}.
	 * <p>
	 * To get either the character entity reference or numeric character reference, use the<br />
	 * {@link CharacterReference#getCharacterReferenceString(int codePoint)} method instead.
	 * <p>
	 * To get the character reference in hexadecimal format, use the {@link #getHexadecimalCharacterReferenceString(int codePoint)} method instead.
	 * <p>
	 * <dl>
	 *  <dt>Examples:</dt>
	 *   <dd><code>NumericCharacterReference.getCharacterReferenceString(62)</code> returns "<code>&#62;</code>"</dd>
	 *   <dd><code>NumericCharacterReference.getCharacterReferenceString('>')</code> returns "<code>&#62;</code>"</dd>
	 * </dl>
	 *
	 * @return the numeric character reference encoded form of the specified unicode code point.
	 * @see CharacterReference#getCharacterReferenceString(int codePoint)
	 */
	public static String getCharacterReferenceString(final int codePoint) {
		return getDecimalCharacterReferenceString(codePoint);
	}

	static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
		// only called from CharacterReference.construct(), so we can assume that first characters are "&#"
		final ParseText parseText=source.getParseText();
		int codePointStringBegin=begin+2;
		boolean hex;
		if (hex=(parseText.charAt(codePointStringBegin)=='x')) codePointStringBegin++;
		final int unterminatedMaxCodePoint=hex ? unterminatedCharacterReferenceSettings.hexadecimalCharacterReferenceMaxCodePoint : unterminatedCharacterReferenceSettings.decimalCharacterReferenceMaxCodePoint;
		final int maxSourcePos=source.end-1;
		String codePointString;
		int end;
		int x=codePointStringBegin;
		boolean unterminated=false;
		while (true) {
			final char ch=parseText.charAt(x);
			if (ch==';') {
				end=x+1;
				codePointString=source.substring(codePointStringBegin,x);
				break;
			}
			if ((ch>='0' && ch<='9') || (hex && ((ch>='a' && ch<='f') || (ch>='A' && ch<='F')))) {
				// We have a valid decimal digit (if hex is false), or a hexadecimal digit (if hex is true)
				if (x==maxSourcePos) {
					// We are at the last position in the source text without the terminating semicolon.
					unterminated=true;
					x++; // include this digit
				}
			} else {
				// We don't have a valid digit, meaning the character reference is unterminated.
				unterminated=true;
			}
			if (unterminated) {
				// Different browsers react differently to unterminated numeric character references.
				// The behaviour of this method is determined by the settings in the unterminatedCharacterReferenceSettings parameter.
				if (unterminatedMaxCodePoint==INVALID_CODE_POINT) {
					// reject:
					return null;
				} else {
					// accept:
					end=x;
					codePointString=source.substring(codePointStringBegin,x);
					break;
				}
			}
			x++;
		}
		if (codePointString.length()==0) return null;
		int codePoint=INVALID_CODE_POINT;
		try {
			codePoint=Integer.parseInt(codePointString,hex?16:10);
			if (unterminated && codePoint>unterminatedMaxCodePoint) return null;
			if (codePoint>Character.MAX_CODE_POINT) codePoint=INVALID_CODE_POINT;
		} catch (NumberFormatException ex) {
			// This should only happen if number is larger than Integer.MAX_VALUE.
			if (unterminated) return null;
			// If it is a terminated reference just ignore the exception as codePoint will remain with its value of INVALID_CODE_POINT.
		}
		return new NumericCharacterReference(source,begin,end,codePoint,hex);
	}

	public String getDebugInfo() {
		final StringBuilder sb=new StringBuilder();
		sb.append('"');
		try {
			if (hex)
				appendHexadecimalCharacterReferenceString(sb,codePoint);
			else
				appendDecimalCharacterReferenceString(sb,codePoint);
			sb.append("\" ");
			appendUnicodeText(sb,codePoint);
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
		sb.append(' ').append(super.getDebugInfo());
		return sb.toString();
	}
}