// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v $
// $Author: derrickoswald $
// $Date: 2006/04/17 13:53:12 $
// $Revision: 1.48 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.tests.utilTests;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.lang.reflect.Field;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Random;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tests.ParserTestCase;
import org.htmlparser.util.CharacterReference;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.Translate;
import org.htmlparser.util.sort.Sort;
public class CharacterTranslationTest
extends
ParserTestCase
{
static
{
System.setProperty ("org.htmlparser.tests.utilTests.CharacterTranslationTest", "CharacterTranslationTest");
}
/**
* The list of references.
*/
protected static CharacterReference[] mReferences;
public CharacterTranslationTest (String name)
{
super (name);
}
/**
* Class loader to access the compiled character references.
*/
class SimpleClassLoader extends ClassLoader
{
/**
* The class path for this class loader.
*/
String mRoot;
public SimpleClassLoader (String root)
{
if (!root.endsWith (File.separator))
root += File.separator;
mRoot = root;
}
public Class loadClass (String className)
throws
ClassNotFoundException
{
return (loadClass (className, true));
}
public synchronized Class loadClass (String className, boolean resolveIt)
throws
ClassNotFoundException
{
byte data[];
FileInputStream in;
Class ret;
try
{
// try system class loader
ret = super.findSystemClass (className);
}
catch (ClassNotFoundException e)
{
try
{
in = new FileInputStream (mRoot + className + ".class");
data = new byte[in.available ()];
in.read (data);
in.close ();
ret = defineClass (className, data, 0, data.length);
if (null == ret)
throw new ClassFormatError ();
if (resolveIt)
resolveClass (ret);
}
catch (IOException ioe)
{
throw new ClassNotFoundException ();
}
}
return (ret);
}
}
/**
* Create a character reference translation class source file.
* Usage:
* <pre>
* java -classpath .:lib/htmlparser.jar Generate > Translate.java
* </pre>
* Derived from HTMLStringFilter.java provided as an example with the
* htmlparser.jar file available at
* <a href="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a>
* written by Somik Raha (
* <a href='mailto:somik@industriallogic.com?
* subject=htmlparser'>somik@industriallogic. com</a>
* <a href="http://industriallogic.com">http://industriallogic.com</a>).
* @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>
*/
public class Generate
{
/**
* The working parser.
*/
protected Parser mParser;
protected String nl = System.getProperty ("line.separator", "\n");
/**
* Create a Generate object.
* Sets up the generation by creating a new <code>Parser</code> pointed
* at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
* with the standard scanners registered.
*/
public Generate ()
throws ParserException
{
mParser = new Parser ("http://www.w3.org/TR/REC-html40/sgml/entities.html");
}
/**
* Translate character references.
* After generating the Translate class we could use it
* to do this job, but that would involve a bootstrap
* problem, so this method does the reference conversion
* for a very tiny subset (enough to understand the w3.org
* page).
* @param string The raw string.
* @return The string with character references fixed.
*/
public String translate (String string)
{
int index;
int amp;
StringBuffer ret;
ret = new StringBuffer (4096);
index = 0;
while ((index < string.length ()) && (-1 != (amp = string.indexOf ('&', index))))
{
// include the part before the special character
ret.append (string.substring (index, amp));
if (string.startsWith (" ", amp))
{
ret.append (" ");
index = amp + 6;
}
else if (string.startsWith ("<", amp))
{
ret.append ("<");
index = amp + 4;
}
else if (string.startsWith (">", amp))
{
ret.append (">");
index = amp + 4;
}
else if (string.startsWith ("&", amp))
{
ret.append ("&");
index = amp + 5;
}
else if (string.startsWith (""e;", amp))
{
ret.append ("\"");
index = amp + 7;
}
else if (string.startsWith ("÷", amp))
{
//ret.append ('\u00F7');
//index = amp + 8;
ret.append ("&");
index = amp + 1;
}
else if (string.startsWith ("©", amp))
{
//ret.append ('\u00A9');
//index = amp + 6;
ret.append ("&");
index = amp + 1;
}
else
{
System.out.println ("unknown special character starting with " + string.substring (amp, amp + 7));
ret.append ("&");
index = amp + 1;
}
}
ret.append (string.substring (index));
return (ret.toString ());
}
public void gather (Node node, StringBuffer buffer)
{
NodeList children;
if (node instanceof Text)
{
// Node is a plain string
// Cast it to an HTMLText
Text stringNode = (Text)node;
// Retrieve the data from the object
buffer.append (stringNode.getText ());
}
else if (node instanceof LinkTag)
{
// Node is a link
// Cast it to an HTMLLinkTag
LinkTag linkNode = (LinkTag)node;
// Retrieve the data from the object and print it
buffer.append (linkNode.getLinkText ());
}
else if (node instanceof Tag)
{
String name = ((Tag)node).getTagName ();
if (name.equals ("BR") || name.equals ("P"))
buffer.append (nl);
else
{
children = ((Tag)node).getChildren ();
if (null != children)
for (int i = 0; i < children.size (); i++)
gather (children.elementAt (i), buffer);
}
}
else if (node instanceof Remark)
{
}
else
{
System.out.println ();
System.out.println(node.toString());
}
}
/**
* Find the lowest index of whitespace (space or newline).
* @param string The string to look in.
* @param index Where to start looking.
* @return -1 if there is no whitespace, the minimum index otherwise.
*/
public int indexOfWhitespace (String string, int index)
{
int space;
int cr;
int ret;
space = string.indexOf (" ", index);
cr = string.indexOf (nl, index);
if (-1 == space)
ret = cr;
else if (-1 == cr)
ret = space;
else
ret = Math.min (space, cr);
return (ret);
}
/**
* Rewrite the comment string.
* In the sgml table, the comments are of the form:
* <pre>
* -- latin capital letter I with diaeresis,
* U+00CF ISOlat1
* </pre>
* so we just want to make a one-liner without the spaces and newlines.
* @param string The raw comment.
* @return The single line comment.
*/
public String pack (String string)
{
int index;
int spaces;
StringBuffer ret;
ret = new StringBuffer (string.length ());
if (string.startsWith ("-- "))
string = string.substring (3);
// remove newlines
string = string.replace ('\n', ' ');
// remove doublespaces
index = 0;
while ((index < string.length ()) && (-1 != (spaces = indexOfWhitespace (string, index))))
{
ret.append (string.substring (index, spaces));
ret.append (" ");
while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))
spaces++;
index = spaces;
}
if (index < string.length ())
ret.append (string.substring (index));
return (ret.toString ());
}
/**
* Pretty up a comment string.
* @param string The comment to operate on.
* @return The beautiful comment string.
*/
public String pretty (String string)
{
int index;
int spaces;
StringBuffer ret;
ret = new StringBuffer (string.length ());
// newline instead of doublespaces
index = 0;
while ((index < string.length ()) && (-1 != (spaces = string.indexOf (" ", index))))
{
ret.append (" // " + string.substring (index, spaces));
if (!string.substring (index, spaces).endsWith (nl))
ret.append (nl);
while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces)))
spaces++;
index = spaces;
}
if (index < string.length ())
ret.append (" // " + string.substring (index));
return (ret.toString ());
}
/**
* Pad a string on the left with the given character to the length specified.
* @param string The string to pad
* @param character The character to pad with.
* @param length The size to pad to.
* @return The padded string.
*/
public String pad (String string, char character, int length)
{
StringBuffer ret;
ret = new StringBuffer (length);
ret.append (string);
while (length > ret.length ())
ret.insert (0, character);
return (ret.toString ());
}
/**
* Convert the textual representation of the numeric character reference to a character.
* @param string The numeric character reference (in quotes).
* @return The character represented by the numeric character reference.
*
*/
public String unicode (String string)
{
int code;
if (string.startsWith ("\"") && string.endsWith (";\""))
{
string = string.substring (3, string.length () - 2);
try
{
code = Integer.parseInt (string);
string = "'\\u" + pad (Integer.toHexString (code), '0', 4) + "'";
}
catch (Exception e)
{
e.printStackTrace ();
}
return (string);
}
else
return (string);
}
/**
* Parse the sgml declaration for character entity reference
* name, equivalent numeric character reference and a comment.
* Emit a java hash table 'put' with the name as the key, the
* numeric character as the value and comment the insertion
* with the comment.
* @param string The contents of the sgml declaration.
* @param out The sink for output.
*/
public void extract (String string, PrintWriter out)
{
int space;
String token;
String code;
if (string.startsWith ("<!--"))
out.println (pretty (string.substring (4, string.length () - 3).trim ()));
else if (string.startsWith ("<!ENTITY"))
{
string = string.substring (8, string.length () - 3).trim ();
if (-1 != (space = string.indexOf (" ")))
{
token = string.substring (0, space);
string = string.substring (space).trim ();
if (string.startsWith ("CDATA"))
{
string = string.substring (5).trim ();
if (-1 != (space = string.indexOf (" ")))
{
code = string.substring (0, space).trim ();
code = unicode (code);
string = string.substring (space).trim ();
out.println (
" new CharacterReference (\"" + token + "\","
// no token is larger than 8 characters - yet
+ pad (code, ' ', code.length () + 9 - token.length ()) + "),"
+ " // "
+ pack (string));
}
else
out.println (string);
}
else
out.println (string);
}
else
out.println (string);
}
else
out.println (string);
}
/**
* Extract special characters.
* Scan the string looking for substrings of the form:
* <pre>
* <!ENTITY nbsp CDATA " " -- no-break space = non-breaking space, U+00A0 ISOnum -->
* </pre>
* and emit a java definition for each.
* @param string The raw string from w3.org.
* @param out The sink for output.
*/
public void sgml (String string, PrintWriter out)
{
int index;
int begin;
int end;
index = 0;
while (-1 != (begin = string.indexOf ("<", index)))
{
if (-1 != (end = string.indexOf ("-->", begin)))
{
extract (string.substring (begin, end + 3), out);
index = end + 3;
}
else
index = begin + 1;
}
}
/**
* Pull out text elements from the HTML.
* @param out The sink for output.
*/
public void parse (PrintWriter out)
throws
ParserException
{
Node node;
StringBuffer buffer = new StringBuffer (4096);
// Run through an enumeration of html elements, and pick up
// only those that are plain string.
for (NodeIterator e = mParser.elements (); e.hasMoreNodes ();)
{
node = e.nextNode ();
gather (node, buffer);
}
String text = translate (buffer.toString ());
sgml (text, out);
}
}
public CharacterReference[] getReferences ()
{
final String class_name = "CharacterEntityReferenceList";
String paths;
String path;
String source;
PrintWriter out;
Generate generate;
SimpleClassLoader loader;
Class hello;
Field field;
CharacterReference[] ret;
ret = mReferences;
if (null == ret)
{
paths = System.getProperty ("java.class.path");
path = System.getProperty ("user.home");
if (!path.endsWith (File.separator))
path += File.separator;
source = path + class_name + ".java";
try
{
// create it
generate = new Generate ();
out = new PrintWriter (new FileWriter (source));
out.println ("import org.htmlparser.util.CharacterReference;");
out.println ();
out.println ("/** Generated by " + this.getClass ().getName () + " **/");
out.println ("public class " + class_name);
out.println ("{");
out.println (" /**");
out.println (" * Table mapping character to entity reference.");
out.println (" */");
out.println (" public static final CharacterReference[] mCharacterReferences =");
out.println (" {");
generate.parse (out);
out.println (" };");
out.println ("}");
out.close ();
// compile it
if (0 == com.sun.tools.javac.Main.compile (new String[] {"-classpath", paths, source}))
{
try
{
// load it
loader = new SimpleClassLoader (path);
hello = loader.loadClass (class_name);
try
{
// get the references
field = hello.getField ("mCharacterReferences");
ret = (CharacterReference[])field.get (null);
Sort.QuickSort (ret);
}
catch (IllegalAccessException iae)
{
fail ("references not accessible");
}
catch (NoSuchFieldException nsfe)
{
fail ("references not found");
}
}
catch (ClassNotFoundException cnfe)
{
fail ("couldn't load class");
}
finally
{
File classfile;
classfile = new File (path + class_name + ".class");
classfile.delete ();
}
}
else
fail ("couldn't compile class");
mReferences = ret;
}
catch (IOException ioe)
{
fail ("couldn't write class");
}
catch (ParserException ioe)
{
fail ("couldn't parse w3.org entities list");
}
}
return (ret);
}
public void testInitialCharacterEntityReference ()
{
assertEquals (
"character entity reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialNumericCharacterReference1 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialNumericCharacterReference2 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference1 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference2 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference3 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference4 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference5 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference6 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference7 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReference8 ()
{
assertEquals (
"numeric character reference at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialCharacterEntityReferenceWithoutSemi ()
{
assertEquals (
"character entity reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialNumericCharacterReferenceWithoutSemi ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi1 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi2 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi3 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi4 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi5 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi6 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi7 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testInitialHexNumericCharacterReferenceWithoutSemi8 ()
{
assertEquals (
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
Translate.decode ("÷ is the division sign."));
}
public void testFinalCharacterEntityReference ()
{
assertEquals (
"character entity reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalNumericCharacterReference ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference1 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference2 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference3 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference4 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference5 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference6 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference7 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReference8 ()
{
assertEquals (
"numeric character reference at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalCharacterEntityReferenceWithoutSemi ()
{
assertEquals (
"character entity reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalNumericCharacterReferenceWithoutSemi1 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalNumericCharacterReferenceWithoutSemi2 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi1 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi2 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi3 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi4 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi5 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi6 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi7 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testFinalHexNumericCharacterReferenceWithoutSemi8 ()
{
assertEquals (
"numeric character reference without a semicolon at end of string doesn't work",
"The division sign (\u00f7) is \u00f7",
Translate.decode ("The division sign (\u00f7) is ÷"));
}
public void testReferencesInString ()
{
assertEquals (
"character references within a string don't work",
"Thus, the character entity reference \u00f7 is a more convenient form than \u00f7 for obtaining the division sign (\u00f7)",
Translate.decode ("Thus, the character entity reference ÷ is a more convenient form than ÷ for obtaining the division sign (\u00f7)"));
}
public void testBogusCharacterEntityReference1 ()
{
assertEquals (
"bogus character entity reference doesn't work",
"The character entity reference &divode; is bogus",
Translate.decode ("The character entity reference &divode; is bogus"));
}
public void testBogusCharacterEntityReference2 ()
{
assertEquals (
"bogus character entity reference doesn't work",
"The character entity reference &(divide) is bogus",
Translate.decode ("The character entity reference &(divide) is bogus"));
}
public void testBogusNumericCharacterReference ()
{
assertEquals (
"bogus numeric character reference doesn't work",
"The numeric character reference BF7; is bogus",
Translate.decode ("The numeric character reference BF7; is bogus"));
}
public void testBogusHexNumericCharacterReference ()
{
assertEquals (
"bogus numeric character reference doesn't work",
"The numeric character reference KJ7; is bogus",
Translate.decode ("The numeric character reference KJ7; is bogus"));
}
public void testPoorlyTerminatedCharacterEntityReference1 ()
{
assertEquals (
"poorly terminated character entity reference doesn't work",
"The character entity reference \u00f7d should be decoded",
Translate.decode ("The character entity reference ÷d should be decoded"));
}
public void testPoorlyTerminatedCharacterEntityReference2 ()
{
assertEquals (
"poorly terminated character entity reference doesn't work",
"The character entity reference \u00f7<br> should be decoded",
Translate.decode ("The character entity reference ÷<br> should be decoded"));
}
public void testPoorlyTerminatedNumericCharacterReference1 ()
{
assertEquals (
"poorly terminated numeric character reference doesn't work",
"The numeric character reference \u00f7pop should be decoded",
Translate.decode ("The numeric character reference ÷pop should be decoded"));
}
public void testPoorlyTerminatedNumericCharacterReference2 ()
{
assertEquals (
"poorly terminated numeric character reference doesn't work",
"The numeric character reference \u00f7<br> should be decoded",
Translate.decode ("The numeric character reference ÷<br> should be decoded"));
}
public void testPoorlyTerminatedNumericCharacterReference3 ()
{
assertEquals (
"poorly terminated numeric character reference doesn't work",
"The numeric character reference \u00f7xpert should be decoded",
Translate.decode ("The numeric character reference ÷xpert should be decoded"));
}
public void testEncode ()
{
assertEquals (
"encode doesn't work",
"Character entity reference: ÷, another: , numeric character reference: ♧.",
Translate.encode ("Character entity reference: \u00f7, another: \u00a0, numeric character reference: \u2667."));
}
public void testEncodeLink ()
{
assertEquals (
"encode link doesn't work",
"<a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>",
Translate.encode ("<a href=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>"));
}
public byte[] encodedecode (byte[] bytes)
throws
IOException
{
InputStream in;
ByteArrayOutputStream out;
byte[] data;
// encode
in = new ByteArrayInputStream (bytes);
out = new ByteArrayOutputStream ();
Translate.encode (in, new PrintStream (out, false, "ISO-8859-1"));
in.close ();
out.close ();
data = out.toByteArray ();
// decode
in = new ByteArrayInputStream (data);
out = new ByteArrayOutputStream ();
Translate.decode (in, new PrintStream (out, false, "ISO-8859-1"));
in.close ();
out.close ();
data = out.toByteArray ();
return (data);
}
public void check (byte[] reference, byte[] result)
throws
IOException
{
InputStream ref;
InputStream in;
int i;
int i1;
int i2;
ref = new ByteArrayInputStream (reference);
in = new ByteArrayInputStream (result);
i = 0;
do
{
i1 = ref.read ();
i2 = in.read ();
if (i1 != i2)
fail ("byte difference detected at offset " + i + " expected " + i1 + ", actual " + i2);
i++;
}
while (-1 != i1);
ref.close ();
in.close ();
}
public void testHexNumericEncoding ()
throws
IOException
{
try
{
Translate.ENCODE_HEXADECIMAL = true;
assertEquals (
"hex value incorrect",
"֫ is a non-existant character.",
Translate.encode ("\u05AB is a non-existant character."));
}
finally
{
Translate.ENCODE_HEXADECIMAL = false;
}
}
public void testLastCharacterEntityReference ()
throws
IOException
{
assertEquals (
"poorly terminated numeric character reference doesn't work",
"The character entity reference\u200cshould be decoded",
Translate.decode ("The character entity reference&zwnjshould be decoded"));
}
public void testEncodeDecodePage () throws IOException
{
URL url;
URLConnection connection;
InputStream in;
ByteArrayOutputStream out;
byte[] bytes;
byte[] result;
int c;
// get some bytes
url = new URL ("http://sourceforge.net/projects/htmlparser");
connection = url.openConnection ();
in = connection.getInputStream ();
out = new ByteArrayOutputStream ();
while (-1 != (c = in.read ()))
out.write (c);
in.close ();
out.close ();
bytes = out.toByteArray ();
// run it through
result = encodedecode (bytes);
// check
check (bytes, result);
}
/**
* Check all references read in from the w3.org site.
* If this test fails but the others pass, suspect that the list of
* entity references has been augmented. The updated list is in the
* CharacterEntityReferenceList.java file in your home directory.
*/
public void testEncodeDecodeAll ()
{
CharacterReference[] list;
StringBuffer stimulus;
StringBuffer response;
CharacterReference ref;
String string;
list = getReferences ();
stimulus = new StringBuffer ();
response = new StringBuffer ();
for (int i = 0; i < list.length; i++)
{
ref = list[i];
stimulus.append ((char)ref.getCharacter ());
response.append ("&");
response.append (ref.getKernel ());
response.append (";");
}
string = Translate.encode (stimulus.toString ());
if (!string.equals (response.toString ()))
fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\"");
string = Translate.decode (string);
if (!string.equals (stimulus.toString ()))
fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\"");
}
public void testEncodeDecodeRandom ()
{
Random random;
CharacterReference[] list;
StringBuffer stimulus;
StringBuffer response;
char character;
CharacterReference ref;
String string;
random = new Random ();
list = getReferences ();
stimulus = new StringBuffer ();
response = new StringBuffer ();
for (int i = 0; i < 1000; i++)
{
for (int j = 0; j < 10; j++)
{
// some random characters
for (int k = 0; k < 10; k++)
{
character = (char)random.nextInt (127);
if (character >= ' ')
{
if ('&' == character)
{
stimulus.append (character);
response.append ("&");
}
else if ('"' == character)
{
stimulus.append (character);
response.append (""");
}
else if ('<' == character)
{
stimulus.append (character);
response.append ("<");
}
else if ('>' == character)
{
stimulus.append (character);
response.append (">");
}
else
{
stimulus.append (character);
response.append (character);
}
}
}
ref = list[random.nextInt (list.length)];
stimulus.append ((char)ref.getCharacter ());
response.append ("&");
response.append (ref.getKernel ());
response.append (";");
// some more random characters
for (int k = 0; k < 10; k++)
{
character = (char)random.nextInt (127);
if (character >= ' ')
{
if ('&' == character)
{
stimulus.append (character);
response.append ("&");
}
else if ('"' == character)
{
stimulus.append (character);
response.append (""");
}
else if ('<' == character)
{
stimulus.append (character);
response.append ("<");
}
else if ('>' == character)
{
stimulus.append (character);
response.append (">");
}
else
{
stimulus.append (character);
response.append (character);
}
}
}
}
string = Translate.encode (stimulus.toString ());
if (!string.equals (response.toString ()))
fail ("encoding incorrect, expected \n\"" + response.toString () + "\", encoded \n\"" + string + "\"");
string = Translate.decode (string);
if (!string.equals (stimulus.toString ()))
fail ("decoding incorrect, expected \n\"" + stimulus.toString () + "\", decoded \n\"" + string + "\", encoded \n\"" + response.toString () + "\"");
stimulus.setLength (0);
response.setLength (0);
}
}
public void testEncodeDecodeRandomNoSemi ()
{
Random random;
CharacterReference[] list;
StringBuffer stimulus;
StringBuffer response;
char character;
int index;
CharacterReference ref;
String kernel;
ArrayList forbidden;
String string;
random = new Random ();
list = getReferences ();
stimulus = new StringBuffer ();
response = new StringBuffer ();
for (int i = 0; i < 1000; i++)
{
for (int j = 0; j < 10; j++)
{
// some random characters
for (int k = 0; k < 10; k++)
{
character = (char)random.nextInt (127);
if (character >= ' ')
{
if ('&' == character)
{
stimulus.append (character);
response.append ("&");
}
else if ('"' == character)
{
stimulus.append (character);
response.append (""");
}
else if ('<' == character)
{
stimulus.append (character);
response.append ("<");
}
else if ('>' == character)
{
stimulus.append (character);
response.append (">");
}
else
{
stimulus.append (character);
response.append (character);
}
}
}
index = random.nextInt (list.length);
ref = list[index];
kernel = ref.getKernel ();
stimulus.append ((char)ref.getCharacter ());
response.append ("&");
response.append (kernel);
// to be fair, we ensure that the next character isn't valid
// for a different reference, i.e. &sup shouldn't be followed
// by a 1, 2, 3 or e
forbidden = new ArrayList ();
for (int k = index + 1; k < list.length; k++)
if (list[k].getKernel ().regionMatches (
0,
kernel,
0,
kernel.length ()))
forbidden.add (new Character (list[k].getKernel ().charAt (kernel.length ())));
else
break;
do
{
character = (char)random.nextInt (127);
if ( (' ' <= character)
&& ('&' != character)
&& ('"' != character)
&& ('<' != character)
&& ('>' != character)
&& (';' != character)
&& !(forbidden.contains (new Character (character))))
{
stimulus.append (character);
response.append (character);
character = 0;
}
else
character = ' ';
}
while (0 != character);
// some more random characters
for (int k = 0; k < 10; k++)
{
character = (char)random.nextInt (127);
if (character >= ' ')
{
if ('&' == character)
{
stimulus.append (character);
response.append ("&");
}
else if ('"' == character)
{
stimulus.append (character);
response.append (""");
}
else if ('<' == character)
{
stimulus.append (character);
response.append ("<");
}
else if ('>' == character)
{
stimulus.append (character);
response.append (">");
}
else
{
stimulus.append (character);
response.append (character);
}
}
}
}
string = Translate.decode (response.toString ());
if (!string.equals (stimulus.toString ()))
fail ("decoding incorrect:\nexpected \"" + stimulus.toString () + "\"\n decoded \"" + string + "\"\n encoded \"" + response.toString () + "\"");
stimulus.setLength (0);
response.setLength (0);
}
}
}