Clean.java example

Explorer
tizzit-master
/**
 * Copyright (c) 2009 Juwi MacMillan Group GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * @(#)Clean.java   1.11 2000/08/16
 *
 */

package org.tizzit.util.tidy;

/**
 *
 * Clean up misuse of presentation markup
 *
 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
 * See Tidy.java for the copyright notice.
 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
 * HTML Tidy Release 4 Aug 2000</a>
 *
 * @author  Dave Raggett <dsr@w3.org>
 * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
 * @version 1.0, 1999/05/22
 * @version 1.0.1, 1999/05/29
 * @version 1.1, 1999/06/18 Java Bean
 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
 * @version 1.4, 1999/09/04 DOM support
 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
 */

/*
 Filters from other formats such as Microsoft Word
 often make excessive use of presentation markup such
 as font tags, B, I, and the align attribute. By applying
 a set of production rules, it is straight forward to
 transform this to use CSS.

 Some rules replace some of the children of an element by
 style properties on the element, e.g.

 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>

 Such rules are applied to the element's content and then
 to the element itself until none of the rules more apply.
 Having applied all the rules to an element, it will have
 a style attribute with one or more properties. 

 Other rules strip the element they apply to, replacing
 it by style properties on the contents, e.g.
 
 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
 
 These rules are applied to an element before processing
 its content and replace the current element by the first
 element in the exposed content.

 After applying both sets of rules, you can replace the
 style attribute by a class value and style rule in the
 document head. To support this, an association of styles
 and class names is built.

 A naive approach is to rely on string matching to test
 when two property lists are the same. A better approach
 would be to first sort the properties before matching.
 */

public class Clean {

	private int classNum = 1;

	private TagTable tt;

	public Clean(TagTable tt) {
		this.tt = tt;
	}

	private StyleProp insertProperty(StyleProp props, String name, String value) {
		StyleProp first, prev, prop;
		int cmp;

		prev = null;
		first = props;

		while (props != null) {
			cmp = props.name.compareTo(name);

			if (cmp == 0) {
				/* this property is already defined, ignore new value */
				return first;
			}

			if (cmp > 0) // props.name > name
			{
				/* insert before this */

				prop = new StyleProp(name, value, props);

				if (prev != null)
					prev.next = prop;
				else
					first = prop;

				return first;
			}

			prev = props;
			props = props.next;
		}

		prop = new StyleProp(name, value);

		if (prev != null)
			prev.next = prop;
		else
			first = prop;

		return first;
	}

	/*
	 Create sorted linked list of properties from style string
	 It temporarily places nulls in place of ':' and ';' to
	 delimit the strings for the property name and value.
	 Some systems don't allow you to null literal strings,
	 so to avoid this, a copy is made first.
	 */
	private StyleProp createProps(StyleProp prop, String style) {
		int name_end;
		int value_end;
		int value_start = 0;
		int name_start = 0;
		boolean more;

		name_start = 0;
		while (name_start < style.length()) {
			while (name_start < style.length() && style.charAt(name_start) == ' ')
				++name_start;

			name_end = name_start;

			while (name_end < style.length()) {
				if (style.charAt(name_end) == ':') {
					value_start = name_end + 1;
					break;
				}

				++name_end;
			}

			if (name_end >= style.length() || style.charAt(name_end) != ':') break;

			while (value_start < style.length() && style.charAt(value_start) == ' ')
				++value_start;

			value_end = value_start;
			more = false;

			while (value_end < style.length()) {
				if (style.charAt(value_end) == ';') {
					more = true;
					break;
				}

				++value_end;
			}

			prop = insertProperty(prop, style.substring(name_start, name_end), style.substring(value_start, value_end));

			if (more) {
				name_start = value_end + 1;
				continue;
			}

			break;
		}

		return prop;
	}

	private String createPropString(StyleProp props) {
		String style = "";
		int len;
		StyleProp prop;

		/* compute length */

		for (len = 0, prop = props; prop != null; prop = prop.next) {
			len += prop.name.length() + 2;
			len += prop.value.length() + 2;
		}

		for (prop = props; prop != null; prop = prop.next) {
			style = style.concat(prop.name);
			style = style.concat(": ");

			style = style.concat(prop.value);

			if (prop.next == null) break;

			style = style.concat("; ");
		}

		return style;
	}

	/*
	 create string with merged properties
	 */
	private String addProperty(String style, String property) {
		StyleProp prop;

		prop = createProps(null, style);
		prop = createProps(prop, property);
		style = createPropString(prop);
		return style;
	}

	private String gensymClass(String tag) {
		String str;

		str = "c" + classNum;
		classNum++;
		return str;
	}

	private String findStyle(Lexer lexer, String tag, String properties) {
		Style style;

		for (style = lexer.styles; style != null; style = style.next) {
			if (style.tag.equals(tag) && style.properties.equals(properties)) return style.tagClass;
		}

		style = new Style(tag, gensymClass(tag), properties, lexer.styles);
		lexer.styles = style;
		return style.tagClass;
	}

	/*
	 Find style attribute in node, and replace it
	 by corresponding class attribute. Search for
	 class in style dictionary otherwise gensym
	 new class and add to dictionary.

	 Assumes that node doesn't have a class attribute
	 */
	private void style2Rule(Lexer lexer, Node node) {
		AttVal styleattr, classattr;
		String classname;

		styleattr = node.getAttrByName("style");

		if (styleattr != null) {
			classname = findStyle(lexer, node.element, styleattr.value);
			classattr = node.getAttrByName("class");

			/*
			 if there already is a class attribute
			 then append class name after a space
			 */
			if (classattr != null) {
				classattr.value = classattr.value + " " + classname;
				node.removeAttribute(styleattr);
			} else /* reuse style attribute for class attribute */
			{
				styleattr.attribute = "class";
				styleattr.value = classname;
			}
		}
	}

	private void addColorRule(Lexer lexer, String selector, String color) {
		if (color != null) {
			lexer.addStringLiteral(selector);
			lexer.addStringLiteral(" { color: ");
			lexer.addStringLiteral(color);
			lexer.addStringLiteral(" }\n");
		}
	}

	/*
	 move presentation attribs from body to style element

	 background="foo" ->  body { background-image: url(foo) }
	 bgcolor="foo"    ->  body { background-color: foo }
	 text="foo"       ->  body { color: foo }
	 link="foo"       ->  :link { color: foo }
	 vlink="foo"      ->  :visited { color: foo }
	 alink="foo"      ->  :active { color: foo }
	 */
	private void cleanBodyAttrs(Lexer lexer, Node body) {
		AttVal attr;
		String bgurl = null;
		String bgcolor = null;
		String color = null;

		attr = body.getAttrByName("background");

		if (attr != null) {
			bgurl = attr.value;
			attr.value = null;
			body.removeAttribute(attr);
		}

		attr = body.getAttrByName("bgcolor");

		if (attr != null) {
			bgcolor = attr.value;
			attr.value = null;
			body.removeAttribute(attr);
		}

		attr = body.getAttrByName("text");

		if (attr != null) {
			color = attr.value;
			attr.value = null;
			body.removeAttribute(attr);
		}

		if (bgurl != null || bgcolor != null || color != null) {
			lexer.addStringLiteral(" body {\n");

			if (bgurl != null) {
				lexer.addStringLiteral("  background-image: url(");
				lexer.addStringLiteral(bgurl);
				lexer.addStringLiteral(");\n");
			}

			if (bgcolor != null) {
				lexer.addStringLiteral("  background-color: ");
				lexer.addStringLiteral(bgcolor);
				lexer.addStringLiteral(";\n");
			}

			if (color != null) {
				lexer.addStringLiteral("  color: ");
				lexer.addStringLiteral(color);
				lexer.addStringLiteral(";\n");
			}

			lexer.addStringLiteral(" }\n");
		}

		attr = body.getAttrByName("link");

		if (attr != null) {
			addColorRule(lexer, " :link", attr.value);
			body.removeAttribute(attr);
		}

		attr = body.getAttrByName("vlink");

		if (attr != null) {
			addColorRule(lexer, " :visited", attr.value);
			body.removeAttribute(attr);
		}

		attr = body.getAttrByName("alink");

		if (attr != null) {
			addColorRule(lexer, " :active", attr.value);
			body.removeAttribute(attr);
		}
	}

	private boolean niceBody(Lexer lexer, Node doc) {
		Node body = doc.findBody(lexer.configuration.tt);

		if (body != null) {
			if (body.getAttrByName("background") != null || body.getAttrByName("bgcolor") != null
					|| body.getAttrByName("text") != null || body.getAttrByName("link") != null
					|| body.getAttrByName("vlink") != null || body.getAttrByName("alink") != null) {
				lexer.badLayout |= Report.USING_BODY;
				return false;
			}
		}

		return true;
	}

	/* create style element using rules from dictionary */
	private void createStyleElement(Lexer lexer, Node doc) {
		Node node, head, body;
		Style style;
		AttVal av;

		if (lexer.styles == null && niceBody(lexer, doc)) return;

		node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
		node.implicit = true;

		/* insert type attribute */
		av = new AttVal(null, null, '"', "type", "text/css");
		av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
		node.attributes = av;

		body = doc.findBody(lexer.configuration.tt);

		lexer.txtstart = lexer.lexsize;

		if (body != null) cleanBodyAttrs(lexer, body);

		for (style = lexer.styles; style != null; style = style.next) {
			lexer.addCharToLexer(' ');
			lexer.addStringLiteral(style.tag);
			lexer.addCharToLexer('.');
			lexer.addStringLiteral(style.tagClass);
			lexer.addCharToLexer(' ');
			lexer.addCharToLexer('{');
			lexer.addStringLiteral(style.properties);
			lexer.addCharToLexer('}');
			lexer.addCharToLexer('\n');
		}

		lexer.txtend = lexer.lexsize;

		Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, lexer.lexbuf, lexer.txtstart, lexer.txtend));

		/*
		 now insert style element into document head

		 doc is root node. search its children for html node
		 the head node should be first child of html node
		 */

		head = doc.findHEAD(lexer.configuration.tt);

		if (head != null) Node.insertNodeAtEnd(head, node);
	}

	/* ensure bidirectional links are consistent */
	private void fixNodeLinks(Node node) {
		Node child;

		if (node.prev != null)
			node.prev.next = node;
		else
			node.parent.content = node;

		if (node.next != null)
			node.next.prev = node;
		else
			node.parent.last = node;

		for (child = node.content; child != null; child = child.next)
			child.parent = node;
	}

	/*
	 used to strip child of node when
	 the node has one and only one child
	 */
	private void stripOnlyChild(Node node) {
		Node child;

		child = node.content;
		node.content = child.content;
		node.last = child.last;
		child.content = null;

		for (child = node.content; child != null; child = child.next)
			child.parent = node;
	}

	/* used to strip font start and end tags */
	private void discardContainer(Node element, MutableObject pnode) {
		Node node;
		Node parent = element.parent;

		if (element.content != null) {
			element.last.next = element.next;

			if (element.next != null) {
				element.next.prev = element.last;
				element.last.next = element.next;
			} else
				parent.last = element.last;

			if (element.prev != null) {
				element.content.prev = element.prev;
				element.prev.next = element.content;
			} else
				parent.content = element.content;

			for (node = element.content; node != null; node = node.next)
				node.parent = parent;

			pnode.setObject(element.content);
		} else {
			if (element.next != null)
				element.next.prev = element.prev;
			else
				parent.last = element.prev;

			if (element.prev != null)
				element.prev.next = element.next;
			else
				parent.content = element.next;

			pnode.setObject(element.next);
		}

		element.next = null;
		element.content = null;
	}

	/*
	 Add style property to element, creating style
	 attribute as needed and adding ; delimiter
	 */
	private void addStyleProperty(Node node, String property) {
		AttVal av;

		for (av = node.attributes; av != null; av = av.next) {
			if (av.attribute.equals("style")) break;
		}

		/* if style attribute already exists then insert property */

		if (av != null) {
			String s;

			s = addProperty(av.value, property);
			av.value = s;
		} else /* else create new style attribute */
		{
			av = new AttVal(node.attributes, null, '"', "style", property);
			av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
			node.attributes = av;
		}
	}

	/*
	 Create new string that consists of the
	 combined style properties in s1 and s2

	 To merge property lists, we build a linked
	 list of property/values and insert properties
	 into the list in order, merging values for
	 the same property name.
	 */
	private String mergeProperties(String s1, String s2) {
		String s;
		StyleProp prop;

		prop = createProps(null, s1);
		prop = createProps(prop, s2);
		s = createPropString(prop);
		return s;
	}

	private void mergeStyles(Node node, Node child) {
		AttVal av;
		String s1, s2, style;

		for (s2 = null, av = child.attributes; av != null; av = av.next) {
			if (av.attribute.equals("style")) {
				s2 = av.value;
				break;
			}
		}

		for (s1 = null, av = node.attributes; av != null; av = av.next) {
			if (av.attribute.equals("style")) {
				s1 = av.value;
				break;
			}
		}

		if (s1 != null) {
			if (s2 != null) /* merge styles from both */
			{
				style = mergeProperties(s1, s2);
				av.value = style;
			}
		} else if (s2 != null) /* copy style of child */
		{
			av = new AttVal(node.attributes, null, '"', "style", s2);
			av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
			node.attributes = av;
		}
	}

	private String fontSize2Name(String size) {
		/*
		 String[] sizes =
		 {
		 "50%",
		 "60%",
		 "80%",
		 null,
		 "120%",
		 "150%",
		 "200%"
		 };
		 */

		String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%"};
		String buf;

		if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
			int n = size.charAt(0) - '0';
			return sizes[n];
		}

		if (size.length() > 0 && size.charAt(0) == '-') {
			if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
				int n = size.charAt(1) - '0';
				double x;

				for (x = 1.0; n > 0; --n)
					x *= 0.8;

				x *= 100.0;
				buf = "" + (int) x + "%";

				return buf;
			}

			return "smaller"; /*"70%"; */
		}

		if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
			int n = size.charAt(1) - '0';
			double x;

			for (x = 1.0; n > 0; --n)
				x *= 1.2;

			x *= 100.0;
			buf = "" + (int) x + "%";

			return buf;
		}

		return "larger"; /* "140%" */
	}

	private void addFontFace(Node node, String face) {
		addStyleProperty(node, "font-family: " + face);
	}

	private void addFontSize(Node node, String size) {
		String value;

		if (size.equals("6") && node.tag == tt.tagP) {
			node.element = "h1";
			tt.findTag(node);
			return;
		}

		if (size.equals("5") && node.tag == tt.tagP) {
			node.element = "h2";
			tt.findTag(node);
			return;
		}

		if (size.equals("4") && node.tag == tt.tagP) {
			node.element = "h3";
			tt.findTag(node);
			return;
		}

		value = fontSize2Name(size);

		if (value != null) {
			addStyleProperty(node, "font-size: " + value);
		}
	}

	private void addFontColor(Node node, String color) {
		addStyleProperty(node, "color: " + color);
	}

	private void addAlign(Node node, String align) {
		/* force alignment value to lower case */
		addStyleProperty(node, "text-align: " + align.toLowerCase());
	}

	/*
	 add style properties to node corresponding to
	 the font face, size and color attributes
	 */
	private void addFontStyles(Node node, AttVal av) {
		while (av != null) {
			if (av.attribute.equals("face"))
				addFontFace(node, av.value);
			else if (av.attribute.equals("size"))
				addFontSize(node, av.value);
			else if (av.attribute.equals("color")) addFontColor(node, av.value);

			av = av.next;
		}
	}

	/*
	 Symptom: <p align=center>
	 Action: <p style="text-align: center">
	 */
	private void textAlign(Lexer lexer, Node node) {
		AttVal av, prev;

		prev = null;

		for (av = node.attributes; av != null; av = av.next) {
			if (av.attribute.equals("align")) {
				if (prev != null)
					prev.next = av.next;
				else
					node.attributes = av.next;

				if (av.value != null) {
					addAlign(node, av.value);
				}

				break;
			}

			prev = av;
		}
	}

	/*
	 The clean up rules use the pnode argument to return the
	 next node when the orignal node has been deleted
	 */

	/*
	 Symptom: <dir> <li> where <li> is only child
	 Action: coerce <dir> <li> to <div> with indent.
	 */

	private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
		Node child;

		if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
			child = node.content;

			if (child == null) return false;

			/* check child has no peers */

			if (child.next != null) return false;

			if (child.tag != tt.tagLi) return false;

			if (!child.implicit) return false;

			/* coerce dir to div */

			node.tag = tt.tagDiv;
			node.element = "div";
			addStyleProperty(node, "margin-left: 2em");
			stripOnlyChild(node);
			return true;

			//#if 0
			//Node content;
			//Node last;
			//content = child.content;
			//last = child.last;
			//child.content = null;

			/* adjust parent and set margin on contents of <li> */

			//for (child = content; child != null; child = child.next)
			//{
			//    child.parent = node.parent;
			//    addStyleProperty(child, "margin-left: 1em");
			//}
			/* hook first/last into sequence */

			//if (content != null)
			//{
			//    content.prev = node.prev;
			//    last.next = node.next;
			//    fixNodeLinks(content);
			//    fixNodeLinks(last);
			//}
			//node.next = null;
			/* ensure that new node is cleaned */
			//pnode.setObject(cleanNode(lexer, content));
			//return true;
			//#endif
		}

		return false;
	}

	/*
	 Symptom: <center>
	 Action: replace <center> by <div style="text-align: center">
	 */

	private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
		if (node.tag == tt.tagCenter) {
			if (lexer.configuration.DropFontTags) {
				if (node.content != null) {
					Node last = node.last;
					Node parent = node.parent;

					discardContainer(node, pnode);

					node = lexer.inferredTag("br");

					if (last.next != null) last.next.prev = node;

					node.next = last.next;
					last.next = node;
					node.prev = last;

					if (parent.last == last) parent.last = node;

					node.parent = parent;
				} else {
					Node prev = node.prev;
					Node next = node.next;
					Node parent = node.parent;
					discardContainer(node, pnode);

					node = lexer.inferredTag("br");
					node.next = next;
					node.prev = prev;
					node.parent = parent;

					if (next != null)
						next.prev = node;
					else
						parent.last = node;

					if (prev != null)
						prev.next = node;
					else
						parent.content = node;
				}

				return true;
			}
			node.tag = tt.tagDiv;
			node.element = "div";
			addStyleProperty(node, "text-align: center");
			return true;
		}

		return false;
	}

	/*
	 Symptom <div><div>...</div></div>
	 Action: merge the two divs

	 This is useful after nested <dir>s used by Word
	 for indenting have been converted to <div>s
	 */
	private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
		Node child;

		if (node.tag != tt.tagDiv) return false;

		child = node.content;

		if (child == null) return false;

		if (child.tag != tt.tagDiv) return false;

		if (child.next != null) return false;

		mergeStyles(node, child);
		stripOnlyChild(node);
		return true;
	}

	/*
	 Symptom: <ul><li><ul>...</ul></li></ul>
	 Action: discard outer list
	 */

	private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
		Node child, list;

		if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
			child = node.content;

			if (child == null) return false;

			/* check child has no peers */

			if (child.next != null) return false;

			list = child.content;

			if (list == null) return false;

			if (list.tag != node.tag) return false;

			pnode.setObject(node.next);

			/* move inner list node into position of outer node */
			list.prev = node.prev;
			list.next = node.next;
			list.parent = node.parent;
			fixNodeLinks(list);

			/* get rid of outer ul and its li */
			child.content = null;
			node.content = null;
			node.next = null;

			/*
			 If prev node was a list the chances are this node
			 should be appended to that list. Word has no way of
			 recognizing nested lists and just uses indents
			 */

			if (list.prev != null) {
				node = list;
				list = node.prev;

				if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
					list.next = node.next;

					if (list.next != null) list.next.prev = list;

					child = list.last; /* <li> */

					node.parent = child;
					node.next = null;
					node.prev = child.last;
					fixNodeLinks(node);
				}
			}

			cleanNode(lexer, node);
			return true;
		}

		return false;
	}

	/*
	 Symptom: the only child of a block-level element is a
	 presentation element such as B, I or FONT

	 Action: add style "font-weight: bold" to the block and
	 strip the <b> element, leaving its children.

	 example:

	 <p>
	 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
	 </p>

	 becomes:

	 <p style="font-weight: bold; font-family: Arial; font-size: 6">
	 Draft Recommended Practice
	 </p>

	 This code also replaces the align attribute by a style attribute.
	 However, to avoid CSS problems with Navigator 4, this isn't done
	 for the elements: caption, tr and table
	 */
	private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
		Node child;

		if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
			if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
				/* check for align attribute */
				if (node.tag != tt.tagCaption) textAlign(lexer, node);

				child = node.content;

				if (child == null) return false;

				/* check child has no peers */

				if (child.next != null) return false;

				if (child.tag == tt.tagB) {
					mergeStyles(node, child);
					addStyleProperty(node, "font-weight: bold");
					stripOnlyChild(node);
					return true;
				}

				if (child.tag == tt.tagI) {
					mergeStyles(node, child);
					addStyleProperty(node, "font-style: italic");
					stripOnlyChild(node);
					return true;
				}

				if (child.tag == tt.tagFont) {
					mergeStyles(node, child);
					addFontStyles(node, child.attributes);
					stripOnlyChild(node);
					return true;
				}
			}
		}

		return false;
	}

	/* the only child of table cell or an inline element such as em */
	private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
		Node child;

		if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
			child = node.content;

			if (child == null) return false;

			/* check child has no peers */

			if (child.next != null) return false;

			if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
				mergeStyles(node, child);
				addStyleProperty(node, "font-weight: bold");
				stripOnlyChild(node);
				return true;
			}

			if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
				mergeStyles(node, child);
				addStyleProperty(node, "font-style: italic");
				stripOnlyChild(node);
				return true;
			}

			if (child.tag == tt.tagFont) {
				mergeStyles(node, child);
				addFontStyles(node, child.attributes);
				stripOnlyChild(node);
				return true;
			}
		}

		return false;
	}

	/*
	 Replace font elements by span elements, deleting
	 the font element's attributes and replacing them
	 by a single style attribute.
	 */
	private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
		AttVal av, style, next;

		if (node.tag == tt.tagFont) {
			if (lexer.configuration.DropFontTags) {
				discardContainer(node, pnode);
				return false;
			}

			/* if FONT is only child of parent element then leave alone */
			if (node.parent.content == node && node.next == null) return false;

			addFontStyles(node, node.attributes);

			/* extract style attribute and free the rest */
			av = node.attributes;
			style = null;

			while (av != null) {
				next = av.next;

				if (av.attribute.equals("style")) {
					av.next = null;
					style = av;
				}

				av = next;
			}

			node.attributes = style;

			node.tag = tt.tagSpan;
			node.element = "span";

			return true;
		}

		return false;
	}

	/*
	 Applies all matching rules to a node.
	 */
	private Node cleanNode(Lexer lexer, Node node) {
		Node next = null;
		MutableObject o = new MutableObject();
		boolean b = false;

		for (next = node; node.isElement(); node = next) {
			o.setObject(next);

			b = dir2Div(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			b = nestedList(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			b = center2Div(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			b = mergeDivs(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			b = blockStyle(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			b = inlineStyle(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			b = font2Span(lexer, node, o);
			next = (Node) o.getObject();
			if (b) continue;

			break;
		}

		return next;
	}

	private Node createStyleProperties(Lexer lexer, Node node) {
		Node child;

		if (node.content != null) {
			for (child = node.content; child != null; child = child.next) {
				child = createStyleProperties(lexer, child);
			}
		}

		return cleanNode(lexer, node);
	}

	private void defineStyleRules(Lexer lexer, Node node) {
		Node child;

		if (node.content != null) {
			for (child = node.content; child != null; child = child.next) {
				defineStyleRules(lexer, child);
			}
		}

		style2Rule(lexer, node);
	}

	public void cleanTree(Lexer lexer, Node doc) {
		doc = createStyleProperties(lexer, doc);

		if (!lexer.configuration.MakeClean) {
			defineStyleRules(lexer, doc);
			createStyleElement(lexer, doc);
		}
	}

	/* simplifies <b><b> ... </b> ...</b> etc. */
	public void nestedEmphasis(Node node) {
		MutableObject o = new MutableObject();
		Node next;

		while (node != null) {
			next = node.next;

			if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) {
				/* strip redundant inner element */
				o.setObject(next);
				discardContainer(node, o);
				next = (Node) o.getObject();
				node = next;
				continue;
			}

			if (node.content != null) nestedEmphasis(node.content);

			node = next;
		}
	}

	/* replace i by em and b by strong */
	public void emFromI(Node node) {
		while (node != null) {
			if (node.tag == tt.tagI) {
				node.element = tt.tagEm.name;
				node.tag = tt.tagEm;
			} else if (node.tag == tt.tagB) {
				node.element = tt.tagStrong.name;
				node.tag = tt.tagStrong;
			}

			if (node.content != null) emFromI(node.content);

			node = node.next;
		}
	}

	/*
	 Some people use dir or ul without an li
	 to indent the content. The pattern to
	 look for is a list with a single implicit
	 li. This is recursively replaced by an
	 implicit blockquote.
	 */
	public void list2BQ(Node node) {
		while (node != null) {
			if (node.content != null) list2BQ(node.content);

			if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild()
					&& node.content.implicit) {
				stripOnlyChild(node);
				node.element = tt.tagBlockquote.name;
				node.tag = tt.tagBlockquote;
				node.implicit = true;
			}

			node = node.next;
		}
	}

	/*
	 Replace implicit blockquote by div with an indent
	 taking care to reduce nested blockquotes to a single
	 div with the indent set to match the nesting depth
	 */
	public void bQ2Div(Node node) {
		int indent;
		String indent_buf;

		while (node != null) {
			if (node.tag == tt.tagBlockquote && node.implicit) {
				indent = 1;

				while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
					++indent;
					stripOnlyChild(node);
				}

				if (node.content != null) bQ2Div(node.content);

				indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";

				node.element = tt.tagDiv.name;
				node.tag = tt.tagDiv;
				node.addAttribute("style", indent_buf);
			} else if (node.content != null) bQ2Div(node.content);

			node = node.next;
		}
	}

	/* node is <![if ...]> prune up to <![endif]> */
	public Node pruneSection(Lexer lexer, Node node) {
		for (;;) {
			/* discard node and returns next */
			node = Node.discardElement(node);

			if (node == null) return null;

			if (node.type == Node.SectionTag) {
				if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
					node = pruneSection(lexer, node);
					continue;
				}

				if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) {
					node = Node.discardElement(node);
					break;
				}
			}
		}

		return node;
	}

	public void dropSections(Lexer lexer, Node node) {
		while (node != null) {
			if (node.type == Node.SectionTag) {
				/* prune up to matching endif */
				if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
					node = pruneSection(lexer, node);
					continue;
				}

				/* discard others as well */
				node = Node.discardElement(node);
				continue;
			}

			if (node.content != null) dropSections(lexer, node.content);

			node = node.next;
		}
	}

	public void purgeAttributes(Node node) {
		AttVal attr = node.attributes;
		AttVal next = null;
		AttVal prev = null;

		while (attr != null) {
			next = attr.next;

			/* special check for class="Code" denoting pre text */
			if (attr.attribute != null && attr.value != null && attr.attribute.equals("class")
					&& attr.value.equals("Code")) {
				prev = attr;
			} else if (attr.attribute != null
					&& (attr.attribute.equals("class") || attr.attribute.equals("style")
							|| attr.attribute.equals("lang") || attr.attribute.startsWith("x:") || ((attr.attribute
							.equals("height") || attr.attribute.equals("width")) && (node.tag == tt.tagTd
							|| node.tag == tt.tagTr || node.tag == tt.tagTh)))) {
				if (prev != null)
					prev.next = next;
				else
					node.attributes = next;

			} else
				prev = attr;

			attr = next;
		}
	}

	/* Word2000 uses span excessively, so we strip span out */
	public Node stripSpan(Lexer lexer, Node span) {
		Node node;
		Node prev = null;
		Node content;

		/*
		 deal with span elements that have content
		 by splicing the content in place of the span
		 after having processed it
		 */

		cleanWord2000(lexer, span.content);
		content = span.content;

		if (span.prev != null)
			prev = span.prev;
		else if (content != null) {
			node = content;
			content = content.next;
			Node.removeNode(node);
			Node.insertNodeBeforeElement(span, node);
			prev = node;
		}

		while (content != null) {
			node = content;
			content = content.next;
			Node.removeNode(node);
			Node.insertNodeAfterElement(prev, node);
			prev = node;
		}

		if (span.next == null) span.parent.last = prev;

		node = span.next;
		span.content = null;
		Node.discardElement(span);
		return node;
	}

	/* map non-breaking spaces to regular spaces */
	private void normalizeSpaces(Lexer lexer, Node node) {
		while (node != null) {
			if (node.content != null) normalizeSpaces(lexer, node.content);

			if (node.type == Node.TextNode) {
				int i;
				MutableInteger c = new MutableInteger();
				int p = node.start;

				for (i = node.start; i < node.end; ++i) {
					c.value = (int) node.textarray[i];

					/* look for UTF-8 multibyte character */
					if (c.value > 0x7F) i += PPrint.getUTF8(node.textarray, i, c);

					if (c.value == 160) c.value = ' ';

					p = PPrint.putUTF8(node.textarray, p, c.value);
				}
			}

			node = node.next;
		}
	}

	/*
	 This is a major clean up to strip out all the extra stuff you get
	 when you save as web page from Word 2000. It doesn't yet know what
	 to do with VML tags, but these will appear as errors unless you
	 declare them as new tags, such as o:p which needs to be declared
	 as inline.
	 */
	public void cleanWord2000(Lexer lexer, Node node) {
		/* used to a list from a sequence of bulletted p's */
		Node list = null;

		while (node != null) {
			/* discard Word's style verbiage */
			if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
				node = Node.discardElement(node);
				continue;
			}

			/* strip out all span tags Word scatters so liberally! */
			if (node.tag == tt.tagSpan) {
				node = stripSpan(lexer, node);
				continue;
			}

			/* get rid of Word's xmlns attributes */
			if (node.tag == tt.tagHtml) {
				/* check that it's a Word 2000 document */
				if (node.getAttrByName("xmlns:o") == null) return;
			}

			if (node.tag == tt.tagLink) {
				AttVal attr = node.getAttrByName("rel");

				if (attr != null && attr.value != null && attr.value.equals("File-List")) {
					node = Node.discardElement(node);
					continue;
				}
			}

			/* discard empty paragraphs */
			if (node.content == null && node.tag == tt.tagP) {
				node = Node.discardElement(node);
				continue;
			}

			if (node.tag == tt.tagP) {
				AttVal attr = node.getAttrByName("class");

				/* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
				if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
					Node.coerceNode(lexer, node, tt.tagLi);

					if (list == null || list.tag != tt.tagUl) {
						list = lexer.inferredTag("ul");
						Node.insertNodeBeforeElement(node, list);
					}

					purgeAttributes(node);

					if (node.content != null) cleanWord2000(lexer, node.content);

					/* remove node and append to contents of list */
					Node.removeNode(node);
					Node.insertNodeAtEnd(list, node);
					node = list.next;
				}
				/* map sequence of <p class="Code"> to <pre>...</pre> */
				else if (attr != null && attr.value != null && attr.value.equals("Code")) {
					Node br = lexer.newLineNode();
					normalizeSpaces(lexer, node);

					if (list == null || list.tag != tt.tagPre) {
						list = lexer.inferredTag("pre");
						Node.insertNodeBeforeElement(node, list);
					}

					/* remove node and append to contents of list */
					Node.removeNode(node);
					Node.insertNodeAtEnd(list, node);
					stripSpan(lexer, node);
					Node.insertNodeAtEnd(list, br);
					node = list.next;
				} else
					list = null;
			} else
				list = null;

			/* strip out style and class attributes */
			if (node.type == Node.StartTag || node.type == Node.StartEndTag) purgeAttributes(node);

			if (node.content != null) cleanWord2000(lexer, node.content);

			node = node.next;
		}
	}

	public boolean isWord2000(Node root, TagTable tt) {
		Node html = root.findHTML(tt);

		return (html != null && html.getAttrByName("xmlns:o") != null);
	}
}