/** * Copyright (c) 2009 Juwi MacMillan Group GmbH * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * @(#)Clean.java 1.11 2000/08/16 * */ package org.tizzit.util.tidy; /** * * Clean up misuse of presentation markup * * (c) 1998-2000 (W3C) MIT, INRIA, Keio University * See Tidy.java for the copyright notice. * Derived from <a href="http://www.w3.org/People/Raggett/tidy"> * HTML Tidy Release 4 Aug 2000</a> * * @author Dave Raggett <dsr@w3.org> * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java) * @version 1.0, 1999/05/22 * @version 1.0.1, 1999/05/29 * @version 1.1, 1999/06/18 Java Bean * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999 * @version 1.4, 1999/09/04 DOM support * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000 */ /* Filters from other formats such as Microsoft Word often make excessive use of presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is straight forward to transform this to use CSS. Some rules replace some of the children of an element by style properties on the element, e.g. <p><b>...</b></p> -> <p style="font-weight: bold">...</p> Such rules are applied to the element's content and then to the element itself until none of the rules more apply. Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules strip the element they apply to, replacing it by style properties on the contents, e.g. <dir><li><p>...</li></dir> -> <p style="margin-left 1em">... These rules are applied to an element before processing its content and replace the current element by the first element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class value and style rule in the document head. To support this, an association of styles and class names is built. A naive approach is to rely on string matching to test when two property lists are the same. A better approach would be to first sort the properties before matching. */ public class Clean { private int classNum = 1; private TagTable tt; public Clean(TagTable tt) { this.tt = tt; } private StyleProp insertProperty(StyleProp props, String name, String value) { StyleProp first, prev, prop; int cmp; prev = null; first = props; while (props != null) { cmp = props.name.compareTo(name); if (cmp == 0) { /* this property is already defined, ignore new value */ return first; } if (cmp > 0) // props.name > name { /* insert before this */ prop = new StyleProp(name, value, props); if (prev != null) prev.next = prop; else first = prop; return first; } prev = props; props = props.next; } prop = new StyleProp(name, value); if (prev != null) prev.next = prop; else first = prop; return first; } /* Create sorted linked list of properties from style string It temporarily places nulls in place of ':' and ';' to delimit the strings for the property name and value. Some systems don't allow you to null literal strings, so to avoid this, a copy is made first. */ private StyleProp createProps(StyleProp prop, String style) { int name_end; int value_end; int value_start = 0; int name_start = 0; boolean more; name_start = 0; while (name_start < style.length()) { while (name_start < style.length() && style.charAt(name_start) == ' ') ++name_start; name_end = name_start; while (name_end < style.length()) { if (style.charAt(name_end) == ':') { value_start = name_end + 1; break; } ++name_end; } if (name_end >= style.length() || style.charAt(name_end) != ':') break; while (value_start < style.length() && style.charAt(value_start) == ' ') ++value_start; value_end = value_start; more = false; while (value_end < style.length()) { if (style.charAt(value_end) == ';') { more = true; break; } ++value_end; } prop = insertProperty(prop, style.substring(name_start, name_end), style.substring(value_start, value_end)); if (more) { name_start = value_end + 1; continue; } break; } return prop; } private String createPropString(StyleProp props) { String style = ""; int len; StyleProp prop; /* compute length */ for (len = 0, prop = props; prop != null; prop = prop.next) { len += prop.name.length() + 2; len += prop.value.length() + 2; } for (prop = props; prop != null; prop = prop.next) { style = style.concat(prop.name); style = style.concat(": "); style = style.concat(prop.value); if (prop.next == null) break; style = style.concat("; "); } return style; } /* create string with merged properties */ private String addProperty(String style, String property) { StyleProp prop; prop = createProps(null, style); prop = createProps(prop, property); style = createPropString(prop); return style; } private String gensymClass(String tag) { String str; str = "c" + classNum; classNum++; return str; } private String findStyle(Lexer lexer, String tag, String properties) { Style style; for (style = lexer.styles; style != null; style = style.next) { if (style.tag.equals(tag) && style.properties.equals(properties)) return style.tagClass; } style = new Style(tag, gensymClass(tag), properties, lexer.styles); lexer.styles = style; return style.tagClass; } /* Find style attribute in node, and replace it by corresponding class attribute. Search for class in style dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute */ private void style2Rule(Lexer lexer, Node node) { AttVal styleattr, classattr; String classname; styleattr = node.getAttrByName("style"); if (styleattr != null) { classname = findStyle(lexer, node.element, styleattr.value); classattr = node.getAttrByName("class"); /* if there already is a class attribute then append class name after a space */ if (classattr != null) { classattr.value = classattr.value + " " + classname; node.removeAttribute(styleattr); } else /* reuse style attribute for class attribute */ { styleattr.attribute = "class"; styleattr.value = classname; } } } private void addColorRule(Lexer lexer, String selector, String color) { if (color != null) { lexer.addStringLiteral(selector); lexer.addStringLiteral(" { color: "); lexer.addStringLiteral(color); lexer.addStringLiteral(" }\n"); } } /* move presentation attribs from body to style element background="foo" -> body { background-image: url(foo) } bgcolor="foo" -> body { background-color: foo } text="foo" -> body { color: foo } link="foo" -> :link { color: foo } vlink="foo" -> :visited { color: foo } alink="foo" -> :active { color: foo } */ private void cleanBodyAttrs(Lexer lexer, Node body) { AttVal attr; String bgurl = null; String bgcolor = null; String color = null; attr = body.getAttrByName("background"); if (attr != null) { bgurl = attr.value; attr.value = null; body.removeAttribute(attr); } attr = body.getAttrByName("bgcolor"); if (attr != null) { bgcolor = attr.value; attr.value = null; body.removeAttribute(attr); } attr = body.getAttrByName("text"); if (attr != null) { color = attr.value; attr.value = null; body.removeAttribute(attr); } if (bgurl != null || bgcolor != null || color != null) { lexer.addStringLiteral(" body {\n"); if (bgurl != null) { lexer.addStringLiteral(" background-image: url("); lexer.addStringLiteral(bgurl); lexer.addStringLiteral(");\n"); } if (bgcolor != null) { lexer.addStringLiteral(" background-color: "); lexer.addStringLiteral(bgcolor); lexer.addStringLiteral(";\n"); } if (color != null) { lexer.addStringLiteral(" color: "); lexer.addStringLiteral(color); lexer.addStringLiteral(";\n"); } lexer.addStringLiteral(" }\n"); } attr = body.getAttrByName("link"); if (attr != null) { addColorRule(lexer, " :link", attr.value); body.removeAttribute(attr); } attr = body.getAttrByName("vlink"); if (attr != null) { addColorRule(lexer, " :visited", attr.value); body.removeAttribute(attr); } attr = body.getAttrByName("alink"); if (attr != null) { addColorRule(lexer, " :active", attr.value); body.removeAttribute(attr); } } private boolean niceBody(Lexer lexer, Node doc) { Node body = doc.findBody(lexer.configuration.tt); if (body != null) { if (body.getAttrByName("background") != null || body.getAttrByName("bgcolor") != null || body.getAttrByName("text") != null || body.getAttrByName("link") != null || body.getAttrByName("vlink") != null || body.getAttrByName("alink") != null) { lexer.badLayout |= Report.USING_BODY; return false; } } return true; } /* create style element using rules from dictionary */ private void createStyleElement(Lexer lexer, Node doc) { Node node, head, body; Style style; AttVal av; if (lexer.styles == null && niceBody(lexer, doc)) return; node = lexer.newNode(Node.StartTag, null, 0, 0, "style"); node.implicit = true; /* insert type attribute */ av = new AttVal(null, null, '"', "type", "text/css"); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; body = doc.findBody(lexer.configuration.tt); lexer.txtstart = lexer.lexsize; if (body != null) cleanBodyAttrs(lexer, body); for (style = lexer.styles; style != null; style = style.next) { lexer.addCharToLexer(' '); lexer.addStringLiteral(style.tag); lexer.addCharToLexer('.'); lexer.addStringLiteral(style.tagClass); lexer.addCharToLexer(' '); lexer.addCharToLexer('{'); lexer.addStringLiteral(style.properties); lexer.addCharToLexer('}'); lexer.addCharToLexer('\n'); } lexer.txtend = lexer.lexsize; Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, lexer.lexbuf, lexer.txtstart, lexer.txtend)); /* now insert style element into document head doc is root node. search its children for html node the head node should be first child of html node */ head = doc.findHEAD(lexer.configuration.tt); if (head != null) Node.insertNodeAtEnd(head, node); } /* ensure bidirectional links are consistent */ private void fixNodeLinks(Node node) { Node child; if (node.prev != null) node.prev.next = node; else node.parent.content = node; if (node.next != null) node.next.prev = node; else node.parent.last = node; for (child = node.content; child != null; child = child.next) child.parent = node; } /* used to strip child of node when the node has one and only one child */ private void stripOnlyChild(Node node) { Node child; child = node.content; node.content = child.content; node.last = child.last; child.content = null; for (child = node.content; child != null; child = child.next) child.parent = node; } /* used to strip font start and end tags */ private void discardContainer(Node element, MutableObject pnode) { Node node; Node parent = element.parent; if (element.content != null) { element.last.next = element.next; if (element.next != null) { element.next.prev = element.last; element.last.next = element.next; } else parent.last = element.last; if (element.prev != null) { element.content.prev = element.prev; element.prev.next = element.content; } else parent.content = element.content; for (node = element.content; node != null; node = node.next) node.parent = parent; pnode.setObject(element.content); } else { if (element.next != null) element.next.prev = element.prev; else parent.last = element.prev; if (element.prev != null) element.prev.next = element.next; else parent.content = element.next; pnode.setObject(element.next); } element.next = null; element.content = null; } /* Add style property to element, creating style attribute as needed and adding ; delimiter */ private void addStyleProperty(Node node, String property) { AttVal av; for (av = node.attributes; av != null; av = av.next) { if (av.attribute.equals("style")) break; } /* if style attribute already exists then insert property */ if (av != null) { String s; s = addProperty(av.value, property); av.value = s; } else /* else create new style attribute */ { av = new AttVal(node.attributes, null, '"', "style", property); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; } } /* Create new string that consists of the combined style properties in s1 and s2 To merge property lists, we build a linked list of property/values and insert properties into the list in order, merging values for the same property name. */ private String mergeProperties(String s1, String s2) { String s; StyleProp prop; prop = createProps(null, s1); prop = createProps(prop, s2); s = createPropString(prop); return s; } private void mergeStyles(Node node, Node child) { AttVal av; String s1, s2, style; for (s2 = null, av = child.attributes; av != null; av = av.next) { if (av.attribute.equals("style")) { s2 = av.value; break; } } for (s1 = null, av = node.attributes; av != null; av = av.next) { if (av.attribute.equals("style")) { s1 = av.value; break; } } if (s1 != null) { if (s2 != null) /* merge styles from both */ { style = mergeProperties(s1, s2); av.value = style; } } else if (s2 != null) /* copy style of child */ { av = new AttVal(node.attributes, null, '"', "style", s2); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); node.attributes = av; } } private String fontSize2Name(String size) { /* String[] sizes = { "50%", "60%", "80%", null, "120%", "150%", "200%" }; */ String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%"}; String buf; if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') { int n = size.charAt(0) - '0'; return sizes[n]; } if (size.length() > 0 && size.charAt(0) == '-') { if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') { int n = size.charAt(1) - '0'; double x; for (x = 1.0; n > 0; --n) x *= 0.8; x *= 100.0; buf = "" + (int) x + "%"; return buf; } return "smaller"; /*"70%"; */ } if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') { int n = size.charAt(1) - '0'; double x; for (x = 1.0; n > 0; --n) x *= 1.2; x *= 100.0; buf = "" + (int) x + "%"; return buf; } return "larger"; /* "140%" */ } private void addFontFace(Node node, String face) { addStyleProperty(node, "font-family: " + face); } private void addFontSize(Node node, String size) { String value; if (size.equals("6") && node.tag == tt.tagP) { node.element = "h1"; tt.findTag(node); return; } if (size.equals("5") && node.tag == tt.tagP) { node.element = "h2"; tt.findTag(node); return; } if (size.equals("4") && node.tag == tt.tagP) { node.element = "h3"; tt.findTag(node); return; } value = fontSize2Name(size); if (value != null) { addStyleProperty(node, "font-size: " + value); } } private void addFontColor(Node node, String color) { addStyleProperty(node, "color: " + color); } private void addAlign(Node node, String align) { /* force alignment value to lower case */ addStyleProperty(node, "text-align: " + align.toLowerCase()); } /* add style properties to node corresponding to the font face, size and color attributes */ private void addFontStyles(Node node, AttVal av) { while (av != null) { if (av.attribute.equals("face")) addFontFace(node, av.value); else if (av.attribute.equals("size")) addFontSize(node, av.value); else if (av.attribute.equals("color")) addFontColor(node, av.value); av = av.next; } } /* Symptom: <p align=center> Action: <p style="text-align: center"> */ private void textAlign(Lexer lexer, Node node) { AttVal av, prev; prev = null; for (av = node.attributes; av != null; av = av.next) { if (av.attribute.equals("align")) { if (prev != null) prev.next = av.next; else node.attributes = av.next; if (av.value != null) { addAlign(node, av.value); } break; } prev = av; } } /* The clean up rules use the pnode argument to return the next node when the orignal node has been deleted */ /* Symptom: <dir> <li> where <li> is only child Action: coerce <dir> <li> to <div> with indent. */ private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) { Node child; if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) { child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; if (child.tag != tt.tagLi) return false; if (!child.implicit) return false; /* coerce dir to div */ node.tag = tt.tagDiv; node.element = "div"; addStyleProperty(node, "margin-left: 2em"); stripOnlyChild(node); return true; //#if 0 //Node content; //Node last; //content = child.content; //last = child.last; //child.content = null; /* adjust parent and set margin on contents of <li> */ //for (child = content; child != null; child = child.next) //{ // child.parent = node.parent; // addStyleProperty(child, "margin-left: 1em"); //} /* hook first/last into sequence */ //if (content != null) //{ // content.prev = node.prev; // last.next = node.next; // fixNodeLinks(content); // fixNodeLinks(last); //} //node.next = null; /* ensure that new node is cleaned */ //pnode.setObject(cleanNode(lexer, content)); //return true; //#endif } return false; } /* Symptom: <center> Action: replace <center> by <div style="text-align: center"> */ private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) { if (node.tag == tt.tagCenter) { if (lexer.configuration.DropFontTags) { if (node.content != null) { Node last = node.last; Node parent = node.parent; discardContainer(node, pnode); node = lexer.inferredTag("br"); if (last.next != null) last.next.prev = node; node.next = last.next; last.next = node; node.prev = last; if (parent.last == last) parent.last = node; node.parent = parent; } else { Node prev = node.prev; Node next = node.next; Node parent = node.parent; discardContainer(node, pnode); node = lexer.inferredTag("br"); node.next = next; node.prev = prev; node.parent = parent; if (next != null) next.prev = node; else parent.last = node; if (prev != null) prev.next = node; else parent.content = node; } return true; } node.tag = tt.tagDiv; node.element = "div"; addStyleProperty(node, "text-align: center"); return true; } return false; } /* Symptom <div><div>...</div></div> Action: merge the two divs This is useful after nested <dir>s used by Word for indenting have been converted to <div>s */ private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) { Node child; if (node.tag != tt.tagDiv) return false; child = node.content; if (child == null) return false; if (child.tag != tt.tagDiv) return false; if (child.next != null) return false; mergeStyles(node, child); stripOnlyChild(node); return true; } /* Symptom: <ul><li><ul>...</ul></li></ul> Action: discard outer list */ private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) { Node child, list; if (node.tag == tt.tagUl || node.tag == tt.tagOl) { child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; list = child.content; if (list == null) return false; if (list.tag != node.tag) return false; pnode.setObject(node.next); /* move inner list node into position of outer node */ list.prev = node.prev; list.next = node.next; list.parent = node.parent; fixNodeLinks(list); /* get rid of outer ul and its li */ child.content = null; node.content = null; node.next = null; /* If prev node was a list the chances are this node should be appended to that list. Word has no way of recognizing nested lists and just uses indents */ if (list.prev != null) { node = list; list = node.prev; if (list.tag == tt.tagUl || list.tag == tt.tagOl) { list.next = node.next; if (list.next != null) list.next.prev = list; child = list.last; /* <li> */ node.parent = child; node.next = null; node.prev = child.last; fixNodeLinks(node); } } cleanNode(lexer, node); return true; } return false; } /* Symptom: the only child of a block-level element is a presentation element such as B, I or FONT Action: add style "font-weight: bold" to the block and strip the <b> element, leaving its children. example: <p> <b><font face="Arial" size="6">Draft Recommended Practice</font></b> </p> becomes: <p style="font-weight: bold; font-family: Arial; font-size: 6"> Draft Recommended Practice </p> This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator 4, this isn't done for the elements: caption, tr and table */ private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) { Node child; if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) { if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) { /* check for align attribute */ if (node.tag != tt.tagCaption) textAlign(lexer, node); child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; if (child.tag == tt.tagB) { mergeStyles(node, child); addStyleProperty(node, "font-weight: bold"); stripOnlyChild(node); return true; } if (child.tag == tt.tagI) { mergeStyles(node, child); addStyleProperty(node, "font-style: italic"); stripOnlyChild(node); return true; } if (child.tag == tt.tagFont) { mergeStyles(node, child); addFontStyles(node, child.attributes); stripOnlyChild(node); return true; } } } return false; } /* the only child of table cell or an inline element such as em */ private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) { Node child; if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) { child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) { mergeStyles(node, child); addStyleProperty(node, "font-weight: bold"); stripOnlyChild(node); return true; } if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) { mergeStyles(node, child); addStyleProperty(node, "font-style: italic"); stripOnlyChild(node); return true; } if (child.tag == tt.tagFont) { mergeStyles(node, child); addFontStyles(node, child.attributes); stripOnlyChild(node); return true; } } return false; } /* Replace font elements by span elements, deleting the font element's attributes and replacing them by a single style attribute. */ private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) { AttVal av, style, next; if (node.tag == tt.tagFont) { if (lexer.configuration.DropFontTags) { discardContainer(node, pnode); return false; } /* if FONT is only child of parent element then leave alone */ if (node.parent.content == node && node.next == null) return false; addFontStyles(node, node.attributes); /* extract style attribute and free the rest */ av = node.attributes; style = null; while (av != null) { next = av.next; if (av.attribute.equals("style")) { av.next = null; style = av; } av = next; } node.attributes = style; node.tag = tt.tagSpan; node.element = "span"; return true; } return false; } /* Applies all matching rules to a node. */ private Node cleanNode(Lexer lexer, Node node) { Node next = null; MutableObject o = new MutableObject(); boolean b = false; for (next = node; node.isElement(); node = next) { o.setObject(next); b = dir2Div(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = nestedList(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = center2Div(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = mergeDivs(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = blockStyle(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = inlineStyle(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = font2Span(lexer, node, o); next = (Node) o.getObject(); if (b) continue; break; } return next; } private Node createStyleProperties(Lexer lexer, Node node) { Node child; if (node.content != null) { for (child = node.content; child != null; child = child.next) { child = createStyleProperties(lexer, child); } } return cleanNode(lexer, node); } private void defineStyleRules(Lexer lexer, Node node) { Node child; if (node.content != null) { for (child = node.content; child != null; child = child.next) { defineStyleRules(lexer, child); } } style2Rule(lexer, node); } public void cleanTree(Lexer lexer, Node doc) { doc = createStyleProperties(lexer, doc); if (!lexer.configuration.MakeClean) { defineStyleRules(lexer, doc); createStyleElement(lexer, doc); } } /* simplifies <b><b> ... </b> ...</b> etc. */ public void nestedEmphasis(Node node) { MutableObject o = new MutableObject(); Node next; while (node != null) { next = node.next; if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) { /* strip redundant inner element */ o.setObject(next); discardContainer(node, o); next = (Node) o.getObject(); node = next; continue; } if (node.content != null) nestedEmphasis(node.content); node = next; } } /* replace i by em and b by strong */ public void emFromI(Node node) { while (node != null) { if (node.tag == tt.tagI) { node.element = tt.tagEm.name; node.tag = tt.tagEm; } else if (node.tag == tt.tagB) { node.element = tt.tagStrong.name; node.tag = tt.tagStrong; } if (node.content != null) emFromI(node.content); node = node.next; } } /* Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single implicit li. This is recursively replaced by an implicit blockquote. */ public void list2BQ(Node node) { while (node != null) { if (node.content != null) list2BQ(node.content); if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild() && node.content.implicit) { stripOnlyChild(node); node.element = tt.tagBlockquote.name; node.tag = tt.tagBlockquote; node.implicit = true; } node = node.next; } } /* Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with the indent set to match the nesting depth */ public void bQ2Div(Node node) { int indent; String indent_buf; while (node != null) { if (node.tag == tt.tagBlockquote && node.implicit) { indent = 1; while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) { ++indent; stripOnlyChild(node); } if (node.content != null) bQ2Div(node.content); indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em"; node.element = tt.tagDiv.name; node.tag = tt.tagDiv; node.addAttribute("style", indent_buf); } else if (node.content != null) bQ2Div(node.content); node = node.next; } } /* node is <![if ...]> prune up to <![endif]> */ public Node pruneSection(Lexer lexer, Node node) { for (;;) { /* discard node and returns next */ node = Node.discardElement(node); if (node == null) return null; if (node.type == Node.SectionTag) { if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) { node = pruneSection(lexer, node); continue; } if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) { node = Node.discardElement(node); break; } } } return node; } public void dropSections(Lexer lexer, Node node) { while (node != null) { if (node.type == Node.SectionTag) { /* prune up to matching endif */ if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) { node = pruneSection(lexer, node); continue; } /* discard others as well */ node = Node.discardElement(node); continue; } if (node.content != null) dropSections(lexer, node.content); node = node.next; } } public void purgeAttributes(Node node) { AttVal attr = node.attributes; AttVal next = null; AttVal prev = null; while (attr != null) { next = attr.next; /* special check for class="Code" denoting pre text */ if (attr.attribute != null && attr.value != null && attr.attribute.equals("class") && attr.value.equals("Code")) { prev = attr; } else if (attr.attribute != null && (attr.attribute.equals("class") || attr.attribute.equals("style") || attr.attribute.equals("lang") || attr.attribute.startsWith("x:") || ((attr.attribute .equals("height") || attr.attribute.equals("width")) && (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh)))) { if (prev != null) prev.next = next; else node.attributes = next; } else prev = attr; attr = next; } } /* Word2000 uses span excessively, so we strip span out */ public Node stripSpan(Lexer lexer, Node span) { Node node; Node prev = null; Node content; /* deal with span elements that have content by splicing the content in place of the span after having processed it */ cleanWord2000(lexer, span.content); content = span.content; if (span.prev != null) prev = span.prev; else if (content != null) { node = content; content = content.next; Node.removeNode(node); Node.insertNodeBeforeElement(span, node); prev = node; } while (content != null) { node = content; content = content.next; Node.removeNode(node); Node.insertNodeAfterElement(prev, node); prev = node; } if (span.next == null) span.parent.last = prev; node = span.next; span.content = null; Node.discardElement(span); return node; } /* map non-breaking spaces to regular spaces */ private void normalizeSpaces(Lexer lexer, Node node) { while (node != null) { if (node.content != null) normalizeSpaces(lexer, node.content); if (node.type == Node.TextNode) { int i; MutableInteger c = new MutableInteger(); int p = node.start; for (i = node.start; i < node.end; ++i) { c.value = (int) node.textarray[i]; /* look for UTF-8 multibyte character */ if (c.value > 0x7F) i += PPrint.getUTF8(node.textarray, i, c); if (c.value == 160) c.value = ' '; p = PPrint.putUTF8(node.textarray, p, c.value); } } node = node.next; } } /* This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags, such as o:p which needs to be declared as inline. */ public void cleanWord2000(Lexer lexer, Node node) { /* used to a list from a sequence of bulletted p's */ Node list = null; while (node != null) { /* discard Word's style verbiage */ if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) { node = Node.discardElement(node); continue; } /* strip out all span tags Word scatters so liberally! */ if (node.tag == tt.tagSpan) { node = stripSpan(lexer, node); continue; } /* get rid of Word's xmlns attributes */ if (node.tag == tt.tagHtml) { /* check that it's a Word 2000 document */ if (node.getAttrByName("xmlns:o") == null) return; } if (node.tag == tt.tagLink) { AttVal attr = node.getAttrByName("rel"); if (attr != null && attr.value != null && attr.value.equals("File-List")) { node = Node.discardElement(node); continue; } } /* discard empty paragraphs */ if (node.content == null && node.tag == tt.tagP) { node = Node.discardElement(node); continue; } if (node.tag == tt.tagP) { AttVal attr = node.getAttrByName("class"); /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */ if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) { Node.coerceNode(lexer, node, tt.tagLi); if (list == null || list.tag != tt.tagUl) { list = lexer.inferredTag("ul"); Node.insertNodeBeforeElement(node, list); } purgeAttributes(node); if (node.content != null) cleanWord2000(lexer, node.content); /* remove node and append to contents of list */ Node.removeNode(node); Node.insertNodeAtEnd(list, node); node = list.next; } /* map sequence of <p class="Code"> to <pre>...</pre> */ else if (attr != null && attr.value != null && attr.value.equals("Code")) { Node br = lexer.newLineNode(); normalizeSpaces(lexer, node); if (list == null || list.tag != tt.tagPre) { list = lexer.inferredTag("pre"); Node.insertNodeBeforeElement(node, list); } /* remove node and append to contents of list */ Node.removeNode(node); Node.insertNodeAtEnd(list, node); stripSpan(lexer, node); Node.insertNodeAtEnd(list, br); node = list.next; } else list = null; } else list = null; /* strip out style and class attributes */ if (node.type == Node.StartTag || node.type == Node.StartEndTag) purgeAttributes(node); if (node.content != null) cleanWord2000(lexer, node.content); node = node.next; } } public boolean isWord2000(Node root, TagTable tt) { Node html = root.findHTML(tt); return (html != null && html.getAttrByName("xmlns:o") != null); } }