/* gnu.classpath.tools.doclets.xmldoclet.HtmlRepairer.java Copyright (C) 2003 Free Software Foundation, Inc. This file is part of GNU Classpath. GNU Classpath is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. GNU Classpath is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Classpath; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ package gnu.classpath.tools.doclets.xmldoclet; import java.io.*; import java.util.*; import com.sun.javadoc.DocErrorReporter; import com.sun.javadoc.ClassDoc; import com.sun.javadoc.MemberDoc; /** * Provides methods for tidying up HTML source. * * @author Julian Scheid */ public final class HtmlRepairer { private static class TagInfo { private Set parentTags = new HashSet(); public TagInfo(String parentTag) { this.parentTags.add(parentTag); } public TagInfo(String[] parentTagArr) { for (int i=0; i<parentTagArr.length; ++i) { this.parentTags.add(parentTagArr[i]); } } public boolean isLegalParentTag(String tag) { return this.parentTags.contains(tag); } } private DocErrorReporter warningReporter; private boolean noWarn; private boolean noEmailWarn; private ClassDoc contextClass; private MemberDoc contextMember; private StringBuffer output = new StringBuffer(); private Stack tagStack = new Stack(); private boolean isLeadingTag = true; private boolean throwAwayLeadingPara = false; private static Map tagInfoMap; private static Set noTextParentTags; static { tagInfoMap = new HashMap(); tagInfoMap.put("li", new TagInfo(new String[] { "ul", "ol", "nl", "menu", "dir" })); tagInfoMap.put("td", new TagInfo(new String[] { "tr" })); tagInfoMap.put("th", new TagInfo(new String[] { "tr" })); tagInfoMap.put("tr", new TagInfo(new String[] { "table" })); tagInfoMap.put("dt", new TagInfo(new String[] { "dl" })); tagInfoMap.put("dd", new TagInfo(new String[] { "dl" })); tagInfoMap.put("param", new TagInfo(new String[] { "applet" })); String[] noTextParentTagArr = { "area", "base", "body", "br", "dd", "dt", "head", "hr", "html", "img", "input", "link", "map", "meta", "ol", "optgroup", "param", "select", "table", "tbody", "tfoot", "thead", "tr", "ul", }; noTextParentTags = new HashSet(); for (int i=0; i<noTextParentTagArr.length; ++i) { noTextParentTags.add(noTextParentTagArr[i]); } } public HtmlRepairer(DocErrorReporter warningReporter, boolean noWarn, boolean noEmailWarn, ClassDoc contextClass, MemberDoc contextMember, boolean throwAwayLeadingPara) { this.warningReporter = warningReporter; this.noWarn = noWarn; this.noEmailWarn = noEmailWarn; this.contextClass = contextClass; this.contextMember = contextMember; this.throwAwayLeadingPara = throwAwayLeadingPara; } private static String replaceStr(String haystack, String needle, String replacement) { int ndx=haystack.indexOf(needle); if (ndx<0) return haystack; else return haystack.substring(0, ndx)+replacement + replaceStr(haystack.substring(ndx+needle.length()), needle, replacement); } private void haveText(String text) { if (isLeadingTag && throwAwayLeadingPara) { if (0 != text.trim().length()) { isLeadingTag = false; } } if (tagStack.isEmpty() || !noTextParentTags.contains(tagStack.peek())) { text = replaceStr(text, "<1", "<1"); text = replaceStr(text, "&&", "&&"); text = replaceStr(text, "& ", "& "); text = replaceStr(text, "&\t", "&\t"); text = replaceStr(text, "&\r", "&\r"); text = replaceStr(text, "&\n", "&\n"); for (char c='0'; c<='9'; ++c) text = replaceStr(text, "&"+c, "&"+c); text = replaceStr(text, "\u00a7", "§"); output.append(text); } else { printWarning("Discarded text in <" + tagStack.peek() + "> element"); } } private void haveStartOrEndTag(String tag) { boolean _isLeadingTag = isLeadingTag; isLeadingTag = false; tag = tag.trim(); boolean isEndTag = tag.startsWith("/"); boolean isAtomTag = tag.endsWith("/"); if (isEndTag && isAtomTag) { // got something like '</a/>' which is invalid. // suppose a close tag was intended. tag = tag.substring(0, tag.length()-1); } if (tag.length() < 1) { printWarning("Deleting broken tag"); return; } String tagName = tag.substring(isEndTag?1:0, isAtomTag?tag.length()-1:tag.length()); String tagAttributes = ""; for (int i=0; i<tagName.length(); ++i) { if (" \t\r\n".indexOf(tagName.charAt(i))>=0) { tagAttributes = tagName.substring(i).trim(); tagName = tagName.substring(0, i); break; } } if (!isEndTag && tagName.indexOf('@')>0) { if (!noEmailWarn) { printWarning("Tag looks like email address: <"+tagName+">"); } output.append("<"+tag+">"); return; } tagName = tagName.toLowerCase(); if (_isLeadingTag && "p".equals(tagName) && !isEndTag && throwAwayLeadingPara) { return; } if ("p".equals(tagName) || "br".equals(tagName) || "hr".equals(tagName)) { // throw away </p> and </br> if (isEndTag) { return; } // make sure every <p> is a <p/> and every <br> is a <br/> else if (!isAtomTag) { tag += "/"; isAtomTag = true; } } if (isEndTag) { // check whether this close tag is on the stack // if yes, close all tags up to this tag if (tagStack.contains(tagName)) { String popped; do { popped = (String)tagStack.pop(); if (!popped.equals(tagName)) printWarning("Inserting '</"+popped+">"); output.append("</"+popped+">"); } while (!popped.equals(tagName)); } // if not, just throw it away else { printWarning("Deleting <"+tag+">"); } } else { final int STATE_INITIAL = 1; final int STATE_EXPECT_ATTRIBUTENAME = 2; final int STATE_UNQUOTED_ATTRIBUTEVALUE = 3; final int STATE_SINGLEQUOTE_ATTRIBUTEVALUE = 4; final int STATE_DOUBLEQUOTE_ATTRIBUTEVALUE = 5; final int STATE_EXPECT_ATTRIBUTEVALUE = 6; final int STATE_EXPECT_EQUALSIGN = 7; int state = STATE_INITIAL; String newAttributes = ""; String attributeName = null; StringBuffer buf = new StringBuffer(); char[] attrsAsChars = tagAttributes.toCharArray(); for (int i=0, ilim=attrsAsChars.length+1; i<ilim; ++i) { int c; if (i<attrsAsChars.length) c = (int)attrsAsChars[i]; else c = -1; switch (state) { case STATE_INITIAL: if (" \t\r\n".indexOf(c)>=0){ continue; } else if (-1==c) { continue; } else { state = STATE_EXPECT_ATTRIBUTENAME; buf.append((char)c); } break; case STATE_EXPECT_ATTRIBUTENAME: if ('='==c) { attributeName = buf.toString(); buf.setLength(0); state = STATE_EXPECT_ATTRIBUTEVALUE; } else if (-1==c) { attributeName = buf.toString(); buf.setLength(0); printWarning("In Tag '"+tag+"':\nAttribute name without a value, inserting value =\""+attributeName+"\""); } else if (" \t\r\n".indexOf(c)>=0) { state = STATE_EXPECT_EQUALSIGN; } else { buf.append((char)c); } break; case STATE_EXPECT_EQUALSIGN: if (" \t\r\n".indexOf(c)>=0){ continue; } else if ('='==c) { state = STATE_EXPECT_ATTRIBUTEVALUE; attributeName = buf.toString(); buf.setLength(0); } else { attributeName = buf.toString(); buf.setLength(0); printWarning("In Tag '"+tag+"':\nAttribute name without a value, inserting value =\""+attributeName+"\""); newAttributes += " "+attributeName+"=\""+attributeName+"\""; buf.append((char)c); state = STATE_EXPECT_ATTRIBUTENAME; } break; case STATE_EXPECT_ATTRIBUTEVALUE: if (" \t\r\n".indexOf(c)>=0){ continue; } else if ('\"'==c) { state = STATE_DOUBLEQUOTE_ATTRIBUTEVALUE; } else if ('\''==c) { state = STATE_SINGLEQUOTE_ATTRIBUTEVALUE; } else { state = STATE_UNQUOTED_ATTRIBUTEVALUE; buf.append((char)c); } break; case STATE_UNQUOTED_ATTRIBUTEVALUE: if (-1==c || " \t\r\n".indexOf(c)>=0){ state = STATE_INITIAL; newAttributes += " "+attributeName + "=\"" + buf.toString() + "\""; buf.setLength(0); } else { buf.append((char)c); } break; case STATE_SINGLEQUOTE_ATTRIBUTEVALUE: if ('\''==c) { state = STATE_INITIAL; newAttributes += " "+attributeName + "=\"" + buf.toString() + "\""; buf.setLength(0); } else { buf.append((char)c); } break; case STATE_DOUBLEQUOTE_ATTRIBUTEVALUE: if ('\"'==c) { state = STATE_INITIAL; newAttributes += " "+attributeName + "=\"" + buf.toString() + "\""; buf.setLength(0); } else { buf.append((char)c); } break; } } if (!isAtomTag) { // check whether this open tag is equal to the topmost // entry on the stack; if yes, emit a close tag first // corrects stuff like '<tr><td>...<td>...'); if (!tagStack.isEmpty() && tagStack.peek().equals(tagName)) { printWarning("Inserting </"+tagName+">"); output.append("</"+tagName+">"); tagStack.pop(); } else { processKnownChildTags(tagName, tagStack, output); } // otherwise, we assume there are no close tags required // before this open tag. tagStack.push(tagName); output.append("<"+tagName+newAttributes+">"); } else { output.append("<"+tagName+newAttributes+"/>"); } } } private boolean processKnownChildTags(String tagName, Stack tagStack, StringBuffer output) { TagInfo tagInfo = (TagInfo)tagInfoMap.get(tagName); if (null != tagInfo) { String parentTag = null; for (Enumeration en = tagStack.elements(); en.hasMoreElements(); ) { String tag = (String)en.nextElement(); if (tagInfo.isLegalParentTag(tag)) { parentTag = tag; break; } } if (parentTag != null) { while (((String)tagStack.peek()) != parentTag) { String poppedTagName = (String)tagStack.pop(); output.append("</"+poppedTagName+">"); printWarning("Inserting </"+poppedTagName+">"); } return true; } } return false; } private void flush() { // close all pending tags while (!tagStack.isEmpty()) { String tagName = (String)tagStack.pop(); printWarning("Inserting </"+tagName+">"); output.append("</"+tagName+">"); } } /** * Takes HTML fragment and returns a well-formed XHTML * equivalent. * * In the returned String, all tags are properly closed and * nested. * * Currently, the returned String is not guaranteed to be * well-formed. In particular there are no checks on the tag * names, attribute names and entity names. */ public String getWellformedHTML(String text) { final int STATE_INITIAL = 1; final int STATE_TAG_START = 2; final int STATE_TAG = 3; final int STATE_TAG_DOUBLEQUOTE = 4; final int STATE_TAG_SINGLEQUOTE = 5; final int STATE_AMP = 6; int state = STATE_INITIAL; output.setLength(0); StringBuffer buf = new StringBuffer(); char[] textAsChars = text.toCharArray(); outer_loop: for (int i=0, ilim=textAsChars.length+1; i<ilim; ++i) { int c; if (i<textAsChars.length) { c = textAsChars[i]; } else { c = -1; } switch (state) { case STATE_INITIAL: if ('<'==c) { state = STATE_TAG_START; if (buf.length()>0) { haveText(buf.toString()); buf.setLength(0); } } else if ('>'==c) { // assume this is a greater-than sign buf.append(">"); } else if ('&'==c) { state = STATE_AMP; } else if (-1==c) { if (buf.length()>0) { haveText(buf.toString()); buf.setLength(0); } continue; } else { buf.append((char)c); } break; case STATE_AMP: if ('<'==c) { buf.append("&"); state = STATE_TAG_START; if (buf.length()>0) { haveText(buf.toString()); buf.setLength(0); } } else if ('>'==c) { // assume this is a greater-than sign buf.append("&"); buf.append(">"); state = STATE_INITIAL; } else if ('&'==c) { buf.append("&"); buf.append("&"); state = STATE_INITIAL; } else if (-1==c) { buf.append("&"); haveText(buf.toString()); buf.setLength(0); state = STATE_INITIAL; continue; } else { // peek forward and see whether this is a valid entity. if ('#'==c) { buf.append("&"); buf.append((char)c); state = STATE_INITIAL; continue outer_loop; } else if (Character.isLetter((char)c)) { for (int i2=i+1; i2<ilim-1; i2++) { if (';' == textAsChars[i2]) { buf.append("&"); buf.append((char)c); state = STATE_INITIAL; continue outer_loop; } else if (!Character.isLetter((char)c) && !Character.isDigit((char)c) && ".-_:".indexOf((char)c) < 0 //&& !isCombiningChar(c) // FIXME //&& !isExtender(c) // FIXME ) { break; } } // not a valid entity declaration; assume & } buf.append("&"); buf.append((char)c); state = STATE_INITIAL; } /* else if ('#'==c || Character.isLetter((char)c)) { buf.append("&"); buf.append((char)c); state = STATE_INITIAL; } else { buf.append("&"); buf.append((char)c); state = STATE_INITIAL; } */ break; case STATE_TAG_START: if (" \t\r\n".indexOf(c)>=0) { //continue; // new: assume this is a less-sign haveText("<"+c); state = STATE_INITIAL; } else if ('/'==c) { buf.append((char)c); state = STATE_TAG; } else if ('<'==c) { // assume this is a less-sign haveText("<<"); state = STATE_INITIAL; } else if ('>'==c) { // assume this is a less-sign haveText("<>"); state = STATE_INITIAL; } //else if ('-'==c || '+'==c || '='==c || '\''==c || "0123456789".indexOf(c)>=0) { else if (!Character.isLetter((char)c)) { // assume this is a less-sign haveText("<"+(char)c); state = STATE_INITIAL; } else { buf.append((char)c); state = STATE_TAG; } break; case STATE_TAG: if ('\"'==c) { buf.append((char)c); state = STATE_TAG_DOUBLEQUOTE; } else if ('\''==c) { buf.append((char)c); state = STATE_TAG_SINGLEQUOTE; } else if ('>'==c) { state = STATE_INITIAL; haveStartOrEndTag(buf.toString()); buf.setLength(0); } else if ('<'==c) { // notify user, missing greater-than sign haveStartOrEndTag(buf.toString()); buf.setLength(0); } else if (-1==c) { printWarning("Unclosed tag at end-of-comment: <"+buf); haveStartOrEndTag(buf.toString()); buf.setLength(0); } else { buf.append((char)c); } break; case STATE_TAG_DOUBLEQUOTE: if ('\"'==c) { buf.append((char)c); state = STATE_TAG; } else if (-1==c) { printWarning("Unclosed attribute value at end-of-comment."); haveStartOrEndTag(buf.toString()+"\""); } else { buf.append((char)c); } break; case STATE_TAG_SINGLEQUOTE: if ('\''==c) { buf.append((char)c); state = STATE_TAG; } else if (-1==c) { printWarning("Unclosed attribute value at end-of-comment."); haveStartOrEndTag(buf.toString()+"'"); } else { buf.append((char)c); } break; } } return output.toString(); } private String getContext() { if (null != contextClass) { StringBuffer rc = new StringBuffer(); rc.append(contextClass.qualifiedTypeName()); if (null != contextMember) { rc.append("."+contextMember.toString()); } return rc.toString(); } else { return null; } } private void printWarning(String msg) { if (null != warningReporter && !noWarn) { String context = getContext(); if (null != context) { warningReporter.printWarning("In "+getContext()+": "+msg); } else { warningReporter.printWarning("In overview page: "+msg); } } } public String terminateText() { output.setLength(0); flush(); return output.toString(); } }