package com.jbidwatcher.util.html; /* * Copyright (c) 2000-2007, CyberFOX Software, Inc. All Rights Reserved. * * Developed by mrs (Morgan Schweers) */ import com.jbidwatcher.util.config.JConfig; import com.jbidwatcher.util.xml.XMLElement; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * User: Morgan Schweers * Date: Jun 26, 2004 * Time: 2:34:56 PM * */ public class JHTMLParser { private List<htmlToken> m_tokens; private JHTMLListener m_notify = null; private final static boolean do_uber_debug = false; public JHTMLParser(StringBuffer sb, JHTMLListener notify) { m_notify = notify; setup(); parse(sb); } public JHTMLParser(JHTMLListener notify) { m_notify = notify; setup(); } private void setup() { m_tokens = Collections.synchronizedList(new ArrayList<htmlToken>()); } protected void parse(StringBuffer trueBuffer) { boolean inQuote=false, inTag=false, inComment=false; char ch, prev = '\0', next = '\0'; trueBuffer = fixupTitle(trueBuffer); trueBuffer = stripNoBR(trueBuffer); int bufLen = trueBuffer.length(); boolean spitNextTag = false; int start = 0; int firstClose = 0; boolean suspicious = false; for(int charStep = 0; charStep<bufLen; charStep++) { ch = trueBuffer.charAt(charStep); if(charStep>1) prev = trueBuffer.charAt(charStep-1); if(charStep<(bufLen-1)) next = trueBuffer.charAt(charStep+1); if(inTag) { // quoting disabled inside of comment if(!inComment) { if(inQuote && ch == '>') { suspicious = true; debugParsing(trueBuffer, bufLen, charStep); firstClose = charStep; } if(ch == '"') { // This tries to detects not closing a quote. It only // works if the next open quote is in another tag, // instead of in the middle of some random content. if(suspicious && inQuote && prev == '=') { charStep=firstClose; if(charStep>1) prev = trueBuffer.charAt(charStep-1); if(charStep<(bufLen-1)) next = trueBuffer.charAt(charStep+1); inQuote = false; suspicious = false; JConfig.log().logDebug("Potential quote error!"); spitNextTag = true; } if(isEndTag(inQuote, prev, next)) { spitNextTag = true; } else { inQuote = !inQuote; } } } // parsing disabled inside of quoted string if(!inQuote) { // end Tag and start Content if(ch == '>') { if(!inComment) { // We've ended a tag, outside a quote. It's all good. if(suspicious) suspicious = false; logSubstringWeirdness(trueBuffer, start, charStep); addToken(trueBuffer.substring(start, charStep), htmlToken.HTML_TAG); if(spitNextTag) { if(JConfig.queryConfiguration("show.badhtml", "false").equals("true")) { JConfig.log().logDebug("Added 'bad' tag: <" + trueBuffer.substring(start, charStep) + ">"); } } spitNextTag = false; // spitNextTag = addValidTag(trueBuffer, spitNextTag, start, charStep); } else { // Comment ends with "-->" inComment = (prev != '-') || (trueBuffer.charAt(charStep-2) != '-'); } inTag = inComment; if(!inTag) start = charStep+1; // start of content } } } else { // in Content if(ch == '<') { addContentBlock(trueBuffer, start, charStep); inTag = true; // Comments begin with "<!--" inComment = (charStep+3 < bufLen) && (next == '!') && (trueBuffer.charAt(charStep+2) == '-') && (trueBuffer.charAt(charStep+3) == '-'); start = charStep+1; // start of tag if(inComment) charStep += 3; } } } addToken("", htmlToken.HTML_EOF); } private boolean isEndTag(boolean inQuote, char prev, char next) { // This prevents opening a quote at the end of a tag. boolean endingTag = !inQuote && prev != '=' && next == '>'; if(endingTag) { if (JConfig.queryConfiguration("show.badhtml", "false").equals("true")) { JConfig.log().logDebug("Quote error!"); } } return endingTag; } private void debugParsing(StringBuffer trueBuffer, int bufLen, int charStep) { if(JConfig.debugging) { int pre_nl=0, post_nl=0, i; for(i=charStep-1; pre_nl == 0 && i>0 && i>(charStep-40); i--) if(trueBuffer.charAt(i) == '\n') pre_nl = i+1; if(pre_nl == 0) pre_nl = i; for(i=charStep+1; post_nl == 0 && i<bufLen && i<(charStep+20); i++) if(trueBuffer.charAt(i) == '\n') post_nl = i; if(post_nl == 0) post_nl = i; String oddText = trueBuffer.substring(pre_nl, post_nl); if(oddText.indexOf("type=\"submit\"") == -1 && oddText.indexOf("name=\"Submit\"") == -1 && !oddText.startsWith("<META")) { if(JConfig.queryConfiguration("show.badhtml", "false").equals("true")) { JConfig.log().logMessage("Found an unusual tag @ " + charStep + "... (" + oddText + ")"); } } } } private void logSubstringWeirdness(StringBuffer trueBuffer, int start, int charStep) { if(charStep < start) { if(do_uber_debug) { JConfig.log().logDebug("substring(" + start + ", " + charStep + ") of " + trueBuffer.length()); JConfig.log().logDebug("FAILURE @\n-------------------\n" + trueBuffer.substring(charStep, start)); } } } private StringBuffer fixupTitle(StringBuffer trueBuffer) { StringBuffer sb; Matcher m; if(JConfig.queryConfiguration("ebay.titleFix", "true").equals("true")) { sb = new StringBuffer(trueBuffer.length()); m = Pattern.compile("<title>(.*)</title>").matcher(trueBuffer); String quotedTitle = null; while(m.find()) { if(quotedTitle == null) quotedTitle = "<title>" + XMLElement.encodeString(m.group(1)) + "</title>"; m.appendReplacement(sb, Matcher.quoteReplacement(quotedTitle)); } m.appendTail(sb); trueBuffer = sb; } return trueBuffer; } // end Content and start Tag private void addContentBlock(StringBuffer trueBuffer, int start, int charStep) { if(start != charStep) { String whatToAdd = trueBuffer.substring(start, charStep); String trimmed = whatToAdd.trim(); if(!trimmed.equals("")) { addToken(whatToAdd, htmlToken.HTML_CONTENT); } } } private boolean addValidTag(StringBuffer trueBuffer, boolean spitNextTag, int start, int charStep) { addToken(trueBuffer.substring(start, charStep), htmlToken.HTML_TAG); if(spitNextTag) { if(JConfig.queryConfiguration("show.badhtml", "false").equals("true")) { JConfig.log().logDebug("Added 'bad' tag: <" + trueBuffer.substring(start, charStep) + ">"); } spitNextTag = false; } return spitNextTag; } private void debugUnusualTags(StringBuffer trueBuffer, int bufLen, int charStep) { int pre_nl=0, post_nl=0, i; for(i=charStep-1; pre_nl == 0 && i>0 && i>(charStep-40); i--) if(trueBuffer.charAt(i) == '\n') pre_nl = i+1; if(pre_nl == 0) pre_nl = i; for(i=charStep+1; post_nl == 0 && i<bufLen && i<(charStep+20); i++) if(trueBuffer.charAt(i) == '\n') post_nl = i; if(post_nl == 0) post_nl = i; String oddText = trueBuffer.substring(pre_nl, post_nl); if(oddText.indexOf("type=\"submit\"") == -1 && oddText.indexOf("name=\"Submit\"") == -1 && !oddText.startsWith("<META")) { if(JConfig.queryConfiguration("show.badhtml", "false").equals("true")) { JConfig.log().logMessage("Found an unusual tag @ " + charStep + "... (" + oddText + ")"); } } } private StringBuffer stripNoBR(StringBuffer trueBuffer) { StringBuffer sb = new StringBuffer(trueBuffer.length());; Matcher m = Pattern.compile("(<nobr>|</nobr>)").matcher(trueBuffer); while(m.find()) { m.appendReplacement(sb, ""); } m.appendTail(sb); trueBuffer = sb; return trueBuffer; } private void addToken(String newToken, int tokType) { htmlToken finalToken; switch(tokType) { // Tags are the page-logic. case htmlToken.HTML_TAG: { int realTokenType = tokType; if(isEndTag(newToken)) realTokenType = htmlToken.HTML_ENDTAG; if(isSingletonTag(newToken)) realTokenType = htmlToken.HTML_SINGLETAG; finalToken = new htmlToken(newToken, realTokenType); break; } // Content is the non-layout portions of the document. case htmlToken.HTML_CONTENT: { String cleanToken = stripWhitespace(newToken); if(cleanToken.length() == 0) return; finalToken = new htmlToken(cleanToken, tokType); break; } // Things like 'HTML_ENDTOKEN', and other arbitrary m_tokens. default: { finalToken = new htmlToken(newToken, tokType); } } if(m_notify != null) { m_notify.addToken(finalToken, m_tokens.size()); } m_tokens.add(finalToken); } // Endtags start with '/', i.e. </A>. // private boolean isEndTag(String checkTag) { return(checkTag.length() != 0 && checkTag.charAt(0) == '/'); } /** * Right now this 'fakes' it, by checking for the XMLish extension to * HTML, which places a '/' at the end of singleton tags. * * @param checkTag - The tag to check, to see if it's a singleton. * * @return - true if the tag is considered a singleton, false otherwise. */ private boolean isSingletonTag(String checkTag) { return(checkTag.length() != 0 && checkTag.charAt(checkTag.length()-1)=='/'); } // Strip whitespace, including 'faked' whitespace ( ) from both sides of the provided string, // and faked whitespace from the inside of the string. private String stripWhitespace(String cleanupString) { String resultString = cleanupString.replaceAll(" ?", " "); if(resultString.length() != 0) { resultString = resultString.replace((char)160, (char)32); } // Trim, to remove anything left after   stripping. return resultString.trim(); } public List<htmlToken> getTokens() { return m_tokens; } public int getTokenCount() { return m_tokens.size(); } public htmlToken getTokenAt(int index) { if (index < getTokenCount()) { return (m_tokens.get(index)); } return null; } }