JHTML.java example

Explorer
jbidwatcher-master
package com.jbidwatcher.util.html;
/*
 * Copyright (c) 2000-2007, CyberFOX Software, Inc. All Rights Reserved.
 *
 * Developed by mrs (Morgan Schweers)
 */

import java.net.*;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import com.jbidwatcher.util.config.JConfig;
import com.jbidwatcher.util.xml.XMLElement;
import com.jbidwatcher.util.xml.XMLInterface;
import com.jbidwatcher.util.http.Http;
import com.jbidwatcher.util.xml.XMLParseException;

public class JHTML implements JHTMLListener {
  protected boolean m_loaded = false;
  protected int m_tokenIndex;
  protected int m_contentIndex;
  private JHTMLParser m_parser;
  private Map<String, intPair> contentMap;
  private Map<String, intPair> caselessContentMap;
  private List<String> contentList;
  private List<Form> m_formList;
  private Form m_curForm;
  private static boolean do_uber_debug=false;
  private String mCharset;

  //  Extract just the HREF portion (should look for HREF=\")
  private static Pattern urlMatcher = Pattern.compile("(?i)href=\"([^\"#]*)");

  public JHTML(StringBuffer strBuf) {
    setup();
    m_parser = new JHTMLParser(strBuf, this);
  }

  private void setup() {
    caselessContentMap = new HashMap<String, intPair>();
    contentMap = new HashMap<String, intPair>();
    contentList = new ArrayList<String>();
    m_formList = new ArrayList<Form>();
    m_curForm = null;
    reset();
  }

  /**
   * @brief Set the 'tag pointer' to the start of the document.
   */
  public void reset() {
    m_tokenIndex = 0;
    m_contentIndex = 0;
  }

  public Map<String, String> extractMicroformat() {
    XMLElement xe = new XMLElement();
    String currentProperty = null;
    Map<String, String> rval = new HashMap<String, String>();
    htmlToken tok;
    int balance = 0;

    String currentContent = null;
    while((tok = nextToken()) != null) {
      int type = tok.getTokenType();

      if(currentProperty != null) {
        if(type == htmlToken.HTML_TAG) balance++;
        if(type == htmlToken.HTML_ENDTAG) {
          balance--;
          if(balance == 0) {
            if(rval.get(currentProperty) == null || rval.get(currentProperty).length() == 0 || currentContent.length() != 0) {
              rval.put(currentProperty, currentContent);
            }
            currentProperty = null;
          }
        }
      }

      if(type == htmlToken.HTML_TAG || type == htmlToken.HTML_SINGLETAG) {
        if(tok.getToken().startsWith("!")) continue;

        try {
          xe.reset();
          xe.parseString("<" + tok.getToken() + "/>");
        } catch(XMLParseException xpe) {
          JConfig.log().logVerboseDebug("eBay's HTML still sucks.");
          continue;
        }
        String itemprop = xe.getProperty("itemprop");
        if(itemprop != null) {
          String content = xe.getProperty("content");
          if (content != null) {
            if(rval.get(itemprop) == null || rval.get(itemprop).length() == 0 || content.length() != 0) {
              rval.put(itemprop, content);
            }
          } else {
            currentProperty = itemprop;
            currentContent = "";
            balance = 1;
          }
        } else if(xe.getTagName().equals("meta")) {
          String property = xe.getProperty("property");
          String content = xe.getProperty("content");
          if(property != null && property.startsWith("og:")) {
            itemprop = property.substring(3);
            if (rval.get(itemprop) == null || rval.get(itemprop).length() == 0 || content.length() != 0) {
              rval.put(itemprop, content);
            }
          } else {
            String name = xe.getProperty("name");
            if(name != null && name.equals("twitter:text:price")) {
              rval.put("price", content);
            }
          }
        }
      } else if(type == htmlToken.HTML_CONTENT && currentProperty != null) {
        if(currentContent.length() != 0) currentContent += " ";
        currentContent += tok.toString();
      }
    }
    return rval;
  }

  private static class intPair {
    private int first;
    private int second;

    public intPair(int f, int s) { first = f; second = s; }

    public int getFirst() {
      return first;
    }

    public int getSecond() {
      return second;
    }
  }

  public static class Form {
    private List<XMLInterface> mAllInputs;
    private XMLElement formTag;
    private static final String FORM_VALUE = "value";
    private static final String FORM_SUBMIT = "submit";
    private static final String FORM_CHECKBOX = "checkbox";
    public static final String FORM_PASSWORD = "password";
    private static final String FORM_HIDDEN = "hidden";
    private static final String FORM_RADIO = "radio";

    public Form(String initialTag) {
      formTag = new XMLElement();
      formTag.parseString('<' + initialTag + "/>");

      mAllInputs = new ArrayList<XMLInterface>();

      if (do_uber_debug) JConfig.log().logDebug("Name: " + formTag.getProperty("name", "(unnamed)"));
    }

    public String getName() { return formTag.getProperty("name"); }
    public boolean hasInput(String srchFor) { return hasInput(srchFor, null); }
    public boolean hasInput(String srchFor, String value) {
      for (XMLInterface curInput : mAllInputs) {
        String name = curInput.getProperty("name");
        if (name != null) {
          if (srchFor.equalsIgnoreCase(name) && (value == null || curInput.getProperty("value").equalsIgnoreCase(value))) {
            return true;
          }
        }
      }
      return false;
    }

    public boolean delInput(String srchFor) {
      Iterator<XMLInterface> it = mAllInputs.iterator();
      while (it.hasNext()) {
        XMLInterface curInput = it.next();
        String name=curInput.getProperty("name");
        if(name != null) {
          if(srchFor.equalsIgnoreCase(name)) {
            it.remove();
            return true;
          }
        }
      }
      return false;
    }

    public String getCGI() throws UnsupportedEncodingException {
      String action = getAction();
      String rval = getFormData();
      if(action != null) {
        if (action.indexOf('?') == -1) {
          rval = action + '?' + rval;
        } else {
          rval = action + '&' + rval;
        }
      }

      return rval;
    }

    public String getFormData() throws UnsupportedEncodingException {
      Iterator<XMLInterface> it = mAllInputs.iterator();
      StringBuffer rval = new StringBuffer("");
      String seperator = "";
      while(it.hasNext()) {
        XMLElement curInput = (XMLElement)it.next();

        if(do_uber_debug) JConfig.log().logDebug("Type == " + curInput.getProperty("type", "text"));
        if (rval.length() != 0) {
          seperator = "&";
        }

        String type = curInput.getProperty("type", "text");
        String name = curInput.getProperty("name", "");

        if(type.equals("text") || type.equalsIgnoreCase(FORM_HIDDEN) || type.equals(FORM_PASSWORD)) {
          //  Need to URL-Encode 'value'...
          rval.append(seperator).append(name).append('=').append(URLEncoder.encode(curInput.getProperty(FORM_VALUE, ""), "UTF-8"));
        } else if(type.equals(FORM_CHECKBOX) || type.equals(FORM_RADIO)) {
          if(curInput.getProperty("checked") != null) {
            rval.append(seperator).append(name).append('=').append(URLEncoder.encode(curInput.getProperty(FORM_VALUE, "on"), "UTF-8"));
          }
        } else if(type.equals(FORM_SUBMIT)) {
          if(name.length() != 0) {
            String value = curInput.getProperty(FORM_VALUE, "Submit");
            if (!value.equalsIgnoreCase("cancel")) {
              rval.append(seperator).append(name).append('=').append(URLEncoder.encode(value, "UTF-8"));
            }
          }
        }
      }

      return rval.toString();
    }

    public String getAction() {
      return formTag.getProperty(JHTMLDialog.FORM_ACTION);
    }

    private String createProperty(String property, XMLInterface tag, String defValue) {
      String value = tag.getProperty(property);
      if(value != null) {
        return property + "=\"" + value + "\" ";
      }
      return defValue;
    }

    public void addInput(String newTag) {
      XMLElement inputTag = new XMLElement();

      try {
        inputTag.parseString('<' + newTag + "/>");
      } catch (XMLParseException e) {
        if(XMLElement.rejectingBadHTML()) throw e;
        JConfig.log().handleException("Bad input tag", e);
        return;
      }
      String inputType = inputTag.getProperty("type", "text").toLowerCase();
      if(inputTag.getTagName().equals("button")) {
        XMLElement tempTag = new XMLElement();
        String name = createProperty("name", inputTag, "");
        String value= createProperty("value", inputTag, "");
        String type = createProperty("type", inputTag, "button");
        tempTag.parseString("<input " + type + name + value + "/>");
        inputType = tempTag.getProperty("type");
        inputTag = tempTag;
      }

      boolean showInputs = JConfig.queryConfiguration("debug.showInputs", "false").equals("true");

      boolean isError = inputType == null;
      if(!isError) {
        if(inputType.equals("text")) {
          if (showInputs) JConfig.log().logDebug("T: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals(FORM_PASSWORD)) {
          if (showInputs) JConfig.log().logDebug("P: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if (inputType.equalsIgnoreCase(FORM_HIDDEN) || inputType.equalsIgnoreCase("'hidden'")) {
          if (showInputs) JConfig.log().logDebug("H: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals(FORM_CHECKBOX)) {
          if (showInputs) JConfig.log().logDebug("CB: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals(FORM_RADIO)) {
          if (showInputs) JConfig.log().logDebug("R: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals(FORM_SUBMIT)) {
          if (showInputs) JConfig.log().logDebug("S: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals("image")) {
          if (showInputs) JConfig.log().logDebug("I: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals("button")) {
          if (showInputs) JConfig.log().logDebug("B: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals("reset")) {
          if (showInputs) JConfig.log().logDebug("RST: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE));
        } else if(inputType.equals("file")) {
          if (showInputs) JConfig.log().logDebug("File: Name: " + inputTag.getProperty("name"));
        } else {
          JConfig.log().logDebug("Unknown input type: " + inputType);
          isError = true;
        }
      } else {
        JConfig.log().logDebug("Bad input tag (ignoring): " + newTag);
      }

      if(!isError) {
        mAllInputs.add(inputTag);
      }
    }

    public void setText(String key, String val) {
      for (XMLInterface curInput : mAllInputs) {
        String name = curInput.getProperty("name");

        if (name != null) {
          if (name.equalsIgnoreCase(key)) {
            curInput.setProperty(FORM_VALUE, val);
          }
        }
      }
    }

    public String getInputValue(String inputName) {
      for(XMLInterface input : mAllInputs) {
        String name = input.getProperty("name");
        if(name != null && name.equals(inputName)) {
          if (input.getProperty("value") != null) {
            return input.getProperty("value");
          }
        }
      }

      return null;
    }

    public Map<String, Object> getCGIMap() {
      LinkedHashMap<String, Object> rval = new LinkedHashMap<String, Object>();
      for(XMLInterface input : mAllInputs) {
        String name = input.getProperty("name");
        String value = input.getProperty("value");
        rval.put(name, value);
      }
      return rval;
    }
  }

  public List<Form> getForms() { return m_formList; }

  /**
   * @brief Added to work with JHTMLParser, which takes a JHTMLListener (which this implements); this
   * adds each content token into a hash map for later fast lookup.
   *
   * @param newToken - The token that has been extracted.
   * @param contentIndex - This token's index into the total token list...
   * m_parser.getTokenAt(contentIndex) == newTok.
   */
  public void addToken(htmlToken newToken, int contentIndex) {
    if(newToken.getTokenType() == htmlToken.HTML_CONTENT) {
      //  Non-numeric single character content tokens suck.
      if(newToken.getToken().length() == 1 && !Character.isDigit(newToken.getToken().charAt(0))) return;
      //  Keep the content stored by lowercase value, for case-insensitive searching.
      //  Store the passed content index (the 'real' index), and the internal index,
      //  for quick lookups.
      intPair pair = new intPair(contentIndex, contentList.size());

      //  First entry into the table wins.
      if(!contentMap.containsKey(newToken.getToken())) {
        contentMap.put(newToken.getToken(), pair);
        caselessContentMap.put(newToken.getToken().toLowerCase(), pair);
      }
      contentList.add(newToken.getToken());
    } else {
      if(newToken.getTokenType() == htmlToken.HTML_TAG ||
        newToken.getTokenType() == htmlToken.HTML_ENDTAG ||
        newToken.getTokenType() == htmlToken.HTML_SINGLETAG) {
        handleForms(newToken);
        //  <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
        if(newToken.getToken().toLowerCase().startsWith("meta")) {
          checkDocumentType(newToken.getToken(), "ISO-8859-1");
          checkDocumentType(newToken.getToken(), "UTF-8");
        }
      }
    }
  }

  public String getCharset() {
    return mCharset;
  }

  private void checkDocumentType(String meta, String type) {
    if(meta.contains(type)) setCharset(type);
  }

  private void handleForms(htmlToken newToken) {
    if(newToken.getToken().toLowerCase().startsWith("form")) {
      if (m_curForm != null) {
        m_formList.add(m_curForm);
        m_curForm = null;
      }
      try {
        m_curForm = new Form(newToken.getToken());
      } catch (com.jbidwatcher.util.xml.XMLParseException parseException) {
        JConfig.log().logDebug("Form parsing failure: " + parseException);
      }
    } else if(newToken.getToken().toLowerCase().startsWith("/form")) {
      if(m_curForm != null) m_formList.add(m_curForm);
      m_curForm = null;
    }
    if(m_curForm != null) {
      if(newToken.getToken().regionMatches(true, 0, "input", 0, 5) || newToken.getToken().regionMatches(true, 0, "button", 0, 6)) {
        m_curForm.addInput(newToken.getToken());
      }
    }
  }

  //------------------------------------------------------------
  // Content operations.
  //------------------------------------------------------------

  /**
   * @brief Helper function to retrieve just the first piece of content from a potentially HTML string.
   *
   * @param toSearch - The string to search for non-tag content.
   *
   * @return The very first block of non-tag content in a potentially HTML string.
   */
  public static String getFirstContent(String toSearch) {
    JHTML parser = new JHTML(new StringBuffer(toSearch));

    return parser.contentList.get(0);
  }

  public String getTitle() {
    reset();
    String tagWalk = getNextTag();
    while(tagWalk != null && !"title".equalsIgnoreCase(tagWalk)) {
      tagWalk = getNextTag();
    }
    if(tagWalk == null) return null;

    htmlToken t = nextToken();
    while(t != null && t.getTokenType() != htmlToken.HTML_CONTENT) t = nextToken();

    return t == null ? null : t.getToken();
  }

  public String getNextContent() {
    if( (m_contentIndex+1) >= contentList.size()) return null;

    return contentList.get(m_contentIndex++);
  }

  public String getPrevContent() {
    if(m_contentIndex == 0) return null;

    return contentList.get(--m_contentIndex);
  }

  public String getPrevContent(int farBack) {
    if(farBack > m_contentIndex) {
      m_contentIndex = 0;
      return null;
    }

    m_contentIndex -= farBack;
    return contentList.get(m_contentIndex);
  }

//  None of these parameter definitions are needed right now.

//  private static final boolean IGNORE_CASE = true;
//  private static final boolean IS_REGEX = true;
//  private static final boolean NOT_REGEX = false;
//  private static final boolean EXACT = true;
//  private static final boolean INEXACT = false;
//  private static final int DOWN = -1;
//  private static final int UP = 1;
  private static final boolean CHECK_CASE = false;

  public Object lookup(String hunt, boolean caseless) {
    intPair at;
    if (caseless) {
      at = caselessContentMap.get(hunt.toLowerCase());
    } else {
      at = contentMap.get(hunt);
    }
    return at;
  }

  private String contentLookup(String hunt, boolean caseless) {
    intPair at = (intPair)lookup(hunt, caseless);
    if(at == null) return null;

    m_tokenIndex = at.getFirst() +2;
    m_contentIndex = at.getSecond() +1;
    return contentList.get(m_contentIndex++);
  }

  public String find(String hunt, boolean ignoreCase) {
    for (String nextContent : contentList) {
      if (nextContent.regionMatches(ignoreCase, 0, hunt, 0, hunt.length())) {
        return nextContent;
      }
    }

    return null;
  }

  private String contentFind(String hunt, boolean ignoreCase) {
    String nextContent = find(hunt, ignoreCase);
    if (nextContent != null) {
      //  This might not be safe...
      nextContent = contentLookup(nextContent, CHECK_CASE);
    }
    return nextContent;
  }

  public String grep(String match) {
    Pattern matchPat = Pattern.compile(match);
    for (String nextContent : contentList) {
      Matcher m = matchPat.matcher(nextContent);
      if (m.matches()) {
        //  This might not be safe...
        return nextContent;
      }
    }
    return null;
  }

  public Matcher realGrep(String match) {
    Pattern matchPat = Pattern.compile(match);
    for (String nextContent : contentList) {
      Matcher m = matchPat.matcher(nextContent);
      if (m.matches()) {
        return m;
      }
    }
    return null;
  }

  private String grepAfter(String match, String ignore) {
    Pattern toMatch = Pattern.compile(match);
    Pattern toIgnore = (ignore == null ? null : Pattern.compile(ignore));

    for (Iterator<String> it = contentList.iterator(); it.hasNext();) {
      String contentStep = it.next();
      if(toMatch.matcher(contentStep).matches()) {
        Iterator<String> save = it;
        if(it.hasNext()) {
          String potential = it.next();
          if(ignore == null || !toIgnore.matcher(potential).matches()) {
            contentLookup(contentStep, false);
            return potential;
          }
        }
        it = save;
      }
    }

    return null;
  }

  private String contentGrep(String match, String ignore) {
    return grepAfter(match, ignore);
  }

  //  Default to caseless lookups.
  public String getNextContentAfterContent(String previousData) {
    return contentFind(previousData, CHECK_CASE);
  }

  public String getContentBeforeContent(String followingData) {
    if (contentFind(followingData, CHECK_CASE) != null && getPrevContent() != null && getPrevContent() != null) return getPrevContent();
    return null;
  }

  public String getNextContentAfterRegex(String match) {
    return contentGrep(match, null);
  }

  public String getNextContentAfterRegexIgnoring(String match, String ignore) {
    return contentGrep(match, ignore);
  }

  /**
   * Strictly speaking this is not correct; we should reset to the initial
   * content step plus one, and start again.  In practice, this is not needed
   * yet.  (Essentially this should become a larger scale Boyer-Moore.)
   *
   * @param sequence - The sequence of regular expressions to match
   * @return The contents that matched the last regex, or null if no matches.
   */
  public boolean hasSequence(String... sequence) {
    return sequence.length != 0 && findSequence(sequence) != null;
  }

  public class SequenceResult extends LinkedList<String> {
    Pattern[] sequence;
    int nextStartPoint;

    public int getNextStartPoint() {
      return nextStartPoint;
    }
  }

  public SequenceResult findSequence(String... originalSequence) {
    SequenceResult contentSequence = new SequenceResult();
    contentSequence.nextStartPoint = 0;
    Pattern[] inputPattern = new Pattern[originalSequence.length];
    int currentPattern = 0;
    for (String step : originalSequence) {
      inputPattern[currentPattern++] = Pattern.compile(step);
    }
    contentSequence.sequence = inputPattern;
    return findNextSequence(contentSequence);
  }

  public SequenceResult findNextSequence(SequenceResult contentSequence) {
    int stepwise = contentSequence.nextStartPoint;
    Pattern[] inputPattern = contentSequence.sequence;
    List<String> toSearch = contentList.subList(stepwise, contentList.size());

    int index = 0;

    for (String contentStep : toSearch) {
      stepwise++;
      if(inputPattern[index].matcher(contentStep).matches()) {
        contentSequence.add(contentStep);
        index++;
        if (index == inputPattern.length) {
          contentSequence.nextStartPoint = stepwise;
          return contentSequence;
        }
      } else {
        contentSequence.clear();
        index = 0;
      }
    }
    return null;
  }

  //------------------------------------------------------------
  // Tag operations.
  //------------------------------------------------------------

  public String getNextTag() {
    htmlToken returnToken = nextToken();
    if (returnToken != null) {
      while (returnToken != null &&
        returnToken.getTokenType() == htmlToken.HTML_CONTENT &&
        returnToken.getTokenType() != htmlToken.HTML_EOF) {
        returnToken = nextToken();
      }
      if (returnToken != null && returnToken.getTokenType() != htmlToken.HTML_EOF) {
        return returnToken.getToken();
      }
    }

    return null;
  }

  public List<String> getAllLinks() {
    List<String> linkTags = null;
    String curTag = getNextTag();

    while(curTag != null) {
      if(curTag.startsWith("A ") || curTag.startsWith("a ")) {
        if(linkTags == null) {
          linkTags = new ArrayList<String>();
        }
        linkTags.add(curTag);
      }

      curTag = getNextTag();
    }
    return linkTags;
  }

  public String getLinkForContent(String searchContent) {
    reset();
    String lastTag = null;
    htmlToken curToken = nextToken();

    while(curToken != null) {
      switch(curToken.getTokenType()) {
        case htmlToken.HTML_TAG: {
          String tag = curToken.getToken();
          if(tag.regionMatches(true, 0, "a ", 0, 2)) {
            lastTag = tag;
          }
          break;
        }
        case htmlToken.HTML_ENDTAG: {
          String tag = curToken.getToken();
          if(tag.equalsIgnoreCase("/a")) {
            lastTag = null;
          }
        }
        case htmlToken.HTML_CONTENT: {
          String content = curToken.getToken();
          if(lastTag != null) {
            if(searchContent.equals(content)) {
              Matcher result = urlMatcher.matcher(lastTag);
              if(result.find()) {
                return result.group(1);
              }
            }
          }
        }
      }

      curToken = nextToken();
    }

    return null;
  }

  public List<String> getAllImages() {
    HashSet<String> imgUrls = new HashSet<String>();
    String curTag = getNextTag();

    while(curTag != null) {
      if(curTag.toLowerCase().startsWith("img ")) {
        imgUrls.add(deAmpersand(curTag).replaceAll(".*img.*src=\"(.*?)\".*", "$1"));
      }

      curTag = getNextTag();
    }

    return new ArrayList<String>(imgUrls);
  }

  public List<String> getAllURLsOnPage(boolean viewOnly) {
    // Add ALL auctions on myEbay bidding/watching page!
    List<String> addressTags = getAllLinks();
    if(addressTags == null) return null;
    List<String> outEntries = null;

    for (String curTag : addressTags) {
      Matcher result = urlMatcher.matcher(curTag);
      if(result.find()) {
        String href = result.group(1);

        boolean isView = false;
        if (viewOnly) {
          isView = href.matches("^https?://[a-z]+.ebay.[a-z.]+/(?:itm/)?[A-Za-z0-9-]+/[0-9]+(\\?.*)?") || (href.indexOf("ViewItem") != -1);
          if (isView) {
            href = deAmpersand(href);
          }
        }

        if (!viewOnly || isView) {
          if (outEntries == null) outEntries = new ArrayList<String>();
          outEntries.add(href);
        }
      }
    }
    return outEntries;
  }

  public static String deAmpersand(String href) {
    int searchIndex = href.indexOf("&");
    while (searchIndex != -1) {
      href = href.substring(0, searchIndex + 1) +
              href.substring(searchIndex + 5);
      searchIndex = href.indexOf("&");
    }

    return href;
  }

  //------------------------------------------------------------
  // Generic token operations.
  //------------------------------------------------------------

  public htmlToken nextToken() {
    htmlToken rval = m_parser.getTokenAt(m_tokenIndex++);
    if (rval == null) --m_tokenIndex;
    return rval;
  }

  public boolean isLoaded() { return m_loaded; }

  private void loadParseURL(String newURL, String cookie, CleanupHandler cl) {
    m_parser = new JHTMLParser(this);
    StringBuffer loadedPage;

    try {
      URLConnection uc = Http.net().getPage(newURL, cookie, null, true);
      loadedPage = Http.net().receivePage(uc);
      if(loadedPage != null) {
        if(cl != null) cl.cleanup(loadedPage);
        m_parser.parse(loadedPage);
        m_loaded = true;
      }
    } catch(IOException e) {
      loadedPage = null;
      JConfig.log().handleException("JHTML.loadPage: (" + newURL + ") " + e, e);
    }
    if(loadedPage == null) m_loaded = false;
  }

  /** 
   * @brief Simple function that does all the 'usual' stuff for a web page,
   * constructing a JHTML object with the data from the given page.
   * 
   * For pages that need more processing, they have to do it by hand.
   *
   * @param newURL - The URL to get, receive, and pre-parse.
   * @param cookie - A cookie to pass along when getting the page.
   * @param cl - A CleanupHandler to call to clean up the StringBuffer before continuing.
   */
  public JHTML(String newURL, String cookie, CleanupHandler cl) {
    setup();
    loadParseURL(newURL, cookie, cl);
  }

  public JHTML.Form getFormWithInput(String input) {
    List<Form> forms = getForms();
    for (Form curForm : forms) {
      if (curForm.hasInput(input)) return curForm;
    }

    return null;
  }

  public void setCharset(String charset) {
    mCharset = charset;
  }

  private boolean isToken(htmlToken tok, int tokenType, String tag) {
    return tok.getTokenType() == tokenType && tok.getToken().regionMatches(true, 0, tag, 0, tag.length());
  }

  public class Table {
    private List<List<String>> data;

    public Table() {
      data = new ArrayList<List<String>>();
    }

    public void addRow(List<String> newRow) {
      data.add(newRow);
    }

    public String getCell(int col, int row) {
      return data.get(row).get(col);
    }

    public List<String> getRow(int row) {
      return data.get(row);
    }

    public boolean rowCellMatches(int row, String regexp) {
      if(data.size() == 0) return false;
      for(String cell : data.get(row)) {
        if(cell.matches(regexp)) return true;
      }
      return false;
    }

    public int getRowCount() {
      return data.size();
    }
  }

  public List<Table> extractTables() {
    List<Table> tableContents = new ArrayList<Table>(1);

    htmlToken tok = nextToken();
    while (tok != null) {
      if (isToken(tok, htmlToken.HTML_TAG, "table")) {
        tableContents.add(extractTable(tableContents));
      }
      tok = nextToken();
    }
    return tableContents;
  }

  public Table extractTable(List<Table> tableContents) {
    List<htmlToken> currentTable = new ArrayList<htmlToken>();
    Table currentTableContents = new Table();
    List<String> headers = new ArrayList<String>();

    List<List<htmlToken>> tableList = new ArrayList<List<htmlToken>>(1);
    tableList.add(currentTable);

    htmlToken tok = nextToken();
    String curHdr = null;
    while(!isToken(tok, htmlToken.HTML_ENDTAG, "/table")) {
      if (isToken(tok, htmlToken.HTML_TAG, "table")) {
        tableContents.add(extractTable(tableContents));
      } else {
        if (isToken(tok, htmlToken.HTML_ENDTAG, "/tr")) {
          boolean first = true;
          List<String> curRow = null;
          for (String hdr : headers) {
            if (!first) {
            } else {
              curRow = new ArrayList<String>(headers.size());
              first = false;
            }
            curRow.add(hdr);
          }
          if (!headers.isEmpty()) {
            currentTableContents.addRow(curRow);
          }
          headers = new ArrayList<String>();
        } else {
          if (isToken(tok, htmlToken.HTML_TAG, "td") || isToken(tok, htmlToken.HTML_TAG, "th")) {
            curHdr = "";
          } else if (isToken(tok, htmlToken.HTML_ENDTAG, "/td") || isToken(tok, htmlToken.HTML_ENDTAG, "/th")) {
            if (curHdr != null && curHdr.length() != 0) headers.add(curHdr);
            curHdr = null;
          }
        }
        if (tok.getTokenType() == htmlToken.HTML_CONTENT && curHdr != null) {
          if (curHdr.length() != 0) curHdr += ' ';
          curHdr += tok.getToken();
        }
        currentTable.add(tok);
      }
      tok = nextToken();
    }
    return currentTableContents;
  }
}