package com.jbidwatcher.util.html; /* * Copyright (c) 2000-2007, CyberFOX Software, Inc. All Rights Reserved. * * Developed by mrs (Morgan Schweers) */ import java.net.*; import java.io.*; import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; import com.jbidwatcher.util.config.JConfig; import com.jbidwatcher.util.xml.XMLElement; import com.jbidwatcher.util.xml.XMLInterface; import com.jbidwatcher.util.http.Http; import com.jbidwatcher.util.xml.XMLParseException; public class JHTML implements JHTMLListener { protected boolean m_loaded = false; protected int m_tokenIndex; protected int m_contentIndex; private JHTMLParser m_parser; private Map<String, intPair> contentMap; private Map<String, intPair> caselessContentMap; private List<String> contentList; private List<Form> m_formList; private Form m_curForm; private static boolean do_uber_debug=false; private String mCharset; // Extract just the HREF portion (should look for HREF=\") private static Pattern urlMatcher = Pattern.compile("(?i)href=\"([^\"#]*)"); public JHTML(StringBuffer strBuf) { setup(); m_parser = new JHTMLParser(strBuf, this); } private void setup() { caselessContentMap = new HashMap<String, intPair>(); contentMap = new HashMap<String, intPair>(); contentList = new ArrayList<String>(); m_formList = new ArrayList<Form>(); m_curForm = null; reset(); } /** * @brief Set the 'tag pointer' to the start of the document. */ public void reset() { m_tokenIndex = 0; m_contentIndex = 0; } public Map<String, String> extractMicroformat() { XMLElement xe = new XMLElement(); String currentProperty = null; Map<String, String> rval = new HashMap<String, String>(); htmlToken tok; int balance = 0; String currentContent = null; while((tok = nextToken()) != null) { int type = tok.getTokenType(); if(currentProperty != null) { if(type == htmlToken.HTML_TAG) balance++; if(type == htmlToken.HTML_ENDTAG) { balance--; if(balance == 0) { if(rval.get(currentProperty) == null || rval.get(currentProperty).length() == 0 || currentContent.length() != 0) { rval.put(currentProperty, currentContent); } currentProperty = null; } } } if(type == htmlToken.HTML_TAG || type == htmlToken.HTML_SINGLETAG) { if(tok.getToken().startsWith("!")) continue; try { xe.reset(); xe.parseString("<" + tok.getToken() + "/>"); } catch(XMLParseException xpe) { JConfig.log().logVerboseDebug("eBay's HTML still sucks."); continue; } String itemprop = xe.getProperty("itemprop"); if(itemprop != null) { String content = xe.getProperty("content"); if (content != null) { if(rval.get(itemprop) == null || rval.get(itemprop).length() == 0 || content.length() != 0) { rval.put(itemprop, content); } } else { currentProperty = itemprop; currentContent = ""; balance = 1; } } else if(xe.getTagName().equals("meta")) { String property = xe.getProperty("property"); String content = xe.getProperty("content"); if(property != null && property.startsWith("og:")) { itemprop = property.substring(3); if (rval.get(itemprop) == null || rval.get(itemprop).length() == 0 || content.length() != 0) { rval.put(itemprop, content); } } else { String name = xe.getProperty("name"); if(name != null && name.equals("twitter:text:price")) { rval.put("price", content); } } } } else if(type == htmlToken.HTML_CONTENT && currentProperty != null) { if(currentContent.length() != 0) currentContent += " "; currentContent += tok.toString(); } } return rval; } private static class intPair { private int first; private int second; public intPair(int f, int s) { first = f; second = s; } public int getFirst() { return first; } public int getSecond() { return second; } } public static class Form { private List<XMLInterface> mAllInputs; private XMLElement formTag; private static final String FORM_VALUE = "value"; private static final String FORM_SUBMIT = "submit"; private static final String FORM_CHECKBOX = "checkbox"; public static final String FORM_PASSWORD = "password"; private static final String FORM_HIDDEN = "hidden"; private static final String FORM_RADIO = "radio"; public Form(String initialTag) { formTag = new XMLElement(); formTag.parseString('<' + initialTag + "/>"); mAllInputs = new ArrayList<XMLInterface>(); if (do_uber_debug) JConfig.log().logDebug("Name: " + formTag.getProperty("name", "(unnamed)")); } public String getName() { return formTag.getProperty("name"); } public boolean hasInput(String srchFor) { return hasInput(srchFor, null); } public boolean hasInput(String srchFor, String value) { for (XMLInterface curInput : mAllInputs) { String name = curInput.getProperty("name"); if (name != null) { if (srchFor.equalsIgnoreCase(name) && (value == null || curInput.getProperty("value").equalsIgnoreCase(value))) { return true; } } } return false; } public boolean delInput(String srchFor) { Iterator<XMLInterface> it = mAllInputs.iterator(); while (it.hasNext()) { XMLInterface curInput = it.next(); String name=curInput.getProperty("name"); if(name != null) { if(srchFor.equalsIgnoreCase(name)) { it.remove(); return true; } } } return false; } public String getCGI() throws UnsupportedEncodingException { String action = getAction(); String rval = getFormData(); if(action != null) { if (action.indexOf('?') == -1) { rval = action + '?' + rval; } else { rval = action + '&' + rval; } } return rval; } public String getFormData() throws UnsupportedEncodingException { Iterator<XMLInterface> it = mAllInputs.iterator(); StringBuffer rval = new StringBuffer(""); String seperator = ""; while(it.hasNext()) { XMLElement curInput = (XMLElement)it.next(); if(do_uber_debug) JConfig.log().logDebug("Type == " + curInput.getProperty("type", "text")); if (rval.length() != 0) { seperator = "&"; } String type = curInput.getProperty("type", "text"); String name = curInput.getProperty("name", ""); if(type.equals("text") || type.equalsIgnoreCase(FORM_HIDDEN) || type.equals(FORM_PASSWORD)) { // Need to URL-Encode 'value'... rval.append(seperator).append(name).append('=').append(URLEncoder.encode(curInput.getProperty(FORM_VALUE, ""), "UTF-8")); } else if(type.equals(FORM_CHECKBOX) || type.equals(FORM_RADIO)) { if(curInput.getProperty("checked") != null) { rval.append(seperator).append(name).append('=').append(URLEncoder.encode(curInput.getProperty(FORM_VALUE, "on"), "UTF-8")); } } else if(type.equals(FORM_SUBMIT)) { if(name.length() != 0) { String value = curInput.getProperty(FORM_VALUE, "Submit"); if (!value.equalsIgnoreCase("cancel")) { rval.append(seperator).append(name).append('=').append(URLEncoder.encode(value, "UTF-8")); } } } } return rval.toString(); } public String getAction() { return formTag.getProperty(JHTMLDialog.FORM_ACTION); } private String createProperty(String property, XMLInterface tag, String defValue) { String value = tag.getProperty(property); if(value != null) { return property + "=\"" + value + "\" "; } return defValue; } public void addInput(String newTag) { XMLElement inputTag = new XMLElement(); try { inputTag.parseString('<' + newTag + "/>"); } catch (XMLParseException e) { if(XMLElement.rejectingBadHTML()) throw e; JConfig.log().handleException("Bad input tag", e); return; } String inputType = inputTag.getProperty("type", "text").toLowerCase(); if(inputTag.getTagName().equals("button")) { XMLElement tempTag = new XMLElement(); String name = createProperty("name", inputTag, ""); String value= createProperty("value", inputTag, ""); String type = createProperty("type", inputTag, "button"); tempTag.parseString("<input " + type + name + value + "/>"); inputType = tempTag.getProperty("type"); inputTag = tempTag; } boolean showInputs = JConfig.queryConfiguration("debug.showInputs", "false").equals("true"); boolean isError = inputType == null; if(!isError) { if(inputType.equals("text")) { if (showInputs) JConfig.log().logDebug("T: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals(FORM_PASSWORD)) { if (showInputs) JConfig.log().logDebug("P: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if (inputType.equalsIgnoreCase(FORM_HIDDEN) || inputType.equalsIgnoreCase("'hidden'")) { if (showInputs) JConfig.log().logDebug("H: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals(FORM_CHECKBOX)) { if (showInputs) JConfig.log().logDebug("CB: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals(FORM_RADIO)) { if (showInputs) JConfig.log().logDebug("R: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals(FORM_SUBMIT)) { if (showInputs) JConfig.log().logDebug("S: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals("image")) { if (showInputs) JConfig.log().logDebug("I: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals("button")) { if (showInputs) JConfig.log().logDebug("B: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals("reset")) { if (showInputs) JConfig.log().logDebug("RST: Name: " + inputTag.getProperty("name") + ", Value: " + inputTag.getProperty(FORM_VALUE)); } else if(inputType.equals("file")) { if (showInputs) JConfig.log().logDebug("File: Name: " + inputTag.getProperty("name")); } else { JConfig.log().logDebug("Unknown input type: " + inputType); isError = true; } } else { JConfig.log().logDebug("Bad input tag (ignoring): " + newTag); } if(!isError) { mAllInputs.add(inputTag); } } public void setText(String key, String val) { for (XMLInterface curInput : mAllInputs) { String name = curInput.getProperty("name"); if (name != null) { if (name.equalsIgnoreCase(key)) { curInput.setProperty(FORM_VALUE, val); } } } } public String getInputValue(String inputName) { for(XMLInterface input : mAllInputs) { String name = input.getProperty("name"); if(name != null && name.equals(inputName)) { if (input.getProperty("value") != null) { return input.getProperty("value"); } } } return null; } public Map<String, Object> getCGIMap() { LinkedHashMap<String, Object> rval = new LinkedHashMap<String, Object>(); for(XMLInterface input : mAllInputs) { String name = input.getProperty("name"); String value = input.getProperty("value"); rval.put(name, value); } return rval; } } public List<Form> getForms() { return m_formList; } /** * @brief Added to work with JHTMLParser, which takes a JHTMLListener (which this implements); this * adds each content token into a hash map for later fast lookup. * * @param newToken - The token that has been extracted. * @param contentIndex - This token's index into the total token list... * m_parser.getTokenAt(contentIndex) == newTok. */ public void addToken(htmlToken newToken, int contentIndex) { if(newToken.getTokenType() == htmlToken.HTML_CONTENT) { // Non-numeric single character content tokens suck. if(newToken.getToken().length() == 1 && !Character.isDigit(newToken.getToken().charAt(0))) return; // Keep the content stored by lowercase value, for case-insensitive searching. // Store the passed content index (the 'real' index), and the internal index, // for quick lookups. intPair pair = new intPair(contentIndex, contentList.size()); // First entry into the table wins. if(!contentMap.containsKey(newToken.getToken())) { contentMap.put(newToken.getToken(), pair); caselessContentMap.put(newToken.getToken().toLowerCase(), pair); } contentList.add(newToken.getToken()); } else { if(newToken.getTokenType() == htmlToken.HTML_TAG || newToken.getTokenType() == htmlToken.HTML_ENDTAG || newToken.getTokenType() == htmlToken.HTML_SINGLETAG) { handleForms(newToken); // <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> if(newToken.getToken().toLowerCase().startsWith("meta")) { checkDocumentType(newToken.getToken(), "ISO-8859-1"); checkDocumentType(newToken.getToken(), "UTF-8"); } } } } public String getCharset() { return mCharset; } private void checkDocumentType(String meta, String type) { if(meta.contains(type)) setCharset(type); } private void handleForms(htmlToken newToken) { if(newToken.getToken().toLowerCase().startsWith("form")) { if (m_curForm != null) { m_formList.add(m_curForm); m_curForm = null; } try { m_curForm = new Form(newToken.getToken()); } catch (com.jbidwatcher.util.xml.XMLParseException parseException) { JConfig.log().logDebug("Form parsing failure: " + parseException); } } else if(newToken.getToken().toLowerCase().startsWith("/form")) { if(m_curForm != null) m_formList.add(m_curForm); m_curForm = null; } if(m_curForm != null) { if(newToken.getToken().regionMatches(true, 0, "input", 0, 5) || newToken.getToken().regionMatches(true, 0, "button", 0, 6)) { m_curForm.addInput(newToken.getToken()); } } } //------------------------------------------------------------ // Content operations. //------------------------------------------------------------ /** * @brief Helper function to retrieve just the first piece of content from a potentially HTML string. * * @param toSearch - The string to search for non-tag content. * * @return The very first block of non-tag content in a potentially HTML string. */ public static String getFirstContent(String toSearch) { JHTML parser = new JHTML(new StringBuffer(toSearch)); return parser.contentList.get(0); } public String getTitle() { reset(); String tagWalk = getNextTag(); while(tagWalk != null && !"title".equalsIgnoreCase(tagWalk)) { tagWalk = getNextTag(); } if(tagWalk == null) return null; htmlToken t = nextToken(); while(t != null && t.getTokenType() != htmlToken.HTML_CONTENT) t = nextToken(); return t == null ? null : t.getToken(); } public String getNextContent() { if( (m_contentIndex+1) >= contentList.size()) return null; return contentList.get(m_contentIndex++); } public String getPrevContent() { if(m_contentIndex == 0) return null; return contentList.get(--m_contentIndex); } public String getPrevContent(int farBack) { if(farBack > m_contentIndex) { m_contentIndex = 0; return null; } m_contentIndex -= farBack; return contentList.get(m_contentIndex); } // None of these parameter definitions are needed right now. // private static final boolean IGNORE_CASE = true; // private static final boolean IS_REGEX = true; // private static final boolean NOT_REGEX = false; // private static final boolean EXACT = true; // private static final boolean INEXACT = false; // private static final int DOWN = -1; // private static final int UP = 1; private static final boolean CHECK_CASE = false; public Object lookup(String hunt, boolean caseless) { intPair at; if (caseless) { at = caselessContentMap.get(hunt.toLowerCase()); } else { at = contentMap.get(hunt); } return at; } private String contentLookup(String hunt, boolean caseless) { intPair at = (intPair)lookup(hunt, caseless); if(at == null) return null; m_tokenIndex = at.getFirst() +2; m_contentIndex = at.getSecond() +1; return contentList.get(m_contentIndex++); } public String find(String hunt, boolean ignoreCase) { for (String nextContent : contentList) { if (nextContent.regionMatches(ignoreCase, 0, hunt, 0, hunt.length())) { return nextContent; } } return null; } private String contentFind(String hunt, boolean ignoreCase) { String nextContent = find(hunt, ignoreCase); if (nextContent != null) { // This might not be safe... nextContent = contentLookup(nextContent, CHECK_CASE); } return nextContent; } public String grep(String match) { Pattern matchPat = Pattern.compile(match); for (String nextContent : contentList) { Matcher m = matchPat.matcher(nextContent); if (m.matches()) { // This might not be safe... return nextContent; } } return null; } public Matcher realGrep(String match) { Pattern matchPat = Pattern.compile(match); for (String nextContent : contentList) { Matcher m = matchPat.matcher(nextContent); if (m.matches()) { return m; } } return null; } private String grepAfter(String match, String ignore) { Pattern toMatch = Pattern.compile(match); Pattern toIgnore = (ignore == null ? null : Pattern.compile(ignore)); for (Iterator<String> it = contentList.iterator(); it.hasNext();) { String contentStep = it.next(); if(toMatch.matcher(contentStep).matches()) { Iterator<String> save = it; if(it.hasNext()) { String potential = it.next(); if(ignore == null || !toIgnore.matcher(potential).matches()) { contentLookup(contentStep, false); return potential; } } it = save; } } return null; } private String contentGrep(String match, String ignore) { return grepAfter(match, ignore); } // Default to caseless lookups. public String getNextContentAfterContent(String previousData) { return contentFind(previousData, CHECK_CASE); } public String getContentBeforeContent(String followingData) { if (contentFind(followingData, CHECK_CASE) != null && getPrevContent() != null && getPrevContent() != null) return getPrevContent(); return null; } public String getNextContentAfterRegex(String match) { return contentGrep(match, null); } public String getNextContentAfterRegexIgnoring(String match, String ignore) { return contentGrep(match, ignore); } /** * Strictly speaking this is not correct; we should reset to the initial * content step plus one, and start again. In practice, this is not needed * yet. (Essentially this should become a larger scale Boyer-Moore.) * * @param sequence - The sequence of regular expressions to match * @return The contents that matched the last regex, or null if no matches. */ public boolean hasSequence(String... sequence) { return sequence.length != 0 && findSequence(sequence) != null; } public class SequenceResult extends LinkedList<String> { Pattern[] sequence; int nextStartPoint; public int getNextStartPoint() { return nextStartPoint; } } public SequenceResult findSequence(String... originalSequence) { SequenceResult contentSequence = new SequenceResult(); contentSequence.nextStartPoint = 0; Pattern[] inputPattern = new Pattern[originalSequence.length]; int currentPattern = 0; for (String step : originalSequence) { inputPattern[currentPattern++] = Pattern.compile(step); } contentSequence.sequence = inputPattern; return findNextSequence(contentSequence); } public SequenceResult findNextSequence(SequenceResult contentSequence) { int stepwise = contentSequence.nextStartPoint; Pattern[] inputPattern = contentSequence.sequence; List<String> toSearch = contentList.subList(stepwise, contentList.size()); int index = 0; for (String contentStep : toSearch) { stepwise++; if(inputPattern[index].matcher(contentStep).matches()) { contentSequence.add(contentStep); index++; if (index == inputPattern.length) { contentSequence.nextStartPoint = stepwise; return contentSequence; } } else { contentSequence.clear(); index = 0; } } return null; } //------------------------------------------------------------ // Tag operations. //------------------------------------------------------------ public String getNextTag() { htmlToken returnToken = nextToken(); if (returnToken != null) { while (returnToken != null && returnToken.getTokenType() == htmlToken.HTML_CONTENT && returnToken.getTokenType() != htmlToken.HTML_EOF) { returnToken = nextToken(); } if (returnToken != null && returnToken.getTokenType() != htmlToken.HTML_EOF) { return returnToken.getToken(); } } return null; } public List<String> getAllLinks() { List<String> linkTags = null; String curTag = getNextTag(); while(curTag != null) { if(curTag.startsWith("A ") || curTag.startsWith("a ")) { if(linkTags == null) { linkTags = new ArrayList<String>(); } linkTags.add(curTag); } curTag = getNextTag(); } return linkTags; } public String getLinkForContent(String searchContent) { reset(); String lastTag = null; htmlToken curToken = nextToken(); while(curToken != null) { switch(curToken.getTokenType()) { case htmlToken.HTML_TAG: { String tag = curToken.getToken(); if(tag.regionMatches(true, 0, "a ", 0, 2)) { lastTag = tag; } break; } case htmlToken.HTML_ENDTAG: { String tag = curToken.getToken(); if(tag.equalsIgnoreCase("/a")) { lastTag = null; } } case htmlToken.HTML_CONTENT: { String content = curToken.getToken(); if(lastTag != null) { if(searchContent.equals(content)) { Matcher result = urlMatcher.matcher(lastTag); if(result.find()) { return result.group(1); } } } } } curToken = nextToken(); } return null; } public List<String> getAllImages() { HashSet<String> imgUrls = new HashSet<String>(); String curTag = getNextTag(); while(curTag != null) { if(curTag.toLowerCase().startsWith("img ")) { imgUrls.add(deAmpersand(curTag).replaceAll(".*img.*src=\"(.*?)\".*", "$1")); } curTag = getNextTag(); } return new ArrayList<String>(imgUrls); } public List<String> getAllURLsOnPage(boolean viewOnly) { // Add ALL auctions on myEbay bidding/watching page! List<String> addressTags = getAllLinks(); if(addressTags == null) return null; List<String> outEntries = null; for (String curTag : addressTags) { Matcher result = urlMatcher.matcher(curTag); if(result.find()) { String href = result.group(1); boolean isView = false; if (viewOnly) { isView = href.matches("^https?://[a-z]+.ebay.[a-z.]+/(?:itm/)?[A-Za-z0-9-]+/[0-9]+(\\?.*)?") || (href.indexOf("ViewItem") != -1); if (isView) { href = deAmpersand(href); } } if (!viewOnly || isView) { if (outEntries == null) outEntries = new ArrayList<String>(); outEntries.add(href); } } } return outEntries; } public static String deAmpersand(String href) { int searchIndex = href.indexOf("&"); while (searchIndex != -1) { href = href.substring(0, searchIndex + 1) + href.substring(searchIndex + 5); searchIndex = href.indexOf("&"); } return href; } //------------------------------------------------------------ // Generic token operations. //------------------------------------------------------------ public htmlToken nextToken() { htmlToken rval = m_parser.getTokenAt(m_tokenIndex++); if (rval == null) --m_tokenIndex; return rval; } public boolean isLoaded() { return m_loaded; } private void loadParseURL(String newURL, String cookie, CleanupHandler cl) { m_parser = new JHTMLParser(this); StringBuffer loadedPage; try { URLConnection uc = Http.net().getPage(newURL, cookie, null, true); loadedPage = Http.net().receivePage(uc); if(loadedPage != null) { if(cl != null) cl.cleanup(loadedPage); m_parser.parse(loadedPage); m_loaded = true; } } catch(IOException e) { loadedPage = null; JConfig.log().handleException("JHTML.loadPage: (" + newURL + ") " + e, e); } if(loadedPage == null) m_loaded = false; } /** * @brief Simple function that does all the 'usual' stuff for a web page, * constructing a JHTML object with the data from the given page. * * For pages that need more processing, they have to do it by hand. * * @param newURL - The URL to get, receive, and pre-parse. * @param cookie - A cookie to pass along when getting the page. * @param cl - A CleanupHandler to call to clean up the StringBuffer before continuing. */ public JHTML(String newURL, String cookie, CleanupHandler cl) { setup(); loadParseURL(newURL, cookie, cl); } public JHTML.Form getFormWithInput(String input) { List<Form> forms = getForms(); for (Form curForm : forms) { if (curForm.hasInput(input)) return curForm; } return null; } public void setCharset(String charset) { mCharset = charset; } private boolean isToken(htmlToken tok, int tokenType, String tag) { return tok.getTokenType() == tokenType && tok.getToken().regionMatches(true, 0, tag, 0, tag.length()); } public class Table { private List<List<String>> data; public Table() { data = new ArrayList<List<String>>(); } public void addRow(List<String> newRow) { data.add(newRow); } public String getCell(int col, int row) { return data.get(row).get(col); } public List<String> getRow(int row) { return data.get(row); } public boolean rowCellMatches(int row, String regexp) { if(data.size() == 0) return false; for(String cell : data.get(row)) { if(cell.matches(regexp)) return true; } return false; } public int getRowCount() { return data.size(); } } public List<Table> extractTables() { List<Table> tableContents = new ArrayList<Table>(1); htmlToken tok = nextToken(); while (tok != null) { if (isToken(tok, htmlToken.HTML_TAG, "table")) { tableContents.add(extractTable(tableContents)); } tok = nextToken(); } return tableContents; } public Table extractTable(List<Table> tableContents) { List<htmlToken> currentTable = new ArrayList<htmlToken>(); Table currentTableContents = new Table(); List<String> headers = new ArrayList<String>(); List<List<htmlToken>> tableList = new ArrayList<List<htmlToken>>(1); tableList.add(currentTable); htmlToken tok = nextToken(); String curHdr = null; while(!isToken(tok, htmlToken.HTML_ENDTAG, "/table")) { if (isToken(tok, htmlToken.HTML_TAG, "table")) { tableContents.add(extractTable(tableContents)); } else { if (isToken(tok, htmlToken.HTML_ENDTAG, "/tr")) { boolean first = true; List<String> curRow = null; for (String hdr : headers) { if (!first) { } else { curRow = new ArrayList<String>(headers.size()); first = false; } curRow.add(hdr); } if (!headers.isEmpty()) { currentTableContents.addRow(curRow); } headers = new ArrayList<String>(); } else { if (isToken(tok, htmlToken.HTML_TAG, "td") || isToken(tok, htmlToken.HTML_TAG, "th")) { curHdr = ""; } else if (isToken(tok, htmlToken.HTML_ENDTAG, "/td") || isToken(tok, htmlToken.HTML_ENDTAG, "/th")) { if (curHdr != null && curHdr.length() != 0) headers.add(curHdr); curHdr = null; } } if (tok.getTokenType() == htmlToken.HTML_CONTENT && curHdr != null) { if (curHdr.length() != 0) curHdr += ' '; curHdr += tok.getToken(); } currentTable.add(tok); } tok = nextToken(); } return currentTableContents; } }