package com.bao.examples.html; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Locale; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.nodes.TextNode; import org.htmlparser.tags.FormTag; import org.htmlparser.tags.InputTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import junit.framework.TestCase; public class HtmlParserTest2 extends TestCase { private static final Logger logger = Logger.getLogger(HtmlParserTest.class); private String fileContent = null; protected void setUp() throws Exception { InputStream is = getClass().getResourceAsStream("/content.html"); fileContent = IOUtils.toString(is, "GB2312"); } public void testTableVisitor() { Parser myParser = Parser.createParser(fileContent, "GB2312"); NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); try { NodeList nodeList = myParser.parse(lastFilter); for(int i = 0; i <= nodeList.size(); i++) { if(nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if("Table1".equals(tag.getAttribute("id"))) { NodeList childrenList = tag.getChildren(); NodeList childTableList = childrenList.extractAllNodesThatMatch(tableFilter, true); childrenList.keepAllNodesThatMatch(tableFilter, true); for(int k = 0; k < childTableList.size(); k++) { if(childTableList.elementAt(k) instanceof TableTag) { TableTag tag2 = (TableTag) childTableList.elementAt(k); if("DataGrid1".equals(tag2.getAttribute("id"))) { logger.fatal(tag2); TableRow[] rows = tag2.getRows(); for(int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; logger.fatal("row [" + j + "]: " + tr.toPlainTextString().trim()); TableColumn[] td = tr.getColumns(); for(int k2 = 0; k2 < td.length; k2++) { logger.fatal("<td>" + td[k2].toPlainTextString()); } } } } } } } } } catch(ParserException e) { e.printStackTrace(); } } public void testTemp() throws ParserException { Parser myParser = Parser.createParser(fileContent, "UTF-8"); NodeFilter[] a = new NodeFilter[0]; List<NodeFilter> filters = new ArrayList<NodeFilter>(); filters.add(new NodeClassFilter(FormTag.class)); filters.add(new HasAttributeFilter("id", "confirmPassenger")); filters.add(new HasAttributeFilter("name", "save_passenger_single")); NodeFilter filter = new AndFilter(filters.toArray(a)); NodeList nodeList = myParser.parse(filter); if(nodeList.size() <= 0) { System.err.println("Can't find any Form!"); } FormTag form = (FormTag)nodeList.elementAt(0); NodeList inputs = form.getFormInputs(); for(int i = 0, size = inputs.size(); i < size; i++) { InputTag input = (InputTag) inputs.elementAt(i); System.out.printf(Locale.CHINESE, "%02d: name=%s,value=%s\n", i, input.getAttribute("name"), input.getAttribute("value")); } } public void testPlainText() throws ParserException { String content = "<span id='id_550000K56002' class='base_txtdiv' onmouseover=javascript:onStopHover('550000K56002#SHH#XAY') onmouseout='onStopOut()'>K560</span>"; Parser myParser = Parser.createParser(content, "UTF-8"); NodeList nodes = myParser.parse(new NodeClassFilter(TextNode.class)); StringBuilder sb = new StringBuilder(); for(int i = 0, size = nodes.size(); i < size; i++) { TextNode text = (TextNode)nodes.elementAt(i); sb.append(text.getText().trim()); } System.out.println(sb.toString()); } }