package ecologylab.bigsemantics.tools; import java.io.InputStream; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.bigsemantics.cyberneko.CybernekoWrapper; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.net.ParsedURL; public class CybernekoXpathTest { private static final String SLASHDOT = "http://slashdot.org/index2.pl?fhfilter=japan+earthquake"; private static final String SLASHDOT_XPATH = "//div[@id='firehoselist']"; private static final String SLASHDOT_CHILD_XPATH = "."; private static final String FLICKR = "http://www.flickr.com/photos/81124164@N00/4085549266/"; private static final String FLICKR_XPATH = "//html/head/link[2]/@href"; private static final String FLICKR_CHILD_XPATH = "."; private static final String CARTOONS_AC_UK = "http://www.cartoons.ac.uk/record/28011"; private static final String CARTOONS_AC_UK_XPATH = "//*[@id='detailPublish']"; // private static final String CARTOONS_AC_UK_XPATH = "//div[@id='detailPublish']"; //Should be same as above // private static final String CARTOONS_AC_UK_XPATH = "//*[@id='detailPublish']/h4/a[1]"; //Should pick the first link // private static final String CARTOONS_AC_UK_XPATH = "//*[@id='detailPublish']//a[1]"; //Should pick the first link private static final String CARTOONS_AC_UK_CHILD_XPATH = "."; private static final String WIKIPEDIA = "http://en.wikipedia.org/wiki/Modern_art"; private static final String WIKIPEDIA_XPATH = "//div[@id='bodyContent']/p[1]"; private static final String WIKIPEDIA_CHILD_XPATH = "."; private static final String LOCATION = SLASHDOT; private static final String XPATH = SLASHDOT_XPATH; private static final String CHILD_XPATH = SLASHDOT_CHILD_XPATH; private static final ParsedURL PURL = ParsedURL.getAbsolute(LOCATION); private static boolean useUpper = false; public static void main(String[] args) { CybernekoWrapper cyberneko = new CybernekoWrapper(); XPath xpath = XPathFactory.newInstance().newXPath(); try { InputStream inStream = PURL.connect().inputStream(); Document contextNode = cyberneko.parseDOM(inStream, System.out); String parentXPathString; if (useUpper) parentXPathString = cyberneko.xPathTagNamesToLower(XPATH); else parentXPathString = XPATH; NodeList parentNodeList = (NodeList) xpath.evaluate(parentXPathString, contextNode, XPathConstants.NODESET); String childXPath; if (useUpper) childXPath = cyberneko.xPathTagNamesToLower(CHILD_XPATH); else childXPath = CHILD_XPATH; System.out.println("\n\nList Size: " + parentNodeList.getLength()); for (int i = 0; i < parentNodeList.getLength(); i++) { Node node = parentNodeList.item(i); System.out.println(node); String pNode = xpath.evaluate(childXPath, node); System.out.println("Result " + i + " =\t" + pNode.replaceAll("\\s+", " ")); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } private static String getAllTextFromNode(Node node) { StringBuilder buffy = StringBuilderUtils.acquire(); getAllTextFromNode(node, buffy); String result = buffy.toString(); StringBuilderUtils.release(buffy); return result; } /** * This method get all the text from the subtree rooted at a node.The reason for this * implementation is that when we write a xpath we might get node of any type. Now if the node has * some text inside it we would like to get it. And so this method get all the text in the subtree * rooted at that node. Eliminates all formatting tags. * * @param node * @return */ private static void getAllTextFromNode(Node node, StringBuilder buffy) { short nodeType = node.getNodeType(); switch (nodeType) { case Node.TEXT_NODE: case Node.CDATA_SECTION_NODE: buffy.append(node.getNodeValue()); break; case Node.ATTRIBUTE_NODE: case Node.COMMENT_NODE: case Node.PROCESSING_INSTRUCTION_NODE: break; default: NodeList cList = node.getChildNodes(); if (cList != null) { for (int k = 0; k < cList.getLength(); k++) { buffy.append(getAllTextFromNode(cList.item(k))); } } break; } } }