CybernekoXpathTest.java example

Explorer
BigSemanticsJava-master
package ecologylab.bigsemantics.tools;

import java.io.InputStream;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import ecologylab.bigsemantics.cyberneko.CybernekoWrapper;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.net.ParsedURL;

public class CybernekoXpathTest
{

	private static final String			SLASHDOT										= "http://slashdot.org/index2.pl?fhfilter=japan+earthquake";
	private static final String			SLASHDOT_XPATH							= "//div[@id='firehoselist']";
	private static final String			SLASHDOT_CHILD_XPATH				= ".";

	private static final String			FLICKR											= "http://www.flickr.com/photos/81124164@N00/4085549266/";
	private static final String			FLICKR_XPATH								= "//html/head/link[2]/@href";
	private static final String			FLICKR_CHILD_XPATH					= ".";

	private static final String			CARTOONS_AC_UK							= "http://www.cartoons.ac.uk/record/28011";
	private static final String			CARTOONS_AC_UK_XPATH				= "//*[@id='detailPublish']";
	// private static final String CARTOONS_AC_UK_XPATH = "//div[@id='detailPublish']"; //Should be same as above
	// private static final String CARTOONS_AC_UK_XPATH = "//*[@id='detailPublish']/h4/a[1]"; //Should pick the first link
	// private static final String CARTOONS_AC_UK_XPATH = "//*[@id='detailPublish']//a[1]"; //Should pick the first link
	private static final String			CARTOONS_AC_UK_CHILD_XPATH	= ".";

	private static final String			WIKIPEDIA										= "http://en.wikipedia.org/wiki/Modern_art";
	private static final String			WIKIPEDIA_XPATH							= "//div[@id='bodyContent']/p[1]";
	private static final String			WIKIPEDIA_CHILD_XPATH				= ".";

	private static final String			LOCATION										= SLASHDOT;
	private static final String			XPATH												= SLASHDOT_XPATH;
	private static final String			CHILD_XPATH									= SLASHDOT_CHILD_XPATH;

	private static final ParsedURL	PURL												= ParsedURL.getAbsolute(LOCATION);

	private static boolean					useUpper										= false;

	public static void main(String[] args)
	{
		CybernekoWrapper cyberneko = new CybernekoWrapper();
		XPath xpath = XPathFactory.newInstance().newXPath();

		try
		{
			InputStream inStream = PURL.connect().inputStream();
			Document contextNode = cyberneko.parseDOM(inStream, System.out);
			
			String parentXPathString;
			if (useUpper)
				parentXPathString = cyberneko.xPathTagNamesToLower(XPATH);
			else
				parentXPathString = XPATH;

			NodeList parentNodeList = (NodeList) xpath.evaluate(parentXPathString, contextNode, XPathConstants.NODESET);
			String childXPath;
			if (useUpper)
				childXPath = cyberneko.xPathTagNamesToLower(CHILD_XPATH);
			else
				childXPath = CHILD_XPATH;
			System.out.println("\n\nList Size: " + parentNodeList.getLength());
			for (int i = 0; i < parentNodeList.getLength(); i++)
			{
				Node node = parentNodeList.item(i);
				System.out.println(node);
				String pNode = xpath.evaluate(childXPath, node);
				System.out.println("Result " + i + " =\t" + pNode.replaceAll("\\s+", " "));
			}
		}
		catch (Exception e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	private static String getAllTextFromNode(Node node)
	{
		StringBuilder buffy = StringBuilderUtils.acquire();
		getAllTextFromNode(node, buffy);
		String result = buffy.toString();
		StringBuilderUtils.release(buffy);
		return result;
	}

	/**
	 * This method get all the text from the subtree rooted at a node.The reason for this
	 * implementation is that when we write a xpath we might get node of any type. Now if the node has
	 * some text inside it we would like to get it. And so this method get all the text in the subtree
	 * rooted at that node. Eliminates all formatting tags.
	 * 
	 * @param node
	 * @return
	 */
	private static void getAllTextFromNode(Node node, StringBuilder buffy)
	{
		short nodeType = node.getNodeType();
		switch (nodeType)
		{
		case Node.TEXT_NODE:
		case Node.CDATA_SECTION_NODE:
			buffy.append(node.getNodeValue());
			break;
		case Node.ATTRIBUTE_NODE:
		case Node.COMMENT_NODE:
		case Node.PROCESSING_INSTRUCTION_NODE:
			break;
		default:
			NodeList cList = node.getChildNodes();
			if (cList != null)
			{
				for (int k = 0; k < cList.getLength(); k++)
				{
					buffy.append(getAllTextFromNode(cList.item(k)));
				}
			}
			break;
		}
	}

}