XPathTest.java example

Explorer
BigSemanticsJava-master
package ecologylab.bigsemantics.tools;

import java.io.InputStream;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import ecologylab.bigsemantics.cyberneko.CybernekoWrapper;
import ecologylab.bigsemantics.html.dom.IDOMProvider;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.net.ParsedURL;
//import org.w3c.tidy.Tidy;

public class XPathTest
{

	private static final String	ACM_CHILD_XPATH	= "./child::text()";
	private static final String	ACM_CITATION	= "http://portal.acm.org/citation.cfm?id=1460563.1460642&coll=GUIDE&dl=GUIDE&CFID=48444641&CFTOKEN=72936343";
	private static final String	CITATION_XPATH	= ".//a[@name='references']/../following-sibling::table//a[@href[starts-with(.,'citation')]]";

	private static final String	WIKIPEDIA	= "http://en.wikipedia.org/wiki/Harbor_Seal";
	private static final String	WIKIPEDIA_XPATH	= "//*[starts-with(@class,'infobox')]//img[1]";
	private static final String	WIKIPEDIA_CHILD_XPATH	= "./@height";
//	private static final String	XPATH	= CITATION_XPATH;
//	private static final String	LOCATION	= ACM_CITATION;
//	private static final String	XPATH	= CITATION_XPATH;
	
	private static final String	TRENDS	= "http://www.google.com/trends";
	private static final String	TRENDS_XPATH	= "//td[@class='hotListTable']";
	private static final String	TRENDS_CHILD_XPATH	= ".";
	
	private static final String	FLICKR	= "http://www.flickr.com/photos/81124164@N00/4085549266/";
	private static final String	FLICKR_XPATH	= "//html/head/link[2]/@href";
	private static final String	FLICKR_CHILD_XPATH	= ".";

	private static final String	IMDB	= "http://www.imdb.com/title/tt1464540/";
	///div[@id='filmo-head-Actor']/following-sibling::*
	private static final String	IMDB_XPATH	= "//div[@class='mediastrip']//img/@src";
	private static final String	IMDB_CHILD_XPATH	= ".";
	
	private static final String GOOGLE_BOOKS = "http://books.google.com/books?id=fu5HtixRje8C&dq=o%27reilly&source=gbs_navlinks_s";
	private static final String GOOGLE_BOOKS_XPATH = "//div[@id='citations_module_v']/div[2]//div";
	private static final String GOOGLE_BOOKS_CHILD_XPATH = "./div/a";
	
	private static final String	SLASHDOT	= "http://slashdot.org/index2.pl?fhfilter=japan+earthquake";
	private static final String	SLASHDOT_XPATH	= "//div[@id='firehoselist']//h2[@class='story'][1]/following-sibling::div[@class='grid_14'][1]//a[@class='popular tag']";
	private static final String	SLASHDOT_CHILD_XPATH	= ".";
	
	private static final String	LOCATION		= SLASHDOT;
	private static final String	XPATH				= SLASHDOT_XPATH;
	private static final String	CHILD_XPATH	= SLASHDOT_CHILD_XPATH;
	
	private static final ParsedURL PURL = ParsedURL.getAbsolute(LOCATION);


	/**
	 * @param args
	 */
	public static void main(String[] args)
	{
//		IDOMProvider domProvider = new Tidy(); // we used to use JTidy as the DOM provider.
		IDOMProvider domProvider = new CybernekoWrapper();
		domProvider.setQuiet(true);
		domProvider.setShowWarnings(false);
		XPath xpath = XPathFactory.newInstance().newXPath();
		try
		{
//		InputStream inStream = new FileInputStream(new File("C:\\abhinavCode\\ecologylabSemantics\\testcases\\file2.xml"));
			InputStream inStream = PURL.connect().inputStream();
			Document contextNode = domProvider.parseDOM(inStream, null);
			String parentXPathString=XPATH;
			
			NodeList parentNodeList = (NodeList) xpath.evaluate(parentXPathString, contextNode, XPathConstants.NODESET);
				String childXPath = CHILD_XPATH;
				System.out.println("List Size: " + parentNodeList.getLength());
				for(int i=0;i<parentNodeList.getLength();i++)
				{
					Node node = parentNodeList.item(i);
					System.out.println(node);
					String pNode = xpath.evaluate(childXPath, node);
					System.out.println("Result "+i+" =\t"+pNode);
				}
		}
		catch (Exception e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}		

	}
	
	private static String getAllTextFromNode(Node node)
	{
		StringBuilder buffy = StringBuilderUtils.acquire();
		getAllTextFromNode(node, buffy);
		String result = buffy.toString();
		StringBuilderUtils.release(buffy);
		return result;
	}

	/**
	 * This method get all the text from the subtree rooted at a node.The reason for this
	 * implementation is that when we write a xpath we might get node of any type. Now if the node has
	 * some text inside it we would like to get it. And so this method get all the text in the subtree
	 * rooted at that node. Eliminates all formatting tags.
	 * 
	 * @param node
	 * @return
	 */
	private static void getAllTextFromNode(Node node, StringBuilder buffy)
	{
		short nodeType = node.getNodeType();
		switch (nodeType)
		{
		case Node.TEXT_NODE:
		case Node.CDATA_SECTION_NODE:
			buffy.append(node.getNodeValue());
			break;
		case Node.ATTRIBUTE_NODE:
		case Node.COMMENT_NODE:
		case Node.PROCESSING_INSTRUCTION_NODE:
			break;
		default:
			NodeList cList = node.getChildNodes();
			if (cList != null)
			{
				for (int k = 0; k < cList.getLength(); k++)
				{
					buffy.append(getAllTextFromNode(cList.item(k)));
				}
			}
			break;
		}
	}

}