package ecologylab.bigsemantics.tools;
import java.io.InputStream;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ecologylab.bigsemantics.cyberneko.CybernekoWrapper;
import ecologylab.bigsemantics.html.dom.IDOMProvider;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.net.ParsedURL;
//import org.w3c.tidy.Tidy;
public class XPathTest
{
private static final String ACM_CHILD_XPATH = "./child::text()";
private static final String ACM_CITATION = "http://portal.acm.org/citation.cfm?id=1460563.1460642&coll=GUIDE&dl=GUIDE&CFID=48444641&CFTOKEN=72936343";
private static final String CITATION_XPATH = ".//a[@name='references']/../following-sibling::table//a[@href[starts-with(.,'citation')]]";
private static final String WIKIPEDIA = "http://en.wikipedia.org/wiki/Harbor_Seal";
private static final String WIKIPEDIA_XPATH = "//*[starts-with(@class,'infobox')]//img[1]";
private static final String WIKIPEDIA_CHILD_XPATH = "./@height";
// private static final String XPATH = CITATION_XPATH;
// private static final String LOCATION = ACM_CITATION;
// private static final String XPATH = CITATION_XPATH;
private static final String TRENDS = "http://www.google.com/trends";
private static final String TRENDS_XPATH = "//td[@class='hotListTable']";
private static final String TRENDS_CHILD_XPATH = ".";
private static final String FLICKR = "http://www.flickr.com/photos/81124164@N00/4085549266/";
private static final String FLICKR_XPATH = "//html/head/link[2]/@href";
private static final String FLICKR_CHILD_XPATH = ".";
private static final String IMDB = "http://www.imdb.com/title/tt1464540/";
///div[@id='filmo-head-Actor']/following-sibling::*
private static final String IMDB_XPATH = "//div[@class='mediastrip']//img/@src";
private static final String IMDB_CHILD_XPATH = ".";
private static final String GOOGLE_BOOKS = "http://books.google.com/books?id=fu5HtixRje8C&dq=o%27reilly&source=gbs_navlinks_s";
private static final String GOOGLE_BOOKS_XPATH = "//div[@id='citations_module_v']/div[2]//div";
private static final String GOOGLE_BOOKS_CHILD_XPATH = "./div/a";
private static final String SLASHDOT = "http://slashdot.org/index2.pl?fhfilter=japan+earthquake";
private static final String SLASHDOT_XPATH = "//div[@id='firehoselist']//h2[@class='story'][1]/following-sibling::div[@class='grid_14'][1]//a[@class='popular tag']";
private static final String SLASHDOT_CHILD_XPATH = ".";
private static final String LOCATION = SLASHDOT;
private static final String XPATH = SLASHDOT_XPATH;
private static final String CHILD_XPATH = SLASHDOT_CHILD_XPATH;
private static final ParsedURL PURL = ParsedURL.getAbsolute(LOCATION);
/**
* @param args
*/
public static void main(String[] args)
{
// IDOMProvider domProvider = new Tidy(); // we used to use JTidy as the DOM provider.
IDOMProvider domProvider = new CybernekoWrapper();
domProvider.setQuiet(true);
domProvider.setShowWarnings(false);
XPath xpath = XPathFactory.newInstance().newXPath();
try
{
// InputStream inStream = new FileInputStream(new File("C:\\abhinavCode\\ecologylabSemantics\\testcases\\file2.xml"));
InputStream inStream = PURL.connect().inputStream();
Document contextNode = domProvider.parseDOM(inStream, null);
String parentXPathString=XPATH;
NodeList parentNodeList = (NodeList) xpath.evaluate(parentXPathString, contextNode, XPathConstants.NODESET);
String childXPath = CHILD_XPATH;
System.out.println("List Size: " + parentNodeList.getLength());
for(int i=0;i<parentNodeList.getLength();i++)
{
Node node = parentNodeList.item(i);
System.out.println(node);
String pNode = xpath.evaluate(childXPath, node);
System.out.println("Result "+i+" =\t"+pNode);
}
}
catch (Exception e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static String getAllTextFromNode(Node node)
{
StringBuilder buffy = StringBuilderUtils.acquire();
getAllTextFromNode(node, buffy);
String result = buffy.toString();
StringBuilderUtils.release(buffy);
return result;
}
/**
* This method get all the text from the subtree rooted at a node.The reason for this
* implementation is that when we write a xpath we might get node of any type. Now if the node has
* some text inside it we would like to get it. And so this method get all the text in the subtree
* rooted at that node. Eliminates all formatting tags.
*
* @param node
* @return
*/
private static void getAllTextFromNode(Node node, StringBuilder buffy)
{
short nodeType = node.getNodeType();
switch (nodeType)
{
case Node.TEXT_NODE:
case Node.CDATA_SECTION_NODE:
buffy.append(node.getNodeValue());
break;
case Node.ATTRIBUTE_NODE:
case Node.COMMENT_NODE:
case Node.PROCESSING_INSTRUCTION_NODE:
break;
default:
NodeList cList = node.getChildNodes();
if (cList != null)
{
for (int k = 0; k < cList.getLength(); k++)
{
buffy.append(getAllTextFromNode(cList.item(k)));
}
}
break;
}
}
}