import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.Scanner;
/**
* Nutch client
*
* <p/>
* Copyright 1996-2012 by Mark Watson. All rights reserved.
* <p/>
* This software is can be used under either of the following licenses:
* <p/>
* 1. LGPL v3<br/>
* 2. Apache 2
* <p/>
*/
public class NutchClient {
static public List<Hashtable<String,String>> searchGetCache(String opensearch_url, String query) throws IOException, ParserConfigurationException, SAXException {
return search_helper(opensearch_url, query, true);
}
static public List<Hashtable<String,String>> search(String opensearch_url, String query) throws IOException, ParserConfigurationException, SAXException {
return search_helper(opensearch_url, query, false);
}
static private List<Hashtable<String,String>> search_helper(String opensearch_url, String query, boolean return_cache) throws IOException, ParserConfigurationException, SAXException {
List<Hashtable<String,String>> ret = new ArrayList<Hashtable<String,String>>();
String url_str = opensearch_url + "?query=" + URLEncoder.encode(query, "UTF-8");
System.out.println(url_str);
URL url = new URL(url_str);
URLConnection uc = url.openConnection();
BufferedInputStream bis = new BufferedInputStream(uc.getInputStream());
DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document doc = docBuilder.parse(bis);
doc.getDocumentElement().normalize();
System.out.println( doc.getDocumentElement().getTagName());
NodeList listItems = doc.getElementsByTagName("item");
int numItems = listItems.getLength();
for (int i=0; i<numItems; i++) {
Node item = listItems.item(i);
//System.out.println("\nStart of new item in RSS 2.0 XML stream:");
Hashtable<String,String> new_item = new Hashtable<String,String>();
ret.add(new_item);
NodeList item_data = item.getChildNodes();
int num = item_data.getLength();
for (int n=0; n<num; n++) {
Node data = item_data.item(n);
String name = data.getNodeName();
if (name.equals("title") || name.equals("description") ||
name.equals("link")) {
new_item.put(name, data.getTextContent());
}
if (name.equals("nutch:cache")) {
new_item.put("cache_uri", data.getTextContent());
}
// debug printout:
//if (!name.equals("#text")) {
// String text = data.getTextContent();
// System.out.println(name + ": " + text);
//}
}
if (return_cache && new_item.get("cache_uri")!=null) {
new_item.put("cache_content", getCacheContent(new_item.get("cache_uri")));
}
}
return ret;
}
static public String getCacheContent(String cache_uri) throws IOException {
URL url = new URL(cache_uri);
URLConnection uc = url.openConnection();
return new Scanner(uc.getInputStream()). useDelimiter("\\Z").next();
}
/**
* @param args
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
*/
public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException {
long t1 = System.currentTimeMillis();
List<Hashtable<String,String>> results =
NutchClient.search("http://localhost:8080/opensearch", "Java AI");
//NutchClient.searchGetCache("http://localhost:8080/opensearch", "Java RDF");
long t2 = System.currentTimeMillis();
System.out.println("results: " + results);
System.out.println(" Time in milliseconds for web service call: " + (t2 - t1));
System.out.println(results.get(0));
System.out.println(results.get(1));
}
}