package edu.tufts.vue.mbs;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import com.clearforest.calais.common.CalaisJavaIf;
import com.clearforest.calais.common.Property;
import com.clearforest.calais.common.StringUtils;
import com.clearforest.calais.simple.Entity;
import com.google.common.collect.AbstractMapEntry;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import edu.tufts.vue.metadata.MetadataList;
import edu.tufts.vue.metadata.VueMetadataElement;
import tufts.vue.AnalyzerAction;
import tufts.vue.LWComponent;
import tufts.vue.LWNode;
import tufts.vue.MetaMap;
import tufts.vue.Resource;
import tufts.vue.VUE;
import tufts.vue.VueResources;
public class OpenCalaisAnalyzer implements LWComponentAnalyzer {
private static final org.apache.log4j.Logger Log = org.apache.log4j.Logger
.getLogger(AnalyzerAction.class);
private ArrayList<CalaisEntity> m_entities = new ArrayList<CalaisEntity>();
public List analyze(LWComponent c) {
return analyze(c, true);
}
private String downloadURL(String theURL) throws IOException {
URL u;
InputStream is = null;
DataInputStream dis;
String s;
StringBuffer sb = new StringBuffer();
try {
u = new URL(theURL);
is = u.openStream();
dis = new DataInputStream(new BufferedInputStream(is));
while ((s = dis.readLine()) != null)
sb.append(s + "\n");
} finally {
try {
if (is != null)
is.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
return sb.toString();
}
public Multimap analyzeResource(LWComponent c) throws IOException {
m_entities = new ArrayList<CalaisEntity>();
Multimap<String, AnalyzerResult> results = Multimaps
.newArrayListMultimap();
String resp_simple = null;
String context = null;
if (c != null) {
Resource r = c.getResource();
String spec = r.getSpec();
if (spec.startsWith("http") || spec.startsWith("https")) {
resp_simple = downloadURL("http://service.semanticproxy.com/processurl/xqffs8ggkmebrsehdsbt56j8/simple/"
+ spec);
}
} else {
context = "Eduardo Manet the 19th century French painter.";
}
javax.xml.parsers.DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setIgnoringElementContentWhitespace(true);
factory.setIgnoringComments(true);
factory.setValidating(false);
InputStream is = null;
org.w3c.dom.Document doc = null;
try {
is = new java.io.ByteArrayInputStream(resp_simple.getBytes("UTF-8"));
doc = factory.newDocumentBuilder().parse((InputStream) is);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
NodeList nodeLst = doc.getElementsByTagName("CalaisSimpleOutputFormat");
for (int s = 0; s < nodeLst.getLength(); s++) {
Node fstNode = nodeLst.item(s);
visit(fstNode, 0);
}
Iterator<CalaisEntity> it = m_entities.iterator();
while (it.hasNext()) {
CalaisEntity prop = it.next();
results.put(prop.getType(), new AnalyzerResult(prop.getType(), prop.getName(), prop.getRelevance(), prop.getCount()));
}
return results;
}
public CalaisJavaIf prepCalais()
{
CalaisJavaIf calais = new CalaisJavaIf("xqffs8ggkmebrsehdsbt56j8");
calais.setOutputFormat("text/simple");
calais.setCalaisURL("http://api.opencalais.com/enlighten/rest");
//calais.setCalaisURL("http://api.opencalais.com/enlighten/calais.asmx/Enlighten");
calais.setVerifyCert(false);
return calais;
}
public Multimap analyzeString(String tweet)
{
m_entities = new ArrayList<CalaisEntity>();
Multimap<String, AnalyzerResult> results = Multimaps
.newArrayListMultimap();
// String resp_simple = downloadURL("http://service.semanticproxy.com/processurl/xqffs8ggkmebrsehdsbt56j8/simple/"
// + spec);
//System.out.println("TWEET:"+ tweet);
//URLEncoder.encode(tweet);
String resp_simple = null;
CalaisJavaIf calais = prepCalais();
resp_simple = calais.callEnlighten(tweet);
resp_simple = StringUtils.unescapeHTML(resp_simple);
System.out.println(resp_simple);
javax.xml.parsers.DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setIgnoringElementContentWhitespace(true);
factory.setIgnoringComments(true);
factory.setValidating(false);
InputStream is = null;
org.w3c.dom.Document doc = null;
try {
is = new java.io.ByteArrayInputStream(resp_simple.getBytes("UTF-8"));
doc = factory.newDocumentBuilder().parse((InputStream) is);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return results;
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return results;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return results;
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return results;
}
NodeList nodeLst = doc.getElementsByTagName("CalaisSimpleOutputFormat");
for (int s = 0; s < nodeLst.getLength(); s++) {
Node fstNode = nodeLst.item(s);
visit(fstNode, 0);
}
Iterator<CalaisEntity> it = m_entities.iterator();
while (it.hasNext()) {
CalaisEntity prop = it.next();
results.put(prop.getType(), new AnalyzerResult(prop.getType(), prop
.getName(), prop.getRelevance(), prop.getCount()));
System.out.println("Analyzer Result : " + prop.getType() + "," + prop.getName() + "," + prop.getRelevance() + "," + prop.getCount());
}
return results;
}
public List analyze(LWComponent c, boolean fallback) {
m_entities = new ArrayList<CalaisEntity>();
List<AnalyzerResult> results = new ArrayList<AnalyzerResult>();
CalaisJavaIf calais = prepCalais();
String context = null;
if (c != null) {
MetadataList ml = c.getMetadataList();
List<VueMetadataElement> elems = ml.getMetadata();
Iterator i = elems.iterator();
c.getNotes();
context = c.getLabel().trim() + ". ";
if (c.getNotes() != null)
context += c.getNotes().trim() + ". ";
while (i.hasNext()) {
VueMetadataElement e = (VueMetadataElement) i.next();
context += e.getValue() + ". ";
}
if (c.getResource() != null) {
MetaMap map = c.getResource().getProperties();
// /c.getResource().get
if (map != null) {
Collection collection = map.entries();
// Iterator iterator = collection.iterator();
Object[] obj = collection.toArray();
for (int p = 0; p < obj.length; p++) {
com.google.common.collect.AbstractMapEntry o = (AbstractMapEntry) obj[p];
// System.out.println(o.getKey());
String key = o.getKey().toString().toLowerCase();
if (key.startsWith("title") || key.startsWith("date")
|| key.startsWith("creator")
|| key.startsWith("description")) {
// System.out.println(o.toString());
context += "The " + o.getKey().toString().trim()
+ " is " + o.getValue().toString().trim()
+ ". ";
}
}
}
}
} else {
context = "Eduardo Manet the 19th century French painter.";
}
URLEncoder.encode(context);
String resp_simple = null;
resp_simple = calais.callEnlighten(context);
resp_simple = StringUtils.unescapeHTML(resp_simple);
// m_entities
javax.xml.parsers.DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setIgnoringElementContentWhitespace(true);
factory.setIgnoringComments(true);
factory.setValidating(false);
InputStream is = null;
org.w3c.dom.Document doc = null;
try {
is = new java.io.ByteArrayInputStream(resp_simple.getBytes("UTF-8"));
doc = factory.newDocumentBuilder().parse((InputStream) is);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
NodeList nodeLst = doc.getElementsByTagName("CalaisSimpleOutputFormat");
for (int s = 0; s < nodeLst.getLength(); s++) {
Node fstNode = nodeLst.item(s);
visit(fstNode, 0);
}
Iterator<CalaisEntity> it = m_entities.iterator();
while (it.hasNext()) {
Entity prop = it.next();
results.add(new AnalyzerResult(prop.getType(), prop.getName()));
}
return results;
}
public void visit(Node node, int level) {
NodeList nl = node.getChildNodes();
if (nl.getLength() == 1)
return;
boolean skip = false;
for (int i = 0, cnt = nl.getLength(); i < cnt; i++) {
Node n = nl.item(i);
if (n.getNodeName().equals("#text"))
continue;
if (n.getFirstChild() == null)
skip = true;
else if (n.getFirstChild().getNodeValue() == null
|| (n.getFirstChild().getNodeValue() != null && n
.getFirstChild().getNodeValue().trim().equals("")))
skip = true;
else
skip = false;
if (!skip) {
CalaisEntity entity = new CalaisEntity();
entity.setType(n.getNodeName());
NamedNodeMap nnm = n.getAttributes();
Node countNode = nnm.getNamedItem("count");
String cString = null;
try {
cString = countNode.getNodeValue();
} catch (NullPointerException npe) {
} finally {
if (cString == null)
cString = "1";
}
entity.setCount((Integer.valueOf(cString)).intValue());
Node doubleNode = nnm.getNamedItem("relevance");
try {
cString = doubleNode.getNodeValue();
} catch (NullPointerException npe) {
} finally {
if (cString == null)
cString = "0.0";
}
entity.setRelevance((Double.valueOf(cString)));
entity.setName(n.getFirstChild().getNodeValue());
m_entities.add(entity);
}
visit(nl.item(i), level + 1);
}
}
public String getAnalyzerName() {
return "Open Calais Analyzer";
}
public static void main(String[] args) {
OpenCalaisAnalyzer oca = new OpenCalaisAnalyzer();
oca.analyze(null);
}
}