package com.knowledgebooks.info_spiders;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
/**
* This is a web services client class for using the Open Calais web service
* for analysizing text and identifying entities and relationships between
* entities. You will need a free Open Calais developer's key that should
* be set in an environment variable of defined on the command line when starting
* the JVM. This property is obtained using: <code>System.getProperty("OPEN_CALAIS_KEY")</code>
* <p/>
* Using Open Calais is optional since KB_bundle includes its own named entity
* extractor in class {@link com.knowledgebooks.nlp.ExtractNames}
* <p/>
*/
public class OpenCalaisClient {
/**
* A client program to use Reuters News Open Calais web service. This can be
* used as an alternative to the KnowledgeBooks.com entity extraction
* classes.
*
* @param text
* @return Hashtable<String, List<String>> of properties with associated values list (e.g., property "Person", value list would be a list of names occurring in the input text other properties can be "City","State", "Country", etc.)
* @throws IOException
* @throws MalformedURLException
*/
public Hashtable<String, List<String>> getPropertyNamesAndValues(String text) throws MalformedURLException, IOException {
Hashtable<String, List<String>> ret = new Hashtable<String, List<String>>();
String licenseID = System.getenv("OPEN_CALAIS_KEY");
if (licenseID == null || licenseID.length() < 5) {
System.out.println("Error: must have environment variable OPEN_CALAIS_KEY set");
System.exit(1);
}
//System.out.println("licenseID = " + licenseID);
String result = "";
try {
String content = text;
System.out.println("\n\n****** content sent to Open Calais:\n\n"+ content + "\n\n");
String paramsXML = "<c:params xmlns:c=\"http://s.opencalais.com/1/pred/\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"><c:processingDirectives c:contentType=\"text/txt\" c:outputFormat=\"xml/rdf\"></c:processingDirectives><c:userDirectives c:allowDistribution=\"true\" c:allowSearch=\"true\" c:externalID=\"17cabs901\" c:submitter=\"ABC\"></c:userDirectives><c:externalMetadata></c:externalMetadata></c:params>";
StringBuilder sb = new StringBuilder(content.length() + 512);
sb.append("licenseID=").append(licenseID);
sb.append("&content=").append(content);
sb.append("¶msXML=").append(paramsXML);
String payload = sb.toString();
URLConnection connection = new URL("http://api.opencalais.com/enlighten/calais.asmx/Enlighten").openConnection();
connection.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.addRequestProperty("Content-Length", String.valueOf(payload.length()));
connection.setDoOutput(true);
OutputStream out = connection.getOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
writer.write(payload);
writer.flush();
// get response from Open Calais server:
result = new Scanner(
connection.getInputStream()).
useDelimiter("\\Z").next();
result = result.replaceAll("<", "<").replaceAll(">", ">");
//System.out.println(result);
int index1 = result.indexOf("terms of service.-->");
index1 = result.indexOf("<!--", index1);
int index2 = result.indexOf("-->", index1);
result = result.substring(index1 + 4, index2 - 1 + 1);
String[] lines = result.split("\\n");
for (String line : lines) {
int index = line.indexOf(":");
if (index > -1) {
String relation = line.substring(0, index).trim();
String[] entities = line.substring(index + 1).trim().split(",");
for (int i = 0, size = entities.length; i < size; i++) {
entities[i] = entities[i].trim();
}
ret.put(relation, Arrays.asList(entities));
}
}
} catch (Exception ex) {
System.err.println("\nERROR USING OPEN CALAIS: " + ex + "\nresult string: " + result + "\n");
ex.printStackTrace();
}
return ret;
}
/**
*
*/
public OpenCalaisClient() {
}
}