package er.extensions.components.javascript;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.webobjects.foundation.NSArray;
import com.webobjects.foundation.NSMutableArray;
import er.extensions.foundation.ERXMutableURL;
import er.extensions.foundation.ERXStringUtilities;
/**
* Provides an interface to the Yahoo Content Analysis Service.
*
* @author mschrag
*/
public class ERXYahooContentAnalysisService {
/**
* Returns a term extraction of significant words or phrases from the given
* content using the Yahoo Term Extraction service:
* http://developer.yahoo.com/search/content/V1/termExtraction.html.
*
* @param appid
* your Yahoo application ID (see
* https://developer.yahoo.com/wsregapp/ )
* @param content
* the content to extract terms from
* @param context
* an optional search phrase that can provide context for the
* term extraction
* @param maxTerms
* the maximum number of terms to return, or null for unlimited
* @return an array of extract terms
* @throws IOException
* @throws SAXException
* @throws ParserConfigurationException
* @throws FactoryConfigurationError
*/
public static NSArray<String> termExtraction(String appid, String content, String context, Integer maxTerms) throws IOException, SAXException, ParserConfigurationException, FactoryConfigurationError {
if (content == null || content.trim().length() == 0) {
return NSArray.<String> emptyArray();
}
ERXMutableURL queryParameters = new ERXMutableURL();
queryParameters.setQueryParameter("appid", appid);
queryParameters.setQueryParameter("context", ERXStringUtilities.stripHtml(content, false));
queryParameters.setQueryParameter("output", "xml");
if (context != null) {
queryParameters.setQueryParameter("context", ERXStringUtilities.stripHtml(context, false));
}
String postData = queryParameters.toExternalForm();
HttpURLConnection conn = (HttpURLConnection) new URL("http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction").openConnection();
NSMutableArray<String> terms = new NSMutableArray<>();
try {
conn.setRequestMethod("POST");
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
conn.setRequestProperty("Accept", "*/*");
conn.setRequestProperty("Host", "api.search.yahoo.com");
conn.setRequestProperty("Connection", "Keep-Alive");
conn.setRequestProperty("Cache-Control", "no-cache");
conn.setRequestProperty("Content-Length", Integer.toString(postData.length()));
conn.setUseCaches(false);
conn.setDoInput(true);
conn.setDoOutput(true);
try (PrintWriter pw = new PrintWriter(new OutputStreamWriter(conn.getOutputStream()), true)) {
pw.print(postData);
}
Document resultsDoc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(conn.getInputStream());
resultsDoc.normalize();
NodeList resultNodes = resultsDoc.getDocumentElement().getElementsByTagName("Result");
for (int i = 0; i < resultNodes.getLength() && (maxTerms == null || terms.count() <= maxTerms.intValue()); i++) {
Node resultNode = resultNodes.item(i);
String result = ((Element) resultNode).getChildNodes().item(0).getNodeValue();
if (result != null && result.length() > 0) {
terms.addObject(result);
}
}
}
finally {
conn.disconnect();
}
return terms;
}
}