/*
* Copyright (c) 2006-2014 by Public Library of Science
*
* http://plos.org
* http://ambraproject.org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ambraproject.service.article;
import org.ambraproject.util.DocumentBuilderFactoryCreator;
import org.ambraproject.util.XPathUtil;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.methods.StringRequestEntity;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathException;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Alex Kudlick
* Date: 7/3/12
*/
public class AIArticleClassifier implements ArticleClassifier {
private static final Logger log = LoggerFactory.getLogger(AIArticleClassifier.class);
private static final String MESSAGE_BEGIN = "<TMMAI project='%s' location = '.'>\n" +
" <Method name='getSuggestedTermsFullPaths' returnType='java.util.Vector'/>\n" +
" <VectorParam>\n" +
" <VectorElement>";
private static final String MESSAGE_END = "</VectorElement>\n" +
" </VectorParam>\n" +
"</TMMAI>";
private String serviceUrl;
private String thesaurus;
private HttpClient httpClient;
@Required
public void setServiceUrl(String serviceUrl) {
this.serviceUrl = serviceUrl;
}
@Required
public void setThesaurus(String thesaurus) {
this.thesaurus = thesaurus;
}
@Required
public void setHttpClient(HttpClient httpClient) {
this.httpClient = httpClient;
}
/**
* @inheritDoc
*/
@Override
public Map<String, Integer> classifyArticle(Document articleXml) throws Exception {
List<String> rawTerms = getRawTerms(articleXml);
Map<String, Integer> results = new LinkedHashMap<String, Integer>(rawTerms.size());
for (String rawTerm : rawTerms) {
Map.Entry<String, Integer> entry = parseVectorElement(rawTerm);
// When the new taxonomy launched, we had a problem where lots of PLOS ONE
// papers were being tagged with subcategories of
// "/Earth sciences/Geography/Locations" (see Jira TAX-30). So we're just
// blacklisting this category for now.
//
// TODO: tweak the AI taxonomy server rulebase to make this unnecessary, and
// remove the hack.
if (entry.getKey() != null && !entry.getKey().startsWith("/Earth sciences/Geography/Locations/")) {
results.put(entry.getKey(), entry.getValue());
}
}
return results;
}
/**
* Queries the MAI server for taxonomic terms for a given article, and returns a list
* of the raw results.
*
* @param articleXml DOM of the article to categorize
* @return List of results from the server. This will consist of raw XML fragments, and
* include things like counts that we don't currently store in mysql.
* @throws Exception
*/
private List<String> getRawTerms(Document articleXml) throws Exception {
String toCategorize = getCategorizationContent(articleXml);
String aiMessage = String.format(MESSAGE_BEGIN, thesaurus) + toCategorize + MESSAGE_END;
PostMethod post = new PostMethod(serviceUrl);
post.setRequestEntity(new StringRequestEntity(aiMessage, "application/xml", "UTF-8"));
httpClient.executeMethod(post);
Document response = DocumentBuilderFactoryCreator.createFactory()
.newDocumentBuilder().parse(post.getResponseBodyAsStream());
//parse result
NodeList vectorElements = response.getElementsByTagName("VectorElement");
List<String> results = new ArrayList<String>(vectorElements.getLength());
//The first and last elements of the vector response are just MAITERMS
for (int i = 1; i < vectorElements.getLength() - 1; i++) {
results.add(vectorElements.item(i).getTextContent());
}
return results;
}
// There appears to be a bug in the AI getSuggestedTermsFullPath method.
// It's supposed to return a slash-delimited path that starts with a slash,
// like an absolute Unix file path. However, rarely, it just returns "naked"
// terms without the leading slash. Discard these, since the calling
// code won't be able to handle this. (Note the first slash after <TERM> in the regex)
//Positive (Good term) example response:
//"<TERM>/Biology and life sciences/Computational biology/Computational neuroscience/Single neuron function|(5) neuron*(5)</TERM>"
//This regex:
//Confirms the response is good
//Finds the term and places in the result
//Finds first number wrapped in parentheses after the pipe symbol and places it in the result
static Pattern TERM_PATTERN = Pattern.compile("<TERM>\\s*(/.*)\\|\\s*\\((\\d+)\\).*</TERM>");
/**
* Parses a single line of the XML response from the taxonomy server.
*
* @param vectorElement The text body of a line of the response
* @return the term and weight of the term or null if the line is not valid
*/
static Map.Entry<String, Integer> parseVectorElement(String vectorElement) {
Matcher match = TERM_PATTERN.matcher(vectorElement);
if(match.find()) {
String text = match.group(1);
Integer value = Integer.valueOf(match.group(2));
return new AbstractMap.SimpleImmutableEntry<String, Integer>(text, value);
} else {
//Bad term, return null
return null;
}
}
/**
* Adds the text content of the given element to the StringBuilder, if it exists.
* If more than one element exists with the given name, only appends the first one.
*
* @param sb StringBuilder to be modified
* @param dom DOM tree of an article
* @param elementName name of element to search for in the dom
* @return true if the StringBuilder was modified
*/
boolean appendElementIfExists(StringBuilder sb, Document dom, String elementName) {
NodeList list = dom.getElementsByTagName(elementName);
if (list != null && list.getLength() > 0) {
sb.append(list.item(0).getTextContent());
sb.append("\n");
return true;
} else {
return false;
}
}
/**
* Adds the text content of all found elements to the StringBuilder, if they exist.
*
* @param sb StringBuilder to be modified
* @param dom DOM tree of an article
* @param elementName name of element to search for in the dom
* @return true if the StringBuilder was modified
*/
boolean appendAllElementsIfExists(StringBuilder sb, Document dom, String elementName) {
NodeList list = dom.getElementsByTagName(elementName);
if (list != null && list.getLength() > 0) {
for(int a = 0; a < list.getLength(); a++) {
sb.append(list.item(a).getTextContent());
sb.append("\n");
}
return true;
} else {
return false;
}
}
/**
* Appends a given section of the article, with one of the given titles, to the
* StringBuilder passed in. (Examples include "Results", "Materials and Methods",
* "Discussion", etc.)
*
* @param sb StringBuilder to be modified
* @param dom DOM tree of an article
* @param sectionTitles list of titles to look for. The first one found will be
* appended.
* @return true if the StringBuilder was modified
* @throws XPathException
*/
boolean appendSectionIfExists(StringBuilder sb, Document dom, String... sectionTitles)
throws XPathException {
XPathUtil xPathUtil = new XPathUtil();
for (String title : sectionTitles) {
Node node = xPathUtil.selectSingleNode(dom,
String.format("/article/body/sec[title='%s']", title));
if (node != null) {
sb.append(node.getTextContent());
sb.append("\n");
return true;
}
}
return false;
}
/**
* Returns a string containing only the parts of the article that should be examined
* by the taxonomy server. For research articles, this is presently the title, the
* abstract, the Materials and Methods section, and the Results section. (If any of
* these sections are not present, they are not sent, but this is not a fatal error.)
* If none of these sections (abstract, materials/methods, or results) are present,
* then this method will return the entire body text. This is usually the case for
* non-research-articles, such as corrections, opinion pieces, etc.
*
* @param dom DOM tree of an article
* @return raw text content, XML-escaped, of the relevant article sections
* @throws TransformerException
* @throws XPathException
*/
String getCategorizationContent(Document dom) throws TransformerException, XPathException {
StringBuilder sb = new StringBuilder();
appendElementIfExists(sb, dom, "article-title");
appendAllElementsIfExists(sb, dom, "abstract");
appendElementIfExists(sb, dom, "body");
return StringEscapeUtils.escapeXml(sb.toString().trim());
}
// Utility main method and associated code useful for grabbing categories for individual
// articles.
// TODO: consider moving this somewhere else.
private static final Pattern DOI_REGEX = Pattern.compile("(p[a-z]{3}\\.\\d{7})");
private static final String XML_URL = "http://www.plosone.org/article/fetchObjectAttachment.action"
+ "?uri=info%%3Adoi%%2F10.1371%%2Fjournal.%s&representation=XML";
private static final String XML_URL_FULLDOI = "http://www.plosone.org/article/fetchObjectAttachment.action"
+ "?uri=%s&representation=XML";
/**
* Returns the XML for an article. Note that this fetches the article XML via a
* web request to the live site, not using a filestore.
*
* @param doi doi specifying the article
* @return String of the article XML, if found
* @throws Exception
*/
private String fetchXml(String doi) throws Exception {
URL url = new URL(doi);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.connect();
InputStream is = conn.getInputStream();
String result = IOUtils.toString(is);
is.close();
return result;
}
/**
* @inheritDoc
*/
public void testThesaurus(final OutputStream os, final String doi, final String thesaurus) throws Exception {
String full_doi = String.format(XML_URL_FULLDOI, doi);
String xml = fetchXml(full_doi);
PrintStream ps = new PrintStream(os);
Document dom = DocumentBuilderFactoryCreator.createFactory().newDocumentBuilder().parse(
new ByteArrayInputStream(xml.getBytes("utf-8")));
AIArticleClassifier classifier = new AIArticleClassifier();
ps.println("Content to send to taxonomy server:");
ps.println("\n\n" + classifier.getCategorizationContent(dom) + "\n\n");
classifier.setServiceUrl("https://plos.accessinn.com:9136/servlet/dh");
classifier.setThesaurus(thesaurus);
classifier.setHttpClient(new HttpClient(new MultiThreadedHttpConnectionManager()));
List<String> rawOutput = classifier.getRawTerms(dom);
ps.println("\n\nTerms returned by taxonomy server:");
for (String s : rawOutput) {
// Strip out XML wrapping
s = s.replace("<TERM>", "");
s = s.replace("</TERM>", "");
// Replicate the hack in classifyArticle() above, so that this main method only shows what
// we would actually store in mysql.
if (!s.startsWith("/Earth sciences/Geography/Locations/")) {
ps.println(s);
}
}
ps.println("\n\n");
}
/**
* Main method that categorizes a single article, based on its DOI as input on
* the command line.
*
* @param args
* @throws Exception
*/
@Deprecated
public static void main(String... args) throws Exception {
//TODO: Delete me, not likely in use any longer
if (args.length != 2) {
System.err.println("You must specify the thesaurus as the first argument, and the PLOS "
+ "article as the second. You entered: " + Arrays.toString(args));
System.exit(1);
}
Matcher matcher = DOI_REGEX.matcher(args[1]);
matcher.find();
String doi = matcher.group(1);
if (doi != null) {
doi = String.format(XML_URL, doi);
AIArticleClassifier aiArticleClassifier = new AIArticleClassifier();
aiArticleClassifier.testThesaurus(System.out, doi, args[0].trim());
} else {
System.out.println(args[1] + " is not a valid DOI");
System.exit(1);
}
}
}