/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.twentyn.patentExtractor;
import com.fasterxml.jackson.annotation.JsonProperty;
import nu.xom.converters.DOMConverter;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistryPOSTagger;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistrySentenceParser;
import uk.ac.cam.ch.wwmm.chemicaltagger.POSContainer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class PatentDocumentFeatures {
public static final Logger LOGGER = LogManager.getLogger(PatentDocumentFeatures.class);
public static final String SENTENCE_PATH = "//MOLECULE//ancestor::Sentence";
public static final String SENTENCE_DOC_HEADER = "molecule-sentence";
public static final String MOLECULE_PATH = "//MOLECULE";
// TODO: nullable/nonnull annotations
private static List<Document> runTagger(DocumentBuilder docBuilder, ChemistryPOSTagger tagger,
List<String> textContent)
throws ParserConfigurationException, XPathExpressionException {
List<Document> tagDocs = new ArrayList<>(textContent.size());
for (String text : textContent) {
POSContainer container = tagger.runTaggers(text);
ChemistrySentenceParser parser = new ChemistrySentenceParser(container);
parser.parseTags();
nu.xom.Document xomDoc = parser.makeXMLDocument();
DOMImplementation domImpl = docBuilder.getDOMImplementation();
Document doc = DOMConverter.convert(xomDoc, domImpl);
tagDocs.add(doc);
}
return tagDocs;
}
/**
* Extracts sentence nodes from a POS-tagger XML document. These sentences are intended to provide some notion of
* locality for identified chemical entities.
*
* @param docBuilder A document builder to use when producing single-sentence XML documents.
* @param doc The POS-tagger XML document from which to extract sentences.
* @return A list of single-sentence documents.
* @throws ParserConfigurationException
* @throws XPathExpressionException
*/
private static List<Document> findSentences(DocumentBuilder docBuilder, Document doc)
throws ParserConfigurationException, XPathExpressionException {
if (doc != null) {
// TODO: is there a more efficient yet still safe way to do this?
XPath xpath = Util.getXPathFactory().newXPath();
// TODO: get rid of this inline xpath compilation, run during setup.
NodeList nodes =
(NodeList) xpath.evaluate(SENTENCE_PATH, doc, XPathConstants.NODESET);
List<Document> docList = new ArrayList<>(nodes.getLength());
for (int i = 0; i < nodes.getLength(); i++) {
Node n = nodes.item(i);
/* With help from:
* http://examples.javacodegeeks.com/core-java/xml/dom/copy-nodes-subtree-from-one-dom-document-to-another/ */
org.w3c.dom.Document newDoc = docBuilder.newDocument();
Element rootElement = newDoc.createElement(SENTENCE_DOC_HEADER);
Node newNode = newDoc.importNode(n, true);
rootElement.appendChild(newNode);
newDoc.appendChild(rootElement);
docList.add(newDoc);
}
return docList;
} else {
// TODO: log here.
return new ArrayList<>(0);
}
}
/* Node.getTextContent() returns the concatenation of all text content without any delimitation between nodes.
* This function recursively traverses the document structure, appending (naively) text content to a string joiner
* as it goes. */
private static List<String> appendTextContent(List<String> textList, Node n) {
if (n.getNodeType() == Node.TEXT_NODE) {
textList.add(n.getTextContent());
} else {
NodeList childNodes = n.getChildNodes();
for (int j = 0; j < childNodes.getLength(); j++) {
Node childNode = childNodes.item(j);
textList = appendTextContent(textList, childNode);
}
}
return textList;
}
private static Map<String, Integer> extractMoleculeCounts(Map<String, Integer> moleculeCounts, Document doc)
throws ParserConfigurationException, XPathExpressionException {
if (doc != null) {
/* This uses //MOLECULE instead of //MOLECULE//text(), as the latter finds all text for all molecules
* instead of text for each molecule. We could also do a secondary traversal of each MOLECULE fragment,
* but running XPath queries over XPath results is a major pain. Instead, we'll grab the MOLECULE nodes
* and recursively extract the text content one molecule at a time. */
XPath xpath = Util.getXPathFactory().newXPath();
NodeList nodes =
(NodeList) xpath.evaluate(MOLECULE_PATH, doc, XPathConstants.NODESET);
for (int i = 0; i < nodes.getLength(); i++) {
List<String> nameList = appendTextContent(new LinkedList<String>(), nodes.item(i));
String moleculeName = StringUtils.join(nameList, " ");
Integer count = moleculeCounts.get(moleculeName);
if (count == null) {
count = 0;
}
moleculeCounts.put(moleculeName, count + 1);
}
}
return moleculeCounts;
}
private static String docToString(Transformer transformer, Document doc) throws TransformerException {
StringWriter stringWriter = new StringWriter();
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(stringWriter);
transformer.transform(source, result);
return stringWriter.toString();
}
// TODO: prolly belongs in a factory.
/**
* Extracts features from PatentDocument objects, including counts of terms in the patent text that can be
* identified as chemical entities.
*
* @param posTagger A ChemTagger POS (part of speech) tagger to use when extracting features from the patent text.
* @param patentDocument The PatentDocument from which to extract features.
* @return A PatentDocumentFeatures object containing features for the specified patent document.
* @throws ParserConfigurationException
* @throws XPathExpressionException
* @throws TransformerException
* @throws IOException
*/
public static PatentDocumentFeatures extractPatentDocumentFeatures(
ChemistryPOSTagger posTagger, PatentDocument patentDocument)
throws ParserConfigurationException, XPathExpressionException, TransformerException, IOException {
DocumentBuilder docBuilder = Util.mkDocBuilderFactory().newDocumentBuilder();
List<Document> claimsDocs = runTagger(docBuilder, posTagger, patentDocument.getClaimsText());
List<Document> textDocs = runTagger(docBuilder, posTagger, patentDocument.getTextContent());
List<Document> claimsTags = new ArrayList<>(claimsDocs.size());
for (Document d : claimsDocs) {
List<Document> sentences = findSentences(docBuilder, d);
claimsTags.addAll(sentences);
}
List<Document> textTags = new ArrayList<>(textDocs.size());
for (Document d : textDocs) {
List<Document> sentences = findSentences(docBuilder, d);
textTags.addAll(sentences);
}
Transformer transformer = TransformerFactory.newInstance().newTransformer();
List<String> claimsSentences = new ArrayList<>(claimsTags.size());
for (Document doc : claimsTags) {
claimsSentences.add(docToString(transformer, doc));
}
List<String> textSentences = new ArrayList<>(textTags.size());
for (Document doc : textTags) {
textSentences.add(docToString(transformer, doc));
}
Map<String, Integer> claimsMoleculeCounts = new HashMap<>();
for (Document doc : claimsDocs) {
extractMoleculeCounts(claimsMoleculeCounts, doc);
}
Map<String, Integer> textMoleculeCounts = new HashMap<>();
for (Document doc : textDocs) {
extractMoleculeCounts(textMoleculeCounts, doc);
}
return new PatentDocumentFeatures(patentDocument,
claimsDocs, textDocs,
claimsSentences, textSentences,
claimsMoleculeCounts, textMoleculeCounts);
}
@JsonProperty("patent_document")
protected PatentDocument patentDocument;
// TODO: why are JsonSerialize and JsonDeserialize ignored in this situation (hence they're commented out.)?
// @JsonSerialize(contentUsing = Util.DocumentSerializer.class)
// @JsonDeserialize(contentUsing = Util.DocumentDeserializer.class)
@JsonProperty("claims_tags")
protected List<Document> claimsDocuments;
// @JsonSerialize(contentUsing = Util.DocumentSerializer.class)
// @JsonDeserialize(contentUsing = Util.DocumentDeserializer.class)
@JsonProperty("text_tags")
protected List<Document> textDocuments;
@JsonProperty("claims_sentences")
protected List<String> claimsSentences;
@JsonProperty("text_sentences")
protected List<String> textSentences;
@JsonProperty("claims_molecule_counts")
protected Map<String, Integer> claimsMoleculeCounts;
@JsonProperty("text_molecule_counts")
protected Map<String, Integer> textMoleculeCounts;
public PatentDocumentFeatures(PatentDocument patentDocument,
List<Document> claimsDocuments, List<Document> textDocuments,
List<String> claimsSentences, List<String> textSentences,
Map<String, Integer> claimsMoleculeCounts, Map<String, Integer> textMoleculeCounts) {
this.patentDocument = patentDocument;
this.claimsDocuments = claimsDocuments;
this.textDocuments = textDocuments;
this.claimsSentences = claimsSentences;
this.textSentences = textSentences;
this.claimsMoleculeCounts = claimsMoleculeCounts;
this.textMoleculeCounts = textMoleculeCounts;
}
public PatentDocument getPatentDocument() {
return patentDocument;
}
public List<Document> getClaimsDocument() {
return claimsDocuments;
}
public List<Document> getTextDocument() {
return textDocuments;
}
public List<String> getClaimsSentences() {
return claimsSentences;
}
public List<String> getTextSentences() {
return textSentences;
}
public Map<String, Integer> getClaimsMoleculeCounts() {
return claimsMoleculeCounts;
}
public Map<String, Integer> getTextMoleculeCounts() {
return textMoleculeCounts;
}
}