PatentDocumentFeatures.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentExtractor;

import com.fasterxml.jackson.annotation.JsonProperty;
import nu.xom.converters.DOMConverter;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistryPOSTagger;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistrySentenceParser;
import uk.ac.cam.ch.wwmm.chemicaltagger.POSContainer;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

public class PatentDocumentFeatures {
  public static final Logger LOGGER = LogManager.getLogger(PatentDocumentFeatures.class);

  public static final String SENTENCE_PATH = "//MOLECULE//ancestor::Sentence";
  public static final String SENTENCE_DOC_HEADER = "molecule-sentence";
  public static final String MOLECULE_PATH = "//MOLECULE";

  // TODO: nullable/nonnull annotations
  private static List<Document> runTagger(DocumentBuilder docBuilder, ChemistryPOSTagger tagger,
                                          List<String> textContent)
      throws ParserConfigurationException, XPathExpressionException {
    List<Document> tagDocs = new ArrayList<>(textContent.size());
    for (String text : textContent) {
      POSContainer container = tagger.runTaggers(text);
      ChemistrySentenceParser parser = new ChemistrySentenceParser(container);
      parser.parseTags();
      nu.xom.Document xomDoc = parser.makeXMLDocument();
      DOMImplementation domImpl = docBuilder.getDOMImplementation();
      Document doc = DOMConverter.convert(xomDoc, domImpl);
      tagDocs.add(doc);
    }
    return tagDocs;
  }

  /**
   * Extracts sentence nodes from a POS-tagger XML document.  These sentences are intended to provide some notion of
   * locality for identified chemical entities.
   *
   * @param docBuilder A document builder to use when producing single-sentence XML documents.
   * @param doc        The POS-tagger XML document from which to extract sentences.
   * @return A list of single-sentence documents.
   * @throws ParserConfigurationException
   * @throws XPathExpressionException
   */
  private static List<Document> findSentences(DocumentBuilder docBuilder, Document doc)
      throws ParserConfigurationException, XPathExpressionException {
    if (doc != null) {
      // TODO: is there a more efficient yet still safe way to do this?
      XPath xpath = Util.getXPathFactory().newXPath();
      // TODO: get rid of this inline xpath compilation, run during setup.
      NodeList nodes =
          (NodeList) xpath.evaluate(SENTENCE_PATH, doc, XPathConstants.NODESET);

      List<Document> docList = new ArrayList<>(nodes.getLength());
      for (int i = 0; i < nodes.getLength(); i++) {
        Node n = nodes.item(i);

        /* With help from:
         * http://examples.javacodegeeks.com/core-java/xml/dom/copy-nodes-subtree-from-one-dom-document-to-another/ */
        org.w3c.dom.Document newDoc = docBuilder.newDocument();
        Element rootElement = newDoc.createElement(SENTENCE_DOC_HEADER);
        Node newNode = newDoc.importNode(n, true);
        rootElement.appendChild(newNode);
        newDoc.appendChild(rootElement);
        docList.add(newDoc);
      }
      return docList;
    } else {
      // TODO: log here.
      return new ArrayList<>(0);
    }
  }

  /* Node.getTextContent() returns the concatenation of all text content without any delimitation between nodes.
   * This function recursively traverses the document structure, appending (naively) text content to a string joiner
   * as it goes. */
  private static List<String> appendTextContent(List<String> textList, Node n) {
    if (n.getNodeType() == Node.TEXT_NODE) {
      textList.add(n.getTextContent());
    } else {
      NodeList childNodes = n.getChildNodes();
      for (int j = 0; j < childNodes.getLength(); j++) {
        Node childNode = childNodes.item(j);
        textList = appendTextContent(textList, childNode);
      }
    }
    return textList;
  }

  private static Map<String, Integer> extractMoleculeCounts(Map<String, Integer> moleculeCounts, Document doc)
      throws ParserConfigurationException, XPathExpressionException {
    if (doc != null) {
      /* This uses //MOLECULE instead of //MOLECULE//text(), as the latter finds all text for all molecules
       * instead of text for each molecule.  We could also do a secondary traversal of each MOLECULE fragment,
       * but running XPath queries over XPath results is a major pain.  Instead, we'll grab the MOLECULE nodes
       * and recursively extract the text content one molecule at a time. */
      XPath xpath = Util.getXPathFactory().newXPath();
      NodeList nodes =
          (NodeList) xpath.evaluate(MOLECULE_PATH, doc, XPathConstants.NODESET);
      for (int i = 0; i < nodes.getLength(); i++) {
        List<String> nameList = appendTextContent(new LinkedList<String>(), nodes.item(i));
        String moleculeName = StringUtils.join(nameList, " ");
        Integer count = moleculeCounts.get(moleculeName);
        if (count == null) {
          count = 0;
        }
        moleculeCounts.put(moleculeName, count + 1);
      }
    }
    return moleculeCounts;
  }

  private static String docToString(Transformer transformer, Document doc) throws TransformerException {
    StringWriter stringWriter = new StringWriter();
    DOMSource source = new DOMSource(doc);
    StreamResult result = new StreamResult(stringWriter);
    transformer.transform(source, result);
    return stringWriter.toString();
  }

  // TODO: prolly belongs in a factory.

  /**
   * Extracts features from PatentDocument objects, including counts of terms in the patent text that can be
   * identified as chemical entities.
   *
   * @param posTagger      A ChemTagger POS (part of speech) tagger to use when extracting features from the patent text.
   * @param patentDocument The PatentDocument from which to extract features.
   * @return A PatentDocumentFeatures object containing features for the specified patent document.
   * @throws ParserConfigurationException
   * @throws XPathExpressionException
   * @throws TransformerException
   * @throws IOException
   */
  public static PatentDocumentFeatures extractPatentDocumentFeatures(
      ChemistryPOSTagger posTagger, PatentDocument patentDocument)
      throws ParserConfigurationException, XPathExpressionException, TransformerException, IOException {
    DocumentBuilder docBuilder = Util.mkDocBuilderFactory().newDocumentBuilder();

    List<Document> claimsDocs = runTagger(docBuilder, posTagger, patentDocument.getClaimsText());
    List<Document> textDocs = runTagger(docBuilder, posTagger, patentDocument.getTextContent());

    List<Document> claimsTags = new ArrayList<>(claimsDocs.size());
    for (Document d : claimsDocs) {
      List<Document> sentences = findSentences(docBuilder, d);
      claimsTags.addAll(sentences);
    }
    List<Document> textTags = new ArrayList<>(textDocs.size());
    for (Document d : textDocs) {
      List<Document> sentences = findSentences(docBuilder, d);
      textTags.addAll(sentences);
    }

    Transformer transformer = TransformerFactory.newInstance().newTransformer();

    List<String> claimsSentences = new ArrayList<>(claimsTags.size());
    for (Document doc : claimsTags) {
      claimsSentences.add(docToString(transformer, doc));
    }

    List<String> textSentences = new ArrayList<>(textTags.size());
    for (Document doc : textTags) {
      textSentences.add(docToString(transformer, doc));
    }

    Map<String, Integer> claimsMoleculeCounts = new HashMap<>();
    for (Document doc : claimsDocs) {
      extractMoleculeCounts(claimsMoleculeCounts, doc);
    }
    Map<String, Integer> textMoleculeCounts = new HashMap<>();
    for (Document doc : textDocs) {
      extractMoleculeCounts(textMoleculeCounts, doc);
    }

    return new PatentDocumentFeatures(patentDocument,
        claimsDocs, textDocs,
        claimsSentences, textSentences,
        claimsMoleculeCounts, textMoleculeCounts);
  }


  @JsonProperty("patent_document")
  protected PatentDocument patentDocument;
  // TODO: why are JsonSerialize and JsonDeserialize ignored in this situation (hence they're commented out.)?
  // @JsonSerialize(contentUsing = Util.DocumentSerializer.class)
  // @JsonDeserialize(contentUsing = Util.DocumentDeserializer.class)
  @JsonProperty("claims_tags")
  protected List<Document> claimsDocuments;
  // @JsonSerialize(contentUsing = Util.DocumentSerializer.class)
  // @JsonDeserialize(contentUsing = Util.DocumentDeserializer.class)
  @JsonProperty("text_tags")
  protected List<Document> textDocuments;
  @JsonProperty("claims_sentences")
  protected List<String> claimsSentences;
  @JsonProperty("text_sentences")
  protected List<String> textSentences;
  @JsonProperty("claims_molecule_counts")
  protected Map<String, Integer> claimsMoleculeCounts;
  @JsonProperty("text_molecule_counts")
  protected Map<String, Integer> textMoleculeCounts;

  public PatentDocumentFeatures(PatentDocument patentDocument,
                                List<Document> claimsDocuments, List<Document> textDocuments,
                                List<String> claimsSentences, List<String> textSentences,
                                Map<String, Integer> claimsMoleculeCounts, Map<String, Integer> textMoleculeCounts) {
    this.patentDocument = patentDocument;
    this.claimsDocuments = claimsDocuments;
    this.textDocuments = textDocuments;
    this.claimsSentences = claimsSentences;
    this.textSentences = textSentences;
    this.claimsMoleculeCounts = claimsMoleculeCounts;
    this.textMoleculeCounts = textMoleculeCounts;
  }

  public PatentDocument getPatentDocument() {
    return patentDocument;
  }

  public List<Document> getClaimsDocument() {
    return claimsDocuments;
  }

  public List<Document> getTextDocument() {
    return textDocuments;
  }

  public List<String> getClaimsSentences() {
    return claimsSentences;
  }

  public List<String> getTextSentences() {
    return textSentences;
  }

  public Map<String, Integer> getClaimsMoleculeCounts() {
    return claimsMoleculeCounts;
  }

  public Map<String, Integer> getTextMoleculeCounts() {
    return textMoleculeCounts;
  }
}