PatentDocument.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentExtractor;

import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
 * This class represents parts of a USPTO patent document that are relevant to 20's use cases.  It can extract
 * information from the USPTO's XML documents and convert it to a POJO that can then be serialized as JSON.  Use this
 * as the basis for any processing of patent text.
 */
public class PatentDocument {

  public static final Logger LOGGER = LogManager.getLogger(PatentDocument.class);

  // See http://www.uspto.gov/learning-and-resources/xml-resources.
  public static final String DTD2014 = "v4.5 2014-04-03";
  public static final String DTD2013 = "v4.4 2013-05-16";
  public static final String DTD2012 = "v4.3 2012-12-04";
  public static final String DTD2006 = "v4.2 2006-08-23";
  public static final String DTD2005 = "v4.1 2005-08-25";
  public static final String DTD2004 = "v40 2004-12-02";

  public static final String DTD2014_APP = "v4.4 2014-04-03";
  public static final String DTD2012_APP = "v4.3 2012-12-04";
  public static final String DTD2006_APP = "v4.2 2006-08-23";
  public static final String DTD2005_APP = "v4.1 2005-08-25";
  public static final String DTD2004_APP = "v4.0 2004-12-02";

  public static final String PATH_DTD_VERSION = "/us-patent-grant/@dtd-version";
  public static final String PATH_DTD_VERSION_APP = "/us-patent-application/@dtd-version";
  public static final String[] PATHS_TEXT = {
      "//description",
      "//invention-title",
      "//abstract",
  };
  public static final String PATH_CLAIMS = "//claims";

  public static final String
      PATH_KEY_FILE_ID = "fileId",
      PATH_KEY_TITLE = "title",
      PATH_KEY_DATE = "date",
      PATH_KEY_MAIN_CLASSIFICATION = "classification",
      PATH_KEY_FURTHER_CLASSIFICATIONS = "further_classifications",
      PATH_KEY_SEARCHED_CLASSIFICATIONS = "referenced_classifications";

  // TODO: is there a type-safe way of building an object from XPath with a map of functions?
  public static final HashMap<String, String> PATHS_2013 = new HashMap<String, String>() {{
    put(PATH_KEY_FILE_ID, "/us-patent-grant/@file");
    put(PATH_KEY_TITLE, "/us-patent-grant/us-bibliographic-data-grant/invention-title");
    put(PATH_KEY_DATE, "/us-patent-grant/@date-publ");
    put(PATH_KEY_MAIN_CLASSIFICATION,
        "/us-patent-grant/us-bibliographic-data-grant/classification-national/main-classification/text()");
    put(PATH_KEY_FURTHER_CLASSIFICATIONS,
        "/us-patent-grant/us-bibliographic-data-grant/classification-national/further-classification");
    put(PATH_KEY_SEARCHED_CLASSIFICATIONS,
        "/us-patent-grant/us-bibliographic-data-grant/us-field-of-classification-search/classification-national[./country/text()='US']/main-classification");
  }};

  public static final HashMap<String, String> PATHS_2004 = new HashMap<String, String>() {{
    put(PATH_KEY_FILE_ID, "/us-patent-grant/@file");
    put(PATH_KEY_TITLE, "/us-patent-grant/us-bibliographic-data-grant/invention-title");
    put(PATH_KEY_DATE, "/us-patent-grant/@date-publ");
    put(PATH_KEY_MAIN_CLASSIFICATION,
        "/us-patent-grant/us-bibliographic-data-grant/classification-national/main-classification/text()");
    put(PATH_KEY_FURTHER_CLASSIFICATIONS,
        "/us-patent-grant/us-bibliographic-data-grant/classification-national/further-classification");
    put(PATH_KEY_SEARCHED_CLASSIFICATIONS,
        "/us-patent-grant/us-bibliographic-data-grant/field-of-search/classification-national[./country/text()='US']/main-classification");
  }};

  public static final HashMap<String, String> PATHS_2014_APP = new HashMap<String, String>() {{
    put(PATH_KEY_FILE_ID, "/us-patent-application/@file");
    put(PATH_KEY_TITLE, "/us-patent-application/us-bibliographic-data-application/invention-title");
    put(PATH_KEY_DATE, "/us-patent-application/@date-publ");
    put(PATH_KEY_MAIN_CLASSIFICATION,
        "/us-patent-application/us-bibliographic-data-application/classification-national/main-classification/text()");
    put(PATH_KEY_FURTHER_CLASSIFICATIONS,
        "/us-patent-application/us-bibliographic-data-application/classification-national/further-classification");
    put(PATH_KEY_SEARCHED_CLASSIFICATIONS, // Note: doesn't exist, but left for ease of use.
        "/us-patent-application/us-bibliographic-data-application/us-field-of-classification-search/classification-national[./country/text()='US']/main-classification");
  }};

  public static final HashMap<String, HashMap<String, String>> VERSION_MAP =
      new HashMap<String, HashMap<String, String>>() {{
        put(DTD2014, PATHS_2013); // All the 2013 paths work with the 2014 DTD.
        put(DTD2013, PATHS_2013);
        put(DTD2012, PATHS_2013); // All the 2013 paths work with the 2012 DTD.
        put(DTD2006, PATHS_2013); // All the 2013 paths work with the 2006 DTD.
        put(DTD2005, PATHS_2013); // All the 2013 paths work with the 2005 DTD.
        put(DTD2004, PATHS_2004);
        put(DTD2014_APP, PATHS_2014_APP);
        put(DTD2012_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2012 app DTD.
        put(DTD2006_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2006 app DTD.
        put(DTD2005_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2005 app DTD, though the classifications might be different.
        put(DTD2004_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2005 app DTD assuming searched classifications are always empty.
      }};

  private static final Pattern GZIP_PATTERN = Pattern.compile("\\.gz$");

  public static class HtmlVisitor implements NodeVisitor {
    // Based on https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java
    private static final HashSet<String> SEGMENTING_NODES = new HashSet<String>() {{
      addAll(Arrays.asList(
          "p", "h1", "h2", "h3", "h4", "h5", "h6", "dt", "dd", "tr", "li", "body", "div", // HTML entities
          "row", "claim" // patent-specific entities
      ));
    }};
    private static final Pattern SPACE_PATTERN = Pattern.compile("^\\s+$");

    private StringBuilder segmentBuilder = new StringBuilder();
    private List<String> textSegments = new LinkedList<>();

    @Override
    public void head(org.jsoup.nodes.Node node, int i) {
      // This borrows a page from HtmlToPlainText's book.
      if (node instanceof TextNode) {
        String text = ((TextNode) node).text();
        if (text != null && text.length() > 0) {
          segmentBuilder.append(((TextNode) node).text());
        }
      }
    }

    @Override
    public void tail(org.jsoup.nodes.Node node, int i) {
      String nodeName = node.nodeName();
      if (nodeName.equals("a")) {
        // Same as Jsoup's HtmlToPlainText.
        segmentBuilder.append(String.format(" <%s>", node.absUrl("href")));
      } else if (SEGMENTING_NODES.contains(nodeName) && segmentBuilder.length() > 0) {
        String segmentText = segmentBuilder.toString();
        // Ignore blank lines, as we'll be tagging each line separately.
        if (!SPACE_PATTERN.matcher(segmentText).matches()) {
          this.textSegments.add(segmentText);
        }
        // TODO: is it better to drop the old one than clear the existing?
        segmentBuilder.setLength(0);
      }
    }

    public List<String> getTextContent() {
      return this.textSegments;
    }
  }

  private static List<String> extractTextFromHTML(DocumentBuilder docBuilder, NodeList textNodes)
      throws ParserConfigurationException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    List<String> allTextList = new ArrayList<>(0);
    if (textNodes != null) {
      for (int i = 0; i < textNodes.getLength(); i++) {
        Node n = textNodes.item(i);
                    /* This extremely around-the-horn approach to handling text content is due to the mix of HTML and
                     * XML in the patent body.  We use Jsoup to parse the HTML entities we find in the body, and use
                     * its extremely convenient NodeVisitor API to recursively traverse the document and extract the
                     * text content in reasonable chunks.
                     */
        Document contentsDoc = Util.nodeToDocument(docBuilder, "body", n);
        String docText = Util.documentToString(contentsDoc);
        // With help from http://stackoverflow.com/questions/832620/stripping-html-tags-in-java
        org.jsoup.nodes.Document htmlDoc = Jsoup.parse(docText);
        HtmlVisitor visitor = new HtmlVisitor();
        NodeTraversor traversor = new NodeTraversor(visitor);
        traversor.traverse(htmlDoc);
        List<String> textSegments = visitor.getTextContent();
        allTextList.addAll(textSegments);
      }
    }
    return allTextList;
  }

  /**
   * Extracts the text content from text fields in a patent XML document.
   *
   * @param docBuilder A document builder to use when constructing intermediate XML/HTML documents in the extraction
   *                   process.
   * @param paths      A list of XPath paths from which to exactract text.
   * @param xpath      An XPath instance to use when running XPath queries.
   * @param doc        The XML document from which to extract text.
   * @return A list of strings representing the textual content of the document.  These could be sentences,
   * paragraphs, or larger text units, but should represent some sort of structure in the document's text.
   * @throws ParserConfigurationException
   * @throws TransformerConfigurationException
   * @throws TransformerException
   * @throws XPathExpressionException
   */
  private static List<String> getRelevantDocumentText(DocumentBuilder docBuilder, String[] paths,
                                                      XPath xpath, Document doc)
      throws ParserConfigurationException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    List<String> allTextList = new ArrayList<>(0);
    for (String path : paths) {
      XPathExpression exp = xpath.compile(path);
      NodeList textNodes = (NodeList) exp.evaluate(doc, XPathConstants.NODESET);
      allTextList.addAll(extractTextFromHTML(docBuilder, textNodes));
    }

    return allTextList;
  }

  /**
   * Converts an XML file into a patent document object, extracting relevant fields from the patent XML.
   *
   * @param inputPath A path to the file to be read.
   * @return A patent object if the XML can be read, or null otherwise.
   * @throws IOException                  Thrown on file I/O errors.
   * @throws ParserConfigurationException Thrown when the XML parser cannot be configured correctly.
   * @throws SAXException                 Thrown on XML parser errors.
   * @throws XPathExpressionException     Thrown when XPath fails to handle queries against the specified document.
   */
  // TODO: logging?
  // TODO: are @nullable and @non-null annotations still a thing?
  // TODO: prolly belongs in a factory.
  public static PatentDocument patentDocumentFromXMLFile(File inputPath)
      throws IOException, ParserConfigurationException,
      SAXException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    InputStream iStream = null;

    iStream = new BufferedInputStream(new FileInputStream(inputPath));
    if (GZIP_PATTERN.matcher(inputPath.getName()).find()) {
      iStream = new GZIPInputStream(iStream);
    }
    return patentDocumentFromXMLStream(iStream);
  }

  /**
   * Converts a string of XML into a patent document object, extracting relevant fields from the patent XML.
   *
   * @param text The XML string to parse and extract.
   * @return A patent object if the XML can be read, or null otherwise.
   * @throws IOException
   * @throws ParserConfigurationException
   * @throws SAXException
   * @throws TransformerConfigurationException
   * @throws TransformerException
   * @throws XPathExpressionException
   */
  public static PatentDocument patentDocumentFromXMLString(String text)
      throws IOException, ParserConfigurationException,
      SAXException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    StringReader stringReader = new StringReader(text);
    return patentDocumentFromXMLStream(new ReaderInputStream(stringReader));
  }

  public static PatentDocument patentDocumentFromXMLStream(InputStream iStream)
      throws IOException, ParserConfigurationException,
      SAXException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {

    // Create XPath objects for validating that this document is actually a patent.
    XPath xpath = Util.getXPathFactory().newXPath();
    XPathExpression versionXPath = xpath.compile(PATH_DTD_VERSION);
    XPathExpression versionXPathApp = xpath.compile(PATH_DTD_VERSION_APP);

    DocumentBuilderFactory docFactory = Util.mkDocBuilderFactory();
    DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
    Document doc = docBuilder.parse(iStream);

    Util.DocumentType docType = Util.identifyDocType(doc);
    if (docType != Util.DocumentType.PATENT && docType != Util.DocumentType.APPLICATION) {
      LOGGER.warn("Found unexpected document type: " + docType);
      return null;
    }

    boolean isApplication = docType == Util.DocumentType.APPLICATION;
    // Yes this is in fact the way suggested by the XPath API.
    String version;
    if (!isApplication) {
      version = (String) versionXPath.evaluate(doc, XPathConstants.STRING);
    } else {
      version = (String) versionXPathApp.evaluate(doc, XPathConstants.STRING);
    }

    if (version == null || !VERSION_MAP.containsKey(version)) {
      LOGGER.warn(String.format("Unrecognized patent DTD version: %s", version));
      return null;
    }

    HashMap<String, String> paths = VERSION_MAP.get(version);

    /* Create XPath objects for extracting the fields of interest based on the version information.
     * TODO: extract these into some sharable, thread-safe place, maybe via dependency injection.
     */
    XPathExpression idXPath = xpath.compile(paths.get(PATH_KEY_FILE_ID));
    XPathExpression dateXPath = xpath.compile(paths.get(PATH_KEY_DATE));
    XPathExpression titleXPath = xpath.compile(paths.get(PATH_KEY_TITLE));
    XPathExpression classificationXPath = xpath.compile(paths.get(PATH_KEY_MAIN_CLASSIFICATION));
    XPathExpression furtherClassificationsXPath = xpath.compile(paths.get(PATH_KEY_FURTHER_CLASSIFICATIONS));
    XPathExpression searchedClassificationsXPath = xpath.compile(paths.get(PATH_KEY_SEARCHED_CLASSIFICATIONS));

    String fileId = (String) idXPath.evaluate(doc, XPathConstants.STRING);
    String date = (String) dateXPath.evaluate(doc, XPathConstants.STRING);
    NodeList titleNodes = (NodeList) titleXPath.evaluate(doc, XPathConstants.NODESET);
    String title = StringUtils.join(" ", extractTextFromHTML(docBuilder, titleNodes));
    String classification = (String) classificationXPath.evaluate(doc, XPathConstants.STRING);
    NodeList furtherClassificationNodes =
        (NodeList) furtherClassificationsXPath.evaluate(doc, XPathConstants.NODESET);
    ArrayList<String> furtherClassifications = null;
    if (furtherClassificationNodes != null) {
      furtherClassifications = new ArrayList<>(furtherClassificationNodes.getLength());
      for (int i = 0; i < furtherClassificationNodes.getLength(); i++) {
        Node n = furtherClassificationNodes.item(i);
        String txt = n.getTextContent();
        if (txt != null) {
          furtherClassifications.add(i, txt);
        }
      }
    } else {
      furtherClassifications = new ArrayList<>(0);
    }

    NodeList otherClassificationNodes =
        (NodeList) searchedClassificationsXPath.evaluate(doc, XPathConstants.NODESET);
    ArrayList<String> otherClassifications = null;
    if (otherClassificationNodes != null) {
      otherClassifications = new ArrayList<>(otherClassificationNodes.getLength());
      for (int i = 0; i < otherClassificationNodes.getLength(); i++) {
        Node n = otherClassificationNodes.item(i);
        String txt = n.getTextContent();
        if (txt != null) {
          otherClassifications.add(i, txt);
        }
      }
    } else {
      otherClassifications = new ArrayList<>(0);
    }

    // Extract text content for salient document paths.
    List<String> allTextList = getRelevantDocumentText(docBuilder, PATHS_TEXT, xpath, doc);
    List<String> claimsTextList = getRelevantDocumentText(docBuilder, new String[]{PATH_CLAIMS}, xpath, doc);

    return new PatentDocument(fileId, date, title, classification,
        furtherClassifications, otherClassifications, allTextList, claimsTextList, isApplication);
  }

  @JsonProperty("file_id")
  protected String fileId;
  @JsonProperty("grant_date")
  protected String grantDate;
  @JsonProperty("title")
  protected String title;
  @JsonProperty("primary_classification")
  protected String mainClassification;
  @JsonProperty("further_classification")
  protected List<String> furtherClassifications;
  @JsonProperty("searched_classifications")
  protected List<String> searchedClassifications;
  @JsonProperty("text_content")
  protected List<String> textContent;
  @JsonProperty("claims")
  protected List<String> claimsText;
  @JsonProperty("isApplication")
  protected Boolean isApplication;

  // TODO: this could probably use a builder if it gets more complicated.

  protected PatentDocument(String fileId, String grantDate, String title, String mainClassification,
                           List<String> furtherClassifications, List<String> searchedClassifications,
                           List<String> textContent, List<String> claimsText, Boolean isApplication) {
    this.fileId = fileId;
    this.grantDate = grantDate;
    this.title = title;
    this.mainClassification = mainClassification;
    this.furtherClassifications = furtherClassifications;
    this.searchedClassifications = searchedClassifications;
    this.textContent = textContent;
    this.claimsText = claimsText;
    this.isApplication = isApplication;
  }

  public String getFileId() {
    return fileId;
  }

  public String getGrantDate() {
    return grantDate;
  }

  public String getTitle() {
    return title;
  }

  public String getMainClassification() {
    return mainClassification;
  }

  public List<String> getFurtherClassifications() {
    return furtherClassifications;
  }

  public List<String> getSearchedClassifications() {
    return searchedClassifications;
  }

  public List<String> getTextContent() {
    return textContent;
  }

  public List<String> getClaimsText() {
    return claimsText;
  }

  public Boolean getIsApplication() {
    return isApplication;
  }
}