/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.twentyn.patentExtractor;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* This class represents parts of a USPTO patent document that are relevant to 20's use cases. It can extract
* information from the USPTO's XML documents and convert it to a POJO that can then be serialized as JSON. Use this
* as the basis for any processing of patent text.
*/
public class PatentDocument {
public static final Logger LOGGER = LogManager.getLogger(PatentDocument.class);
// See http://www.uspto.gov/learning-and-resources/xml-resources.
public static final String DTD2014 = "v4.5 2014-04-03";
public static final String DTD2013 = "v4.4 2013-05-16";
public static final String DTD2012 = "v4.3 2012-12-04";
public static final String DTD2006 = "v4.2 2006-08-23";
public static final String DTD2005 = "v4.1 2005-08-25";
public static final String DTD2004 = "v40 2004-12-02";
public static final String DTD2014_APP = "v4.4 2014-04-03";
public static final String DTD2012_APP = "v4.3 2012-12-04";
public static final String DTD2006_APP = "v4.2 2006-08-23";
public static final String DTD2005_APP = "v4.1 2005-08-25";
public static final String DTD2004_APP = "v4.0 2004-12-02";
public static final String PATH_DTD_VERSION = "/us-patent-grant/@dtd-version";
public static final String PATH_DTD_VERSION_APP = "/us-patent-application/@dtd-version";
public static final String[] PATHS_TEXT = {
"//description",
"//invention-title",
"//abstract",
};
public static final String PATH_CLAIMS = "//claims";
public static final String
PATH_KEY_FILE_ID = "fileId",
PATH_KEY_TITLE = "title",
PATH_KEY_DATE = "date",
PATH_KEY_MAIN_CLASSIFICATION = "classification",
PATH_KEY_FURTHER_CLASSIFICATIONS = "further_classifications",
PATH_KEY_SEARCHED_CLASSIFICATIONS = "referenced_classifications";
// TODO: is there a type-safe way of building an object from XPath with a map of functions?
public static final HashMap<String, String> PATHS_2013 = new HashMap<String, String>() {{
put(PATH_KEY_FILE_ID, "/us-patent-grant/@file");
put(PATH_KEY_TITLE, "/us-patent-grant/us-bibliographic-data-grant/invention-title");
put(PATH_KEY_DATE, "/us-patent-grant/@date-publ");
put(PATH_KEY_MAIN_CLASSIFICATION,
"/us-patent-grant/us-bibliographic-data-grant/classification-national/main-classification/text()");
put(PATH_KEY_FURTHER_CLASSIFICATIONS,
"/us-patent-grant/us-bibliographic-data-grant/classification-national/further-classification");
put(PATH_KEY_SEARCHED_CLASSIFICATIONS,
"/us-patent-grant/us-bibliographic-data-grant/us-field-of-classification-search/classification-national[./country/text()='US']/main-classification");
}};
public static final HashMap<String, String> PATHS_2004 = new HashMap<String, String>() {{
put(PATH_KEY_FILE_ID, "/us-patent-grant/@file");
put(PATH_KEY_TITLE, "/us-patent-grant/us-bibliographic-data-grant/invention-title");
put(PATH_KEY_DATE, "/us-patent-grant/@date-publ");
put(PATH_KEY_MAIN_CLASSIFICATION,
"/us-patent-grant/us-bibliographic-data-grant/classification-national/main-classification/text()");
put(PATH_KEY_FURTHER_CLASSIFICATIONS,
"/us-patent-grant/us-bibliographic-data-grant/classification-national/further-classification");
put(PATH_KEY_SEARCHED_CLASSIFICATIONS,
"/us-patent-grant/us-bibliographic-data-grant/field-of-search/classification-national[./country/text()='US']/main-classification");
}};
public static final HashMap<String, String> PATHS_2014_APP = new HashMap<String, String>() {{
put(PATH_KEY_FILE_ID, "/us-patent-application/@file");
put(PATH_KEY_TITLE, "/us-patent-application/us-bibliographic-data-application/invention-title");
put(PATH_KEY_DATE, "/us-patent-application/@date-publ");
put(PATH_KEY_MAIN_CLASSIFICATION,
"/us-patent-application/us-bibliographic-data-application/classification-national/main-classification/text()");
put(PATH_KEY_FURTHER_CLASSIFICATIONS,
"/us-patent-application/us-bibliographic-data-application/classification-national/further-classification");
put(PATH_KEY_SEARCHED_CLASSIFICATIONS, // Note: doesn't exist, but left for ease of use.
"/us-patent-application/us-bibliographic-data-application/us-field-of-classification-search/classification-national[./country/text()='US']/main-classification");
}};
public static final HashMap<String, HashMap<String, String>> VERSION_MAP =
new HashMap<String, HashMap<String, String>>() {{
put(DTD2014, PATHS_2013); // All the 2013 paths work with the 2014 DTD.
put(DTD2013, PATHS_2013);
put(DTD2012, PATHS_2013); // All the 2013 paths work with the 2012 DTD.
put(DTD2006, PATHS_2013); // All the 2013 paths work with the 2006 DTD.
put(DTD2005, PATHS_2013); // All the 2013 paths work with the 2005 DTD.
put(DTD2004, PATHS_2004);
put(DTD2014_APP, PATHS_2014_APP);
put(DTD2012_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2012 app DTD.
put(DTD2006_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2006 app DTD.
put(DTD2005_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2005 app DTD, though the classifications might be different.
put(DTD2004_APP, PATHS_2014_APP); // All the 2014 app paths work with the 2005 app DTD assuming searched classifications are always empty.
}};
private static final Pattern GZIP_PATTERN = Pattern.compile("\\.gz$");
public static class HtmlVisitor implements NodeVisitor {
// Based on https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java
private static final HashSet<String> SEGMENTING_NODES = new HashSet<String>() {{
addAll(Arrays.asList(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "dt", "dd", "tr", "li", "body", "div", // HTML entities
"row", "claim" // patent-specific entities
));
}};
private static final Pattern SPACE_PATTERN = Pattern.compile("^\\s+$");
private StringBuilder segmentBuilder = new StringBuilder();
private List<String> textSegments = new LinkedList<>();
@Override
public void head(org.jsoup.nodes.Node node, int i) {
// This borrows a page from HtmlToPlainText's book.
if (node instanceof TextNode) {
String text = ((TextNode) node).text();
if (text != null && text.length() > 0) {
segmentBuilder.append(((TextNode) node).text());
}
}
}
@Override
public void tail(org.jsoup.nodes.Node node, int i) {
String nodeName = node.nodeName();
if (nodeName.equals("a")) {
// Same as Jsoup's HtmlToPlainText.
segmentBuilder.append(String.format(" <%s>", node.absUrl("href")));
} else if (SEGMENTING_NODES.contains(nodeName) && segmentBuilder.length() > 0) {
String segmentText = segmentBuilder.toString();
// Ignore blank lines, as we'll be tagging each line separately.
if (!SPACE_PATTERN.matcher(segmentText).matches()) {
this.textSegments.add(segmentText);
}
// TODO: is it better to drop the old one than clear the existing?
segmentBuilder.setLength(0);
}
}
public List<String> getTextContent() {
return this.textSegments;
}
}
private static List<String> extractTextFromHTML(DocumentBuilder docBuilder, NodeList textNodes)
throws ParserConfigurationException, TransformerConfigurationException,
TransformerException, XPathExpressionException {
List<String> allTextList = new ArrayList<>(0);
if (textNodes != null) {
for (int i = 0; i < textNodes.getLength(); i++) {
Node n = textNodes.item(i);
/* This extremely around-the-horn approach to handling text content is due to the mix of HTML and
* XML in the patent body. We use Jsoup to parse the HTML entities we find in the body, and use
* its extremely convenient NodeVisitor API to recursively traverse the document and extract the
* text content in reasonable chunks.
*/
Document contentsDoc = Util.nodeToDocument(docBuilder, "body", n);
String docText = Util.documentToString(contentsDoc);
// With help from http://stackoverflow.com/questions/832620/stripping-html-tags-in-java
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(docText);
HtmlVisitor visitor = new HtmlVisitor();
NodeTraversor traversor = new NodeTraversor(visitor);
traversor.traverse(htmlDoc);
List<String> textSegments = visitor.getTextContent();
allTextList.addAll(textSegments);
}
}
return allTextList;
}
/**
* Extracts the text content from text fields in a patent XML document.
*
* @param docBuilder A document builder to use when constructing intermediate XML/HTML documents in the extraction
* process.
* @param paths A list of XPath paths from which to exactract text.
* @param xpath An XPath instance to use when running XPath queries.
* @param doc The XML document from which to extract text.
* @return A list of strings representing the textual content of the document. These could be sentences,
* paragraphs, or larger text units, but should represent some sort of structure in the document's text.
* @throws ParserConfigurationException
* @throws TransformerConfigurationException
* @throws TransformerException
* @throws XPathExpressionException
*/
private static List<String> getRelevantDocumentText(DocumentBuilder docBuilder, String[] paths,
XPath xpath, Document doc)
throws ParserConfigurationException, TransformerConfigurationException,
TransformerException, XPathExpressionException {
List<String> allTextList = new ArrayList<>(0);
for (String path : paths) {
XPathExpression exp = xpath.compile(path);
NodeList textNodes = (NodeList) exp.evaluate(doc, XPathConstants.NODESET);
allTextList.addAll(extractTextFromHTML(docBuilder, textNodes));
}
return allTextList;
}
/**
* Converts an XML file into a patent document object, extracting relevant fields from the patent XML.
*
* @param inputPath A path to the file to be read.
* @return A patent object if the XML can be read, or null otherwise.
* @throws IOException Thrown on file I/O errors.
* @throws ParserConfigurationException Thrown when the XML parser cannot be configured correctly.
* @throws SAXException Thrown on XML parser errors.
* @throws XPathExpressionException Thrown when XPath fails to handle queries against the specified document.
*/
// TODO: logging?
// TODO: are @nullable and @non-null annotations still a thing?
// TODO: prolly belongs in a factory.
public static PatentDocument patentDocumentFromXMLFile(File inputPath)
throws IOException, ParserConfigurationException,
SAXException, TransformerConfigurationException,
TransformerException, XPathExpressionException {
InputStream iStream = null;
iStream = new BufferedInputStream(new FileInputStream(inputPath));
if (GZIP_PATTERN.matcher(inputPath.getName()).find()) {
iStream = new GZIPInputStream(iStream);
}
return patentDocumentFromXMLStream(iStream);
}
/**
* Converts a string of XML into a patent document object, extracting relevant fields from the patent XML.
*
* @param text The XML string to parse and extract.
* @return A patent object if the XML can be read, or null otherwise.
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
* @throws TransformerConfigurationException
* @throws TransformerException
* @throws XPathExpressionException
*/
public static PatentDocument patentDocumentFromXMLString(String text)
throws IOException, ParserConfigurationException,
SAXException, TransformerConfigurationException,
TransformerException, XPathExpressionException {
StringReader stringReader = new StringReader(text);
return patentDocumentFromXMLStream(new ReaderInputStream(stringReader));
}
public static PatentDocument patentDocumentFromXMLStream(InputStream iStream)
throws IOException, ParserConfigurationException,
SAXException, TransformerConfigurationException,
TransformerException, XPathExpressionException {
// Create XPath objects for validating that this document is actually a patent.
XPath xpath = Util.getXPathFactory().newXPath();
XPathExpression versionXPath = xpath.compile(PATH_DTD_VERSION);
XPathExpression versionXPathApp = xpath.compile(PATH_DTD_VERSION_APP);
DocumentBuilderFactory docFactory = Util.mkDocBuilderFactory();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
Document doc = docBuilder.parse(iStream);
Util.DocumentType docType = Util.identifyDocType(doc);
if (docType != Util.DocumentType.PATENT && docType != Util.DocumentType.APPLICATION) {
LOGGER.warn("Found unexpected document type: " + docType);
return null;
}
boolean isApplication = docType == Util.DocumentType.APPLICATION;
// Yes this is in fact the way suggested by the XPath API.
String version;
if (!isApplication) {
version = (String) versionXPath.evaluate(doc, XPathConstants.STRING);
} else {
version = (String) versionXPathApp.evaluate(doc, XPathConstants.STRING);
}
if (version == null || !VERSION_MAP.containsKey(version)) {
LOGGER.warn(String.format("Unrecognized patent DTD version: %s", version));
return null;
}
HashMap<String, String> paths = VERSION_MAP.get(version);
/* Create XPath objects for extracting the fields of interest based on the version information.
* TODO: extract these into some sharable, thread-safe place, maybe via dependency injection.
*/
XPathExpression idXPath = xpath.compile(paths.get(PATH_KEY_FILE_ID));
XPathExpression dateXPath = xpath.compile(paths.get(PATH_KEY_DATE));
XPathExpression titleXPath = xpath.compile(paths.get(PATH_KEY_TITLE));
XPathExpression classificationXPath = xpath.compile(paths.get(PATH_KEY_MAIN_CLASSIFICATION));
XPathExpression furtherClassificationsXPath = xpath.compile(paths.get(PATH_KEY_FURTHER_CLASSIFICATIONS));
XPathExpression searchedClassificationsXPath = xpath.compile(paths.get(PATH_KEY_SEARCHED_CLASSIFICATIONS));
String fileId = (String) idXPath.evaluate(doc, XPathConstants.STRING);
String date = (String) dateXPath.evaluate(doc, XPathConstants.STRING);
NodeList titleNodes = (NodeList) titleXPath.evaluate(doc, XPathConstants.NODESET);
String title = StringUtils.join(" ", extractTextFromHTML(docBuilder, titleNodes));
String classification = (String) classificationXPath.evaluate(doc, XPathConstants.STRING);
NodeList furtherClassificationNodes =
(NodeList) furtherClassificationsXPath.evaluate(doc, XPathConstants.NODESET);
ArrayList<String> furtherClassifications = null;
if (furtherClassificationNodes != null) {
furtherClassifications = new ArrayList<>(furtherClassificationNodes.getLength());
for (int i = 0; i < furtherClassificationNodes.getLength(); i++) {
Node n = furtherClassificationNodes.item(i);
String txt = n.getTextContent();
if (txt != null) {
furtherClassifications.add(i, txt);
}
}
} else {
furtherClassifications = new ArrayList<>(0);
}
NodeList otherClassificationNodes =
(NodeList) searchedClassificationsXPath.evaluate(doc, XPathConstants.NODESET);
ArrayList<String> otherClassifications = null;
if (otherClassificationNodes != null) {
otherClassifications = new ArrayList<>(otherClassificationNodes.getLength());
for (int i = 0; i < otherClassificationNodes.getLength(); i++) {
Node n = otherClassificationNodes.item(i);
String txt = n.getTextContent();
if (txt != null) {
otherClassifications.add(i, txt);
}
}
} else {
otherClassifications = new ArrayList<>(0);
}
// Extract text content for salient document paths.
List<String> allTextList = getRelevantDocumentText(docBuilder, PATHS_TEXT, xpath, doc);
List<String> claimsTextList = getRelevantDocumentText(docBuilder, new String[]{PATH_CLAIMS}, xpath, doc);
return new PatentDocument(fileId, date, title, classification,
furtherClassifications, otherClassifications, allTextList, claimsTextList, isApplication);
}
@JsonProperty("file_id")
protected String fileId;
@JsonProperty("grant_date")
protected String grantDate;
@JsonProperty("title")
protected String title;
@JsonProperty("primary_classification")
protected String mainClassification;
@JsonProperty("further_classification")
protected List<String> furtherClassifications;
@JsonProperty("searched_classifications")
protected List<String> searchedClassifications;
@JsonProperty("text_content")
protected List<String> textContent;
@JsonProperty("claims")
protected List<String> claimsText;
@JsonProperty("isApplication")
protected Boolean isApplication;
// TODO: this could probably use a builder if it gets more complicated.
protected PatentDocument(String fileId, String grantDate, String title, String mainClassification,
List<String> furtherClassifications, List<String> searchedClassifications,
List<String> textContent, List<String> claimsText, Boolean isApplication) {
this.fileId = fileId;
this.grantDate = grantDate;
this.title = title;
this.mainClassification = mainClassification;
this.furtherClassifications = furtherClassifications;
this.searchedClassifications = searchedClassifications;
this.textContent = textContent;
this.claimsText = claimsText;
this.isApplication = isApplication;
}
public String getFileId() {
return fileId;
}
public String getGrantDate() {
return grantDate;
}
public String getTitle() {
return title;
}
public String getMainClassification() {
return mainClassification;
}
public List<String> getFurtherClassifications() {
return furtherClassifications;
}
public List<String> getSearchedClassifications() {
return searchedClassifications;
}
public List<String> getTextContent() {
return textContent;
}
public List<String> getClaimsText() {
return claimsText;
}
public Boolean getIsApplication() {
return isApplication;
}
}