/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.twentyn.patentExtractor;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
import org.apache.commons.codec.binary.Base64InputStream;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
public class Util {
public enum DocumentType {
PATENT,
APPLICATION,
SEQUENCE,
UNKNOWN,
}
private static final HashMap<String, DocumentType> NODE_NAME_TO_DOC_TYPE = new HashMap<String, DocumentType>() {{
put("us-patent-grant", DocumentType.PATENT);
put("us-patent-application", DocumentType.APPLICATION);
put("sequence-cwu", DocumentType.SEQUENCE);
}};
private static final ThreadLocal<XPathFactory> XPATH_FACTORY = new ThreadLocal<XPathFactory>() {
@Override
protected XPathFactory initialValue() {
return XPathFactory.newInstance();
}
};
private static final ThreadLocal<TransformerFactory> TRANSFORMER_FACTORY = new ThreadLocal<TransformerFactory>() {
@Override
protected TransformerFactory initialValue() {
return TransformerFactory.newInstance();
}
};
public static XPathFactory getXPathFactory() {
return XPATH_FACTORY.get();
}
public static TransformerFactory getTransformerFactory() {
return TRANSFORMER_FACTORY.get();
}
public static DocumentType identifyDocType(Document dom) throws XPathExpressionException {
XPath xpath = null;
xpath = getXPathFactory().newXPath();
for (Map.Entry<String, DocumentType> entry : NODE_NAME_TO_DOC_TYPE.entrySet()) {
Node top = (Node) xpath.evaluate("/" + entry.getKey(), dom, XPathConstants.NODE);
if (top != null) {
return entry.getValue();
}
}
return DocumentType.UNKNOWN;
}
public static byte[] compressXMLDocument(Document doc) throws
IOException, TransformerConfigurationException, TransformerException {
Transformer transformer = getTransformerFactory().newTransformer();
// The OutputKeys.INDENT configuration key determines whether the output is indented.
DOMSource w3DomSource = new DOMSource(doc);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Writer w = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(
new Base64OutputStream(baos, true, 0, new byte[]{'\n'}))));
StreamResult sResult = new StreamResult(w);
transformer.transform(w3DomSource, sResult);
w.close();
return baos.toByteArray();
}
public static Document decompressXMLDocument(byte[] bytes) throws
IOException, ParserConfigurationException, SAXException,
TransformerConfigurationException, TransformerException {
// With help from http://stackoverflow.com/questions/309424/read-convert-an-inputstream-to-a-string
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
InputStream s = new GZIPInputStream(new Base64InputStream(bais));
DocumentBuilder documentBuilder = mkDocBuilderFactory().newDocumentBuilder();
Document doc = documentBuilder.parse(s);
s.close();
return doc;
}
public static DocumentBuilderFactory mkDocBuilderFactory() throws ParserConfigurationException {
/* Try to load the document. Note that the factory must be configured within the context of a method call
* for exception handling. TODO: can we work around this w/ dependency injection? */
// With help from http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
docFactory.setValidating(false);
docFactory.setNamespaceAware(true);
docFactory.setFeature("http://xml.org/sax/features/namespaces", false);
docFactory.setFeature("http://xml.org/sax/features/validation", false);
docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
return docFactory;
}
public static String documentToString(Document doc)
throws ParserConfigurationException, TransformerConfigurationException, TransformerException {
StringWriter stringWriter = new StringWriter();
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(stringWriter);
Transformer transformer = getTransformerFactory().newTransformer();
transformer.transform(source, result);
return stringWriter.toString();
}
public static Document nodeToDocument(DocumentBuilder docBuilder, String documentContainer, Node n) {
/* With help from:
* http://examples.javacodegeeks.com/core-java/xml/dom/copy-nodes-subtree-from-one-dom-document-to-another/ */
org.w3c.dom.Document newDoc = docBuilder.newDocument();
Element rootElement = newDoc.createElement(documentContainer);
Node newNode = newDoc.importNode(n, true);
rootElement.appendChild(newNode);
newDoc.appendChild(rootElement);
return newDoc;
}
public static class DocumentSerializer extends JsonSerializer<Document> {
@Override
public void serialize(Document document, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
throws IOException, JsonProcessingException {
byte[] compressedDoc;
try {
compressedDoc = compressXMLDocument(document);
} catch (TransformerException e) {
throw new IOException("Caught TransformerException when compressing document", e);
}
jsonGenerator.writeString(new String(compressedDoc, "UTF-8"));
}
}
public static class DocumentDeserializer extends JsonDeserializer<Document> {
@Override
public Document deserialize(JsonParser jsonParser, DeserializationContext deserializationContext)
throws IOException, JsonProcessingException {
byte[] compressedDoc = jsonParser.getText().getBytes("UTF-8");
Document doc;
try {
doc = decompressXMLDocument(compressedDoc);
} catch (ParserConfigurationException e) {
throw new IOException("Caught ParserConfigurationException when compressing document", e);
} catch (SAXException e) {
throw new IOException("Caught SAXException when compressing document", e);
} catch (TransformerConfigurationException e) {
throw new IOException("Caught TransformerConfigurationException when compressing document", e);
} catch (TransformerException e) {
throw new IOException("Caught TransformerException when compressing document", e);
}
return doc;
}
}
}