/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.twentyn.patentExtractor; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.SerializerProvider; import org.apache.commons.codec.binary.Base64InputStream; import org.apache.commons.codec.binary.Base64OutputStream; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.StringWriter; import java.io.Writer; import java.util.HashMap; import java.util.Map; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; public class Util { public enum DocumentType { PATENT, APPLICATION, SEQUENCE, UNKNOWN, } private static final HashMap<String, DocumentType> NODE_NAME_TO_DOC_TYPE = new HashMap<String, DocumentType>() {{ put("us-patent-grant", DocumentType.PATENT); put("us-patent-application", DocumentType.APPLICATION); put("sequence-cwu", DocumentType.SEQUENCE); }}; private static final ThreadLocal<XPathFactory> XPATH_FACTORY = new ThreadLocal<XPathFactory>() { @Override protected XPathFactory initialValue() { return XPathFactory.newInstance(); } }; private static final ThreadLocal<TransformerFactory> TRANSFORMER_FACTORY = new ThreadLocal<TransformerFactory>() { @Override protected TransformerFactory initialValue() { return TransformerFactory.newInstance(); } }; public static XPathFactory getXPathFactory() { return XPATH_FACTORY.get(); } public static TransformerFactory getTransformerFactory() { return TRANSFORMER_FACTORY.get(); } public static DocumentType identifyDocType(Document dom) throws XPathExpressionException { XPath xpath = null; xpath = getXPathFactory().newXPath(); for (Map.Entry<String, DocumentType> entry : NODE_NAME_TO_DOC_TYPE.entrySet()) { Node top = (Node) xpath.evaluate("/" + entry.getKey(), dom, XPathConstants.NODE); if (top != null) { return entry.getValue(); } } return DocumentType.UNKNOWN; } public static byte[] compressXMLDocument(Document doc) throws IOException, TransformerConfigurationException, TransformerException { Transformer transformer = getTransformerFactory().newTransformer(); // The OutputKeys.INDENT configuration key determines whether the output is indented. DOMSource w3DomSource = new DOMSource(doc); ByteArrayOutputStream baos = new ByteArrayOutputStream(); Writer w = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream( new Base64OutputStream(baos, true, 0, new byte[]{'\n'})))); StreamResult sResult = new StreamResult(w); transformer.transform(w3DomSource, sResult); w.close(); return baos.toByteArray(); } public static Document decompressXMLDocument(byte[] bytes) throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException, TransformerException { // With help from http://stackoverflow.com/questions/309424/read-convert-an-inputstream-to-a-string ByteArrayInputStream bais = new ByteArrayInputStream(bytes); InputStream s = new GZIPInputStream(new Base64InputStream(bais)); DocumentBuilder documentBuilder = mkDocBuilderFactory().newDocumentBuilder(); Document doc = documentBuilder.parse(s); s.close(); return doc; } public static DocumentBuilderFactory mkDocBuilderFactory() throws ParserConfigurationException { /* Try to load the document. Note that the factory must be configured within the context of a method call * for exception handling. TODO: can we work around this w/ dependency injection? */ // With help from http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); docFactory.setValidating(false); docFactory.setNamespaceAware(true); docFactory.setFeature("http://xml.org/sax/features/namespaces", false); docFactory.setFeature("http://xml.org/sax/features/validation", false); docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); docFactory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); return docFactory; } public static String documentToString(Document doc) throws ParserConfigurationException, TransformerConfigurationException, TransformerException { StringWriter stringWriter = new StringWriter(); DOMSource source = new DOMSource(doc); StreamResult result = new StreamResult(stringWriter); Transformer transformer = getTransformerFactory().newTransformer(); transformer.transform(source, result); return stringWriter.toString(); } public static Document nodeToDocument(DocumentBuilder docBuilder, String documentContainer, Node n) { /* With help from: * http://examples.javacodegeeks.com/core-java/xml/dom/copy-nodes-subtree-from-one-dom-document-to-another/ */ org.w3c.dom.Document newDoc = docBuilder.newDocument(); Element rootElement = newDoc.createElement(documentContainer); Node newNode = newDoc.importNode(n, true); rootElement.appendChild(newNode); newDoc.appendChild(rootElement); return newDoc; } public static class DocumentSerializer extends JsonSerializer<Document> { @Override public void serialize(Document document, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException, JsonProcessingException { byte[] compressedDoc; try { compressedDoc = compressXMLDocument(document); } catch (TransformerException e) { throw new IOException("Caught TransformerException when compressing document", e); } jsonGenerator.writeString(new String(compressedDoc, "UTF-8")); } } public static class DocumentDeserializer extends JsonDeserializer<Document> { @Override public Document deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JsonProcessingException { byte[] compressedDoc = jsonParser.getText().getBytes("UTF-8"); Document doc; try { doc = decompressXMLDocument(compressedDoc); } catch (ParserConfigurationException e) { throw new IOException("Caught ParserConfigurationException when compressing document", e); } catch (SAXException e) { throw new IOException("Caught SAXException when compressing document", e); } catch (TransformerConfigurationException e) { throw new IOException("Caught TransformerConfigurationException when compressing document", e); } catch (TransformerException e) { throw new IOException("Caught TransformerException when compressing document", e); } return doc; } } }