/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ /** * @author Ariel Schwartz * @author Gaurav Bhalotia * * This is the Generic XML Parser, you need to extend this class to parse a * specific file conforming to a given DTD. * */ package org.erasmusmc.dataimport.Medline.xmlparsers; import java.io.File; import java.io.FileOutputStream; import java.io.PrintStream; import java.net.UnknownHostException; import java.sql.Connection; import java.sql.SQLException; import java.util.Stack; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.erasmusmc.dataimport.Medline.util.BioTextDBConnection; import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; /** * The generic parser which extends the default handler provided by the SAX * library. This is an event based parsing, that does not require the * construction of a DOM tree in memory. Thus it is good for parsing large XML * files. */ public class GenericXMLParser extends DefaultHandler { /** Constants used for JAXP 1.2 */ static final String JAXP_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; // static final String JAXP_SCHEMA_LANGUAGE = // "http://localhost/xml/jaxp/properties/schemaLanguage"; static final String W3C_XML_SCHEMA = "http://www.w3.org/2001/XMLSchema"; // static final String W3C_XML_SCHEMA = "http://localhost/2001/XMLSchema"; static final String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; // static final String JAXP_SCHEMA_SOURCE = // "http://localhost/xml/jaxp/properties/schemaSource"; static protected Connection dbConnection; static XMLReader xmlReader; static Stack childHandlers = new Stack(); protected String currentElement; protected String xmlFileName = null; static String filename = null; static FileOutputStream outfile = null; /* If the parser needs to be validated */ static boolean parseValidate = false; /* If the output needs to go to a intermediate file */ static boolean toFile = false; /** * The default constructor. Initializes the data base connection * * @throws ClassNotFoundException * If the database driver class is not found * @throws SQLException * If there is a problem in connection to the database */ public GenericXMLParser() { xmlFileName = new File(filename).getName(); try { dbConnection = (new BioTextDBConnection()).getConnection(); } catch (Exception ex) { System.err.println(ex.getMessage()); System.exit(-1); } addChildHandler(this); } /** * Convert from a filename to a file URL. */ protected static String convertToFileURL(String filename) throws Exception { File file = new File(filename); String path = file.toURI().toString(); return path; } /** * Prints the correct usage for the code */ protected static void usage() { System.err.println("Usage: [-options] <file.xml>"); System.err.println("\t-dtd = DTD validation"); System.err.println("\t-validate = Parse validation (Checks that all tags are being handled)"); System.err.println("\t-file = Output to an intermediate file (Will be file.xml-insert.sql)"); System.err.println("\t-host = database hostname"); System.err.println("\t-dbname = database database name"); System.err.println("\t-user = database username"); System.err.println("\t-psswd = database password"); System.err.println("\t-xsd | -xsdss <file.xsd> = W3C XML Schema validation using xsi: hints"); System.err.println("\t\tin instance document or schema source <file.xsd>"); System.err.println("\t-xsdss <file> = W3C XML Schema validation using schema source <file>"); System.err.println("\t-usage or -help = this message"); System.exit(1); } static public void main(String[] args, Class parserClass) throws Exception { boolean dtdValidate = false; boolean xsdValidate = false; String schemaSource = null; /* Parse arguments to get the supplied options */ for (int i = 0; i < args.length; i++) { if (args[i].equals("-dtd")) { dtdValidate = true; } else if (args[i].equals("-xsd")) { xsdValidate = true; } else if (args[i].equals("-validate")) { parseValidate = true; } else if (args[i].equals("-file")) { toFile = true; } else if (args[i].equals("-xsdss")) { if (i == args.length - 1) { usage(); } xsdValidate = true; schemaSource = args[++i]; } else if (args[i].equals("-usage")) { usage(); } else if (args[i].equals("-help")) { usage(); } else { filename = args[i]; /* Must be last arg */ if (i != args.length - 1) { usage(); } } } if (filename == null) { usage(); } else { if (toFile == true) { /* If intermediate file chosen then open it to write */ outfile = new FileOutputStream(filename + "-insert.sql"); } } /* Create a JAXP SAXParserFactory and configure it */ SAXParserFactory spf = SAXParserFactory.newInstance(); /* * Set namespaceAware to true to get a parser that corresponds to the * default SAX2 namespace feature setting. This is necessary because the * default value from JAXP 1.0 was defined to be false. */ spf.setNamespaceAware(false); /* Validation part 1: set whether validation is on */ spf.setValidating(dtdValidate || xsdValidate); /* Create a JAXP SAXParser */ SAXParser saxParser = spf.newSAXParser(); /* Validation part 2a: set the schema language if necessary */ if (xsdValidate) { try { saxParser.setProperty(JAXP_SCHEMA_LANGUAGE, W3C_XML_SCHEMA); } catch (SAXNotRecognizedException x) { /* This can happen if the parser does not support JAXP 1.2 */ System.err.println("Error: JAXP SAXParser property not recognized: " + JAXP_SCHEMA_LANGUAGE); System.err.println("Check to see if parser conforms to JAXP 1.2 spec."); System.exit(1); } } /* * Validation part 2b: Set the schema source, if any. See the JAXP 1.2 * maintenance update specification for more complex usages of this feature. */ if (schemaSource != null) { saxParser.setProperty(JAXP_SCHEMA_SOURCE, new File(schemaSource)); } /* Get the encapsulated SAX XMLReader */ xmlReader = saxParser.getXMLReader(); /* Get an instance of the parser for the specific class */ GenericXMLParser parser = (GenericXMLParser) parserClass.newInstance(); /* Set the ContentHandler of the XMLReader */ xmlReader.setContentHandler(parser); /* Set an ErrorHandler before parsing */ xmlReader.setErrorHandler(new MyErrorHandler(System.err)); System.out.println("Going to parse the File " + filename); /* Tell the XMLReader to parse the XML document */ boolean succes = false; while (!succes) { try { xmlReader.parse(convertToFileURL(filename)); succes = true; } catch (UnknownHostException e){ System.err.println(e.getMessage()); System.err.println("Retrying in 30 seconds"); Thread.sleep(30000); } } } /** * Error handler to report errors and warnings */ private static class MyErrorHandler implements ErrorHandler { /* Error handler output goes here */ private PrintStream out; MyErrorHandler(PrintStream out) { this.out = out; } /** * Returns a string describing parse exception details */ private String getParseExceptionInfo(SAXParseException spe) { String systemId = spe.getSystemId(); if (systemId == null) { systemId = "null"; } String info = "URI=" + systemId + " Line=" + spe.getLineNumber() + ": " + spe.getMessage(); return info; } /** * The following methods are standard SAX ErrorHandler methods. See SAX * documentation for more info. */ @Override public void warning(SAXParseException spe) throws SAXException { out.println("Warning: " + getParseExceptionInfo(spe)); } @Override public void error(SAXParseException spe) throws SAXException { String message = "Error: " + getParseExceptionInfo(spe); throw new SAXException(message); } @Override public void fatalError(SAXParseException spe) throws SAXException { String message = "Fatal Error: " + getParseExceptionInfo(spe); throw new SAXException(message); } } /** * Stores the handler for the current node, it also sets the handler in the * XML reader. */ static public void addChildHandler(ContentHandler childHandler) { childHandlers.push(childHandler); xmlReader.setContentHandler(childHandler); } /** * Removes the current childhandler from the heap and sets the parent handler * as the current handler */ static public void removeChildHandler() { childHandlers.pop(); ContentHandler parentHandler = (ContentHandler) childHandlers.peek(); xmlReader.setContentHandler(parentHandler); } /** * Returns the current database connection */ static public Connection getDbConnection() { return dbConnection; } }