Parser.java example

Explorer
geotools-old-master
/*
 *    GeoTools - The Open Source Java GIS Toolkit
 *    http://geotools.org
 *
 *    (C) 2002-2008, Open Source Geospatial Foundation (OSGeo)
 *
 *    This library is free software; you can redistribute it and/or
 *    modify it under the terms of the GNU Lesser General Public
 *    License as published by the Free Software Foundation;
 *    version 2.1 of the License.
 *
 *    This library is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *    Lesser General Public License for more details.
 */
package org.geotools.xml;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.List;

import javax.xml.namespace.QName;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;

import org.apache.xerces.parsers.SAXParser;
import org.eclipse.xsd.XSDSchema;
import org.geotools.xml.impl.ParserHandler;
import org.geotools.xs.XS;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.NamespaceSupport;


/**
 * GeoTools XML parser.
 * <p>
 * This parser uses a sax based driver to parse an input stream into a single object. For streaming
 * look at {@link StreamingParser}. If the source document being parsed as already been parsed into
 * a {@link Document} the {@link DOMParser} class may be used.
 * </p>
 * <p>
 * <h3>Schema Resolution</h3>
 * See {@link org.geotools.xml.Configuration} javadocs for instructions on how
 * to customize schema resolution. This is often desirable in the case that
 * the instance document being parsed contains invalid uri's in schema imports
 * and includes.
 * </p>
 * @author Justin Deoliveira, The Open Planning Project
 *
 *
 * @source $URL$
 */
public class Parser {
    /** sax handler which maintains the element stack */
    private ParserHandler handler;

    /** the sax parser driving the handler */
    private SAXParser parser;

    /** the instance document being parsed */
    private InputStream input;

    /**
     * Creats a new instance of the parser.
     *
     * @param configuration The parser configuration, bindings and context,
     *         must never be <code>null</code>.
     *
     */
    public Parser(Configuration configuration) {
        if (configuration == null) {
            throw new NullPointerException("configuration");
        }

        handler = new ParserHandler(configuration);
        configuration.setupParser(this);
    }

    /**
     * Creates a new instance of the parser.
     *
     * @param configuration Object representing the configuration of the parser.
     * @param input A uri representing the instance document to be parsed.
     *
     * @throws ParserConfigurationException
     * @throws SAXException If a sax parser can not be created.
     * @throws URISyntaxException If <code>input</code> is not a valid uri.
     *
     * @deprecated use {@link #Parser(Configuration)} and {@link #parse(InputStream)}.
     */
    public Parser(Configuration configuration, String input)
        throws IOException, URISyntaxException {
        this(configuration, new BufferedInputStream(new FileInputStream(new File(new URI(input)))));
    }

    /**
     * Creates a new instance of the parser.
     *
     * @param configuration Object representing the configuration of the parser.
     * @param input The stream representing the instance document to be parsed.
     *
     * @deprecated use {@link #Parser(Configuration)} and {@link #parse(InputStream)}.
     */
    public Parser(Configuration configuration, InputStream input) {
        this(configuration);
        this.input = input;
    }

    /**
     * @return The underlying parser handler.
     */
    ParserHandler getParserHandler() {
        return handler;
    }
    
    /**
     * Signals the parser to parse the entire instance document. The object
     * returned from the parse is the object which has been bound to the root
     * element of the document. This method should only be called once for
     * a single instance document.
     *
     * @return The object representation of the root element of the document.
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     *
     * @deprecated use {@link #parse(InputStream)}
     */
    public Object parse() throws IOException, SAXException, ParserConfigurationException {
        return parse(input);
    }

    /**
     * Parses an instance documented defined by an input stream.
     * <p>
     * The object returned from the parse is the object which has been bound to the root
     * element of the document. This method should only be called once for a single instance document.
     * </p>
     *
     * @return The object representation of the root element of the document.
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public Object parse(InputStream input)
        throws IOException, SAXException, ParserConfigurationException {
        return parse(new InputSource(input));
    }

    /**
     * Parses an instance documented defined by a reader.
     * <p>
     * The object returned from the parse is the object which has been bound to the root
     * element of the document. This method should only be called once for a single instance document.
     * </p>
     *
     * @return The object representation of the root element of the document.
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public Object parse(Reader reader)
        throws IOException, SAXException, ParserConfigurationException {
        return parse(new InputSource(reader));
    }

    /**
     * Parses an instance document defined by a transformer source.
     * <p>
     * Note: Currently this method reads the entire source into memory in order to validate
     * it. If large documents must be parsed one of {@link #
     * </p>
     * @param source THe source of the instance document.
     *
     * @return @return The object representation of the root element of the document.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     * @throws TransformerException
     * 
     * @since 2.6
     */
    public Object parse(Source source) throws IOException, SAXException, ParserConfigurationException, TransformerException {
        //TODO: use SAXResult to stream, need to figure out how to enable 
        // validation with transformer api
        //SAXResult result = new SAXResult( handler );
        StreamResult result = new StreamResult( new ByteArrayOutputStream() );
        
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer tx = tf.newTransformer();
        
        tx.transform( source, result );
        
        return parse( new ByteArrayInputStream( ((ByteArrayOutputStream)result.getOutputStream()).toByteArray() ) );
    }
    

    /**
     * Parses an instance documented defined by a sax input source.
     * <p>
     * The object returned from the parse is the object which has been bound to the root
     * element of the document. This method should only be called once for a single instance document.
     * </p>
     *
     * @return The object representation of the root element of the document.
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public Object parse(InputSource source)
        throws IOException, SAXException, ParserConfigurationException {
        parser = parser();
        parser.setContentHandler(handler);
        parser.setErrorHandler(handler);

        parser.parse(source);

        return handler.getValue();
    }

    /**
     * Sets the strict parsing flag.
     * <p>
     * When set to <code>true</code>, this will cause the parser to operate in
     * a strict mode, which means that xml being parsed must be exactly correct
     * with respect to the schema it references.
     * </p>
     * <p>
     * Some examples of cases in which the parser will throw an exception while
     * operating in strict mode:
     * <ul>
     *  <li>no 'schemaLocation' specified, or specified incorrectly
     *  <li>element found which is not declared in the schema
     * </ul>
     * </p>
     * @param strict The strict flag.
     */
    public void setStrict(boolean strict) {
        handler.setStrict(strict);
    }

    /**
     * Sets the flag controlling wether the parser should validate or not.
     *
     * @param validating Validation flag, <code>true</code> to validate, otherwise <code>false</code>
     */
    public void setValidating(boolean validating) {
        handler.setValidating(validating);
    }

    /**
     * @return Flag determining if the parser is validatin or not.
     */
    public boolean isValidating() {
        return handler.isValidating();
    }
    
    /**
     * Sets the flag which controls how the parser handles validation errors.
     * <p>
     * When this flag is set, the parser will throw an exception when it encounters 
     * a validation error. Otherise the error will be stored, retreivable from 
     * {@link #getValidationErrors()}.
     * </p>
     * <p>
     * The default behavior is to set this flag to <code>false</code>. So client
     * code should explicitly set this flag if it is desired that the exception 
     * be thrown when the validation error occurs.
     * </p>
     * @param fail failure flag, <code>true</code> to fail, otherwise <code>false</code>
     */
    public void setFailOnValidationError( boolean fail ) {
        handler.setFailOnValidationError( fail );
    }
    
    /**
     * @return The flag determining how the parser deals with validation errors.
     */
    public boolean isFailOnValidationError() {
        return handler.isFailOnValidationError();
    }
    
    /**
     * Sets flag that controls whether the parser will process mixed content in a way 
     * that preserves order of child elements and text.
     * 
     * @since 2.7
     */
    public void setHandleMixedContent(boolean handleMixedContent) {
        handler.setHandleMixedContent(handleMixedContent);
    }
    
    /**
     * Flag that controls whether the parser will process mixed content in a way 
     * that preserves order of child elements and text.
     * <p>
     * By default the parser will simply concatenate blindly all child text and not preserve order
     * with respect to other elements within a mixed content type.
     * </p>
     * 
     * @since 2.7
     */
    public boolean isHandleMixedContent() {
        return handler.isHandleMixedContent();
    }
    
    /**
     * Returns a list of any validation errors that occured while parsing.
     *
     * @return A list of errors, or an empty list if none.
     */
    public List getValidationErrors() {
        return handler.getValidationErrors();
    }

    /**
     * Validates an instance document defined by a input stream.
     * <p>
     * Clients should call {@link #getValidationErrors()} after this method to 
     * retrieve any validation errors that occurred. Clients do not need to call 
     * {@link #setValidating(boolean)} when using this method to validate. 
     * </p>
     * <p>
     * This method does not do any of the work done by {@link #parse(InputSource)}, it
     * only validates. 
     * </p>
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public void validate( InputStream in ) throws IOException, SAXException, ParserConfigurationException {
        validate( new InputSource( in ) );
    }

    /**
     * Validates an instance document defined by a reader.
     * <p>
     * Clients should call {@link #getValidationErrors()} after this method to 
     * retrieve any validation errors that occurred. Clients do not need to call 
     * {@link #setValidating(boolean)} when using this method to validate. 
     * </p>
     * <p>
     * This method does not do any of the work done by {@link #parse(InputSource)}, it
     * only validates. 
     * </p>
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public void validate( Reader reader ) throws IOException, SAXException, ParserConfigurationException {
        validate( new InputSource( reader ) );
    }
    
    /**
     * Validates an instance document defined by a input source.
     * <p>
     * Clients should call {@link #getValidationErrors()} after this method to 
     * retrieve any validation errors that occurred. Clients do not need to call 
     * {@link #setValidating(boolean)} when using this method to validate. 
     * </p>
     * <p>
     * This method does not do any of the work done by {@link #parse(InputSource)}, it
     * only validates. 
     * </p>
     *
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public void validate( InputSource source ) throws IOException, SAXException, ParserConfigurationException {
        SAXParser parser = parser( true );
        parser.setContentHandler( handler.getValidator() );
        parser.setErrorHandler( handler.getValidator() );
        parser.parse( source );
    }

    /**
     * Returns the schema objects referenced by the instance document being
     * parsed. This method can only be called after a successful parse has
     * begun.
     *
     * @return The schema objects used to parse the document, or null if parsing
     * has not commenced.
     */
    public XSDSchema[] getSchemas() {
        if (handler != null) {
            return handler.getSchemas();
        }

        return null;
    }

    /**
     * Returns the namespace mappings maintained by the parser.
     * <p>
     * Clients may register additional namespace mappings. This is useful when
     * an application whishes to provide some "default" namespace mappings.
     * </p>
     * <p>
     * Clients should register namespace mappings in the current "context", ie
     * do not call {@link NamespaceSupport#pushContext()}. Example:
     * <code>
     * Parser parser = new Parser( ... );
     * parser.getNamespaces().declarePrefix( "foo", "http://www.foo.com" );
     * ...
     * </code>
     * </p>
     *
     * @return The namespace support containing prefix to uri mappings.
     * @since 2.4
     */
    public NamespaceSupport getNamespaces() {
        return handler.getNamespaceSupport();
    }

    protected SAXParser parser() throws ParserConfigurationException, SAXException {
        return parser( isValidating() );
    }
    
    protected SAXParser parser(boolean validate) throws ParserConfigurationException, SAXException {
        //JD: we use xerces directly here because jaxp does seem to allow use to 
        // override all the namespaces to validate against
        SAXParser parser = new SAXParser();

        //set the appropriate features
        parser.setFeature("http://xml.org/sax/features/namespaces", true);

        if (validate) {
            parser.setFeature("http://xml.org/sax/features/validation", true);
            parser.setFeature("http://apache.org/xml/features/validation/schema", true);
            parser.setFeature("http://apache.org/xml/features/validation/schema-full-checking", true);
        }

        //set the schema sources of this configuration, and all dependent ones
        StringBuffer schemaLocation = new StringBuffer();

        for (Iterator d = handler.getConfiguration().allDependencies().iterator(); d.hasNext();) {
            Configuration dependency = (Configuration) d.next();

            //ignore xs namespace
            if (XS.NAMESPACE.equals(dependency.getNamespaceURI())) {
                continue;
            }

            //seperate entries by space
            if (schemaLocation.length() > 0) {
                schemaLocation.append(" ");
            }

            //add the entry
            schemaLocation.append(dependency.getNamespaceURI());
            schemaLocation.append(" ");
            schemaLocation.append(dependency.getSchemaFileURL());
        }

        //set hte property to map namespaces to schema locations
        parser.setProperty("http://apache.org/xml/properties/schema/external-schemaLocation",
            schemaLocation.toString());

        return parser;
    }

    /**
     * Properties used to control the parser behaviour.
     * <p>
     * Parser properties are set in the configuration of a parser.
     * <pre>
     * Configuration configuration = new ....
     * configuration.getProperties().add( Parser.Properties.PARSE_UNKNOWN_ELEMENTS );
     * configuration.getProperties().add( Parser.Properties.PARSE_UNKNOWN_ATTRIBUTES );
     * </pre>
     * </p>
     * @author Justin Deoliveira, The Open Planning Project
     * @deprecated
     */
    public static interface Properties {
        /**
         * If set, the parser will continue to parse when it finds an element
         * and cannot determine its type.
         *
         * @deprecated use {@link Parser#setStrict(boolean)}
         */
        QName PARSE_UNKNOWN_ELEMENTS = new QName("http://www.geotools.org", "parseUnknownElements");

        /**
         * If set, the parser will continue to parse when it finds an attribute
         * and cannot determine its type.
         *
         * @deprecated use {@link Parser#setStrict(boolean)}
         */
        QName PARSE_UNKNOWN_ATTRIBUTES = new QName("http://www.geotools.org",
                "parseUnknownAttributes");

        /**
         * If set, the parser will ignore the schemaLocation attribute of an
         * instance document.
         *
         * @deprecated use {@link Parser#setStrict(boolean)}
         */
        QName IGNORE_SCHEMA_LOCATION = new QName("http://www.geotools.org", "ignoreSchemaLocation");
    }
}