/** * JHOVE2 - Next-generation architecture for format-aware characterization * <p> * Copyright (c) 2009 by The Regents of the University of California, Ithaka * Harbors, Inc., and The Board of Trustees of the Leland Stanford Junior * University. All rights reserved. * </p> * <p> * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * </p> * <ul> * <li>Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer.</li> * <li>Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution.</li> * <li>Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission.</li> * </ul> * <p> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * </p> */ package org.jhove2.module.format.xml; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jhove2.annotation.ReportableProperty; import org.jhove2.core.io.Input; import org.jhove2.core.reportable.AbstractReportable; import com.sleepycat.persist.model.Persistent; /** * This class is used to hold information about an <i>XML declaration</i> * discovered during parsing of an XML instance. For example: <br /> * <?xml version="1.0" encoding="UTF-8" standalone="yes"?> * * @author rnanders * @see http://www.w3.org/TR/xml/#NT-XMLDecl */ @Persistent public class XmlDeclaration extends AbstractReportable { /** The regular expression group for the version information. */ private static final String VERSION_REGEX = "(?:\\s*version\\s*=\\s*[\'\"]([^\'\"]+)[\'\"]){0,1}"; /** The regular expression group for the optional encoding information. */ private static final String ENCODING_REGEX = "(?:\\s*encoding\\s*=\\s*[\'\"]([^\'\"]+)[\'\"]){0,1}"; /** The regular expression group for the optional standalone information. */ private static final String STANDALONE_REGEX = "(?:\\s*standalone\\s*=\\s*[\'\"]([^\'\"]+)[\'\"]){0,1}"; /** The regular expression for the XML declaration. */ private static final String DECLARATION_REGEX = "<\\?xml" + VERSION_REGEX + ENCODING_REGEX + STANDALONE_REGEX; /** The compiled regex pattern for the XML declaration. */ private static final Pattern DECLARATION_PATTERN = Pattern.compile(DECLARATION_REGEX); /** The XML version number as found by SAX2. */ protected String versionFromSAX2; /** The version number actually declared. */ protected String versionDeclared; /** The character encoding as found by SAX2. */ protected String encodingFromSAX2; /** The character encoding actually declared. */ protected String encodingDeclared; /** The standalone status as found by SAX2. */ protected String standaloneFromSAX2; /** The standalone status actually declared. */ protected String standaloneDeclared; public XmlDeclaration(){ super(); } /** * Gets the version number of the XML Standard to which this instance * conforms. * * @return the version number */ @ReportableProperty(order = 1, value = "XML Version") public String getVersion() { return getValue(versionFromSAX2, versionDeclared); } /** * Gets the character encoding used in the XML instance. * * @return the character encoding */ @ReportableProperty(order = 2, value = "Character Encoding") public String getEncoding() { return getValue(encodingFromSAX2, encodingDeclared); } /** * Gets the standalone status of the DTD markup declarations. * * @return the standalone status */ @ReportableProperty(order = 3, value = "Standalone") public String getStandalone() { return getValue(standaloneFromSAX2, standaloneDeclared); } /** * Compares the value found by SAX2 and the value explicitly specified in the XML * Declaration. <br /> * * <ul> * <li>If equal, then returns the SAX2 value.</li> * <li>If value was omitted from the declaration, then value returned is labeled "(default)".</li> * <li>If the SAX2 and declared values differ, then both values are reported.</li> * </ul> * * @param valueFromSAX2 * the value found by SAX2 parser * @param valueDeclared * the value actually specified in the XML Declaration * @return the value */ private String getValue(String valueFromSAX2, String valueDeclared) { if (valueFromSAX2 == null) { return valueDeclared; } else if (valueFromSAX2.equalsIgnoreCase(valueDeclared)) { return valueFromSAX2; } else if (valueDeclared == null) { return valueFromSAX2 + " [default]"; } else { return valueFromSAX2 + " [!= (value declared = " + valueDeclared + ")]"; } } /** * Parses the beginning of the XML document to extract the values declared * for version, encoding, and standalone * * @param input * the Input from the Source object * @throws IOException. */ protected void parse(Input input) throws IOException { /* Get the text of the XML Declaration */ StringBuffer sb = new StringBuffer(); char c; do { c = (char) input.readUnsignedByte(); sb.append(c); } while ((c != '>') && (c != Input.EOF) && (sb.length() < 100)); String xmldecl = sb.toString(); /* Use regular expression capture groups to extract values (or null if omitted) */ Matcher m = XmlDeclaration.DECLARATION_PATTERN.matcher(xmldecl); if (m.find()) { versionDeclared = m.group(1); encodingDeclared = m.group(2); standaloneDeclared = m.group(3); } } }