/** * JHOVE2 - Next-generation architecture for format-aware characterization * <p> * Copyright (c) 2009 by The Regents of the University of California, Ithaka * Harbors, Inc., and The Board of Trustees of the Leland Stanford Junior * University. All rights reserved. * </p> * <p> * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * </p> * <ul> * <li>Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer.</li> * <li>Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution.</li> * <li>Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission.</li> * </ul> * <p> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * </p> */ package org.jhove2.module.format.xml; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jhove2.annotation.ReportableProperty; import org.jhove2.core.JHOVE2; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.Message; import org.jhove2.core.Message.Context; import org.jhove2.core.Message.Severity; import org.jhove2.core.io.Input; import org.jhove2.core.reportable.AbstractReportable; import com.sleepycat.persist.model.Persistent; /** * A class to hold a sorted set of numeric character references (NCRs) that are * used to represent Unicode characters, and the counts of how many times each * NCR reference was found in the XML document. * * @see http://www.w3.org/International/questions/qa-escapes * @see http://unicode.org/standard/principles.html#Assigning_Codes */ @Persistent public class NumericCharacterReferenceInformation extends AbstractReportable { /** * The regular expression that would match a numeric character reference. * The capture group allows extraction of just the numeric code portion. */ private static final String NCR_REGEX = "&#([0-9]+|[xX][0-9a-fA-F]+);"; /** * The compiled regular expression for the NCR. */ private static final Pattern NCR_PATTERN = Pattern.compile(NCR_REGEX); /** The de-duplicated set of NCRs found in the XML document. */ TreeMap<Integer, NumericCharacterReference> numericCharacterReferenceMap = new TreeMap<Integer, NumericCharacterReference>(); /** Invalid character for encoding message. */ protected Message invalidCharacterForEncodingMessage; protected NumericCharacterReferenceInformation(){ super(); } /** * Get the NCRs found during XML parsing. * * @return the numeric character references */ @ReportableProperty(order = 1, value = "numeric character references found during XML parsing") public ArrayList<NumericCharacterReference> getNumericCharacterReferenceList() { return new ArrayList<NumericCharacterReference>( numericCharacterReferenceMap.values()); } /** * Increment the instance count for this numeric character reference. * * @param code * the string representation of a character's unicode code point * @param jhove2 * the JHOVE2 framework * @throws JHOVE2Exception */ public void tally(String code, JHOVE2 jhove2) throws JHOVE2Exception { try { Integer codePoint; if (code.substring(0,1).toLowerCase().equals("x")) { // Hexadecimal codePoint = Integer.decode(code.toLowerCase().replace("x", "0x")); } else { // Decimal codePoint = new Integer(code); } NumericCharacterReference reference = numericCharacterReferenceMap .get(codePoint); if (reference != null) { reference.count++; } else { numericCharacterReferenceMap.put(codePoint, new NumericCharacterReference(codePoint)); } } catch (NumberFormatException e) { this.invalidCharacterForEncodingMessage = new Message( Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.xml.XmlModule.invalidCharacterForEncodingMessage", jhove2.getConfigInfo()); } } /** * In order to locate numeric character references (like the code for double * dagger = ‡ ), we need to do a separate parse of the source object. * The SAX2 parser does not provide a mechanism for getting at these markup * constructs, which are not considered XML entities. The characters() * method of the ContentHandler interface, translates these codes into * Unicode characters before placing the data in the buffer. * @param jhove2 * the JHOVE2 framework * * @throws IOException * @throws JHOVE2Exception */ protected void parse(Input input, String encodingFromSAX2, JHOVE2 jhove2) throws IOException, JHOVE2Exception { ByteBuffer bbuf = input.getBuffer(); int position = bbuf.position(); try { /* Get a CharSequence object that can be analyzed */ CharBuffer cbuf = Charset.forName(encodingFromSAX2).newDecoder() .decode(bbuf); /* Look for numeric character references */ Matcher ncrMatcher = NCR_PATTERN.matcher(cbuf); while (ncrMatcher.find()) { /* * Found one, record the occurrence of the NCR code (pattern * capture group 1) */ tally(ncrMatcher.group(1), jhove2); } } catch (CharacterCodingException e) { this.invalidCharacterForEncodingMessage = new Message( Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.xml.XmlModule.invalidCharacterForEncodingMessage", jhove2.getConfigInfo()); } finally { bbuf.position(position); input.resetBuffer(); } } }