/** * JHOVE2 - Next-generation architecture for format-aware characterization * * Copyright (c) 2009 by The Regents of the University of California, * Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford * Junior University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * o Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * o Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * o Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package org.jhove2.module.format.utf8; import java.io.EOFException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import org.jhove2.annotation.ReportableProperty; import org.jhove2.core.JHOVE2; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.Message; import org.jhove2.core.Message.Context; import org.jhove2.core.Message.Severity; import org.jhove2.core.io.Input; import org.jhove2.core.reportable.AbstractReportable; import org.jhove2.core.source.MeasurableSource; import org.jhove2.core.source.Source; import org.jhove2.module.format.Parser; import org.jhove2.module.format.Validator; import org.jhove2.module.format.utf8.unicode.C0Control; import org.jhove2.module.format.utf8.unicode.C1Control; import org.jhove2.module.format.utf8.unicode.CodeBlock; import org.jhove2.module.format.utf8.unicode.Unicode; import org.jhove2.module.format.utf8.unicode.Unicode.EOL; import com.sleepycat.persist.model.Persistent; /** * JHOVE2 UTF-8 character modeling class. * * @author mstrong, slabrams */ @Persistent public class UTF8Character extends AbstractReportable implements Parser, Validator { /** Validation coverage. */ public static final Coverage COVERAGE = Coverage.Inclusive; /** Byte Order Mark (BOM). */ public static final int BOM = 0xFEFF; /** Marker that the character code point is uninitialized or unknown. */ public static final int UNINITIALIZED = -1; /** C0 control. Null if character is not a C0 control. */ protected C0Control c0control; /** C1 control. Null if character is not a C1 control. */ protected C1Control c1control; /** Unicode code block. */ protected CodeBlock codeBlock; /** Character code point. */ protected int codePoint; /** Code point out of range message. */ protected Message codePointOutOfRangeMessage; /** Invalid byte value message. */ protected List<Message> invalidByteValueMessages; /** Character Byte Order Mark (BOM) status. */ protected boolean isBOM; /** Character C0 control status. */ protected boolean isC0Control; /** Character C1 control status. */ protected boolean isC1Control; /** Non-character status. */ protected boolean isNonCharacter; /** Character validation status. */ protected Validity isValid; /** Character byte offset. */ protected long offset; /** Character encoded size, in bytes. */ protected int size; protected static Properties codeBlockProps; protected static Properties c0ControlProps; protected static Properties c1ContolProps; /** * Instantiate a new <code>UTF8Character</code> */ public UTF8Character() { super(); this.codePoint = UNINITIALIZED; this.invalidByteValueMessages = new ArrayList<Message>(); this.isBOM = false; this.isC0Control = false; this.isC1Control = false; this.isNonCharacter = false; this.isValid = Validity.Undetermined; this.size = 0; } /** * Parse a source unit input. Implicitly set the start and end elapsed time. * * @param jhove2 * JHOVE2 framework * @param source * UTF-8 source unit * @param input * UTF-8 source input * @return Number of bytes consumed * @throws EOFException * If End-of-File is reached reading the source unit * @throws IOException * If an I/O exception is raised reading the source unit * @throws JHOVE2Exception */ @Override public long parse(JHOVE2 jhove2, Source source, Input input) throws EOFException, IOException, JHOVE2Exception { this.isValid = Validity.True; long offset = ((MeasurableSource) source).getStartingOffset(); /* Read the first byte. */ long consumed = 0L; int[] b = new int[4]; b[0] = input.readUnsignedByte(); if (b[0] == Input.EOF) { this.isValid = Validity.False; throw new EOFException(); } consumed++; /* Determine size of the character [Unicode, D92]. */ if (0x00 <= b[0] && b[0] <= 0x7F) { this.size = 1; } else if (0xC2 <= b[0] && b[0] <= 0xDF) { this.size = 2; } else if (0xe0 <= b[0] && b[0] <= 0xEF) { this.size = 3; } else if (0xF0 <= b[0] && b[0] <= 0xF4) { this.size = 4; } else if ((0x80 <= b[0] && b[0] <= 0xC1) || (0xF5 <= b[0] && b[0] <= 0xFF)) { this.isValid = Validity.False; Object[]messageArgs = new Object[]{0, input.getPosition()-offset, b[0]}; this.invalidByteValueMessages.add(new Message(Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.utf8.UTF8Character.invalidByteValueMessages", messageArgs, jhove2.getConfigInfo())); } /* Read the remaining bytes. */ for (int i = 1; i < this.size; i++) { b[i] = input.readUnsignedByte(); if (b[i] == Input.EOF) { this.isValid = Validity.False; throw new EOFException(); } consumed++; if ((i == 2 && ((this.size == 3 && ((b[0] == 0xE0 && (0x0A > b[i] || b[i] > 0xBF)) || (b[0] == 0xED && (0x80 > b[i] || b[i] > 0x9F)))) || (this.size == 4 && ((b[0] == 0xF0 && (0x90 > b[i] || b[i] > 0xBF)) || (b[0] == 0xF4 && (0x80 > b[i] || b[i] > 0x8F)))))) || (0x80 > b[i] || b[i] > 0xBF)) { this.isValid = Validity.False; Object[]messageArgs = new Object[]{i, input.getPosition()-offset, b[i]}; this.invalidByteValueMessages.add(new Message(Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.utf8.UTF8Character.invalidByteValueMessages", messageArgs, jhove2.getConfigInfo())); } } /* Determine the character's code point. */ if (this.size == 1) { this.codePoint = b[0]; } else if (this.size == 2) { this.codePoint = ((b[0] & 0x1f) << 6) + (b[1] & 0x3f); } else if (this.size == 3) { this.codePoint = ((b[0] & 0x0f) << 12) + ((b[1] & 0x3f) << 6) + (b[2] & 0x3f); } else if (this.size == 4) { this.codePoint = ((b[0] & 0x07) << 18) + ((b[1] & 0x3f) << 12) + ((b[2] & 0x3f) << 6) + (b[3] & 0x3f); } /* Set character properties. */ if (this.codePoint == BOM) { this.isBOM = true; } this.codeBlock = CodeBlock.getBlock(this.codePoint, jhove2); this.c0control = C0Control.getControl(this.codePoint, jhove2); this.c1control = C1Control.getControl(this.codePoint, jhove2); /* Check for code point outside of valid range [Unicode, D76]. */ if (this.codePoint < 0x00 || (0xD7FF < this.codePoint && this.codePoint < 0xE000) || this.codePoint > 0x10FFFF) { this.isValid = Validity.False; Object[] messageArgs = new Object[]{input.getPosition()-consumed-offset, this.codePoint}; this.codePointOutOfRangeMessage = new Message(Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.utf8.UTF8Character.codePointOutOfRangeMessage", messageArgs, jhove2.getConfigInfo()); } /* Check if code point is a non-character [Unicode, D14] */ if ((this.codePoint >= 0xFDD0 && this.codePoint <= 0xFDEF) || this.codePoint == 0x0FFFE || this.codePoint == 0x0FFFF || this.codePoint == 0x1FFFE || this.codePoint == 0x1FFFF || this.codePoint == 0x2FFFE || this.codePoint == 0x2FFFF || this.codePoint == 0x3FFFE || this.codePoint == 0x3FFFF || this.codePoint == 0x4FFFE || this.codePoint == 0x4FFFF || this.codePoint == 0x5FFFE || this.codePoint == 0x5FFFF || this.codePoint == 0x6FFFE || this.codePoint == 0x6FFFF || this.codePoint == 0x7FFFE || this.codePoint == 0x7FFFF || this.codePoint == 0x8FFFE || this.codePoint == 0x8FFFF || this.codePoint == 0x9FFFE || this.codePoint == 0x9FFFF || this.codePoint == 0x10FFFE || this.codePoint == 0x10FFFF) { this.isNonCharacter = true; } return consumed; } /** * Validate a source unit. * * @param jhove2 * JHOVE2 framework * @param source * UTF-8 source unit * @param input * UTF-8 source input * @return Source unit validity */ @Override public Validity validate(JHOVE2 jhove2, Source source, Input input) { return this.isValid; } /** * Get code point. * * @return Code point */ @ReportableProperty(order = 1, value = "Code point.") public int getCodePoint() { return this.codePoint; } /** * Determine line ending markers. * * @param prevCodePoint * Previous character code point * @param codePoint * Current character code point * @return The line ending markers (CR, LF, or CRLF) or null if not at a * line ending */ public static synchronized EOL getEOL(int prevCodePoint, int codePoint) { EOL eol = null; if (codePoint == Unicode.LF) { if (prevCodePoint == Unicode.CR) { eol = EOL.CRLF; } else { eol = EOL.LF; } } else if (prevCodePoint == Unicode.CR) { eol = EOL.CR; } return eol; } /** * Get Unicode code block. * * @return Unicode code block, or null if not in any code block */ @ReportableProperty(order = 3, value = "Code block.") public CodeBlock getCodeBlock() { return this.codeBlock; } /** * Get C0 control. * * @return C0 control or null if not a C0 control */ @ReportableProperty(order = 5, value = "C0 control character.") public C0Control getC0Control() { return this.c0control; } /** * Get C1 control. * * @return C1 control or null if not a c1 control */ @ReportableProperty(order = 7, value = "C1 control character.") public C1Control getC1Control() { return this.c1control; } /** * Get code point out of range message. * * @return Code point our of range message */ @ReportableProperty(order = 12, value = "Code point out of range message.") public Message getCodePointOutOfRange() { return this.codePointOutOfRangeMessage; } /** Get validation coverage. * @return Validation coverage */ @Override public Coverage getCoverage() { return COVERAGE; } /** * Get invalid byte value message. * * @return Invalid byte value message */ @ReportableProperty(order = 11, value = "Invalid byte value message.") public List<Message> getInvalidByteValues() { return this.invalidByteValueMessages; } /** * Get encoded size, in bytes. * * @return Encoded size, in bytes */ @ReportableProperty(order = 2, value = "Encoded size, in bytes.") public int getSize() { return this.size; } /** * Get Byte Order Mark (BOM) status. * * @return True if a BOM */ @ReportableProperty(order = 8, value = "Byte Order Mark (BOM) status: true if a BOM.") public boolean isByteOrderMark() { return this.isBOM; } /** * Get C0 control status. * * @return True if C0 control */ @ReportableProperty(order = 4, value = "C0 control status: true if a C0 control character.") public boolean isC0Control() { return this.isC0Control; } /** * Get C1 control status. * * @return True if C1 control */ @ReportableProperty(order = 6, value = "C1 control status: true if a C1 control character.") public boolean isC1Control() { return this.isC1Control; } /** * Get non-character status. * * @return True if not a character */ @ReportableProperty(order = 9, value = "Non-character status: true if not a character.") public boolean isNonCharacter() { return this.isNonCharacter; } /** * Get validation status. * * @return True if a valid ASCII character stream */ @Override public Validity isValid() { return this.isValid; } }