/**
* JHOVE2 - Next-generation architecture for format-aware characterization
*
* Copyright (c) 2009 by The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* o Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* o Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* o Neither the name of the University of California/California Digital
* Library, Ithaka Harbors/Portico, or Stanford University, nor the names of
* its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.jhove2.module.format.utf8;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.jhove2.annotation.ReportableProperty;
import org.jhove2.core.JHOVE2;
import org.jhove2.core.JHOVE2Exception;
import org.jhove2.core.Message;
import org.jhove2.core.Message.Context;
import org.jhove2.core.Message.Severity;
import org.jhove2.core.format.Format;
import org.jhove2.core.io.Input;
import org.jhove2.core.source.MeasurableSource;
import org.jhove2.core.source.Source;
import org.jhove2.module.format.BaseFormatModule;
import org.jhove2.module.format.Validator;
import org.jhove2.module.format.utf8.unicode.C0Control;
import org.jhove2.module.format.utf8.unicode.C1Control;
import org.jhove2.module.format.utf8.unicode.CodeBlock;
import org.jhove2.module.format.utf8.unicode.Unicode;
import org.jhove2.module.format.utf8.unicode.Unicode.EOL;
import org.jhove2.persist.FormatModuleAccessor;
import com.sleepycat.persist.model.Persistent;
/**
* JHOVE2 UTF-8 module.
*
* @author mstrong, slabrams
*/
@Persistent
public class UTF8Module
extends BaseFormatModule
implements Validator
{
/** UTF-8 module version identifier. */
public static final String VERSION = "2.0.0";
/** UTF-8 module release date. */
public static final String RELEASE = "2010-09-10";
/** UTF-8 module rights statement. */
public static final String RIGHTS =
"Copyright 2010 by The Regents of the University of California. " +
"Available under the terms of the BSD license.";
/** UTF-8 module validation coverage. */
public static final Coverage COVERAGE = Coverage.Inclusive;
/** Byte Order Mark (BOM) message. */
protected Message bomMessage;
/**
* Non-line ending C0 control characters. CR and LF are therefore
* <em>not</em> found in this set.
*/
protected Set<C0Control> c0Characters;
/** C1 control characters. */
protected Set<C1Control> c1Characters;
/** Unicode code blocks. */
protected Set<CodeBlock> codeBlocks;
/** End-of-Line (EOL) markers. */
protected Set<EOL> eolMarkers;
/** Fail fast message. */
protected Message failFastMessage;
/** Invalid UTF-8 characters. */
protected List<UTF8Character> invalidCharacters;
/** Number of characters. */
protected long numCharacters;
/** Number of lines. A line is terminated by a CR, CRLF, LF, or EOF. */
protected long numLines;
/** Number of non-characters. */
protected long numNonCharacters;
/** UTF-8 validity status. */
protected Validity isValid;
/**
* Instantiate a new <code>UTF8Module</code>.
*
* @param format
* UTF-8 format
*/
public UTF8Module(Format format, FormatModuleAccessor formatModuleAccessor) {
super(VERSION, RELEASE, RIGHTS, format, formatModuleAccessor);
this.c0Characters = new TreeSet<C0Control>();
this.c1Characters = new TreeSet<C1Control>();
this.codeBlocks = new TreeSet<CodeBlock>();
this.eolMarkers = new TreeSet<EOL>();
this.invalidCharacters = new ArrayList<UTF8Character>();
this.isValid = Validity.Undetermined;
this.numCharacters = 0L;
this.numLines = 0L;
this.numNonCharacters = 0L;
}
public UTF8Module(){
this(null, null);
}
/**
* Parse a source unit.
*
* @param jhove2
* JHOVE2 framework
* @param source
* UTF-8 source unit
* @param input
* UTF-8 source input
* @return Number of bytes consumed
* @throws EOFException
* If End-of-File is reached reading the source unit
* @throws IOException
* If an I/O exception is raised reading the source unit
* @throws JHOVE2Exception
* @see org.jhove2.module.format.FormatModule#parse(org.jhove2.core.JHOVE2,
* org.jhove2.core.source.Source, org.jhove2.core.io.Input)
*/
@Override
public long parse(JHOVE2 jhove2, Source source, Input input)
throws EOFException, IOException, JHOVE2Exception
{
long consumed = 0L;
this.isValid = Validity.Undetermined;
int numErrors = 0;
long start = ((MeasurableSource) source).getStartingOffset();
long end = start + ((MeasurableSource) source).getSize();
input.setPosition(start);
EOL eol = null;
long position = start;
int prevCodePoint = UTF8Character.UNINITIALIZED;
this.isValid = Validity.True;
while (end == 0 || position < end) {
UTF8Character ch = new UTF8Character();
long n = 0L;
try {
n = ch.parse(jhove2, source, input);
} catch (EOFException e) {
this.isValid = Validity.False;
break;
}
consumed += n;
if (position == start && ch.isByteOrderMark()) {
Object[] messageParms = new Object[]{position - start};
this.bomMessage = new Message(Severity.INFO,
Context.OBJECT,
"org.jhove2.module.format.utf8.UTF8Module.bomMessage",
messageParms, jhove2.getConfigInfo());
}
else {
this.numCharacters++;
int codePoint = ch.getCodePoint();
Validity isValid = ch.isValid();
if (isValid == Validity.False) {
this.isValid = isValid;
if (jhove2.failFast(++numErrors)) {
this.failFastMessage = new Message(Severity.INFO,
Context.PROCESS,
"org.jhove2.module.format.utf8.UTF8Module.failFastMessage",
jhove2.getConfigInfo());
break;
}
this.invalidCharacters.add(ch);
}
/* Determine character properties. */
eol = UTF8Character.getEOL(prevCodePoint, codePoint);
if (eol != null) {
this.numLines++;
this.eolMarkers.add(eol);
}
CodeBlock codeBlock = ch.getCodeBlock();
if (codeBlock != null) {
this.codeBlocks.add(codeBlock);
}
C0Control c0 = ch.getC0Control();
if (c0 != null && !c0.getMnemonic().equals("CR")
&& !c0.getMnemonic().equals("LF")) {
this.c0Characters.add(c0);
}
C1Control c1 = ch.getC1Control();
if (c1 != null) {
this.c1Characters.add(c1);
}
if (ch.isNonCharacter()) {
this.numNonCharacters++;
}
if (ch.isValid() == Validity.False) {
this.isValid = Validity.False;
}
prevCodePoint = codePoint;
}
position += n;
}
eol = UTF8Character.getEOL(prevCodePoint,
UTF8Character.UNINITIALIZED);
if (eol != null) {
this.numLines++;
this.eolMarkers.add(eol);
}
else if (prevCodePoint != Unicode.LF) {
this.numLines++;
}
return consumed;
}
/**
* Validate a UTF-8 source unit.
*
* @param jhove2
* JHOVE2 framework
* @param source
* UTF-8 source unit
* @param input
* UTF-8 source input
* @return UTF-8 validation status
* @see org.jhove2.module.format.Validator#validate(org.jhove2.core.JHOVE2,
* org.jhove2.core.source.Source, org.jhove2.core.io.Input)
*/
@Override
public Validity validate(JHOVE2 jhove2, Source source, Input input)
throws JHOVE2Exception
{
return this.isValid;
}
/**
* Get Byte Order Mark (BOM) message.
*
* @return Byte Order Mark (BOM) message
*/
@ReportableProperty(order = 11, value = "Byte Order Mark (BOM) message.")
public Message getByteOrderMark() {
return this.bomMessage;
}
/**
* Get non-line ending C0 control characters. Therefore CR and LF will
* <em>not</em> be reported in this set.
*
* @return Set of non-line ending C0 control characters
*/
@ReportableProperty(order = 5, value = "Set of unique non-line-ending C0 control "
+ "characters. Thus, CR and LF are not included in this set.")
public Set<C0Control> getC0Characters() {
return this.c0Characters;
}
/**
* Get C1 control characters.
*
* @return Set of C1 control characters
*/
@ReportableProperty(order = 6, value = "Set of unique C1 control characters.")
public Set<C1Control> getC1Characters() {
return this.c1Characters;
}
/**
* Get code blocks.
*
* @return Set of code blocks
*/
@ReportableProperty(order = 4, value = "Set of unique Unicode code blocks.")
public Set<CodeBlock> getCodeBlocks() {
return this.codeBlocks;
}
/** Get UTF-8 module validation coverage.
* @return UTF-8 module validation coverage
*/
@Override
public Coverage getCoverage() {
return COVERAGE;
}
/**
* Get End-of-Line (EOL) markers.
*
* @return Set of EOL markers
*/
@ReportableProperty(order = 3, value = "Set of unique End-of-Line (EOL) markers.")
public Set<EOL> getEOLMarkers() {
return this.eolMarkers;
}
/**
* Get fail fast message.
*
* @return Fail fast message
*/
@ReportableProperty(order = 13, value = "Fail fast message.")
public Message getFailFast() {
return this.failFastMessage;
}
/**
* Get invalid UTF-8 characters.
*
* @return Invalid UTF-8 characters
*/
@ReportableProperty(order = 12, value = "Invalid UTF-8 characters.")
public List<UTF8Character> getInvalidCharacters() {
return this.invalidCharacters;
}
/**
* Get number of characters.
*
* @return Number of characters
*/
@ReportableProperty(order = 1, value = "Number of UTF-8 characters.")
public long getNumCharacters() {
return this.numCharacters;
}
/**
* Get number of lines. A line is terminated by a CR, CRLF, LF, or EOF
*
* @return Number of lines
*/
@ReportableProperty(order = 2, value = "Number of lines. A line is "
+ "terminated by a CR, CRLF, LF, or End-of-File (EOF).")
public long getNumLines() {
return this.numLines;
}
/**
* Get number of non-characters.
*
* @return Number of non-characters
*/
@ReportableProperty(order = 7, value = "Number of UTF-8 non-characters.")
public long getNumNonCharacters() {
return this.numNonCharacters;
}
/**
* Get UTF-8 validation status.
*
* @return UTF-8 validation status
* @see org.jhove2.module.format.Validator#isValid()
*/
@Override
public Validity isValid() {
return this.isValid;
}
}