/**
* JHOVE2 - Next-generation architecture for format-aware characterization
*
* Copyright (c) 2009 by The Regents of the University of California,
* Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford
* Junior University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* o Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* o Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* o Neither the name of the University of California/California Digital
* Library, Ithaka Harbors/Portico, or Stanford University, nor the names of
* its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.jhove2.module.format.sgml;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jhove2.core.JHOVE2Exception;
import org.jhove2.core.Message;
import org.jhove2.core.Message.Context;
import org.jhove2.core.Message.Severity;
import org.jhove2.core.source.Source;
import org.jhove2.core.JHOVE2;
import org.jhove2.util.externalprocess.ExternalProcessHandler;
import org.jhove2.util.externalprocess.ExternalProcessUtils;
import org.jhove2.util.externalprocess.FilepathFilter;
import org.jhove2.util.externalprocess.NoSuchShellEnvException;
/**
* Wrapper around OpenSP SGML parser and onsgmlNorm utility.
*
* After the SGML file is parsed by OpenSp the output (in ESIS format)
* is parsed using an ANTLR-generated Java parser class. The grammar
* has been decorated with Java members and methods to accumulate information
* about the features of interest in the SGML file. The grammar accepts
* the OpenSP indication as to whether or not the SGML instance conforms
* to its DTD, and hence is to be considered valid.
*
* No DOCTYPE information is returned in the ESIS file. If the SGML module
* is configured to ask for doctype, then we run the onsgml "normalization"
* utility, and extract the doctype from the first line of its output.
*
* @author smorrissey
*
*/
public class OpenSpWrapper implements SgmlParser {
/** suffix to be used on files generated by onsmgl command */
public final String ESIS_SUFFIX = ".esis";
/** suffix to be used on files generated by sgmlnorm command */
public final String NORM_SUFFIX = ".norm";
/** suffix to be used for std err redirect for shell invocation */
public final String TEMP_STD_ERR = ".std.err";
/** Handler that will invoke external process to run ongsml utilities */
protected ExternalProcessHandler processHandler;
/** filters to be applied to filepaths to enable processing by ExternalProcessHandler on different operating systems*/
protected FilepathFilter filepathFilter = null;
/** full path to onsgmls command */
protected String onsgmlsPath;
/** full path to sgmlnorm utility */
protected String sgmlnormPath;
/** options settings for OpenSp ongsmls command */
protected OnsgmlsOptions onsgmlsOptions;
/** options settings for OpenSp sgmlnorm command */
protected SgmlNormOptions sgmlnormOptions;
/** quote character to use in command string around source file path if it contains space(s)*/
protected String spaceEscapeChar;
/** ESIS file parser */
protected OnsgmlsOutputParser esisFileParser;
/** ERR (Message) file parser*/
protected OpenSpErrMessageParser messageParser;
/** OpenSP norm output file parser */
protected SgmlNormParser doctypeParser;
/** list of paths to any temporary files created by OpenSp to be deleted if JHOVE2 configured to delete temporary files */
protected List<String> tempFilePaths = new ArrayList<String>();
/**
* Invokes onsmgls processor to parse and validate the SGML
* source. Then invokes ANTLR-generated parser to accumulate
* feature information about the instance
* @param sgm SgmlModule instance invoking this method; module will have
* Source member
* @param jhove2 Application framework with configuratin information
* @param source Source to be parsed
* @throws JHOVE2Exception
*/
@Override
public SgmlDocumentProperties parseFile(SgmlModule sgm, JHOVE2 jhove2, Source source)
throws JHOVE2Exception {
SgmlDocumentProperties props;
String esisCommandParms = this.getOnsgmlsOptions().getOptionString();
if (sgm.getDocumentProperties()==null){
sgm.setDocumentProperties(new SgmlDocumentProperties());
}
props = sgm.getDocumentProperties();
if (props.getSgmlParserConfigSettings()==null){
props.setSgmlParserConfigSettings(new ArrayList<String>());
}
props.getSgmlParserConfigSettings().add("OpenSp onsgmls Options: ".concat(esisCommandParms));
String [] onsgmlOutputs =
this.parseSgmlFile(jhove2, source, ESIS_SUFFIX, this.onsgmlsPath, esisCommandParms, sgm);
String esisFilePath = onsgmlOutputs[0];
String esisErrFilePath = onsgmlOutputs[1];
if (esisFilePath != null && esisErrFilePath != null){
do {
// parse the ESIS output file from onsgmls to extract features of SGML file
File esisFile = new File(esisFilePath);
if (!esisFile.exists()){
Object[]messageArgs = new Object[]{esisFilePath};
Message message = new Message(
Severity.WARNING,
Context.OBJECT,
"org.jhove2.module.format.sgml.OpenSpWrapper.NoEsisFileFound",
messageArgs,
jhove2.getConfigInfo());
sgm.getSgmlParserWarningMessages().add(message);
}
else {
try {
props =
esisFileParser.parseEsisFile(esisFilePath, jhove2, source, sgm);
} catch (IOException e) {
continue; // message already attached to module
}
}
/** now parse the Message (.err) file created by OpenSP */
File messageFile = new File(esisErrFilePath);
if (!messageFile.exists()){
Object[]messageArgs = new Object[]{esisFilePath};
Message message = new Message(
Severity.WARNING,
Context.OBJECT,
"org.jhove2.module.format.sgml.OpenSpWrapper.NoErrFileFound",
messageArgs,
jhove2.getConfigInfo());
sgm.getSgmlParserWarningMessages().add(message);
}
else {
try {
messageParser.parseMessageFile(esisErrFilePath, jhove2, source, sgm);
} catch (IOException e) {
continue; // message already attached to module
}
}
} while (false);
}
return props;
}
/**
* Invokes OpenSp sgmlnorm utility to parse the SGML file. If the file can be successfully parsed,
* normalized output will contain a DOCTYPE statement
* @param sgm SgmlModule instance invoking this method; module will have
* Source member
* @param jhove2 JHOVE2 framework with config information
* @param source Source to be processed
* @throws JHOVE2Exception
*/
@Override
public void determineDoctype(SgmlModule sgm, JHOVE2 jhove2, Source source)
throws JHOVE2Exception {;
SgmlDocumentProperties props = sgm.getDocumentProperties();
if (props==null){
props = new SgmlDocumentProperties();
sgm.setDocumentProperties(props);
}
String normOptions = this.getSgmlnormOptions().getOptionString();
if (props.getSgmlParserConfigSettings()==null){
props.setSgmlParserConfigSettings(new ArrayList<String>());
}
props.getSgmlParserConfigSettings().add("OpenSp sgmlnorm Options: ".concat(normOptions));
String [] normOutputs =
this.parseSgmlFile(jhove2, source, NORM_SUFFIX, this.sgmlnormPath, normOptions, sgm);
String normOutPath = normOutputs[0];
String normErrPath =normOutputs[1];
if (normOutPath != null && normErrPath != null){
do {
File normFile = new File(normOutPath);
if (!normFile.exists()){
Object[]messageArgs = new Object[]{normOutPath};
Message message = new Message(
Severity.WARNING,
Context.OBJECT,
"org.jhove2.module.format.sgml.OpenSpWrapper.NoNormFileFound",
messageArgs,
jhove2.getConfigInfo());
sgm.getSgmlParserWarningMessages().add(message);
}
else {
try {
doctypeParser.parseNormFile(normOutPath, jhove2, source, sgm);
} catch (IOException e) {
continue; // message already attached to module
}
}
}while (false);
}
return;
}
@Override
public void cleanUp() throws JHOVE2Exception {
this.getOnsgmlsOptions().setFilepathFilter(null);
this.setOnsgmlsOptions(null);
this.getSgmlnormOptions().setFilepathFilter(null);
this.setSgmlnormOptions(null);
this.setProcessHandler(null);
this.setFilepathFilter(null);
this.setOnsgmlsPath(null);
this.setSgmlnormPath(null);
this.setEsisFileParser(null);
this.setMessageParser(null);
this.setDoctypeParser(null);
if (this.tempFilePaths != null){
for (String tempFilePath:this.tempFilePaths){
File tempFile = new File(tempFilePath);
if (tempFile.exists() && (tempFile.isFile())){
tempFile.delete();
}
}
for (int i=0; i<this.tempFilePaths.size(); i++){
String oldString = this.tempFilePaths.set(i, null);
oldString = null;
}
this.tempFilePaths = null;
}
}
/**
* Apply an OpenSp utility (onsmgls, sgmlnorm) to the SGML file
* @param jhove2 Framework with configuration information
* @param source Source to be parsed
* @param tempFileSuffix base suffix for onsgmls output and error temp files
* @param openSpCommand String containing full path to OpenSp command
* @param commandParms string containing parameters for OpenSp command
* @param sgmlModule sgml Module to which messages can be attached
* @return String[] containing path to .out and .err files Paths will be null if ExternalProcessor resulted in error
* @throws JHOVE2Exception
* @throws IOException
* @throws NoSuchShellEnvException
*/
protected String[] parseSgmlFile(JHOVE2 jhove2, Source source,
String tempFileSuffix, String openSpCommand, String commandParms, SgmlModule sgmlModule)
throws JHOVE2Exception {
String parseOutputFilePath = null;
String parseErrFilePath = null;
String standardErrFilePath = null;
File tempOutFile = null;
File tempErrFile = null;
File tempStdErrFile = null;
String sgmFilePath = null;
String [] onsgmlOutputs = {null, null};
File sgmFile = source.getFile();
do {
try {
sgmFilePath = sgmFile.getCanonicalPath();
if (this.filepathFilter != null){
sgmFilePath = this.filepathFilter.filter( sgmFilePath);
}
} catch (IOException e) {
String eMessage = e.getLocalizedMessage();
if (eMessage==null){
eMessage = "";
}
Object[]messageArgs = new Object[]{sgmFilePath, eMessage};
Message message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLSourcePath",
messageArgs,
jhove2.getConfigInfo());
sgmlModule.getSgmlParserErrorMessages().add(message);
continue;
}
if (sgmFilePath.contains(" ")){
sgmFilePath = spaceEscapeChar.concat(sgmFilePath).concat(spaceEscapeChar);
}
// create path names for the 2 output (output and err messages)
// files generated by OpenSp
try {
tempOutFile = File.createTempFile(
jhove2.getInvocation().getTempPrefix(),
jhove2.getInvocation().getTempSuffix().concat(tempFileSuffix),
new File(jhove2.getInvocation().getTempDirectory()));
} catch (IOException e) {;
String eMessage = e.getLocalizedMessage();
if (eMessage==null){
eMessage = "";
}
Object[]messageArgs = new Object[]{eMessage};
Message message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLTempOutFile",
messageArgs,
jhove2.getConfigInfo());
sgmlModule.getSgmlParserErrorMessages().add(message);
continue;
}
if (jhove2.getInvocation().getDeleteTempFilesOnClose()){
try {
this.tempFilePaths.add(tempOutFile.getCanonicalPath());
}
catch (IOException e){}
}
parseOutputFilePath = tempOutFile.getPath();
if (this.filepathFilter != null){
parseOutputFilePath = this.filepathFilter.filter(parseOutputFilePath);
}
try {
tempErrFile = File.createTempFile(
jhove2.getInvocation().getTempPrefix(),
jhove2.getInvocation().getTempSuffix().concat(tempFileSuffix).concat(".err"),
new File(jhove2.getInvocation().getTempDirectory()));
} catch (IOException e) {
String eMessage = e.getLocalizedMessage();
if (eMessage==null){
eMessage = "";
}
Object[]messageArgs = new Object[]{eMessage};
Message message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLTempErrFile",
messageArgs,
jhove2.getConfigInfo());
sgmlModule.getSgmlParserErrorMessages().add(message);
continue;
}
if (jhove2.getInvocation().getDeleteTempFilesOnClose()){
try {
this.tempFilePaths.add(tempErrFile.getCanonicalPath());
}
catch (IOException e){}
}
parseErrFilePath = tempErrFile.getPath();
if (this.filepathFilter != null){
parseErrFilePath = this.filepathFilter.filter(parseErrFilePath);
}
try{
tempStdErrFile = File.createTempFile(
jhove2.getInvocation().getTempPrefix(),
jhove2.getInvocation().getTempSuffix().concat(tempFileSuffix).concat(TEMP_STD_ERR),
new File(jhove2.getInvocation().getTempDirectory()));
} catch (IOException e) {
String eMessage = e.getLocalizedMessage();
if (eMessage==null){
eMessage = "";
}
Object[]messageArgs = new Object[]{eMessage};
Message message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLStdErrFile2",
messageArgs,
jhove2.getConfigInfo());
sgmlModule.getSgmlParserErrorMessages().add(message);
continue;
}
if (jhove2.getInvocation().getDeleteTempFilesOnClose()){
try {
this.tempFilePaths.add(tempStdErrFile.getCanonicalPath());
}
catch (IOException e){}
}
standardErrFilePath = tempStdErrFile.getPath();
if (this.filepathFilter != null){
standardErrFilePath = this.filepathFilter.filter(standardErrFilePath);
}
StringBuffer sbCommand = new StringBuffer(openSpCommand);
sbCommand.append(" ");
sbCommand.append(commandParms);
sbCommand.append(" ");
sbCommand.append(OpenSpOptions.ERRFILEOPT);
sbCommand.append(parseErrFilePath);
sbCommand.append(" ");
sbCommand.append(sgmFilePath);
sbCommand.append(" > ");
sbCommand.append(parseOutputFilePath);
sbCommand.append(" 2>");
sbCommand.append(standardErrFilePath);
String command = sbCommand.toString();
try {
this.getProcessHandler().executeCommand(command);
} catch (NoSuchShellEnvException en) {
Object[]messageArgs = new Object[]{en.getMessage()};
Message message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessNoSuchShellCommand",
messageArgs,
jhove2.getConfigInfo());
sgmlModule.getSgmlParserErrorMessages().add(message);
continue;
} catch (JHOVE2Exception je){
String eType = je.getCause().getClass().getCanonicalName();
String eMessage = je.getCause().getLocalizedMessage();
Object[]messageArgs = new Object[]{command, eType, eMessage};
Message message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessException",
messageArgs,
jhove2.getConfigInfo());
sgmlModule.getSgmlParserErrorMessages().add(message);
continue;
}
// check for process error messages (OpenSp produces .err file of length > 0)
if (tempStdErrFile.exists() && tempStdErrFile.length()>0){
String standardErrText = null;
Message message = null;
Object[]messageArgs = null;
try {
standardErrText = ExternalProcessUtils.fileContentsToString(tempStdErrFile);
messageArgs = new Object[]{command, standardErrText};
message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessErrMessage",
messageArgs,
jhove2.getConfigInfo());
} catch (IOException e) {
String eMessage = e.getLocalizedMessage();
if (eMessage==null){
eMessage = "";
}
String eType = e.getClass().getCanonicalName();
messageArgs = new Object[]{command, eType, eMessage};
message = new Message(
Severity.ERROR,
Context.PROCESS,
"org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessErrMessageException",
messageArgs,
jhove2.getConfigInfo());
}
sgmlModule.getSgmlParserErrorMessages().add(message);
parseOutputFilePath = null;
parseErrFilePath = null;
}
onsgmlOutputs = new String[]{parseOutputFilePath, parseErrFilePath};
} while (false);
return onsgmlOutputs;
}
/**
* @return the processHandler
*/
public ExternalProcessHandler getProcessHandler() {
return processHandler;
}
/**
* @param processHandler the processHandler to set
*/
public void setProcessHandler(ExternalProcessHandler processHandler) {
this.processHandler = processHandler;
}
/**
* @return the onsgmlsPath
*/
public String getOnsgmlsPath() {
return onsgmlsPath;
}
/**
* @param onsgmlsPath the onsgmlsPath to set
*/
public void setOnsgmlsPath(String onsgmlsPath) {
this.onsgmlsPath = onsgmlsPath;
}
/**
* @return the filepathFilter
*/
public FilepathFilter getFilepathFilter() {
return filepathFilter;
}
/**
* @param filepathFilter the filepathFilter to set
*/
public void setFilepathFilter(FilepathFilter filepathFilter) {
this.filepathFilter = filepathFilter;
}
/**
* @return the sgmlnormPath
*/
public String getSgmlnormPath() {
return sgmlnormPath;
}
/**
* @param sgmlnormPath the sgmlnormPath to set
*/
public void setSgmlnormPath(String sgmlnormPath) {
this.sgmlnormPath = sgmlnormPath;
}
/**
* @return the onsgmlsOptions
*/
public OnsgmlsOptions getOnsgmlsOptions() {
return onsgmlsOptions;
}
/**
* @param onsgmlsOptions the onsgmlsOptions to set
*/
public void setOnsgmlsOptions(OnsgmlsOptions onsgmlsOptions) {
this.onsgmlsOptions = onsgmlsOptions;
}
/**
* @return the sgmlnormOptions
*/
public SgmlNormOptions getSgmlnormOptions() {
return sgmlnormOptions;
}
/**
* @param sgmlnormOptions the sgmlnormOptions to set
*/
public void setSgmlnormOptions(SgmlNormOptions sgmlnormOptions) {
this.sgmlnormOptions = sgmlnormOptions;
}
/**
* @return the spaceEscapeChar
*/
public String getSpaceEscapeChar() {
return spaceEscapeChar;
}
/**
* @param spaceEscapeChar the spaceEscapeChar to set
*/
public void setSpaceEscapeChar(String spaceEscapeChar) {
this.spaceEscapeChar = spaceEscapeChar;
}
/**
* @return the esisFileParser
*/
public OnsgmlsOutputParser getEsisFileParser() {
return esisFileParser;
}
/**
* @param esisFileParser the esisFileParser to set
*/
public void setEsisFileParser(OnsgmlsOutputParser esisFileParser) {
this.esisFileParser = esisFileParser;
}
/**
* @return the messageParser
*/
public OpenSpErrMessageParser getMessageParser() {
return messageParser;
}
/**
* @param messageParser the messageParser to set
*/
public void setMessageParser(OpenSpErrMessageParser messageParser) {
this.messageParser = messageParser;
}
/**
* @return the doctypeParser
*/
public SgmlNormParser getDoctypeParser() {
return doctypeParser;
}
/**
* @param doctypeParser the doctypeParser to set
*/
public void setDoctypeParser(SgmlNormParser doctypeParser) {
this.doctypeParser = doctypeParser;
}
}