/** * JHOVE2 - Next-generation architecture for format-aware characterization * * Copyright (c) 2009 by The Regents of the University of California, * Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford * Junior University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * o Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * o Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * o Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package org.jhove2.module.format.sgml; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.Message; import org.jhove2.core.Message.Context; import org.jhove2.core.Message.Severity; import org.jhove2.core.source.Source; import org.jhove2.core.JHOVE2; import org.jhove2.util.externalprocess.ExternalProcessHandler; import org.jhove2.util.externalprocess.ExternalProcessUtils; import org.jhove2.util.externalprocess.FilepathFilter; import org.jhove2.util.externalprocess.NoSuchShellEnvException; /** * Wrapper around OpenSP SGML parser and onsgmlNorm utility. * * After the SGML file is parsed by OpenSp the output (in ESIS format) * is parsed using an ANTLR-generated Java parser class. The grammar * has been decorated with Java members and methods to accumulate information * about the features of interest in the SGML file. The grammar accepts * the OpenSP indication as to whether or not the SGML instance conforms * to its DTD, and hence is to be considered valid. * * No DOCTYPE information is returned in the ESIS file. If the SGML module * is configured to ask for doctype, then we run the onsgml "normalization" * utility, and extract the doctype from the first line of its output. * * @author smorrissey * */ public class OpenSpWrapper implements SgmlParser { /** suffix to be used on files generated by onsmgl command */ public final String ESIS_SUFFIX = ".esis"; /** suffix to be used on files generated by sgmlnorm command */ public final String NORM_SUFFIX = ".norm"; /** suffix to be used for std err redirect for shell invocation */ public final String TEMP_STD_ERR = ".std.err"; /** Handler that will invoke external process to run ongsml utilities */ protected ExternalProcessHandler processHandler; /** filters to be applied to filepaths to enable processing by ExternalProcessHandler on different operating systems*/ protected FilepathFilter filepathFilter = null; /** full path to onsgmls command */ protected String onsgmlsPath; /** full path to sgmlnorm utility */ protected String sgmlnormPath; /** options settings for OpenSp ongsmls command */ protected OnsgmlsOptions onsgmlsOptions; /** options settings for OpenSp sgmlnorm command */ protected SgmlNormOptions sgmlnormOptions; /** quote character to use in command string around source file path if it contains space(s)*/ protected String spaceEscapeChar; /** ESIS file parser */ protected OnsgmlsOutputParser esisFileParser; /** ERR (Message) file parser*/ protected OpenSpErrMessageParser messageParser; /** OpenSP norm output file parser */ protected SgmlNormParser doctypeParser; /** list of paths to any temporary files created by OpenSp to be deleted if JHOVE2 configured to delete temporary files */ protected List<String> tempFilePaths = new ArrayList<String>(); /** * Invokes onsmgls processor to parse and validate the SGML * source. Then invokes ANTLR-generated parser to accumulate * feature information about the instance * @param sgm SgmlModule instance invoking this method; module will have * Source member * @param jhove2 Application framework with configuratin information * @param source Source to be parsed * @throws JHOVE2Exception */ @Override public SgmlDocumentProperties parseFile(SgmlModule sgm, JHOVE2 jhove2, Source source) throws JHOVE2Exception { SgmlDocumentProperties props; String esisCommandParms = this.getOnsgmlsOptions().getOptionString(); if (sgm.getDocumentProperties()==null){ sgm.setDocumentProperties(new SgmlDocumentProperties()); } props = sgm.getDocumentProperties(); if (props.getSgmlParserConfigSettings()==null){ props.setSgmlParserConfigSettings(new ArrayList<String>()); } props.getSgmlParserConfigSettings().add("OpenSp onsgmls Options: ".concat(esisCommandParms)); String [] onsgmlOutputs = this.parseSgmlFile(jhove2, source, ESIS_SUFFIX, this.onsgmlsPath, esisCommandParms, sgm); String esisFilePath = onsgmlOutputs[0]; String esisErrFilePath = onsgmlOutputs[1]; if (esisFilePath != null && esisErrFilePath != null){ do { // parse the ESIS output file from onsgmls to extract features of SGML file File esisFile = new File(esisFilePath); if (!esisFile.exists()){ Object[]messageArgs = new Object[]{esisFilePath}; Message message = new Message( Severity.WARNING, Context.OBJECT, "org.jhove2.module.format.sgml.OpenSpWrapper.NoEsisFileFound", messageArgs, jhove2.getConfigInfo()); sgm.getSgmlParserWarningMessages().add(message); } else { try { props = esisFileParser.parseEsisFile(esisFilePath, jhove2, source, sgm); } catch (IOException e) { continue; // message already attached to module } } /** now parse the Message (.err) file created by OpenSP */ File messageFile = new File(esisErrFilePath); if (!messageFile.exists()){ Object[]messageArgs = new Object[]{esisFilePath}; Message message = new Message( Severity.WARNING, Context.OBJECT, "org.jhove2.module.format.sgml.OpenSpWrapper.NoErrFileFound", messageArgs, jhove2.getConfigInfo()); sgm.getSgmlParserWarningMessages().add(message); } else { try { messageParser.parseMessageFile(esisErrFilePath, jhove2, source, sgm); } catch (IOException e) { continue; // message already attached to module } } } while (false); } return props; } /** * Invokes OpenSp sgmlnorm utility to parse the SGML file. If the file can be successfully parsed, * normalized output will contain a DOCTYPE statement * @param sgm SgmlModule instance invoking this method; module will have * Source member * @param jhove2 JHOVE2 framework with config information * @param source Source to be processed * @throws JHOVE2Exception */ @Override public void determineDoctype(SgmlModule sgm, JHOVE2 jhove2, Source source) throws JHOVE2Exception {; SgmlDocumentProperties props = sgm.getDocumentProperties(); if (props==null){ props = new SgmlDocumentProperties(); sgm.setDocumentProperties(props); } String normOptions = this.getSgmlnormOptions().getOptionString(); if (props.getSgmlParserConfigSettings()==null){ props.setSgmlParserConfigSettings(new ArrayList<String>()); } props.getSgmlParserConfigSettings().add("OpenSp sgmlnorm Options: ".concat(normOptions)); String [] normOutputs = this.parseSgmlFile(jhove2, source, NORM_SUFFIX, this.sgmlnormPath, normOptions, sgm); String normOutPath = normOutputs[0]; String normErrPath =normOutputs[1]; if (normOutPath != null && normErrPath != null){ do { File normFile = new File(normOutPath); if (!normFile.exists()){ Object[]messageArgs = new Object[]{normOutPath}; Message message = new Message( Severity.WARNING, Context.OBJECT, "org.jhove2.module.format.sgml.OpenSpWrapper.NoNormFileFound", messageArgs, jhove2.getConfigInfo()); sgm.getSgmlParserWarningMessages().add(message); } else { try { doctypeParser.parseNormFile(normOutPath, jhove2, source, sgm); } catch (IOException e) { continue; // message already attached to module } } }while (false); } return; } @Override public void cleanUp() throws JHOVE2Exception { this.getOnsgmlsOptions().setFilepathFilter(null); this.setOnsgmlsOptions(null); this.getSgmlnormOptions().setFilepathFilter(null); this.setSgmlnormOptions(null); this.setProcessHandler(null); this.setFilepathFilter(null); this.setOnsgmlsPath(null); this.setSgmlnormPath(null); this.setEsisFileParser(null); this.setMessageParser(null); this.setDoctypeParser(null); if (this.tempFilePaths != null){ for (String tempFilePath:this.tempFilePaths){ File tempFile = new File(tempFilePath); if (tempFile.exists() && (tempFile.isFile())){ tempFile.delete(); } } for (int i=0; i<this.tempFilePaths.size(); i++){ String oldString = this.tempFilePaths.set(i, null); oldString = null; } this.tempFilePaths = null; } } /** * Apply an OpenSp utility (onsmgls, sgmlnorm) to the SGML file * @param jhove2 Framework with configuration information * @param source Source to be parsed * @param tempFileSuffix base suffix for onsgmls output and error temp files * @param openSpCommand String containing full path to OpenSp command * @param commandParms string containing parameters for OpenSp command * @param sgmlModule sgml Module to which messages can be attached * @return String[] containing path to .out and .err files Paths will be null if ExternalProcessor resulted in error * @throws JHOVE2Exception * @throws IOException * @throws NoSuchShellEnvException */ protected String[] parseSgmlFile(JHOVE2 jhove2, Source source, String tempFileSuffix, String openSpCommand, String commandParms, SgmlModule sgmlModule) throws JHOVE2Exception { String parseOutputFilePath = null; String parseErrFilePath = null; String standardErrFilePath = null; File tempOutFile = null; File tempErrFile = null; File tempStdErrFile = null; String sgmFilePath = null; String [] onsgmlOutputs = {null, null}; File sgmFile = source.getFile(); do { try { sgmFilePath = sgmFile.getCanonicalPath(); if (this.filepathFilter != null){ sgmFilePath = this.filepathFilter.filter( sgmFilePath); } } catch (IOException e) { String eMessage = e.getLocalizedMessage(); if (eMessage==null){ eMessage = ""; } Object[]messageArgs = new Object[]{sgmFilePath, eMessage}; Message message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLSourcePath", messageArgs, jhove2.getConfigInfo()); sgmlModule.getSgmlParserErrorMessages().add(message); continue; } if (sgmFilePath.contains(" ")){ sgmFilePath = spaceEscapeChar.concat(sgmFilePath).concat(spaceEscapeChar); } // create path names for the 2 output (output and err messages) // files generated by OpenSp try { tempOutFile = File.createTempFile( jhove2.getInvocation().getTempPrefix(), jhove2.getInvocation().getTempSuffix().concat(tempFileSuffix), new File(jhove2.getInvocation().getTempDirectory())); } catch (IOException e) {; String eMessage = e.getLocalizedMessage(); if (eMessage==null){ eMessage = ""; } Object[]messageArgs = new Object[]{eMessage}; Message message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLTempOutFile", messageArgs, jhove2.getConfigInfo()); sgmlModule.getSgmlParserErrorMessages().add(message); continue; } if (jhove2.getInvocation().getDeleteTempFilesOnClose()){ try { this.tempFilePaths.add(tempOutFile.getCanonicalPath()); } catch (IOException e){} } parseOutputFilePath = tempOutFile.getPath(); if (this.filepathFilter != null){ parseOutputFilePath = this.filepathFilter.filter(parseOutputFilePath); } try { tempErrFile = File.createTempFile( jhove2.getInvocation().getTempPrefix(), jhove2.getInvocation().getTempSuffix().concat(tempFileSuffix).concat(".err"), new File(jhove2.getInvocation().getTempDirectory())); } catch (IOException e) { String eMessage = e.getLocalizedMessage(); if (eMessage==null){ eMessage = ""; } Object[]messageArgs = new Object[]{eMessage}; Message message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLTempErrFile", messageArgs, jhove2.getConfigInfo()); sgmlModule.getSgmlParserErrorMessages().add(message); continue; } if (jhove2.getInvocation().getDeleteTempFilesOnClose()){ try { this.tempFilePaths.add(tempErrFile.getCanonicalPath()); } catch (IOException e){} } parseErrFilePath = tempErrFile.getPath(); if (this.filepathFilter != null){ parseErrFilePath = this.filepathFilter.filter(parseErrFilePath); } try{ tempStdErrFile = File.createTempFile( jhove2.getInvocation().getTempPrefix(), jhove2.getInvocation().getTempSuffix().concat(tempFileSuffix).concat(TEMP_STD_ERR), new File(jhove2.getInvocation().getTempDirectory())); } catch (IOException e) { String eMessage = e.getLocalizedMessage(); if (eMessage==null){ eMessage = ""; } Object[]messageArgs = new Object[]{eMessage}; Message message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.IOExceptionForSGMLStdErrFile2", messageArgs, jhove2.getConfigInfo()); sgmlModule.getSgmlParserErrorMessages().add(message); continue; } if (jhove2.getInvocation().getDeleteTempFilesOnClose()){ try { this.tempFilePaths.add(tempStdErrFile.getCanonicalPath()); } catch (IOException e){} } standardErrFilePath = tempStdErrFile.getPath(); if (this.filepathFilter != null){ standardErrFilePath = this.filepathFilter.filter(standardErrFilePath); } StringBuffer sbCommand = new StringBuffer(openSpCommand); sbCommand.append(" "); sbCommand.append(commandParms); sbCommand.append(" "); sbCommand.append(OpenSpOptions.ERRFILEOPT); sbCommand.append(parseErrFilePath); sbCommand.append(" "); sbCommand.append(sgmFilePath); sbCommand.append(" > "); sbCommand.append(parseOutputFilePath); sbCommand.append(" 2>"); sbCommand.append(standardErrFilePath); String command = sbCommand.toString(); try { this.getProcessHandler().executeCommand(command); } catch (NoSuchShellEnvException en) { Object[]messageArgs = new Object[]{en.getMessage()}; Message message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessNoSuchShellCommand", messageArgs, jhove2.getConfigInfo()); sgmlModule.getSgmlParserErrorMessages().add(message); continue; } catch (JHOVE2Exception je){ String eType = je.getCause().getClass().getCanonicalName(); String eMessage = je.getCause().getLocalizedMessage(); Object[]messageArgs = new Object[]{command, eType, eMessage}; Message message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessException", messageArgs, jhove2.getConfigInfo()); sgmlModule.getSgmlParserErrorMessages().add(message); continue; } // check for process error messages (OpenSp produces .err file of length > 0) if (tempStdErrFile.exists() && tempStdErrFile.length()>0){ String standardErrText = null; Message message = null; Object[]messageArgs = null; try { standardErrText = ExternalProcessUtils.fileContentsToString(tempStdErrFile); messageArgs = new Object[]{command, standardErrText}; message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessErrMessage", messageArgs, jhove2.getConfigInfo()); } catch (IOException e) { String eMessage = e.getLocalizedMessage(); if (eMessage==null){ eMessage = ""; } String eType = e.getClass().getCanonicalName(); messageArgs = new Object[]{command, eType, eMessage}; message = new Message( Severity.ERROR, Context.PROCESS, "org.jhove2.module.format.sgml.OpenSpWrapper.externalProcessErrMessageException", messageArgs, jhove2.getConfigInfo()); } sgmlModule.getSgmlParserErrorMessages().add(message); parseOutputFilePath = null; parseErrFilePath = null; } onsgmlOutputs = new String[]{parseOutputFilePath, parseErrFilePath}; } while (false); return onsgmlOutputs; } /** * @return the processHandler */ public ExternalProcessHandler getProcessHandler() { return processHandler; } /** * @param processHandler the processHandler to set */ public void setProcessHandler(ExternalProcessHandler processHandler) { this.processHandler = processHandler; } /** * @return the onsgmlsPath */ public String getOnsgmlsPath() { return onsgmlsPath; } /** * @param onsgmlsPath the onsgmlsPath to set */ public void setOnsgmlsPath(String onsgmlsPath) { this.onsgmlsPath = onsgmlsPath; } /** * @return the filepathFilter */ public FilepathFilter getFilepathFilter() { return filepathFilter; } /** * @param filepathFilter the filepathFilter to set */ public void setFilepathFilter(FilepathFilter filepathFilter) { this.filepathFilter = filepathFilter; } /** * @return the sgmlnormPath */ public String getSgmlnormPath() { return sgmlnormPath; } /** * @param sgmlnormPath the sgmlnormPath to set */ public void setSgmlnormPath(String sgmlnormPath) { this.sgmlnormPath = sgmlnormPath; } /** * @return the onsgmlsOptions */ public OnsgmlsOptions getOnsgmlsOptions() { return onsgmlsOptions; } /** * @param onsgmlsOptions the onsgmlsOptions to set */ public void setOnsgmlsOptions(OnsgmlsOptions onsgmlsOptions) { this.onsgmlsOptions = onsgmlsOptions; } /** * @return the sgmlnormOptions */ public SgmlNormOptions getSgmlnormOptions() { return sgmlnormOptions; } /** * @param sgmlnormOptions the sgmlnormOptions to set */ public void setSgmlnormOptions(SgmlNormOptions sgmlnormOptions) { this.sgmlnormOptions = sgmlnormOptions; } /** * @return the spaceEscapeChar */ public String getSpaceEscapeChar() { return spaceEscapeChar; } /** * @param spaceEscapeChar the spaceEscapeChar to set */ public void setSpaceEscapeChar(String spaceEscapeChar) { this.spaceEscapeChar = spaceEscapeChar; } /** * @return the esisFileParser */ public OnsgmlsOutputParser getEsisFileParser() { return esisFileParser; } /** * @param esisFileParser the esisFileParser to set */ public void setEsisFileParser(OnsgmlsOutputParser esisFileParser) { this.esisFileParser = esisFileParser; } /** * @return the messageParser */ public OpenSpErrMessageParser getMessageParser() { return messageParser; } /** * @param messageParser the messageParser to set */ public void setMessageParser(OpenSpErrMessageParser messageParser) { this.messageParser = messageParser; } /** * @return the doctypeParser */ public SgmlNormParser getDoctypeParser() { return doctypeParser; } /** * @param doctypeParser the doctypeParser to set */ public void setDoctypeParser(SgmlNormParser doctypeParser) { this.doctypeParser = doctypeParser; } }