/** * JHOVE2 - Next-generation architecture for format-aware characterization * * Copyright (c) 2009 by The Regents of the University of California, * Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford * Junior University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * o Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * o Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * o Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package org.jhove2.module.format.sgml; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import org.jhove2.core.JHOVE2; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.source.Source; import org.jhove2.util.CopyUtils; /** * Class to parse .out (ESIS) files produced by OpenSp onsgmls module * Please see module specification document for details of the ESIS file format * @author smorrissey * */ public class EsisFileParser implements OnsgmlsOutputParser { // characteristics of interest to the JHOVE2 SGML module protected boolean isSgmlValid = false; // is this a conforming SGML file; indicated by presence of "C" command at end of file protected String rootElementName; // element name of first startElementCommand protected List<String> esisParseErrors; protected int publicIdCount = 0; protected int fileNamesCount = 0; protected int sysidsCount = 0; protected int extTextEntCount = 0; protected int notatDefCount = 0; protected int extDataEntCount = 0; protected int entrefCount = 0; protected int intDataEntCount = 0; protected int subDocEntityDefCount = 0; protected int subDocCommandCount = 0; protected int omitCommandCount = 0; protected int elementAttributeCount = 0; protected int dataAttrCount = 0; protected int linkAttrCount = 0; protected int elementCount = 0; // number of start element commands (same element can occur more than once) protected int dataCount = 0; protected int includedSubElementsCount = 0; protected int emptyElementsCount = 0; protected int commentsCount = 0; protected int sDataCount = 0; protected int piCount = 0; protected int appInfoCount = 0; protected SortedSet<String> elementNames; protected SortedSet<String> entRefNames; protected SortedSet<String> sdataNames; /** ESIS Command characters */ protected String POUND = "#"; protected String AMP = "&"; //38 x26 protected String LEFTPAREN = "("; //40 x28 protected String RIGHTPAREN= ")"; //41 x29 protected String DASH = "-"; //45 x2D protected String PI = "?"; //63 x3F protected String ACMD = "A"; //65 x41 protected String CCMD = "C"; //67 x43 protected String DCMD = "D"; //68 x44 protected String ECMD = "E"; //69 x45 protected String ICMD = "I"; //74 x49 protected String LCMD = "L"; //76 x4C protected String NCMD = "N"; //78 x4E protected String SCMD = "S"; //83 x53 protected String TCMD = "T"; //84 x54 protected String UNDER = "_"; //95 x5F protected String LACMD = "a"; //97 x61 protected String LECMD = "e"; //101 x65 protected String FCMD = "f"; //102 x66 protected String LCICMD = "i"; //105 x69 protected String OCMD = "o"; //111 x6f protected String PCMD = "p"; //112 x70 protected String LSCMD = "s"; //115 x73 protected String LEFTBRACE = "{"; //123 x7B protected String RIGHTBRACE= "}"; //125 x7d protected String SDATADELIM = "\\|"; /** * */ public EsisFileParser() { super(); isSgmlValid = false; rootElementName = null; this.setEsisParseErrors(new LinkedList<String>()); this.setElementNames(new TreeSet<String>()); this.setEntRefNames(new TreeSet<String>()); this.setSdataNames(new TreeSet<String>()); } /* (non-Javadoc) * @see org.jhove2.module.format.sgml.OnsgmlsOutputParser#parseEsisFile(java.lang.String, org.jhove2.core.JHOVE2, org.jhove2.core.source.Source, org.jhove2.module.format.sgml.SgmlModule) */ @Override public SgmlDocumentProperties parseEsisFile(String esisPath, JHOVE2 jhove2, Source source, SgmlModule sgm) throws JHOVE2Exception, IOException { BufferedReader onsgmlsOutput = null; String tempMessage = null; boolean foundCInOutput = false; boolean foundDataAfterCInInput = false; try { onsgmlsOutput = new BufferedReader (new InputStreamReader(new FileInputStream(esisPath), "utf-8")); if (onsgmlsOutput.ready()) // read through output to get counts, start element, sdata entities, entity references { while ((tempMessage = onsgmlsOutput.readLine()) != null) { if (foundCInOutput){ //readLine() does not return line-termination characters boolean isEmptyLine = (tempMessage.length()==0); if (!isEmptyLine){ esisParseErrors.add(OnsgmlsOutputParser.ESISERR + "Content found Conforming File Message"); foundDataAfterCInInput = true; continue; } continue; } if (tempMessage.startsWith(ACMD)) { elementAttributeCount++; this.checkForSdataEntities(tempMessage); } else if (tempMessage.startsWith(DCMD)) { dataAttrCount++; } else if (tempMessage.startsWith(LACMD)) { linkAttrCount++; } else if (tempMessage.startsWith(LEFTPAREN)) { String elementName = tempMessage.substring(1); if (elementCount==0){ if(tempMessage.length()>0){ rootElementName = elementName; } } elementCount++; this.elementNames.add(elementName); } else if (tempMessage.startsWith(DASH)) { dataCount++; this.checkForSdataEntities(tempMessage); } else if (tempMessage.startsWith(RIGHTPAREN)) { continue; } else if (tempMessage.startsWith(AMP)) { entrefCount++; if(tempMessage.length()>0){ String entRef = tempMessage.substring(1); entRefNames.add(entRef); } } else if (tempMessage.startsWith(PI)) { piCount++; } else if (tempMessage.startsWith(NCMD)) { notatDefCount++; } else if (tempMessage.startsWith(ECMD)) { extDataEntCount++; } else if (tempMessage.startsWith(ICMD)) { intDataEntCount++; } else if (tempMessage.startsWith(SCMD)) { subDocEntityDefCount++; } else if (tempMessage.startsWith(TCMD)) { extTextEntCount++; } else if (tempMessage.startsWith(LSCMD)) { sysidsCount++; } else if (tempMessage.startsWith(PCMD)) { publicIdCount++; } else if (tempMessage.startsWith(FCMD)) { fileNamesCount++; } else if (tempMessage.startsWith(LEFTBRACE)) { subDocCommandCount++; } else if (tempMessage.startsWith(RIGHTBRACE)) { continue; } else if (tempMessage.startsWith(LCMD)) { continue; } else if (tempMessage.startsWith(POUND)) { appInfoCount++; } else if (tempMessage.startsWith(CCMD)) { foundCInOutput = true; } else if (tempMessage.startsWith(LCICMD)) { includedSubElementsCount++; } else if (tempMessage.startsWith(LECMD)) { emptyElementsCount++; } else if (tempMessage.startsWith(UNDER)) { commentsCount++; } else if (tempMessage.startsWith(OCMD)) { omitCommandCount++; } }// end while }// end if (onsgmlsOutput.ready()) }// end try finally{ if (onsgmlsOutput != null){ onsgmlsOutput.close(); } } if (foundCInOutput && !foundDataAfterCInInput){ isSgmlValid = true; } if (sgm.getDocumentProperties()==null){ sgm.setDocumentProperties(new SgmlDocumentProperties()); } SgmlDocumentProperties props = sgm.getDocumentProperties(); this.extractDocProperties(sgm.getDocumentProperties()); return props; } /** * Method to extract fields from ANTLR parser and make deep copy into SgmlDocumentProperties object. * Clears those objects in the ANTLR parser. * @param props updated SgmlDocumentProperties object with content of ANTLR parser fields */ protected void extractDocProperties(SgmlDocumentProperties props){ if (getEsisParseErrors() != null){ if (props.getParseErrors() == null){ props.setParseErrors(new ArrayList<String>()); } props.getParseErrors().addAll( CopyUtils.copyAndClearList(getEsisParseErrors())); setEsisParseErrors(null); } props.setAppInfoCount(appInfoCount); props.setCommentsCount(commentsCount); props.setDataAttrCount(dataAttrCount); props.setDataCount(dataCount); props.setElementAttributeCount(elementAttributeCount); props.setElementCount(elementCount); props.setElementNames(CopyUtils.copyAndClearSet(elementNames)); props.setEmptyElementsCount(emptyElementsCount); props.setEntRefNames(CopyUtils.copyAndClearSet(entRefNames)); props.setEntityFileNamesCount(fileNamesCount); props.setEntrefCount(entrefCount); props.setExtDataEntCount(extDataEntCount); props.setExtTextEntCount(extTextEntCount); props.setIncludedSubElementsCount(includedSubElementsCount); props.setIntDataEntCount(intDataEntCount); props.setLinkAttrCount(linkAttrCount); props.setNotatDefCount(notatDefCount); props.setOmitCommandCount(omitCommandCount); props.setProcessingInstructionsCount(piCount); props.setPublicIdCount(publicIdCount); String strName = null; if (rootElementName != null){ strName = new String(rootElementName); rootElementName = null; } props.setRootElementName(strName); props.setsDataCount(sDataCount); props.setSdataNames(CopyUtils.copyAndClearSet(sdataNames)); props.setSubDocCommandCount(subDocCommandCount); props.setSubDocEntityDefCount(subDocEntityDefCount); props.setSysidsCount(sysidsCount); props.setSgmlValid(isSgmlValid); return; } /** * Method to inspect contents of data (element content) and element attribute commands for SDATA entities * @param tempMessage */ protected void checkForSdataEntities(String tempMessage){ if (tempMessage != null){ int start = -1; int end =-1; String testString = tempMessage; while ((start = testString.indexOf(SDATADELIM))>-1){ if (testString.length()>start+SDATADELIM.length()){ testString = testString.substring(start+SDATADELIM.length()); end = testString.indexOf(SDATADELIM); if (end > 0){ String sdataString = testString.substring(0,end); sDataCount++; sdataNames.add(sdataString); if (testString.length() > end+SDATADELIM.length()){ testString = testString.substring(end+SDATADELIM.length()); } else { testString = ""; } } else{ testString = ""; } } else { testString = ""; } }// end while } return; } /** * @return the esisParseErrors */ public List<String> getEsisParseErrors() { return esisParseErrors; } /** * @param esisParseErrors the esisParseErrors to set */ public void setEsisParseErrors(List<String> esisParseErrors) { this.esisParseErrors = esisParseErrors; } /** * @return the entRefNames */ public SortedSet<String> getEntRefNames() { return entRefNames; } /** * @param entRefNames the entRefNames to set */ public void setEntRefNames(SortedSet<String> entRefNames) { this.entRefNames = entRefNames; } /** * @return the sdataNames */ public SortedSet<String> getSdataNames() { return sdataNames; } /** * @param sdataNames the sdataNames to set */ public void setSdataNames(SortedSet<String> sdataNames) { this.sdataNames = sdataNames; } /** * @return the elementNames */ public SortedSet<String> getElementNames() { return elementNames; } /** * @param elementNames the elementNames to set */ public void setElementNames(SortedSet<String> elementNames) { this.elementNames = elementNames; } }