/** * JHOVE2 - Next-generation architecture for format-aware characterization * * Copyright (c) 2009 by The Regents of the University of California, * Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford * Junior University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * o Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * o Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * o Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package org.jhove2.module.format.sgml; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import org.jhove2.core.JHOVE2; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.source.Source; /** * Class for parsing output of OpenSp sgmlnorm utility. This utility is run in order to extract the DOCTYPE statement * and any public or system identifer from an SGML document. * * Please see the SGML module specification for details about running this utility. * * @author smorrissey * */ public class SgmlNormFileParser implements SgmlNormParser { protected String DOCTYPE = "<!DOCTYPE"; protected String DOCTYPELC = "<!doctype"; protected String PUBLIC = "PUBLIC"; protected String PUBLICLC = "public"; protected String SYSTEM = "SYSTEM"; protected String SYSTEMLC = "system"; protected String QUOTE = "\""; protected String SQUOTE = "'"; protected boolean foundDoctype = false; protected boolean foundPubid = false; protected boolean foundSysid = false; protected String pubid = null; protected String systemId = null; /** * */ public SgmlNormFileParser() { super(); } /* (non-Javadoc) * @see org.jhove2.module.format.sgml.SgmlNormParser#parseNormFile(java.lang.String, org.jhove2.core.JHOVE2, org.jhove2.core.source.Source, org.jhove2.module.format.sgml.SgmlModule) */ @Override public void parseNormFile(String normFilepath, JHOVE2 jhove2, Source source, SgmlModule sgm) throws JHOVE2Exception, IOException { BufferedReader sgmlNormOutput = null; String tempMessage = null; try { sgmlNormOutput = new BufferedReader (new InputStreamReader(new FileInputStream(normFilepath), "utf-8")); if (sgmlNormOutput.ready()) { while ((tempMessage = sgmlNormOutput.readLine()) != null) { if (foundDoctype){ continue; } if (tempMessage.startsWith(DOCTYPE)||tempMessage.startsWith(DOCTYPELC)){ foundDoctype=true; int endPubId = this.lookForPubid(tempMessage); String restOfMessage = tempMessage; if (endPubId > -1 && tempMessage.length()>endPubId + 1){ restOfMessage = tempMessage.substring(endPubId + 1); } this.lookForSystemId(restOfMessage); continue; } else { continue; } }// end while }// end if (sgmlNormOutput.ready()) }// end try finally { if (sgmlNormOutput != null){ sgmlNormOutput.close(); } } if (sgm.getDocumentProperties()==null) { sgm.setDocumentProperties(new SgmlDocumentProperties()); } this.extractDocProperties(sgm.getDocumentProperties()); return; } /** * Extract significant properties obtained from parsed file and add to SgmlDocumentProperties object associated with the file * @param props */ protected void extractDocProperties(SgmlDocumentProperties props) { props.setFoundDoctype(this.foundDoctype); String strPubid = null; if (this.pubid != null){ strPubid = new String(this.pubid); this.pubid = null; } props.setPublicIdentifier(strPubid); props.setFoundPubid(this.foundPubid); String strSysId = null; if (this.systemId != null){ strSysId = new String(this.systemId); this.systemId = null; } props.setSystemIdentifier(strSysId); props.setFoundSysid(this.foundSysid); return; } /** * Parse String containing DOCTYPE statement to see if there is a public identifier * If pubid is found, update foundPubId to "true", and extract and store pubid in field "pubid" * @param tempMessage String containing DOCTYPE statement * @return position of terminal single or double quote of the public identifier, or -1 if no public identifier found */ protected int lookForPubid(String tempMessage) { int endPubId = -1; int firstQuote = -1; int endQuote = -1; int startPubId = tempMessage.indexOf(PUBLIC); if (startPubId < 0){ startPubId = tempMessage.indexOf(PUBLICLC); } if (startPubId >=0 && tempMessage.length()>startPubId+1){ boolean useQuote = false; firstQuote = tempMessage.substring(startPubId).indexOf(QUOTE); if (firstQuote > -1){ useQuote = true; } else { firstQuote = tempMessage.substring(startPubId).indexOf(SQUOTE); } if (firstQuote > -1 && tempMessage.length()> firstQuote + startPubId + 1){ firstQuote = firstQuote + startPubId; if (useQuote){ endQuote = tempMessage.substring(firstQuote+1).indexOf(QUOTE); } else { endQuote = tempMessage.substring(firstQuote+1).indexOf(SQUOTE); } if (endQuote > -1){ endQuote = endQuote + firstQuote +1; this.foundPubid = true; this.pubid = tempMessage.substring(firstQuote+1,endQuote); endPubId = endQuote; } } } return endPubId; } /** * Parse String containing DOCTYPE statement to see if it contains System Identifier * If found, set this.foundSysid to true, and set this.systemId to value of System Identifier * @param restOfMessage String containing DOCTYPE statement */ protected void lookForSystemId(String restOfMessage) { int firstQuote = -1; int endQuote = -1; int startSysId = restOfMessage.indexOf(SYSTEM); if (startSysId < 0){ startSysId = restOfMessage.indexOf(SYSTEMLC); } if (startSysId >=0 && restOfMessage.length()>startSysId+1){ boolean useQuote = false; firstQuote = restOfMessage.substring(startSysId).indexOf(QUOTE); if (firstQuote > -1){ useQuote = true; } else { firstQuote = restOfMessage.substring(startSysId).indexOf(SQUOTE); } if (firstQuote > -1 && restOfMessage.length()> firstQuote + startSysId + 1){ firstQuote = firstQuote + startSysId; if (useQuote){ endQuote = restOfMessage.substring(firstQuote+1).indexOf(QUOTE); } else { endQuote = restOfMessage.substring(firstQuote+1).indexOf(SQUOTE); } if (endQuote > -1){ endQuote = endQuote + firstQuote +1; this.foundSysid = true; this.systemId = restOfMessage.substring(firstQuote+1,endQuote); } } } return; } }