/** * */ package uk.bl.wa.tika.parser.iso9660; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Enumeration; import java.util.List; import net.didion.loopy.iso9660.ISO9660FileEntry; import net.didion.loopy.iso9660.ISO9660FileSystem; import net.didion.loopy.iso9660.ISO9660VolumeDescriptorSet; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.IOUtils; import org.apache.tika.io.TemporaryResources; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class ISO9660Extractor { private final ContentHandler handler; private final Metadata metadata; private final EmbeddedDocumentExtractor extractor; public ISO9660Extractor( ContentHandler handler, Metadata metadata, ParseContext context) { this.handler = handler; this.metadata = metadata; EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); if (ex==null) { this.extractor = new ParsingEmbeddedDocumentExtractor(context); } else { this.extractor = ex; } } /** * Extend the ISO9660 class to expose the Volume information. * @author Andrew Jackson <Andrew.Jackson@bl.uk> */ public class ISO9660FS extends ISO9660FileSystem { public ISO9660FS(File file, boolean readOnly) throws IOException { super(file, readOnly); } public ISO9660VolumeDescriptorSet getVolumeDescriptorSet() { if( super.getVolumeDescriptorSet() == null ) { try { super.loadVolumeDescriptors(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return (ISO9660VolumeDescriptorSet) super.getVolumeDescriptorSet(); } } /* (non-Javadoc) * @see org.apache.tika.parser.AbstractParser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata) */ //@Override public void parse(InputStream stream) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); /* Use the loopy Java ISO9660 classes to retrieve the (possibly) * compressed entries as individual source units. */ TemporaryResources tmp = new TemporaryResources(); File file = tmp.createTemporaryFile(); OutputStream out = new FileOutputStream(file); try { IOUtils.copy(stream, out); } finally { out.close(); } ISO9660FS iso = new ISO9660FS(file, true); ISO9660VolumeDescriptorSet vds = iso.getVolumeDescriptorSet(); metadata.set("iso:volumeIdentifier",vds.getVolumeIdentifier()); metadata.set("iso:standardIdentifier",vds.getStandardIdentifier()); metadata.set("iso:systemIdentifier",vds.getSystemIdentifier()); metadata.set("iso:volumeSetIdentifier",vds.getVolumeSetIdentifier()); metadata.set("iso:publisher",vds.getPublisher()); metadata.set("iso:preparer",vds.getPreparer()); metadata.set("iso:encoding",vds.getEncoding()); if (iso != null) { try { /* ISO9660 entries (files and directories) are not necessarily in * hierarchical order. Also, directories may be implicit, that * is, referred to in the pathnames of files or directories but * not explicitly present in the form of a directory entry. * * Since all files and directories need to be associated with * the correct parent directory in order for aggregate * characterization to work properly, we there are three stages * of processing: * * (1) Identify all explicit directory entries, creating * Directory sources and putting them into a map keyed to * the directory pathname. * * (2) Identify all implicit directories (by extracting * directories from pathnames and checking to see if they * are not already in the map), creating Directory sources * and putting them into the map. Also characterize any * top-level file entries (children of the ISO9660 file) that * are found. * * (3) Directly characterize all top-level directories, that * is, those whose parent is the ISO9660 file. This will * implicitly characterize all child files and directories. */ List<String> map = new ArrayList<String>(); Enumeration<ISO9660FileEntry> en = iso.getEntries(); /* (1) Identify all directories that are explicit entries. */ while (en.hasMoreElements()) { ISO9660FileEntry entry = en.nextElement(); String name = this.getFullPath(entry); System.out.println("ISO9660 - Pre-scan found directory named: "+name); /* Delete trailing slash from path name, if necessary. Although this * always should be a forward slash (/), in practice a backward slash * \) may be found. */ int in = name.lastIndexOf('/'); if (in < 0) { in = name.lastIndexOf('\\'); } if (in == name.length() - 1) { name = name.substring(0, in); } // Source src = // factory.getSource(jhove2, ISO, entry); /* Get the entry-specific properties. */ /* long crc = entry.getCrc(); Digest crc32 = new Digest(AbstractArrayDigester.toHexString(crc), CRC32Digester.ALGORITHM); ISO9660EntryProperties properties = new ISO9660EntryProperties(name, entry.getCompressedSize(), crc32, entry.getComment(), new Date(entry.getTime())); src = src.addExtraProperties(properties); */ String key = getFullPath(entry); /* Remove trailing slash. Although this always * should be a forward slash (/), in practice a * backward slash (\) may be found. */ int len = key.length() - 1; char ch = key.charAt(len); if (ch == '/') { key = key.substring(0, len); } else if (ch == '\\') { key = key.substring(0, len); } // Now parse it... if( entry.isDirectory() ) { // FIXME What to do with directories? System.out.println("ISO9660 - Found directory named: "+name+" "+entry.getPath()); } else { // FIXME Parse the embedded file: System.out.println("ISO9660 - Found file named: "+name+" "+entry.getPath()); InputStream entryStream = iso.getInputStream(entry); /* Get the entry-specific properties. */ /* long crc = entry.getCrc(); Digest crc32 = new Digest(AbstractArrayDigester.toHexString(crc), CRC32Digester.ALGORITHM); ISO9660EntryProperties properties = new ISO9660EntryProperties(name, entry.getCompressedSize(), crc32, entry.getComment(), new Date(entry.getTime())); */ // Setup Metadata entrydata = new Metadata(); entrydata.set(Metadata.RESOURCE_NAME_KEY, key); // Use the delegate parser to parse the compressed document if (extractor.shouldParseEmbedded(entrydata)) { extractor.parseEmbedded(stream, xhtml, entrydata, true); } } } } finally { iso.close(); tmp.close(); } } xhtml.endDocument(); } /** * Helper to patch a consistent path from the ISO9660 Entry: * @param entry * @return */ private String getFullPath(ISO9660FileEntry entry) { String fullPath = entry.getPath(); if( fullPath == null || fullPath.length() == 0 ) fullPath = entry.getName(); if( fullPath.charAt(0) != '.' ) fullPath = "./"+fullPath; return fullPath; } }