/**
*
*/
package uk.bl.wa.tika.parser.ole2;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Set;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.MarkUnsupportedException;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.Section;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.Ffn;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
/**
* @author andy
*
*/
public class OLE2Parser extends AbstractParser {
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// TODO Auto-generated method stub
return null;
}
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
HWPFDocument doc = new HWPFDocument (stream);
System.out.println("ApplicationName: "+doc.getSummaryInformation().getApplicationName());
System.out.println("OSVersion: "+doc.getSummaryInformation().getOSVersion());
System.out.println("# paragraphs: "+doc.getDocumentSummaryInformation().getParCount());
System.out.println("# bytes: "+doc.getDocumentSummaryInformation().getByteCount());
System.out.println("# hidden: "+doc.getDocumentSummaryInformation().getHiddenCount());
System.out.println("# lines: "+doc.getDocumentSummaryInformation().getLineCount());
System.out.println("# mmclips: "+doc.getDocumentSummaryInformation().getMMClipCount());
System.out.println("# notes: "+doc.getDocumentSummaryInformation().getNoteCount());
System.out.println("# sections: "+doc.getDocumentSummaryInformation().getSectionCount());
System.out.println("# slides: "+doc.getDocumentSummaryInformation().getSlideCount());
System.out.println("format: "+doc.getDocumentSummaryInformation().getFormat());
for( TextPiece tp : doc.getTextTable().getTextPieces() ) {
System.out.println("TP: "+tp.getStringBuffer().substring(0, 100));
System.out.println("TP: "+tp.getPieceDescriptor().isUnicode());
}
for( Object os : doc.getDocumentSummaryInformation().getSections() ) {
Section s = (Section) os;
System.out.println("ss# fid: "+s.getFormatID());
System.out.println("ss# codepage: "+s.getCodepage());
System.out.println("ss# # properties: "+s.getPropertyCount());
for( Property sp : s.getProperties() ) {
System.out.println("ss# property: "+sp.getValue().getClass().getCanonicalName()+" "+sp.getValue());
}
}
for( Ffn f : doc.getFontTable().getFontNames() ) {
System.out.println("Font: "+f.getMainFontName()+", "+f.getSize()+", "+f.getWeight());
}
parseCompObj( stream );
// This
POIFSFileSystem fs = new POIFSFileSystem(stream);
DirectoryEntry root = fs.getRoot();
dump(root);
}
public static void parseCompObj(InputStream file) {
Collector collector = new Collector();
POIFSReader poifsReader = new POIFSReader();
poifsReader.registerListener(collector, "\001CompObj");
try {
poifsReader.read(file);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// collector.classId now contains the result.
}
// http://mail-archives.apache.org/mod_mbox/poi-user/200504.mbox/%3C0IFL00BM77MPGW@mta6.srv.hcvlny.cv.net%3E
// For CLSIDs:
// http://anoochit.fedorapeople.org/rpmbuild/BUILD/msttcorefonts/cab-contents/wviewer.stf
// http://www.msfn.org/board/topic/139093-create-standalone-word-97/
// For CompObj format, not clear:
// FlashPix Format Spec!
public static class Collector implements POIFSReaderListener {
private ClassID classId;
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
InputStream stream = event.getStream();
try {
if (stream.skip(12) == 12) { // magic number for the offset to the clsid.
byte[] classIdBytes = new byte[ClassID.LENGTH];
if (stream.read(classIdBytes) == ClassID.LENGTH) {
classId = new ClassID(classIdBytes, 0);
}
}
} catch (IOException e) {
// Handle error.
}
System.out.println("Found ClassID: "+classId);
}
}
public static void dump(DirectoryEntry root) throws IOException {
System.out.println(root.getName()+" : storage CLSID "+root.getStorageClsid());
for(Iterator it = root.getEntries(); it.hasNext();){
Entry entry = (Entry)it.next();
if(entry instanceof DocumentNode){
DocumentNode node = (DocumentNode)entry;
System.out.println("Node name: "+node.getName());
System.out.println("Node desc: "+node.getShortDescription());
System.out.println("Node size: "+node.getSize());
DocumentInputStream is = new DocumentInputStream(node);
try {
PropertySet ps = new PropertySet(is);
if( ps.getSectionCount() != 0 ) {
for( Property p : ps.getProperties() ) {
System.out.println("Prop: "+p.getID()+" "+p.getValue());
}
}
} catch (NoPropertySetStreamException e) {
// TODO Auto-generated catch block
//e.printStackTrace();
} catch (MarkUnsupportedException e) {
// TODO Auto-generated catch block
//e.printStackTrace();
}
//byte[] bytes = new byte[node.getSize()];
//is.read(bytes);
//is.close();
//FileOutputStream out = new FileOutputStream(new File(parent, node.getName().trim()));
//out.write(bytes);
//out.close();
//System.out.println("Node: "+new String(bytes).substring(0, 10));
} else if (entry instanceof DirectoryEntry){
DirectoryEntry dir = (DirectoryEntry)entry;
dump(dir);
} else {
System.err.println("Skipping unsupported POIFS entry: " + entry);
}
}
}
}