/** * */ package uk.bl.wa.tika; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Map; import org.apache.log4j.Logger; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import uk.bl.wa.tika.parser.iso9660.ISO9660Parser; import uk.bl.wa.tika.parser.warc.ARCParser; import uk.bl.wa.tika.parser.warc.WARCParser; /** * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class PreservationParser extends AutoDetectParser { private static Logger log = Logger.getLogger(PreservationParser.class.getName()); public static final String EXT_MIME_TYPE = "Extended-MIME-Type"; private boolean initialised = false; private NonRecursiveEmbeddedDocumentExtractor embedded = null; /** * */ private static final long serialVersionUID = 6809061887891839162L; /** * Modify the configuration as needed: * @param context */ public void init(ParseContext context) { if( this.initialised ) return; // Add the HighlightJS detector: /* CompositeDetector detect = (CompositeDetector) this.getDetector(); List<Detector> detectors = detect.getDetectors(); detectors.add( new HighlightJSDetector() ); this.setDetector( new CompositeDetector(detectors)); */ // Override the built-in PDF parser (based on PDFBox) with our own (based in iText): MediaType pdf = MediaType.parse("application/pdf"); Map<MediaType, Parser> parsers = getParsers(); parsers.put( pdf, new uk.bl.wa.tika.parser.pdf.pdfbox.PDFParser() ); parsers.put( MediaType.parse("application/x-iso9660-image"), new ISO9660Parser() ); parsers.put( MediaType.parse("application/x-internet-archive"), new ARCParser() ); parsers.put( MediaType.parse("application/warc"), new WARCParser() ); setParsers(parsers); // Override the recursive parsing: embedded = new NonRecursiveEmbeddedDocumentExtractor(context); context.set( EmbeddedDocumentExtractor.class, embedded ); this.initialised = true; } /* (non-Javadoc) * @see org.apache.tika.parser.AutoDetectParser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext) */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Override with custom parsers, etc.: if( !initialised ) init(context); // Pick up the detected MIME Type passed in from above: String providedType = metadata.get( Metadata.CONTENT_TYPE ); log.info("Supplied hint, providedType = " + providedType); /* debug info String[] names = metadata.names(); Arrays.sort(names); for( String name : names ) { System.out.println("PPPPre : "+name+" = "+metadata.get(name)); } */ // Parse: super.parse(stream, handler, metadata, context); /* Debug info names = metadata.names(); Arrays.sort(names); for( String name : names ) { System.out.println("PPPPost : "+name+" = "+metadata.get(name)); } */ // Build the extended MIME Type, incorporating version and creator software etc. MediaType tikaType = null; try { if( providedType == null ) { tikaType = MediaType.parse( metadata.get( Metadata.CONTENT_TYPE ) ); } else { tikaType = MediaType.parse( providedType ); } } catch ( Exception e) { // Stop here and return if this failed: log.error("Could not parse MIME Type: "+metadata.get( Metadata.CONTENT_TYPE )); tikaType = MediaType.OCTET_STREAM; metadata.remove( Metadata.CONTENT_TYPE ); } HashMap<String, String> hm = new HashMap<String,String>(); // Content encoding, if any: String encoding = metadata.get( Metadata.CONTENT_ENCODING ); /* if( encoding != null ) { if ( "text".equals( tikaType.getType() ) ) { tikaType.setParameter( "charset", encoding.toLowerCase() ); } else { tikaType.setParameter( "encoding", encoding ); } } */ // Also look to record the software: String software = null; // Application ID, MS Office only AFAICT, and the VERSION is only doc if( metadata.get( Metadata.APPLICATION_NAME ) != null ) software = metadata.get( Metadata.APPLICATION_NAME ); if( metadata.get( Metadata.APPLICATION_VERSION ) != null ) software += " "+metadata.get( Metadata.APPLICATION_VERSION); // Images, e.g. JPEG and TIFF, can have 'Software', 'tiff:Software', if( metadata.get("pdf:producer") != null ) software = metadata.get("pdf:producer"); if( metadata.get( "Software" ) != null ) software = metadata.get( "Software" ); if( metadata.get( Metadata.SOFTWARE ) != null ) software = metadata.get( Metadata.SOFTWARE ); if( metadata.get( "generator" ) != null ) software = metadata.get( "generator" ); // PNGs have a 'tEXt tEXtEntry: keyword=Software, value=GPL Ghostscript 8.71' String png_textentry = metadata.get("tEXt tEXtEntry"); if( png_textentry != null && png_textentry.contains("keyword=Software, value=") ) software = png_textentry.replace("keyword=Software, value=", ""); /* Some JPEGs have this: Jpeg Comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality comment: CREATOR: gd-jpeg v1.0 (using IJG JPEG v62), default quality */ if( software != null ) { metadata.set(Metadata.SOFTWARE, software); hm.put("software", software); } // Also, if there is any trace of any hardware, record it here: if( metadata.get( Metadata.EQUIPMENT_MODEL ) != null ) metadata.set("hardware", metadata.get( Metadata.EQUIPMENT_MODEL)); // If there is any trace of the source document, add it here: if( metadata.get("pdf:creator") != null ) { hm.put("source",metadata.get("pdf:creator") ); } // Fall back on special type for empty resources: if( "0".equals(metadata.get(Metadata.CONTENT_LENGTH)) ) { metadata.set(Metadata.CONTENT_TYPE, "application/x-empty"); } // Version: if( metadata.get("pdf:version") != null ) { hm.put("version",metadata.get("pdf:version") ); } // Return extended MIME Type: tikaType = new MediaType(tikaType, hm); metadata.set(EXT_MIME_TYPE, tikaType.toString()); // Other sources of modification time? //md.get(Metadata.LAST_MODIFIED); //might be useful, as would any embedded version // e.g. a jpg with 'Date/Time: 2011:10:07 11:35:42'? // e.g. a png with // 'Document ImageModificationTime: year=2011, month=7, day=29, hour=15, minute=33, second=5' // 'tIME: year=2011, month=7, day=29, hour=15, minute=33, second=5' } /** * * @param context * @param recurse */ public void setRecursive( ParseContext context, boolean recurse ) { init(context); embedded.setParseEmbedded(recurse); } }