/**
*
*/
package uk.bl.wa.tika;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Map;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import uk.bl.wa.tika.parser.pdf.itext.PDFParser;
/**
* Base on http://wiki.apache.org/tika/RecursiveMetadata
*
* @author AnJackson
*
*/
public class TikaNanite {
public static void main(String[] args) throws Exception {
PreservationParser parser = new PreservationParser();
// Wrap it in a recursive parser, to access the metadata.
Parser recursiveReportingParser = new RecursiveMetadataParser(parser);
// Set up the context:
ParseContext context = new ParseContext();
context.set(Parser.class, recursiveReportingParser);
parser.init(context);
// Control recursion:
//parser.setRecursive(context, false);
// Basic handler (ignores/pass-through-in-silence):
//ContentHandler handler = new DefaultHandler();
// Abort handler, limiting the output size, to avoid OOM:
ContentHandler handler = new WriteOutContentHandler(1000*1024);
for( String arg : args ) {
File inputFile = new File(arg);
Metadata metadata = new Metadata();
metadata.add( Metadata.RESOURCE_NAME_KEY, inputFile.toURI().toString());
// Instream
InputStream stream = TikaInputStream.get(inputFile);
// Detect
//Tika tika = new Tika();
MediaType type = parser.getDetector().detect(stream, metadata);
System.out.println("Detector found: "+type);
metadata.add( Metadata.CONTENT_TYPE, type.toString());
// Parse
try {
recursiveReportingParser.parse(stream, handler, metadata, context);
} catch (Exception e ) {
System.out.println("---- Exception: "+e);
} finally {
stream.close();
}
System.out.println("--EOF-- Top Level Metadata --");
String[] names = metadata.names();
Arrays.sort(names);
for( String name : names ) {
System.out.println("MD: "+name+" = "+metadata.get(name));
}
System.out.println("----");
}
}
/**
* For this to work reliably, we will need to modify PackageExtractor
* so that the parent-child relationship is maintained. Otherwise,
* the identity of files gets confused when there are ZIPs in ZIPs etc.
*
* @author AnJackson
*/
private static class RecursiveMetadataParser extends ParserDecorator {
/** */
private static final long serialVersionUID = 5133646719357986442L;
public RecursiveMetadataParser(Parser parser) {
super(parser);
}
@Override
public void parse(InputStream stream, ContentHandler ignore,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
System.out.println("----");
String providedType = metadata.get( Metadata.CONTENT_TYPE );
System.out.println("Pre-parse Content-Type = " + providedType);
try {
super.parse(stream, ignore, metadata, context);
} catch (Exception e ) {
System.out.println("---- Exception: "+e);
e.printStackTrace();
}
System.out.println("----");
System.out.println("resourceName = "+metadata.get(Metadata.RESOURCE_NAME_KEY));
System.out.println("----");
String[] names = metadata.names();
Arrays.sort(names);
for( String name : names ) {
System.out.println("RMD : "+name+" = "+metadata.get(name));
}
System.out.println("----");
String text = ignore.toString();
if( text.length() > 10 ) text = text.substring(0,10);
//System.out.println(text);
}
}
}