/**
*
*/
package uk.bl.wa.tika;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.IOException;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
/**
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class TikaDeepIdentifier {
private static Logger log = Logger.getLogger(TikaDeepIdentifier.class.getName());
private static int MAX_BUF = 1024*1024;
// Number of milliseconds before timing out. Defaults to 5 mins (5*60*1000 = 300,000 milliseconds).
private final long parseTimeout = 5*60*1000L;
// Abort handler, limiting the output size, to avoid OOM:
private static WriteOutContentHandler ch = null;
// Silent handler:
//ContentHandler ch = new DefaultHandler();
// Set up a parse context:
private static ParseContext ctx = new ParseContext();
// Set up the parser:
private static PreservationParser pika = new PreservationParser();
static {
pika.init(ctx);
}
/**
* @param args
*/
public static void main(String[] args) {
TikaDeepIdentifier tdi = new TikaDeepIdentifier();
tdi.printDetectors();
System.out.println("GOT: "+tdi.identify(new byte[] {'%','P','D','F','-'}));
}
public void printDetectors() {
CompositeDetector ds = (CompositeDetector) pika.getDetector();
for( Detector d : ds.getDetectors()) {
System.out.println("Detector: "+d.getClass().getCanonicalName());
}
for( MediaType type : pika.getParsers().keySet()) {
System.out.println("Parser: "+type+" : "+pika.getParsers().get(type).getClass());
}
}
/**
*
* @param payload
* @return
*/
public String identify(byte[] payload) {
// Fallback
String tikaType = MediaType.OCTET_STREAM.toString();
// Set up metadata object:
Metadata md = new Metadata();
TikaInputStream tis = null;
try {
tis = TikaInputStream.get( payload, md );
// Type according to Tiki:
tikaType = pika.getDetector().detect( tis, md ).toString();
} catch( Throwable e ) {
log.error( "Tika.detect failed:" + e.getMessage() );
//e.printStackTrace();
return MediaType.OCTET_STREAM.toString();
} finally {
if (tis != null) {
try {
tis.close();
} catch (IOException e) {
log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage());
}
}
}
// Now perform full parse, to find a more detailed tikaType
try {
// Default to detected MIME Type:
md.set( Metadata.CONTENT_TYPE, tikaType.toString() );
// Ensure parsing is NOT recursive:
pika.setRecursive(ctx, false);
// Now perform the parsing:
//parser.parse( new ByteArrayInputStream( payload ), ch, md, ctx );
// One could forcibly limit the size if OOM is still causing problems, like this:
//parser.parse( new ByteArrayInputStream( value.getPayload(), 0, BUF_8KB ), ch, md, ctx );
// Every resource gets it's own write-out buffer:
ch = new WriteOutContentHandler(MAX_BUF);
// Run the parser in a separate thread:
InputStream tikainput = TikaInputStream.get( payload, md );
ParseRunner runner = new ParseRunner( pika, tikainput, ch, md, ctx );
Thread parseThread = new Thread( runner, Long.toString( System.currentTimeMillis() ) );
try {
parseThread.start();
parseThread.join( this.parseTimeout );
parseThread.interrupt();
} catch( OutOfMemoryError o ) {
log.error( "TikaExtractor.parse(): " + tikaType + " : " + o.getMessage() );
} catch( RuntimeException r ) {
log.error( "TikaExtractor.parse(): " + tikaType + " : " + r.getMessage() );
} finally {
if (tikainput != null) {
try {
tikainput.close();
} catch (IOException e) {
log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage());
}
}
}
// Use the extended MIME type generated by the PreservationParser:
String extMimeType = md.get(PreservationParser.EXT_MIME_TYPE);
if( runner.complete && extMimeType != null ) tikaType = extMimeType;
} catch( Throwable e ) {
log.debug( "Tika Exception: " + e.getMessage() );
//e.printStackTrace();
}
// Return whichever value works:
return tikaType;
}
/**
*
* @param payload
* @param metadata
* @return
*/
public String identify(InputStream payload, Metadata metadata) {
// Fallback
String tikaType = MediaType.OCTET_STREAM.toString();
// Set up metadata object:
Metadata md = metadata;
TikaInputStream tis = null;
try {
tis = TikaInputStream.get( payload );
// Type according to Tiki:
tikaType = pika.getDetector().detect( tis, md ).toString();
} catch( Throwable e ) {
log.error( "Tika.detect failed:" + e.getMessage() );
//e.printStackTrace();
return MediaType.OCTET_STREAM.toString();
} finally {
if (tis != null) {
try {
tis.close();
} catch (IOException e) {
log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage());
}
}
}
// Now perform full parse, to find a more detailed tikaType
try {
// Default to detected MIME Type:
md.set( Metadata.CONTENT_TYPE, tikaType.toString() );
// Ensure parsing is NOT recursive:
pika.setRecursive(ctx, false);
// Now perform the parsing:
//parser.parse( new ByteArrayInputStream( payload ), ch, md, ctx );
// One could forcibly limit the size if OOM is still causing problems, like this:
//parser.parse( new ByteArrayInputStream( value.getPayload(), 0, BUF_8KB ), ch, md, ctx );
// Every resource gets it's own write-out buffer:
ch = new WriteOutContentHandler(MAX_BUF);
// Run the parser in a separate thread:
InputStream tikainput = TikaInputStream.get( payload );
ParseRunner runner = new ParseRunner( pika, tikainput, ch, md, ctx );
Thread parseThread = new Thread( runner, Long.toString( System.currentTimeMillis() ) );
try {
parseThread.start();
parseThread.join( this.parseTimeout );
parseThread.interrupt();
} catch( OutOfMemoryError o ) {
log.error( "TikaExtractor.parse(): " + tikaType + " : " + o.getMessage() );
} catch( RuntimeException r ) {
log.error( "TikaExtractor.parse(): " + tikaType + " : " + r.getMessage() );
} finally {
if (tikainput != null) {
try {
tikainput.close();
} catch (IOException e) {
log.warn("Exception closing TikaInputStream. This leaves tmp-files: " + e.getMessage());
}
}
}
// Use the extended MIME type generated by the PreservationParser:
String extMimeType = md.get(PreservationParser.EXT_MIME_TYPE);
if( runner.complete && extMimeType != null ) tikaType = extMimeType;
} catch( Throwable e ) {
log.debug( "Tika Exception: " + e.getMessage() );
//e.printStackTrace();
}
// Return whichever value works:
return tikaType;
}
private File copyToTempFile( String name, byte[] content, int max_bytes ) throws Exception {
File tmp = File.createTempFile("FmtTmp-", name);
tmp.deleteOnExit();
FileOutputStream fos = new FileOutputStream(tmp);
IOUtils.copy(new ByteArrayInputStream(content, 0, max_bytes), fos);
fos.flush();
fos.close();
return tmp;
}
private File copyToTempFile( String name, byte[] content ) throws Exception {
//if( content.length < BUF_8KB )
return copyToTempFile(name, content, MAX_BUF);
}
// ----
private class ParseRunner implements Runnable {
private AutoDetectParser parser;
private InputStream tikainput;
private ContentHandler handler;
private Metadata metadata;
private ParseContext context;
public boolean complete;
public ParseRunner( AutoDetectParser parser, InputStream tikainput, ContentHandler handler, Metadata metadata, ParseContext context ) {
this.parser = parser;
this.tikainput = tikainput;
this.handler = handler;
this.metadata = metadata;
this.context = context;
this.complete = false;
}
@Override
public void run() {
try {
this.parser.parse( this.tikainput, this.handler, this.metadata, this.context );
this.complete = true;
} catch( InterruptedIOException i ) {
this.complete = false;
} catch( Exception e ) {
System.err.println( "ParseRunner.run(): " + e.getMessage() );
}
}
}
// --- Thread pool example, running external command ---
private static final ExecutorService THREAD_POOL
= Executors.newCachedThreadPool();
private static <T> T timedCall(Callable<T> c, long timeout, TimeUnit timeUnit)
throws InterruptedException, ExecutionException, TimeoutException
{
FutureTask<T> task = new FutureTask<T>(c);
THREAD_POOL.execute(task);
return task.get(timeout, timeUnit);
}
void then() throws InterruptedException, ExecutionException {
int timeout = 10;
try {
int returnCode = timedCall(new Callable<Integer>() {
public Integer call() throws Exception
{
java.lang.Process process = Runtime.getRuntime().exec("command");
return process.waitFor();
}}, new Integer(timeout), TimeUnit.SECONDS);
} catch (TimeoutException e) {
// Handle timeout here
}
}
}