package com.caseystella.util.common.enrich; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.StringWriter; import java.util.AbstractMap; import java.util.Map; /** * Created by cstella on 9/4/14. */ public enum ExtractContent { INSTANCE; public Map.Entry<String,Metadata> extractTextWithMetadata(String path, byte[] content) throws TikaException, SAXException, IOException { /* Use tika to extract the content into a the first value in the tuple, and the second value is a bag of key/value pairs representing the metadata from the document. */ Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ByteArrayInputStream bis = new ByteArrayInputStream(content); //set the content type directly as tika has a bear of a time metadata.set(Metadata.CONTENT_TYPE, new Tika().detect(path)); StringWriter writer = new StringWriter(); //this is where Tika parses the document parser.parse(bis, new WriteOutContentHandler(writer), metadata, new ParseContext()); return new AbstractMap.SimpleImmutableEntry<String, Metadata>(writer.toString(), metadata); } }