package focusedCrawler.memex.cdr;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import de.l3s.boilerpipe.extractors.KeepEverythingExtractor;
public class TikaExtractor {
public static class ParsedData {
private String plainText;
private Map<String, String> metadata;
public ParsedData(String plainText, Map<String, String> metadata) {
this.plainText = plainText;
this.metadata = metadata;
}
public String getPlainText() {
return plainText;
}
public Map<String, String> getMetadata() {
return metadata;
}
}
private static final Logger logger = LoggerFactory.getLogger(TikaExtractor.class);
private static final Charset UTF8 = Charset.forName("UTF-8");
private static final int MAX_CHARACTERS = 25 * 1000 * 1000;
private static final TikaConfig CONFIG = TikaConfig.getDefaultConfig();
private final Parser parser = new AutoDetectParser(CONFIG);
private final Detector mimeTypeDetector = CONFIG.getDetector();
public ParsedData parse(String content) {
return parse(new ByteArrayInputStream(content.getBytes(UTF8)), null, null);
}
public ParsedData parse(InputStream stream) {
return parse(stream, null, null);
}
public ParsedData parse(String content, String fileName, String contentType) {
return parse(new ByteArrayInputStream(content.getBytes(UTF8)), fileName, contentType);
}
public ParsedData parse(InputStream stream, String fileName, String contentType) {
BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
Metadata metadata = createMetadata(fileName, contentType);
ParseContext context = new ParseContext();
try {
parser.parse(stream, textHandler, metadata, context);
Map<String, String> metadataMap = new HashMap<String, String>();
for (String propertyName : metadata.names()) {
metadataMap.put(propertyName, metadata.get(propertyName));
}
return new ParsedData(handler.toString(), metadataMap);
} catch (IOException | SAXException | TikaException e) {
logger.error("Failed to extract metadata using Tika.", e);
return null;
}
}
public MediaType detect(String content) {
return detect(new ByteArrayInputStream(content.getBytes(UTF8)), null, null);
}
public MediaType detect(String content, String fileName, String contentType) {
return detect(new ByteArrayInputStream(content.getBytes(UTF8)), fileName, contentType);
}
public MediaType detect(InputStream fileStream, String fileName, String contentType) {
Metadata metadata = createMetadata(fileName, contentType);
try {
return mimeTypeDetector.detect(fileStream, metadata);
} catch (IOException e) {
throw new RuntimeException("Failed to read input stream", e);
}
}
private Metadata createMetadata(String fileName, String contentType) {
Metadata metadata = new Metadata();
if(fileName != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, fileName);
}
if(contentType != null) {
metadata.add(Metadata.CONTENT_TYPE, contentType);
}
return metadata;
}
}