package uk.ac.shef.dcs.jate.io;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.SAXException;
import uk.ac.shef.dcs.jate.JATEException;
/**
*
* Content Extractor that wraps Tika to resolve the detection problem
*
* see Tika bug via https://issues.apache.org/jira/browse/TIKA-879
*
*/
public class ContentExtractor {
// private Parser autoDetectParser = null;
// quick fix for auto detect parser when some of txt file is not
// recognisable
private Parser txtParser = null;
private Tika tika = null;
/**
* Maximum length of the strings returned by the parseToString methods. Used
* to prevent out of memory problems with huge input documents. The default
* setting is 100k characters.
*/
private int maxStringLength = 100 * 1000;
public ContentExtractor() {
// autoDetectParser = new AutoDetectParser();
txtParser = new TXTParser();
tika = new Tika();
}
public String extractContent(URI fileURI) throws JATEException {
File file = new File(fileURI);
return extractContent(file);
}
public String extractContent(File file) throws JATEException {
String content = "";
if (file == null || !file.exists()) {
throw new JATEException("File is not found!");
}
try {
String contentType = Files.probeContentType(file.toPath());
if (MediaType.TEXT_PLAIN.getBaseType().toString().equals(contentType)) {
content = parseTXTToString(file);
} else {
content = tika.parseToString(file);
}
} catch (IOException e1) {
throw new JATEException("I/O exception when detecting file type.");
} catch (TikaException tikaEx) {
throw new JATEException("Tika Content extraction exception: " + tikaEx.toString());
}
return content;
}
private String parseTXTToString(File file) throws IOException, TikaException {
Metadata metadata = new Metadata();
InputStream stream = TikaInputStream.get(file, metadata);
try {
return parseTXTToString(stream, metadata);
} finally {
stream.close();
}
}
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, txtParser);
txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
public static void main(String[] args) throws JATEException, URISyntaxException {
ContentExtractor contentExtractor = new ContentExtractor();
File file = new File("C:\\oak-project\\TermRecogniser\\evaluate\\lotus_notes\\ Workshop-QG9JVW.txt");
String content = contentExtractor.extractContent(file.toURI());
System.out.println(content);
}
}