package mj.ocraptor.file_handler;
import static mj.ocraptor.file_handler.filter.FileType.APPLE_KEY;
import static mj.ocraptor.file_handler.filter.FileType.APPLE_NUMBERS;
import static mj.ocraptor.file_handler.filter.FileType.APPLE_PAGES;
import static mj.ocraptor.file_handler.filter.FileType.EPUB;
import static mj.ocraptor.file_handler.filter.FileType.LO_CALC;
import static mj.ocraptor.file_handler.filter.FileType.LO_IMPRESS;
import static mj.ocraptor.file_handler.filter.FileType.LO_WRITER;
import static mj.ocraptor.file_handler.filter.FileType.PS;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import mj.ocraptor.configuration.Config;
import mj.ocraptor.configuration.properties.ConfigBool;
import mj.ocraptor.configuration.properties.ConfigInteger;
import mj.ocraptor.database.dao.FileEntry;
import mj.ocraptor.database.dao.ResultError;
import mj.ocraptor.database.search.TextProcessing;
import mj.ocraptor.extraction.tika.parser.epub.EpubParser;
import mj.ocraptor.extraction.tika.parser.html.HtmlParser;
import mj.ocraptor.extraction.tika.parser.image.ImageParser;
import mj.ocraptor.extraction.tika.parser.iwork.IWorkPackageParser;
import mj.ocraptor.extraction.tika.parser.microsoft.OfficeParser;
import mj.ocraptor.extraction.tika.parser.microsoft.XPSParser;
import mj.ocraptor.extraction.tika.parser.microsoft.ooxml.OOXMLParser;
import mj.ocraptor.extraction.tika.parser.odf.OpenDocumentParser;
import mj.ocraptor.extraction.tika.parser.pdf.PDFParser;
import mj.ocraptor.extraction.tika.parser.rtf.RTFParser;
import mj.ocraptor.extraction.tika.parser.txt.TXTParser;
import mj.ocraptor.extraction.tika.parser.xml.XMLParser;
import mj.ocraptor.extraction.tika.parser.xoj.XojParser;
import mj.ocraptor.file_handler.filter.FileType;
import mj.ocraptor.rmi_client.RMIClientImpl;
import mj.ocraptor.rmi_server.RMIServerImpl;
import mj.ocraptor.tools.St;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
public class TextExtractorSub {
private FileType fileType;
private Config cfg;
private TextExtractorTools tools;
/**
*
*/
public TextExtractorSub() {
this.cfg = Config.inst();
this.tools = new TextExtractorTools();
}
/**
*
*
* @param file
* @return
* @throws Exception
*/
public FileEntry extractTextTika(final File file) throws Exception {
this.fileType = FileType.get(file);
InputStream inputStream = null;
StringWriter stringWriter = null;
FileEntry result = null;
File tempFile = null;
try {
// ------------------------------------------------ //
// -- do not index the given database-folder
// ------------------------------------------------ //
final TikaConfig config = new TikaConfig(Config.inst().getTikaMimeFile());
final AutoDetectParser autoDetectParser = new AutoDetectParser(config);
final Map<MediaType, Parser> availableParsers = tools.getAvailableParsers(file);
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
final List<MediaType> supportedFileTypes = new ArrayList<MediaType>(availableParsers.keySet());
if (!supportedFileTypes.contains(fileType.getMediaType())) {
result = new FileEntry(file);
// not supported filetype --> don't make a db entry
result.setError(ResultError.NOT_SUPPORTED);
return result;
}
// ------------------------------------------------ //
// --
// ------------------------------------------------ //
autoDetectParser.setParsers(availableParsers);
Metadata metadata = new Metadata();
this.addStandardMetadata(metadata, file);
if (FileType.is(file, FileType.PS)) {
tempFile = PDFParser.convertPostScriptToPDF(file);
inputStream = new FileInputStream(tempFile);
} else {
inputStream = new FileInputStream(file);
}
stringWriter = new StringWriter();
// ------------------------------------------------ //
// parse document and convert content to xhtml
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.setResult(new StreamResult(stringWriter));
BodyContentHandler bch = new BodyContentHandler(handler);
try {
autoDetectParser.parse(inputStream, bch, metadata);
} catch (Exception e) {
throw e;
}
// ------------------------------------------------ //
String xhtml = stringWriter.toString();
// RMIClientImpl.instance().sendDebugErrorToServer(xhtml, null, true);
// TODO:
if (xhtml != null && !xhtml.trim().isEmpty()) {
metadata = normalizeMetadata(metadata);
String xmlns = " xmlns=\"http://www.w3.org/1999/xhtml\"";
xhtml = xhtml.replace(xmlns, "");
xhtml = xhtml.replaceFirst("\\?>", "\\?><div" + xmlns + ">");
xhtml = xhtml.replaceAll("\\s+", " ");
StringBuilder builder = new StringBuilder();
builder.append(xhtml);
// ------------------------------------------------ //
if (this.cfg.getProp(ConfigBool.INCLUDE_METADATA)) {
builder.append("<div class=\"metadata\">");
for (String md : metadata.names()) {
if (md != null && !md.trim().isEmpty()) {
String value = metadata.get(md);
if (value != null && !value.trim().isEmpty()) {
builder.append("<p> " + md + "=" + value + " </p>");
}
}
}
builder.append("</div>");
}
// ------------------------------------------------ //
builder.append("</div>");
xhtml = builder.toString();
xhtml = TextProcessing.preProcess(xhtml);
result = new FileEntry(file);
result.setFullText(xhtml);
// System.out.println(xhtml);
}
} catch (Exception e) {
throw e;
} finally {
try {
if (inputStream != null)
inputStream.close();
if (stringWriter != null)
stringWriter.close();
if (tempFile != null && tempFile.exists())
tempFile.delete();
} catch (IOException e) {
}
}
return result;
}
private Boolean validSize(Integer maxSize, File currentFile) {
if (maxSize != null) {
try {
long fileSizeInKB = currentFile.length() / 1024;
if (fileSizeInKB < maxSize)
return true;
else
return false;
} catch (NumberFormatException e) {
e.printStackTrace();
}
}
return null;
}
private void addStandardMetadata(Metadata metadata, File file) {
if (metadata != null && file != null) {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, fileType.getMimeString());
}
// metadata.remove("X-Parsed-By");
metadata.set(Config.META_FILE_NAME, file.getName());
metadata.set(Config.META_FILE_PATH, file.getParent());
// metadata.set(Metadata.CONTENT_ENCODING, "utf-8");
// metadata.add(Metadata.CONTENT_ENCODING, "utf-8");
}
}
private Metadata normalizeMetadata(Metadata metadata) {
String[] ignoredMetadata = new String[] { "x-parsed-by" };
Metadata filteredMetadata = new Metadata();
for (String key : metadata.names()) {
if (key != null && !key.trim().isEmpty()) {
key = St.normalizeDocumentText(key);
key = St.stripHtmlTags(key);
String value = metadata.get(key);
boolean skipMetadata = false;
for (String ignoreMd : ignoredMetadata) {
if (key.toLowerCase().equals(ignoreMd)) {
skipMetadata = true;
break;
}
}
if (value != null && !value.trim().isEmpty() && !skipMetadata) {
value = St.normalizeDocumentText(value);
value = St.stripHtmlTags(value);
value = value.replaceAll("\\s", " ");
filteredMetadata.add(key, value);
}
}
}
return filteredMetadata;
}
}