package mj.ocraptor.extraction.tika.parser.microsoft;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import javaaxp.core.service.IXPSAccess;
import javaaxp.core.service.IXPSPageAccess;
import javaaxp.core.service.XPSError;
import javaaxp.core.service.impl.XPSServiceImpl;
import javaaxp.core.service.impl.document.jaxb.CTCanvas;
import javaaxp.core.service.impl.document.jaxb.CTGlyphs;
import javaaxp.core.service.impl.document.jaxb.CTPath;
import javaaxp.core.service.impl.fileaccess.XPSZipFileAccess;
import javaaxp.core.service.model.document.page.IFixedPage;
import mj.ocraptor.configuration.Config;
import mj.ocraptor.configuration.properties.ConfigBool;
import mj.ocraptor.extraction.image_processing.TikaImageHelper;
import mj.ocraptor.file_handler.filter.FileType;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class XPSParser implements Parser {
private double currentXPosition = 0;
/**
*
*/
private static final long serialVersionUID = -3528366722867144747L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.singleton(MediaType.application("vnd.ms-xpsdocument"));
private static final String XPS_MIME_TYPE = "application/vnd.ms-xpsdocument";
private XHTMLContentHandler fileXHTML;
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
private Metadata metadata;
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
this.metadata = metadata;
metadata.set(Metadata.CONTENT_TYPE, XPS_MIME_TYPE);
fileXHTML = new XHTMLContentHandler(handler, metadata);
try {
parseXPS(stream);
} catch (XPSError e) {
throw new IOException(e);
}
stream.close();
}
private void parseXPS(InputStream inputStream) throws XPSError, SAXException {
TikaInputStream tikaStream = null;
try {
tikaStream = TikaInputStream.get(inputStream);
File xpsFile = tikaStream.getFile();
IXPSAccess xpsAccess = new XPSServiceImpl().getXPSAccess(xpsFile);
xhtmlStartDocument();
int firstDocNum = xpsAccess.getDocumentAccess().getFirstDocNum();
int lastDocNum = xpsAccess.getDocumentAccess().getLastDocNum();
for (int i = firstDocNum; i <= lastDocNum; i++) {
IXPSPageAccess xpsPageAccess = xpsAccess.getPageAccess(i);
XPSZipFileAccess ac = (XPSZipFileAccess) xpsAccess.getFileAccess();
ac.getFixedDocument(xpsPageAccess.getDocumentReference());
ac.getDocumentStructure(xpsPageAccess.getDocumentReference());
int firstPageNum = xpsPageAccess.getFirstPageNum();
int lastPageNum = xpsPageAccess.getLastPageNum();
for (int j = firstPageNum; j <= lastPageNum; j++) {
IFixedPage fixedPage = xpsPageAccess.getPage(j);
parseObjs(fixedPage.getPathOrGlyphsOrCanvas());
}
if (Config.inst().getProp(
ConfigBool.ENABLE_IMAGE_OCR)) {
TikaImageHelper helper = new TikaImageHelper(metadata);
try {
// Process the file in turn
ZipInputStream zip = new ZipInputStream(inputStream);
ZipEntry entry = zip.getNextEntry();
while (entry != null) {
// TODO: images
String entryExtension = null;
try {
entryExtension = FilenameUtils.getExtension(new File(entry
.getName()).getName());
} catch (Exception e) {
e.printStackTrace();
}
if (entryExtension != null
&& FileType.isValidImageFileExtension(entryExtension)) {
File imageFile = null;
try {
imageFile = TikaImageHelper.saveZipEntryToTemp(zip, entry);
helper.addImage(imageFile);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (imageFile != null) {
imageFile.delete();
}
}
}
entry = zip.getNextEntry();
}
helper.addTextToHandler(fileXHTML);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (helper != null) {
helper.close();
}
}
}
}
xhtmlEndDocument();
} catch (Exception e) {
// TODO: logging
e.printStackTrace();
} finally {
try {
if (tikaStream != null) {
tikaStream.close();
}
} catch (IOException e) {
}
}
}
private void parseObjs(List<Object> objs) throws XPSError, SAXException {
for (Object o : objs)
parseObj(o);
}
private void parseObj(Object xpsObj) throws XPSError, SAXException {
if (xpsObj instanceof CTCanvas) {
CTCanvas c = (CTCanvas) xpsObj;
xhtmlStartCanvas();
parseObjs(c.getPathOrGlyphsOrCanvas());
xhtmlEndCanvas();
} else if (xpsObj instanceof CTGlyphs) {
CTGlyphs c = (CTGlyphs) xpsObj;
if (c.getOriginX() < currentXPosition) {
fileXHTML.startElement("div");
fileXHTML.characters(" ");
fileXHTML.endElement("div");
}
String text = c.getUnicodeString();
xhtmlParagraph(text);
currentXPosition = c.getOriginX();
} else if (xpsObj instanceof CTPath) {
} else {
System.out.println("Unhandled type : "
+ xpsObj.getClass().getCanonicalName());
}
}
private void xhtmlStartDocument() throws SAXException {
fileXHTML.startDocument();
}
private void xhtmlEndDocument() throws SAXException {
fileXHTML.endDocument();
}
private void xhtmlStartCanvas() throws SAXException {
fileXHTML.startElement("div");
}
private void xhtmlEndCanvas() throws SAXException {
fileXHTML.endElement("div");
}
private void xhtmlParagraph(String text) throws SAXException {
fileXHTML.startElement("span");
fileXHTML.characters(text);
fileXHTML.endElement("span");
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata) throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
}