package org.icij.extract.parser.wmf; import org.apache.batik.transcoder.wmf.WMFConstants; import org.apache.batik.transcoder.wmf.tosvg.GdiObject; import org.apache.batik.transcoder.wmf.tosvg.MetaRecord; import org.apache.batik.transcoder.wmf.tosvg.WMFFont; import org.apache.batik.transcoder.wmf.tosvg.WMFPainter; import org.apache.batik.transcoder.wmf.tosvg.WMFRecordStore; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; import static org.apache.tika.sax.XHTMLContentHandler.XHTML; public class WMFParser extends AbstractParser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays .asList(MediaType.application("x-msmetafile"), MediaType.image("x-wmf"), MediaType.image("wmf")))); private static final String WMF_MIME_TYPE = "image/wmf"; private static final long serialVersionUID = 5516989102471431040L; @Override public Set<MediaType> getSupportedTypes(final ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata, final ParseContext context) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, WMF_MIME_TYPE); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); final WMFRecordStore store = new WMFRecordStore(); WMFFont wmfFont = null; store.read(new DataInputStream(stream)); metadata.set("height", Integer.toString(store.getHeightPixels())); metadata.set("width", Integer.toString(store.getWidthPixels())); final int numRecords = store.getNumRecords(); final int numObjects = store.getNumObjects(); for (int i = 0; i < numRecords; i++) { final MetaRecord record = store.getRecord(i); // Logic borrowed from Batik's WMFTranscoder and WMFPainter. switch (record.functionId) { case WMFConstants.META_SELECTOBJECT: int gdiIndex = record.ElementAt( 0 ); if ((gdiIndex & 0x80000000) != 0) { break; } if (gdiIndex >= numObjects) { gdiIndex -= numObjects; } GdiObject gdiObj = store.getObject(gdiIndex); if (!gdiObj.isUsed()) { break; } if (gdiObj.getType() == WMFPainter.FONT) { wmfFont = (WMFFont) gdiObj.getObject(); } break; case WMFConstants.META_TEXTOUT: case WMFConstants.META_DRAWTEXT: case WMFConstants.META_EXTTEXTOUT: final String str = decodeString(((MetaRecord.ByteRecord) record).bstr, wmfFont); final char[] chr = str.toCharArray(); xhtml.startElement(XHTML, "p", "p", new AttributesImpl()); xhtml.characters(chr, 0, chr.length); xhtml.endElement(XHTML, "p", "p"); break; } } xhtml.endDocument(); } private String decodeString(final byte[] buffer, final WMFFont wmfFont) { final String charset; if (null == wmfFont) { charset = WMFConstants.CHARSET_ANSI; } else { switch (wmfFont.charset) { case WMFConstants.META_CHARSET_DEFAULT: charset = WMFConstants.CHARSET_DEFAULT; break; case WMFConstants.META_CHARSET_GREEK: charset = WMFConstants.CHARSET_GREEK; break; case WMFConstants.META_CHARSET_RUSSIAN: charset = WMFConstants.CHARSET_CYRILLIC; break; case WMFConstants.META_CHARSET_HEBREW: charset = WMFConstants.CHARSET_HEBREW; break; case WMFConstants.META_CHARSET_ARABIC: charset = WMFConstants.CHARSET_ARABIC; break; case WMFConstants.META_CHARSET_ANSI: default: charset = WMFConstants.CHARSET_ANSI; break; } } try { return new String(buffer, charset); } catch (UnsupportedEncodingException e) { return new String(buffer); } } }