package com.enonic.cms.plugin.extractor.xformat; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import org.apache.jackrabbit.extractor.AbstractTextExtractor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class MsWordDocxTextExtractor extends AbstractTextExtractor { private static final Logger logger = LoggerFactory.getLogger( MsWordDocxTextExtractor.class ); public MsWordDocxTextExtractor() { super( new String[]{"application/vnd.openxmlformats-officedocument.wordprocessingml.document"} ); } @Override public Reader extractText( final InputStream stream, final String type, final String encoding ) throws IOException { try { final OPCPackage opcPackage = OPCPackage.open( stream ); final XWPFWordExtractor xw = new XWPFWordExtractor( opcPackage ); return new StringReader( xw.getText() ); } catch ( Exception e ) { logger.warn( "Failed to extract Word text content", e ); return new StringReader( "" ); } } }