package com.enonic.cms.plugin.extractor; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringWriter; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import org.apache.jackrabbit.extractor.CompositeTextExtractor; import org.apache.jackrabbit.extractor.MsExcelTextExtractor; import org.apache.jackrabbit.extractor.MsOutlookTextExtractor; import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor; import org.apache.jackrabbit.extractor.MsWordTextExtractor; import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor; import org.apache.jackrabbit.extractor.PlainTextExtractor; import org.apache.jackrabbit.extractor.PngTextExtractor; import org.apache.jackrabbit.extractor.RTFTextExtractor; import org.apache.jackrabbit.extractor.XMLTextExtractor; import com.enonic.cms.api.plugin.ext.TextExtractor; import com.enonic.cms.plugin.extractor.xformat.MsExcelXslxTextExtractor; import com.enonic.cms.plugin.extractor.xformat.MsPowerPointlPptxTextExtractor; import com.enonic.cms.plugin.extractor.xformat.MsWordDocxTextExtractor; public final class ExtractorPack extends TextExtractor { private final CompositeTextExtractor extractor; private final Set<String> allowedMimeTypes; public ExtractorPack() { this.extractor = new CompositeTextExtractor(); // HTML Extractor fails, dont use this //this.extractor.addTextExtractor( new HTMLTextExtractor() ); this.extractor.addTextExtractor( new MsExcelTextExtractor() ); this.extractor.addTextExtractor( new MsOutlookTextExtractor() ); this.extractor.addTextExtractor( new MsPowerPointTextExtractor() ); this.extractor.addTextExtractor( new MsWordTextExtractor() ); this.extractor.addTextExtractor( new MsWordDocxTextExtractor() ); this.extractor.addTextExtractor( new MsExcelXslxTextExtractor() ); this.extractor.addTextExtractor( new MsPowerPointlPptxTextExtractor() ); this.extractor.addTextExtractor( new OpenOfficeTextExtractor() ); this.extractor.addTextExtractor( new PlainTextExtractor() ); this.extractor.addTextExtractor( new PngTextExtractor() ); this.extractor.addTextExtractor( new RTFTextExtractor() ); this.extractor.addTextExtractor( new XMLTextExtractor() ); final String[] contentTypes = this.extractor.getContentTypes(); this.allowedMimeTypes = new HashSet<String>( Arrays.asList( contentTypes ) ); } public boolean canHandle( final String mimeType ) { return this.allowedMimeTypes.contains( mimeType ); } public String extractText( final String mimeType, final InputStream inputStream, final String encoding ) throws IOException { if ( !canHandle( mimeType ) ) { return null; } final Reader reader = this.extractor.extractText( inputStream, mimeType, encoding ); return toString( reader ); } private String toString( final Reader reader ) throws IOException { final StringWriter out = new StringWriter(); final char[] buffer = new char[1024]; while ( true ) { final int num = reader.read( buffer ); if ( num <= 0 ) { break; } out.write( buffer, 0, num ); } reader.close(); out.close(); return out.getBuffer().toString(); } }