/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package com.xpn.xwiki.plugin.lucene;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.xpn.xwiki.plugin.lucene.textextraction.MSExcelTextExtractor;
import com.xpn.xwiki.plugin.lucene.textextraction.MSPowerPointTextExtractor;
import com.xpn.xwiki.plugin.lucene.textextraction.MSWordTextExtractor;
import com.xpn.xwiki.plugin.lucene.textextraction.MimetypeTextExtractor;
import com.xpn.xwiki.plugin.lucene.textextraction.PDFTextExtractor;
import com.xpn.xwiki.plugin.lucene.textextraction.PlainTextExtractor;
import com.xpn.xwiki.plugin.lucene.textextraction.XmlTextExtractor;
/**
* Extraction of text from various binary formats. Extraction itself is done by the textExtractor
* classes in Packages below <code>org.outerj.daisy</code> taken from the <a
* href="http://new.cocoondev.org/daisy">Daisy project </a>.
*
* @version $Id: $
*/
public class TextExtractor
{
private static final Log LOG = LogFactory.getLog(TextExtractor.class);
static final Map<String, MimetypeTextExtractor> textExtractors = new HashMap<String, MimetypeTextExtractor>();
static {
// TODO: make text extractors more pluggable by moving this into a config file.
final XmlTextExtractor xmlTextExtractor = new XmlTextExtractor();
textExtractors.put("application/xhtml+xml", xmlTextExtractor);
textExtractors.put("text/xml", xmlTextExtractor);
textExtractors.put("text/plain", new PlainTextExtractor());
textExtractors.put("application/pdf", new PDFTextExtractor());
// textExtractors.put ("application/vnd.sun.xml.writer", new OpenOfficeTextExtractor ());
textExtractors.put("application/msword", new MSWordTextExtractor());
textExtractors.put("application/ms-word", new MSWordTextExtractor());
textExtractors.put("application/vnd.msword", new MSWordTextExtractor());
textExtractors.put("application/vnd.ms-word", new MSWordTextExtractor());
textExtractors.put("application/vnd.ms-powerpoint", new MSPowerPointTextExtractor());
textExtractors.put("application/ms-powerpoint", new MSPowerPointTextExtractor());
textExtractors.put("application/ms-excel", new MSExcelTextExtractor());
textExtractors.put("application/vnd.ms-excel", new MSExcelTextExtractor());
}
/**
* @param content
* @param mimetype
* @return
*/
public static String getText(byte[] content, String mimetype)
{
final MimetypeTextExtractor extractor =
(MimetypeTextExtractor) textExtractors.get(mimetype);
if (extractor != null) {
try {
return extractor.getText(content);
} catch (Exception e) {
LOG.error("error getting text for mimetype " + mimetype, e);
e.printStackTrace();
}
} else {
LOG.info("no text extractor for mimetype " + mimetype);
}
return null;
}
}