/** * OpenKM, Open Document Management System (http://www.openkm.com) * Copyright (c) 2006-2011 Paco Avila & Josep Llort * * No bytes were intentionally harmed during the development of this application. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package com.openkm.extractor; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import javax.jcr.Node; import javax.jcr.PathNotFoundException; import javax.jcr.RepositoryException; import javax.jcr.ValueFormatException; import org.apache.commons.io.IOUtils; import org.apache.jackrabbit.JcrConstants; import org.apache.jackrabbit.core.query.lucene.JackrabbitTextExtractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.openkm.bean.Document; import com.openkm.core.Config; import com.openkm.util.ReaderInputStream; import com.openkm.util.UserActivity; public class RegisteredExtractors { private static Logger log = LoggerFactory.getLogger(RegisteredExtractors.class); private static JackrabbitTextExtractor jte = new JackrabbitTextExtractor(""); private static final int MIN_EXTRACTION = 16; /** * Initialize text extractors from REGISTERED_TEXT_EXTRACTORS */ public static synchronized void init() { log.info("Initializing text extractors"); jte = new JackrabbitTextExtractor(Config.REGISTERED_TEXT_EXTRACTORS); } /** * Return registered content types */ public static String[] getContentTypes() { return jte.getContentTypes(); } /** * Extract text to be indexed */ public static InputStream getText(Node node, String mimeType, String encoding, InputStream is) throws ValueFormatException, PathNotFoundException, RepositoryException, IOException { log.info("getText({}, {}, {})", new Object[] { mimeType, encoding, is }); InputStream ret = null; boolean failure = false; String failureMessage = "Unknown error"; try { Reader rd = jte.extractText(is, mimeType, encoding); // Check for minimum text extraction size if (rd.markSupported()) { rd.mark(0); long sk = rd.skip(MIN_EXTRACTION); if (sk < MIN_EXTRACTION) failure = true; rd.reset(); } else { log.warn("Mark not supported in {}", rd.getClass().getCanonicalName()); } // Convert reader to input stream ret = new ReaderInputStream(rd); } catch (Exception e) { log.warn("Text extraction failure: {}", e.getMessage()); failureMessage = e.getMessage(); failure = true; } if (failure || ret == null) { if (node != null) { log.warn("There was a problem extracting text from '{}'", node.getPath()); UserActivity.log(node.getSession().getUserID(), "MISC_TEXT_EXTRACTION_FAILURE", node.getUUID(), node.getPath() + ", FailureMessage: " + failureMessage); } } log.info("getText: {}", ret); return ret; } /** * EXPERIMENTAL */ public static void index(Node docNode, Node contNode, String mimeType) throws ValueFormatException, PathNotFoundException, RepositoryException, IOException { InputStream in = null; InputStream out = null; try { in = contNode.getProperty(JcrConstants.JCR_DATA).getStream(); out = RegisteredExtractors.getText(docNode, mimeType, "UTF-8", in); contNode.setProperty(Document.TEXT, out); } finally { IOUtils.closeQuietly(out); IOUtils.closeQuietly(in); } } }