/* * (C) Copyright 2002-2007 Nuxeo SAS (http://nuxeo.com/) and contributors. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser General Public License * (LGPL) version 2.1 which accompanies this distribution, and is available at * http://www.gnu.org/licenses/lgpl.html * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * Contributors: * Nuxeo - initial API and implementation * */ package org.nuxeo.ecm.core.convert.plugins.text.extractors; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.Serializable; import java.util.Map; import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; import org.nuxeo.ecm.core.api.Blob; import org.nuxeo.ecm.core.api.blobholder.BlobHolder; import org.nuxeo.ecm.core.api.impl.blob.FileBlob; import org.nuxeo.ecm.core.convert.api.ConversionException; import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; import org.nuxeo.ecm.core.convert.extension.Converter; import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; public class MSOffice2TextConverter implements Converter { @Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { File f = null; OutputStream fas = null; try { POITextExtractor extractor = ExtractorFactory.createExtractor(blobHolder.getBlob().getStream()); // TODO: find a way to distinguish headings from paragraphs using // WordExtractor#getParagraphText()? // Get extracted text with Unix end of line characters String extractedText = extractor.getText().replace("\r\n", "\n"); byte[] bytes = extractedText.getBytes("UTF-8"); f = File.createTempFile("po-msoffice2text", ".txt"); fas = new FileOutputStream(f); fas.write(bytes); Blob blob = new FileBlob(new FileInputStream(f), "text/plain", "UTF-8"); return new SimpleCachableBlobHolder(blob); } catch (Exception e) { throw new ConversionException( "Error during MSOffice2Text conversion", e); } finally { if (fas != null) { try { fas.close(); } catch (IOException e) { } } if (f != null) { f.delete(); } } } @Override public void init(ConverterDescriptor descriptor) { } }