/* * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * Nuxeo - initial API and implementation * */ package org.nuxeo.ecm.core.convert.plugins.text.extractors; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.xmlbeans.XmlException; import org.nuxeo.ecm.core.api.Blob; import org.nuxeo.ecm.core.api.Blobs; import org.nuxeo.ecm.core.api.blobholder.BlobHolder; import org.nuxeo.ecm.core.convert.api.ConversionException; import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; import org.nuxeo.ecm.core.convert.extension.Converter; import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; import org.nuxeo.runtime.api.Framework; public class MSOffice2TextConverter implements Converter { @Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { File f = null; OutputStream fas = null; try { POITextExtractor extractor = ExtractorFactory.createExtractor(blobHolder.getBlob().getStream()); // TODO: find a way to distinguish headings from paragraphs using // WordExtractor#getParagraphText()? // Get extracted text with Unix end of line characters String extractedText = extractor.getText().replace("\r\n", "\n"); byte[] bytes = extractedText.getBytes("UTF-8"); f = Framework.createTempFile("po-msoffice2text", ".txt"); fas = new FileOutputStream(f); fas.write(bytes); try (InputStream is = new FileInputStream(f)) { Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); return new SimpleCachableBlobHolder(blob); } } catch (IOException | OpenXML4JException | XmlException e) { throw new ConversionException("Error during MSOffice2Text conversion", e); } finally { IOUtils.closeQuietly(fas); if (f != null) { f.delete(); } } } @Override public void init(ConverterDescriptor descriptor) { } }