/* * Copyright (c) 2015, WSO2 Inc. (http://www.wso2.org) All Rights Reserved. * * WSO2 Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"); you may not use this file except * in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.wso2.carbon.registry.indexing.indexer; import java.io.ByteArrayInputStream; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.pdfbox.cos.COSDocument; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; import org.wso2.carbon.registry.indexing.AsyncIndexer.File2Index; import org.wso2.carbon.registry.indexing.solr.IndexDocument; public class PDFIndexer implements Indexer { public static final Log log = LogFactory.getLog(PDFIndexer.class); public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException { COSDocument cosDoc = null; try { PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data)); parser.parse(); cosDoc = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); String docText = stripper.getText(new PDDocument(cosDoc)); return new IndexDocument(fileData.path, docText, null); } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(ErrorCode.SERVER_ERROR, msg); } finally { if (cosDoc != null) { try { cosDoc.close(); } catch (IOException e) { log.error("Failed to close pdf doc stream ",e); } } } } }