/** * Copyright (c) 2009 Juwi MacMillan Group GmbH * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.juwimm.cms.search.res; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.poi.hwpf.extractor.WordExtractor; import org.springframework.beans.factory.annotation.Autowired; import org.tizzit.util.XercesHelper; import de.juwimm.cms.model.DocumentHbm; import de.juwimm.cms.model.DocumentHbmDao; /** * @author <a href="mailto:carsten.schalm@juwimm.com">Carsten Schalm</a> * company Juwi|MacMillan Group Gmbh, Walsrode, Germany * @version $Id$ */ public class WordDocumentLocator { private static Logger log = Logger.getLogger(WordDocumentLocator.class); public static final String MIME_TYPE = "application/msword"; @Autowired private DocumentHbmDao documentHbmDao; public Document getDocument(de.juwimm.cms.model.DocumentHbm document) throws IOException { Document doc = new Document(); InputStream bis = new ByteArrayInputStream(documentHbmDao.getDocumentContent(document.getDocumentId())); WordExtractor extractor = new WordExtractor(bis); String contents = extractor.getText(); doc.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("documentId", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("uid", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); String docName = document.getDocumentName(); if (docName == null) docName = ""; doc.add(new Field("documentName", docName, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("title", docName, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("unitId", document.getUnit().getUnitId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("unitName", document.getUnit().getName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("mimeType", document.getMimeType(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("timeStamp", document.getTimeStamp().toString(), Field.Store.YES, Field.Index.NO)); int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); if (summary != null && summary.length() > 0) { try { summary = XercesHelper.html2nodeUTF8(summary); } catch (Exception e) { // ignore } } if (summary == null) summary = ""; doc.add(new Field("summary", summary, Field.Store.YES, Field.Index.NO)); return doc; } public Document getResource(DocumentHbm document) throws IOException { Document resource = new Document(); InputStream bis = new ByteArrayInputStream(documentHbmDao.getDocumentContent(document.getDocumentId())); getContent(resource, bis); resource.add(new Field("siteId", document.getUnit().getSite().getSiteId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); resource.add(new Field("documentId", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); resource.add(new Field("uid", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); String docName = document.getDocumentName(); if (docName == null) docName = ""; resource.add(new Field("documentName", docName, Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("title", docName, Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("unitId", document.getUnit().getUnitId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); resource.add(new Field("unitName", document.getUnit().getName(), Field.Store.YES, Field.Index.ANALYZED)); resource.add(new Field("mimeType", document.getMimeType(), Field.Store.YES, Field.Index.NOT_ANALYZED)); resource.add(new Field("timeStamp", document.getTimeStamp().toString(), Field.Store.YES, Field.Index.NO)); return resource; } public Document getExternalResource(String url, InputStream in) throws IOException { Document resource = new Document(); resource.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED)); resource.add(new Field("uid", url, Field.Store.YES, Field.Index.NOT_ANALYZED)); resource = getContent(resource, in); return resource; } private Document getContent(Document resource, InputStream in) throws IOException { WordExtractor extractor = new WordExtractor(in); String contents = extractor.getText(); resource.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)); int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); if (summary != null && summary.length() > 0) { try { summary = XercesHelper.html2nodeUTF8(summary); } catch (Exception e) { // ignore } } if (summary == null) summary = ""; resource.add(new Field("summary", summary, Field.Store.YES, Field.Index.NO)); in.close(); return resource; } }