WordDocumentLocator.java example

Explorer
tizzit-master
/**
 * Copyright (c) 2009 Juwi MacMillan Group GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.juwimm.cms.search.res;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.springframework.beans.factory.annotation.Autowired;
import org.tizzit.util.XercesHelper;

import de.juwimm.cms.model.DocumentHbm;
import de.juwimm.cms.model.DocumentHbmDao;

/**
 * @author <a href="mailto:carsten.schalm@juwimm.com">Carsten Schalm</a>
 * company Juwi|MacMillan Group Gmbh, Walsrode, Germany
 * @version $Id$
 */
public class WordDocumentLocator {
	private static Logger log = Logger.getLogger(WordDocumentLocator.class);
	public static final String MIME_TYPE = "application/msword";
	@Autowired
	private DocumentHbmDao documentHbmDao;

	public Document getDocument(de.juwimm.cms.model.DocumentHbm document) throws IOException {
		Document doc = new Document();
		InputStream bis = new ByteArrayInputStream(documentHbmDao.getDocumentContent(document.getDocumentId()));

		WordExtractor extractor = new WordExtractor(bis);
		String contents = extractor.getText();
		doc.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED));

		doc.add(new Field("documentId", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field("uid", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		String docName = document.getDocumentName();
		if (docName == null) docName = "";
		doc.add(new Field("documentName", docName, Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("title", docName, Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("unitId", document.getUnit().getUnitId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field("unitName", document.getUnit().getName(), Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("mimeType", document.getMimeType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		doc.add(new Field("timeStamp", document.getTimeStamp().toString(), Field.Store.YES, Field.Index.NO));
		int summarySize = Math.min(contents.length(), 500);
		String summary = contents.substring(0, summarySize);
		if (summary != null && summary.length() > 0) {
			try {
				summary = XercesHelper.html2nodeUTF8(summary);
			} catch (Exception e) {
				// ignore
			}
		}
		if (summary == null) summary = "";
		doc.add(new Field("summary", summary, Field.Store.YES, Field.Index.NO));
		return doc;
	}

	public Document getResource(DocumentHbm document) throws IOException {
		Document resource = new Document();
		InputStream bis = new ByteArrayInputStream(documentHbmDao.getDocumentContent(document.getDocumentId()));
		getContent(resource, bis);
		resource.add(new Field("siteId", document.getUnit().getSite().getSiteId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		resource.add(new Field("documentId", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		resource.add(new Field("uid", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		String docName = document.getDocumentName();
		if (docName == null) docName = "";
		resource.add(new Field("documentName", docName, Field.Store.YES, Field.Index.ANALYZED));
		resource.add(new Field("title", docName, Field.Store.YES, Field.Index.ANALYZED));
		resource.add(new Field("unitId", document.getUnit().getUnitId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		resource.add(new Field("unitName", document.getUnit().getName(), Field.Store.YES, Field.Index.ANALYZED));
		resource.add(new Field("mimeType", document.getMimeType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
		resource.add(new Field("timeStamp", document.getTimeStamp().toString(), Field.Store.YES, Field.Index.NO));
		return resource;
	}

	public Document getExternalResource(String url, InputStream in) throws IOException {
		Document resource = new Document();
		resource.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
		resource.add(new Field("uid", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
		resource = getContent(resource, in);
		return resource;
	}

	private Document getContent(Document resource, InputStream in) throws IOException {
		WordExtractor extractor = new WordExtractor(in);
		String contents = extractor.getText();
		resource.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED));
		int summarySize = Math.min(contents.length(), 500);
		String summary = contents.substring(0, summarySize);
		if (summary != null && summary.length() > 0) {
			try {
				summary = XercesHelper.html2nodeUTF8(summary);
			} catch (Exception e) {
				// ignore
			}
		}
		if (summary == null) summary = "";
		resource.add(new Field("summary", summary, Field.Store.YES, Field.Index.NO));
		in.close();
		return resource;
	}

}