/**
* Copyright (c) 2009 Juwi MacMillan Group GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.juwimm.cms.search.res;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.springframework.beans.factory.annotation.Autowired;
import org.tizzit.util.XercesHelper;
import de.juwimm.cms.model.DocumentHbm;
import de.juwimm.cms.model.DocumentHbmDao;
/**
* @author <a href="mailto:carsten.schalm@juwimm.com">Carsten Schalm</a>
* company Juwi|MacMillan Group Gmbh, Walsrode, Germany
* @version $Id$
*/
public class WordDocumentLocator {
private static Logger log = Logger.getLogger(WordDocumentLocator.class);
public static final String MIME_TYPE = "application/msword";
@Autowired
private DocumentHbmDao documentHbmDao;
public Document getDocument(de.juwimm.cms.model.DocumentHbm document) throws IOException {
Document doc = new Document();
InputStream bis = new ByteArrayInputStream(documentHbmDao.getDocumentContent(document.getDocumentId()));
WordExtractor extractor = new WordExtractor(bis);
String contents = extractor.getText();
doc.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("documentId", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("uid", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
String docName = document.getDocumentName();
if (docName == null) docName = "";
doc.add(new Field("documentName", docName, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("title", docName, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("unitId", document.getUnit().getUnitId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("unitName", document.getUnit().getName(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("mimeType", document.getMimeType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("timeStamp", document.getTimeStamp().toString(), Field.Store.YES, Field.Index.NO));
int summarySize = Math.min(contents.length(), 500);
String summary = contents.substring(0, summarySize);
if (summary != null && summary.length() > 0) {
try {
summary = XercesHelper.html2nodeUTF8(summary);
} catch (Exception e) {
// ignore
}
}
if (summary == null) summary = "";
doc.add(new Field("summary", summary, Field.Store.YES, Field.Index.NO));
return doc;
}
public Document getResource(DocumentHbm document) throws IOException {
Document resource = new Document();
InputStream bis = new ByteArrayInputStream(documentHbmDao.getDocumentContent(document.getDocumentId()));
getContent(resource, bis);
resource.add(new Field("siteId", document.getUnit().getSite().getSiteId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
resource.add(new Field("documentId", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
resource.add(new Field("uid", document.getDocumentId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
String docName = document.getDocumentName();
if (docName == null) docName = "";
resource.add(new Field("documentName", docName, Field.Store.YES, Field.Index.ANALYZED));
resource.add(new Field("title", docName, Field.Store.YES, Field.Index.ANALYZED));
resource.add(new Field("unitId", document.getUnit().getUnitId().toString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
resource.add(new Field("unitName", document.getUnit().getName(), Field.Store.YES, Field.Index.ANALYZED));
resource.add(new Field("mimeType", document.getMimeType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
resource.add(new Field("timeStamp", document.getTimeStamp().toString(), Field.Store.YES, Field.Index.NO));
return resource;
}
public Document getExternalResource(String url, InputStream in) throws IOException {
Document resource = new Document();
resource.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
resource.add(new Field("uid", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
resource = getContent(resource, in);
return resource;
}
private Document getContent(Document resource, InputStream in) throws IOException {
WordExtractor extractor = new WordExtractor(in);
String contents = extractor.getText();
resource.add(new Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED));
int summarySize = Math.min(contents.length(), 500);
String summary = contents.substring(0, summarySize);
if (summary != null && summary.length() > 0) {
try {
summary = XercesHelper.html2nodeUTF8(summary);
} catch (Exception e) {
// ignore
}
}
if (summary == null) summary = "";
resource.add(new Field("summary", summary, Field.Store.YES, Field.Index.NO));
in.close();
return resource;
}
}