/**
* OLAT - Online Learning and Training<br>
* http://www.olat.org
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br>
* University of Zurich, Switzerland.
* <hr>
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* This file has been modified by the OpenOLAT community. Changes are licensed
* under the Apache 2.0 license as the original file.
*/
package org.olat.search.service.document.file;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.Iterator;
import org.apache.lucene.document.Document;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.olat.core.gui.util.CSSHelper;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.io.LimitedContentWriter;
import org.olat.core.util.vfs.VFSLeaf;
import org.olat.search.service.SearchResourceContext;
/**
* Lucene document mapper.
*
* @author Christian Guretzki
*/
public class WordDocument extends FileDocument {
private static final long serialVersionUID = 1827194935338994490L;
private static final OLog log = Tracing.createLoggerFor(WordDocument.class);
public final static String FILE_TYPE = "type.file.word";
public WordDocument() {
//
}
public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf)
throws IOException, DocumentException, DocumentAccessException {
WordDocument wordDocument = new WordDocument();
wordDocument.init(leafResourceContext, leaf);
wordDocument.setFileType(FILE_TYPE);
wordDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName()));
if (log.isDebug())
log.debug(wordDocument.toString());
return wordDocument.getLuceneDocument();
}
@Override
protected FileContent readContent(VFSLeaf leaf) throws IOException,
DocumentException {
LimitedContentWriter sb = new LimitedContentWriter((int)leaf.getSize(), FileDocumentFactory.getMaxFileSize());
try(InputStream bis = new BufferedInputStream(leaf.getInputStream())) {
POIFSFileSystem filesystem = new POIFSFileSystem(bis);
Iterator<?> entries = filesystem.getRoot().getEntries();
while (entries.hasNext()) {
Entry entry = (Entry) entries.next();
String name = entry.getName();
if (!(entry instanceof DocumentEntry)) {
// Skip directory entries
} else if ("WordDocument".equals(name)) {
collectWordDocument(leaf, filesystem, sb);
}
}
return new FileContent(sb.toString());
} catch (Exception e) {
log.warn("could not read in word document: " + leaf
+ " please check, that this is not an docx/rtf/html file!");
throw new DocumentException(e.getMessage());
}
}
private void collectWordDocument(VFSLeaf leaf, POIFSFileSystem filesystem, Writer sb) throws IOException {
try(WordExtractor extractor = new WordExtractor(filesystem)) {
addTextIfAny(sb, extractor.getTextFromPieces());
} catch(OldWordFileFormatException ex) {
collectOldWordDocument(leaf, sb);
} catch(Exception e) {
log.error("Cannot read word document: " + leaf, e);
}
}
private void collectOldWordDocument(VFSLeaf leaf, Writer sb) throws IOException {
try(InputStream bis = new BufferedInputStream(leaf.getInputStream())) {
POIFSFileSystem pfs = new POIFSFileSystem(bis);
HWPFOldDocument doc = new HWPFOldDocument(pfs);
Word6Extractor docExtractor = new Word6Extractor(doc);
addTextIfAny(sb, docExtractor.getText());
} catch(Exception e) {
log.error("Cannot read old word document: " + leaf, e);
}
}
private void addTextIfAny(Writer sb, String text) throws IOException {
if (text != null && text.length() > 0) {
sb.append(text).append(' ');
}
}
}