/**
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at the
* <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a>
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Initial code contributed and copyrighted by<br>
* frentix GmbH, http://www.frentix.com
* <p>
*/
package org.olat.search.service.document.file;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.lucene.document.Document;
import org.olat.core.gui.util.CSSHelper;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.io.LimitedContentWriter;
import org.olat.core.util.io.ShieldInputStream;
import org.olat.core.util.vfs.JavaIOItem;
import org.olat.core.util.vfs.VFSLeaf;
import org.olat.search.service.SearchResourceContext;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
/**
*
* Description:<br>
* Parse the Word XML document (.docx) with a SAX parser
*
* <P>
* Initial Date: 5 nov. 2012<br>
* @author srosse, stephane.rosse@frentix.com, http://www.frentix.com
*/
public class WordOOXMLDocument extends FileDocument {
private static final long serialVersionUID = 2322994231200065526L;
private static final OLog log = Tracing.createLoggerFor(WordOOXMLDocument.class);
public final static String WORD_FILE_TYPE = "type.file.word";
private static final String HEADER = "word/header";
private static final String FOOTER = "word/footer";
private static final String DOCUMENT = "word/document.xml";
public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf)
throws IOException, DocumentException, DocumentAccessException {
WordOOXMLDocument officeDocument = new WordOOXMLDocument();
officeDocument.init(leafResourceContext, leaf);
officeDocument.setFileType(WORD_FILE_TYPE);
officeDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName()));
if (log.isDebug()) {
log.debug(officeDocument.toString());
}
return officeDocument.getLuceneDocument();
}
@Override
public FileContent readContent(VFSLeaf leaf) throws IOException, DocumentException {
File file = ((JavaIOItem)leaf).getBasefile();
LimitedContentWriter writer = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize());
try(ZipFile wordFile = new ZipFile(file)) {
List<String> contents = new ArrayList<>();
for(Enumeration<? extends ZipEntry> entriesEnumeration=wordFile.entries(); entriesEnumeration.hasMoreElements(); ) {
ZipEntry entry = entriesEnumeration.nextElement();
String name = entry.getName();
if(name.endsWith("word/document.xml")) {
contents.add(name);
} else if(name.startsWith(HEADER) && name.endsWith(".xml")) {
contents.add(name);
} else if(name.startsWith(FOOTER) && name.endsWith(".xml")) {
contents.add(name);
}
}
if(contents.size() > 1) {
Collections.sort(contents, new WordDocumentComparator());
}
for(String content:contents) {
if(writer.accept()) {
ZipEntry entry = wordFile.getEntry(content);
InputStream zip = wordFile.getInputStream(entry);
OfficeDocumentHandler dh = new OfficeDocumentHandler(writer);
parse(new ShieldInputStream(zip), dh);
zip.close();
}
}
} catch (DocumentException e) {
throw e;
} catch (Exception e) {
throw new DocumentException(e.getMessage());
}
return new FileContent(writer.toString());
}
private void parse(InputStream stream, DefaultHandler handler) throws DocumentException {
try {
XMLReader parser = XMLReaderFactory.createXMLReader();
parser.setContentHandler(handler);
parser.setEntityResolver(handler);
try {
parser.setFeature("http://xml.org/sax/features/validation", false);
} catch(Exception e) {
log.error("Cannot deactivate validation", e);
}
parser.parse(new InputSource(stream));
} catch (Exception e) {
throw new DocumentException("XML parser configuration error", e);
}
}
private static class OfficeDocumentHandler extends DefaultHandler {
private final LimitedContentWriter sb;
public OfficeDocumentHandler(LimitedContentWriter sb) {
this.sb = sb;
}
@Override
public void characters(char[] ch, int start, int length) {
if(sb .length() > 0 && sb.charAt(sb.length() - 1) != ' '){
sb.append(' ');
}
sb.write(ch, start, length);
}
}
public static class WordDocumentComparator extends AbstractOfficeDocumentComparator {
@Override
public int compare(String f1, String f2) {
int c = 0;
if(f1.endsWith(DOCUMENT)) {
if(f2.startsWith(HEADER)) {
c = -1;
} else if(f2.startsWith(FOOTER)) {
c = 1;
}
} else if(f1.startsWith(HEADER)) {
if(f2.startsWith(DOCUMENT) || f2.startsWith(FOOTER)) {
c = 1;
} else if(f2.startsWith(HEADER)) {
c = comparePosition(f1, f2, HEADER);
}
} else if(f1.startsWith(FOOTER)) {
if(f2.startsWith(DOCUMENT) || f2.startsWith(HEADER)) {
c = -1;
} else if(f2.startsWith(FOOTER)) {
c = comparePosition(f1, f2, FOOTER);
}
}
if(c == 0) {
c = f1.compareTo(f2);
}
return -c;
}
}
}