/*
* (c) Copyright 2010-2011 AgileBirds
*
* This file is part of OpenFlexo.
*
* OpenFlexo is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenFlexo is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenFlexo. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.openflexo.docxparser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.openflexo.docxparser.dto.ParsedDocx;
import org.openflexo.docxparser.dto.ParsedHtml;
import org.openflexo.docxparser.dto.api.IParsedDocx;
import org.openflexo.docxparser.dto.api.IParsedFlexoContent;
import org.openflexo.docxparser.dto.api.IParsedFlexoDescription;
import org.openflexo.docxparser.dto.api.IParsedFlexoName;
import org.openflexo.docxparser.dto.api.IParsedFlexoTitle;
import org.openflexo.docxparser.flexotag.FlexoContentTag;
import org.openflexo.docxparser.flexotag.FlexoDescriptionTag;
import org.openflexo.docxparser.flexotag.FlexoEPITag;
import org.openflexo.docxparser.flexotag.FlexoNameTag;
import org.openflexo.docxparser.flexotag.FlexoTitleTag;
public class DocxFileParser {
protected static final Logger logger = Logger.getLogger(DocxFileParser.class.getPackage().toString());
private static final List<String> TAGS = Arrays.asList(FlexoDescriptionTag.FLEXODESCRIPTIONTAG, FlexoNameTag.FLEXONAMETAG,
FlexoTitleTag.FLEXOTITLETAG, FlexoContentTag.FLEXOCONTENTTAG, FlexoEPITag.EPI_TAG);
private static final String XPATH;
static {
StringBuilder sb = new StringBuilder();
for (String tag : TAGS) {
if (sb.length() == 0) {
sb.append("//w:sdt/w:sdtPr[not(w:showingPlcHdr)]/w:tag[");
} else {
sb.append(" or ");
}
sb.append("starts-with(@w:val, '");
sb.append(tag);
sb.append("')");
}
sb.append(']');
XPATH = sb.toString();
}
private Set<String> availableCssClasses;
private String resourcesDirectory;
private OPCPackage filePackage;
private PackagePart documentPart = null;
private Document documentXml = null;
public DocxFileParser(byte[] documentBytes, Set<String> availableCssClasses, String resourcesDirectory) throws InvalidFormatException {
ByteArrayInputStream in = new ByteArrayInputStream(documentBytes);
try {
initialize(in, availableCssClasses, resourcesDirectory);
} finally {
try {
in.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public DocxFileParser(InputStream in, Set<String> availableCssClasses, String resourcesDirectory) throws InvalidFormatException {
initialize(in, availableCssClasses, resourcesDirectory);
}
private void initialize(InputStream in, Set<String> availableCssClasses, String resourcesDirectory) throws InvalidFormatException {
if (availableCssClasses == null) {
this.availableCssClasses = new HashSet<String>();
} else {
this.availableCssClasses = availableCssClasses;
}
this.resourcesDirectory = resourcesDirectory;
try {
filePackage = OPCPackage.open(in);
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try {
in.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public IParsedDocx getParsedDocx() {
ParsedDocx parsedDocx = new ParsedDocx();
List<?> resultList = getDocumentXml().selectNodes(XPATH);
for (Iterator<?> iterator = resultList.iterator(); iterator.hasNext();) {
Element element = (Element) iterator.next();
String tagValue = element.attributeValue(DocxQName.getQName(OpenXmlTag.w_val));
Element sdtElement = element.getParent().getParent(); // On w:sdt
Element sdtContentElement = sdtElement.element(DocxQName.getQName(OpenXmlTag.w_sdtContent));
try {
if (tagValue.startsWith(FlexoDescriptionTag.FLEXODESCRIPTIONTAG)) {
FlexoDescriptionTag descTag = new FlexoDescriptionTag(tagValue);
ParsedHtml parsedHtml = OpenXml2Html.getHtml(sdtContentElement, getDocumentPart(), availableCssClasses,
resourcesDirectory);
IParsedFlexoDescription parsedFlexoDescription = parsedDocx.getOrCreateParsedDescription(descTag.getFlexoId(),
descTag.getUserId());
parsedFlexoDescription.addHtmlDescription(descTag.getTarget(), parsedHtml);
} else if (tagValue.startsWith(FlexoNameTag.FLEXONAMETAG)) {
String text = extractTextContent(sdtContentElement);
FlexoNameTag nameTag = new FlexoNameTag(tagValue);
if (text.length() > 0) {
IParsedFlexoName parsedFlexoName = parsedDocx.getOrCreateParsedName(nameTag.getFlexoId(), nameTag.getUserId());
parsedFlexoName.setFlexoName(text);
}
} else if (tagValue.startsWith(FlexoTitleTag.FLEXOTITLETAG)) {
FlexoTitleTag titleTag = new FlexoTitleTag(tagValue);
String text = extractTextContent(sdtContentElement);
if (text.length() > 0) {
IParsedFlexoTitle parsedFlexoTitle = parsedDocx.getOrCreateParsedTitle(titleTag.getFlexoId(), titleTag.getUserId());
parsedFlexoTitle.setFlexoTitle(text);
}
} else if (tagValue.startsWith(FlexoEPITag.EPI_TAG)) {
FlexoEPITag epiTag = new FlexoEPITag(tagValue);
// TODO: Here extract the text under 3 forms:
// * one line string
// * multi-line string
// * styled text
// Reinjection shall then choose the appropriate extract according the info
// available in VP.
String text = extractTextContent(sdtContentElement);
if (text.length() > 0) {
parsedDocx.createParsedFlexoEPI(epiTag, text);
}
} else if (tagValue.startsWith(FlexoContentTag.FLEXOCONTENTTAG)) {
FlexoContentTag contentTag = new FlexoContentTag(tagValue);
ParsedHtml parsedHtml = OpenXml2Html.getHtml(sdtContentElement, getDocumentPart(), availableCssClasses,
resourcesDirectory);
IParsedFlexoContent parsedFlexoContent = parsedDocx.getOrCreateParsedContent(contentTag.getFlexoId(),
contentTag.getUserId());
parsedFlexoContent.setFlexoContent(parsedHtml);
} // else if (tagValue.startsWith(FlexoContentTag.FLEXOEPTAG))
} catch (FlexoDescriptionTag.FlexoTagFormatException e) {
logger.log(Level.WARNING, "Cannot parse tag from a building block which seems to be a Flexo Tag", e);
}
}
return parsedDocx;
}
public String extractTextContent(Element sdtContentElement) {
StringBuilder sb = new StringBuilder();
/*Iterator<?> iteratorWp = sdtContentElement.selectNodes("descendant::w:p").iterator();
while (iteratorWp.hasNext()) {
Element wpElement = (Element) iteratorWp.next();*/
Iterator<?> iteratorWt = sdtContentElement.selectNodes("descendant::w:t").iterator();
while (iteratorWt.hasNext()) {
Element textElement = (Element) iteratorWt.next();
sb.append(textElement.getText());
}
/*if (iteratorWp.hasNext()) {
sb.append(StringUtils.LINE_SEPARATOR);
}
}*/
return sb.toString().trim();
}
public void close() {
try {
filePackage.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public OPCPackage getFilePackage() {
return filePackage;
}
public PackagePart getDocumentPart() {
if (documentPart == null) {
PackageRelationship documentRelationship = getFilePackage().getRelationshipsByType(DocxXmlUtil.RELATIONSHIPTYPE_COREDOCUMENT)
.getRelationship(0);
documentPart = getFilePackage().getPart(documentRelationship);
}
return documentPart;
}
public Document getDocumentXml() {
if (documentXml == null) {
try {
documentXml = new SAXReader().read(getDocumentPart().getInputStream());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
return documentXml;
}
}