/* * Copyright 2004 Outerthought bvba and Schaubroeck nv * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.xpn.xwiki.plugin.lucene.textextraction; import java.io.ByteArrayInputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.xmlpull.mxp1.MXParser; import org.xmlpull.v1.XmlPullParser; import com.xpn.xwiki.plugin.lucene.textextraction.xmlutil.XmlEncodingDetector; /** * Extracts all text from an OpenOffice document. */ public class OpenOfficeTextExtractor implements MimetypeTextExtractor { private static final String TEXTNAMESPACE = "http://openoffice.org/2000/text"; public String getText(byte[] data) throws Exception { /* * the byte array we receive here is in fact a ZIP containing the content.xml, * styles.xml,meta.xml and META-INF/manifest.xml files. We are only interested in the * content.xml because that's the file containing the actual content (duh) */ ByteArrayInputStream bis = new ByteArrayInputStream(data); ZipInputStream zis = new ZipInputStream(bis); ZipEntry ze = null; String zipEntryName = null; StringBuffer text = new StringBuffer(); while ((ze = zis.getNextEntry()) != null && !(zipEntryName = ze.getName()).equals("content.xml")) { } if (zipEntryName != null && zipEntryName.equals("content.xml")) { /* * we found the correct zip entry. This means the "read pointer" of the zipinputstream * points correctly to the beginning of this zip entry and we can pass it to the xml * parser like this (will return -1 as soon as the end of the zip entry is reached) */ /* * We are using this XmlPullParser because it was impossible to work with a sax parser. * The sax parser always wanted to have access to the openoffice dtd. Even tried to * write our own entityresolver to work around this problem but didnt work out. In order * not to pin ourselves down to a specific sax implementor (where we eg. would be able * to specify that we explicitly don't want any check at all against a dtd) we choose * not to use sax at all and use a very lightweight type of parsing for this specific * goal. */ XmlPullParser parser = new MXParser(); parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true); parser.setInput(zis, XmlEncodingDetector.detectEncoding(data)); boolean inText = false; int eventType = parser.getEventType(); while (eventType != XmlPullParser.END_DOCUMENT) { eventType = parser.next(); if (eventType == XmlPullParser.START_TAG) { if (parser.getName().equals("p") && parser.getNamespace().equals(TEXTNAMESPACE)) { text.append(' '); inText = true; } } else if (eventType == XmlPullParser.END_TAG) { if (parser.getName().equals("p") && parser.getNamespace().equals(TEXTNAMESPACE)) { inText = false; } } else if (eventType == XmlPullParser.TEXT) { if (inText) { String gotText = parser.getText(); text.append(gotText); } } } } else { throw new Exception("Invalid OpenOffice document format (content.xml not found)"); } return text.toString(); } }