/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.oo;
import java.io.*;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.jaxen.*;
import org.jaxen.jdom.JDOMXPath;
import org.jdom.*;
import org.jdom.input.*;
/**
* Parser for OpenOffice and OpenDocument formats. This should handle
* the following formats: Text, Spreadsheet, Presentation, and
* corresponding templates and "master" documents.
*
* @author Andrzej Bialecki
*/
public class OOParser implements Parser {
public static final Log LOG = LogFactory.getLog(OOParser.class);
private Configuration conf;
public OOParser () {
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return conf;
}
public ParseResult getParse(Content content) {
String text = null;
String title = null;
Metadata metadata = new Metadata();
ArrayList outlinks = new ArrayList();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
+" bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), conf);
}
ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
ZipEntry ze = null;
while ((ze = zis.getNextEntry()) != null) {
if (ze.getName().equals("content.xml")) {
text = parseContent(ze, zis, outlinks);
} else if (ze.getName().equals("meta.xml")) {
parseMeta(ze, zis, metadata);
}
}
zis.close();
} catch (Exception e) { // run time exception
e.printStackTrace(LogUtil.getWarnStream(LOG));
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as OO document. " + e).getEmptyParseResult(content.getUrl(), conf);
}
title = metadata.get(Metadata.TITLE);
if (text == null)
text = "";
if (title == null)
title = "";
Outlink[] links = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, links, metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
// extract as much plain text as possible.
private String parseContent(ZipEntry ze, ZipInputStream zis, ArrayList outlinks) throws Exception {
StringBuffer res = new StringBuffer();
FilterInputStream fis = new FilterInputStream(zis) {
public void close() {};
};
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(fis);
Element root = doc.getRootElement();
// XXX this is expensive for very large documents. In those cases another
// XXX method (direct processing of SAX events, or XMLPull) should be used.
XPath path = new JDOMXPath("//text:span | //text:p | //text:tab | //text:tab-stop | //text:a");
path.addNamespace("text", root.getNamespace("text").getURI());
Namespace xlink = Namespace.getNamespace("xlink", "http://www.w3.org/1999/xlink");
List list = path.selectNodes(doc);
boolean lastp = true;
for (int i = 0; i < list.size(); i++) {
Element el = (Element)list.get(i);
String text = el.getText();
if (el.getName().equals("p")) {
// skip empty paragraphs
if (!text.equals("")) {
if (!lastp) res.append("\n");
res.append(text + "\n");
lastp = true;
}
} else if (el.getName().startsWith("tab")) {
res.append("\t");
lastp = false;
} else if (el.getName().equals("a")) {
List nl = el.getChildren();
String a = null;
for (int k = 0; k < nl.size(); k++) {
Element anchor = (Element)nl.get(k);
String nsName = anchor.getNamespacePrefix() + ":" + anchor.getName();
if (!nsName.equals("text:span")) continue;
a = anchor.getText();
break;
}
String u = el.getAttributeValue("href", xlink);
if (u == null) u = a; // often anchors are URLs
Outlink o = new Outlink(u, a);
outlinks.add(o);
if (a != null && !a.equals("")) {
if (!lastp) res.append(' ');
res.append(a);
lastp = false;
}
} else {
if (!text.equals("")) {
if (!lastp) res.append(' ');
res.append(text);
}
lastp = false;
}
}
return res.toString();
}
// extract metadata and convert them to Nutch format
private void parseMeta(ZipEntry ze, ZipInputStream zis, Metadata metadata) throws Exception {
FilterInputStream fis = new FilterInputStream(zis) {
public void close() {};
};
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(fis);
XPath path = new JDOMXPath("/office:document-meta/office:meta/*");
Element root = doc.getRootElement();
path.addNamespace("office", root.getNamespace("office").getURI());
List list = path.selectNodes(doc);
for (int i = 0; i < list.size(); i++) {
Element n = (Element)list.get(i);
String text = n.getText();
if (text.trim().equals("")) continue;
String name = n.getName();
if (name.equals("title"))
metadata.add(Metadata.TITLE, text);
else if (name.equals("language"))
metadata.add(Metadata.LANGUAGE, text);
else if (name.equals("creation-date"))
metadata.add(Metadata.DATE, text);
else if (name.equals("print-date"))
metadata.add(Metadata.LAST_PRINTED, text);
else if (name.equals("generator"))
metadata.add(Metadata.APPLICATION_NAME, text);
else if (name.equals("creator"))
metadata.add(Metadata.CREATOR, text);
}
}
public static void main(String[] args) throws Exception {
OOParser oo = new OOParser();
Configuration conf = NutchConfiguration.create();
oo.setConf(conf);
FileInputStream fis = new FileInputStream(args[0]);
byte[] bytes = new byte[fis.available()];
fis.read(bytes);
fis.close();
Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf);
Parse p = oo.getParse(c).get(c.getUrl());
System.out.println(p.getData());
System.out.println("Text: '" + p.getText() + "'");
/*
// create the test output file
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("e:\\ootest.txt"), "UTF-8");
osw.write(p.getText());
osw.flush();
osw.close();
*/
}
}