/* * (C) Copyright 2006-2008 Nuxeo SA (http://nuxeo.com/) and others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * bstefanescu * * $Id$ */ package org.nuxeo.ecm.core.convert.plugins.text.extractors; import java.io.File; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; /** * @author <a href="mailto:bs@nuxeo.com">Bogdan Stefanescu</a> */ public class Xml2TextHandler extends DefaultHandler { protected static final SAXParserFactory factory = SAXParserFactory.newInstance(); static { factory.setValidating(false); factory.setNamespaceAware(false); } protected SAXParser parser; protected StringBuffer buf; protected boolean trim = false; public Xml2TextHandler() throws SAXException, ParserConfigurationException { parser = factory.newSAXParser(); XMLReader reader = parser.getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); } public SAXParser getParser() { return parser; } public String parse(File file) throws SAXException, IOException { parser.parse(file, this); String text = buf.toString(); buf = null; return text; } public String parse(InputStream in) throws SAXException, IOException { parser.parse(in, this); String text = buf.toString(); buf = null; return text; } public String parse(InputSource is) throws SAXException, IOException { parser.parse(is, this); String text = buf.toString(); buf = null; return text; } public String getText() { return buf.toString(); } @Override public void startDocument() throws SAXException { trim = false; buf = new StringBuffer(); } @Override public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { trim = true; } @Override public void endElement(String uri, String localName, String name) throws SAXException { trim = true; } @Override public void characters(char[] ch, int start, int length) throws SAXException { // buf.append(ch, start, length); if (true) return; if (trim) { int i = start; int end = start + length; while (i < end && Character.isWhitespace(ch[i])) { i++; } buf.append(" ").append(ch, i, length - i + start); trim = false; // System.out.println("["+new String(ch, i, length - i + start)+"]"); } else { buf.append(ch, start, length); // System.out.println("{"+new String(ch, start, length)+"}"); } } }