/* * (C) Copyright 2006-2008 Nuxeo SAS (http://nuxeo.com/) and contributors. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser General Public License * (LGPL) version 2.1 which accompanies this distribution, and is available at * http://www.gnu.org/licenses/lgpl.html * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * Contributors: * bstefanescu * * $Id$ */ package org.nuxeo.ecm.core.convert.plugins.text.extractors; import java.io.File; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; /** * @author <a href="mailto:bs@nuxeo.com">Bogdan Stefanescu</a> */ public class Xml2TextHandler extends DefaultHandler { protected static final SAXParserFactory factory = SAXParserFactory.newInstance(); static { factory.setValidating(false); factory.setNamespaceAware(false); } protected SAXParser parser; protected StringBuffer buf; protected boolean trim = false; public Xml2TextHandler() throws SAXException, ParserConfigurationException { parser = factory.newSAXParser(); XMLReader reader = parser.getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); } public SAXParser getParser() { return parser; } public String parse(File file) throws SAXException, IOException { parser.parse(file, this); String text = buf.toString(); buf = null; return text; } public String parse(InputStream in) throws SAXException, IOException { parser.parse(in, this); String text = buf.toString(); buf = null; return text; } public String parse(InputSource is) throws SAXException, IOException { parser.parse(is, this); String text = buf.toString(); buf = null; return text; } public String getText() { return buf.toString(); } @Override public void startDocument() throws SAXException { trim = false; buf = new StringBuffer(); } @Override public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { trim = true; } @Override public void endElement(String uri, String localName, String name) throws SAXException { trim = true; } @Override public void characters(char[] ch, int start, int length) throws SAXException { //buf.append(ch, start, length); if (true) return; if (trim) { int i = start; int end = start + length; while (i < end && Character.isWhitespace(ch[i])) { i++; } buf.append(" ").append(ch, i, length - i + start); trim = false; //System.out.println("["+new String(ch, i, length - i + start)+"]"); } else { buf.append(ch, start, length); //System.out.println("{"+new String(ch, start, length)+"}"); } } }