package com.dteviot.epubviewer;
import java.util.ArrayList;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/*
* Extracts the Text that would be shown to a user from a XHTML document
*/
public class XhtmlToText extends DefaultHandler{
/*
* Nodes that need their contents to be followed by white space
*/
private static final String[] ADD_WHITE_SPACE_NODES =
{ "br", "p", "h1", "h2", "h3", "h4", "h5" };
/*
* chop text into strings of a couple of hundred words or so.
*/
private static final int MIN_CHARS_PER_STRING = 6 * 200;
private StringBuilder mBuilder;
private ArrayList<String> mText;
private boolean mInBody = false;
public XhtmlToText(ArrayList<String> text) {
mText = text;
mBuilder = new StringBuilder();
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
super.characters(ch, start, length);
// ignore text in head
if (mInBody) {
mBuilder.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
super.endElement(uri, localName, name);
if (isWhiteSpaceNode(localName)) {
mBuilder.append(" ");
}
if (MIN_CHARS_PER_STRING < mBuilder.length()) {
flushAccumulator();
}
}
@Override
public void endDocument() throws SAXException {
super.endDocument();
// we're done, make sure any remaining text is moved
flushAccumulator();
}
@Override
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
super.startElement(uri, localName, name, attributes);
if (localName.equalsIgnoreCase("li")) {
mBuilder.append(" ");
} else if (localName.equalsIgnoreCase("body")) {
mInBody = true;
}
}
private void flushAccumulator() {
if (0 < mBuilder.length()) {
mText.add(mBuilder.toString());
mBuilder.setLength(0);
}
}
private boolean isWhiteSpaceNode(String nodeName) {
for (String s : ADD_WHITE_SPACE_NODES) {
if (s.equals(nodeName)) {
return true;
}
}
// if get here, not found
return false;
}
}