/* © 2010 Stephan Reichholf <stephan at reichholf dot net> * * Licensed under the Create-Commons Attribution-Noncommercial-Share Alike 3.0 Unported * http://creativecommons.org/licenses/by-nc-sa/3.0/ */ package net.reichholf.dreamdroid.parsers; import android.util.Log; import net.reichholf.dreamdroid.dataProviders.interfaces.DataParser; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; /** * @author sreichholf * */ public class GenericSaxParser implements DataParser { private static String LOG_TAG = GenericSaxParser.class.getSimpleName(); private DefaultHandler mHandler; private boolean mError; private String mErrorText; static Pattern sControlPatternAggressive = Pattern.compile("\\p{C}"); /** * */ public GenericSaxParser() { mError = false; } /** * @param h */ public GenericSaxParser(DefaultHandler h) { mHandler = h; mError = false; } /** * @param h */ public void setHandler(DefaultHandler h) { mHandler = h; } /** * @return */ public DefaultHandler getHandler() { return mHandler; } protected String stripNonValidXMLCharacters(String in, boolean aggressive) { if(aggressive) return sControlPatternAggressive.matcher(in).replaceAll("").replace(" ", " "); else return stripControlCharacters(in).replace("\u008A", "\n").replace(" ", " "); } /* * this is based on https://github.com/GreyCat/java-string-benchmark/blob/master/src/ru/greycat/algorithms/strip/RatchetFreak2EdStaub1GreyCat1.java * and is about a zillion lightyears faster than replaceAll... (defeats noticable lag between load finish and parse finish) */ public String stripControlCharacters(String s) { int length = s.length(); char[] oldChars = new char[length +1]; s.getChars(0, length, oldChars, 0); oldChars[length] = '\0'; // avoiding explicit bound check in while int newLen = 0; // find first non-printable, // if there are none it ends on the null char I appended while (true) { ++newLen; char ch = oldChars[newLen]; if(! (ch > ' ' || Character.isWhitespace(ch)) ) break; } for (int j = newLen; j < length; j++) { char ch = oldChars[j]; if (ch > ' ' || Character.isWhitespace(ch)) { oldChars[newLen] = ch; // the while avoids repeated overwriting here when newLen==j newLen++; } } return new String(oldChars, 0, newLen); } /* * (non-Javadoc) * * @see * net.reichholf.dreamdroid.dataProviders.interfaces.DataParser#parse(java * .lang.String) */ protected boolean parse(String input, boolean isRetry) { try { input = stripNonValidXMLCharacters(input, isRetry); mError = false; mErrorText = null; // InputSource is = new InputSource(input); ByteArrayInputStream bais = new ByteArrayInputStream(input.getBytes()); InputSource is = new InputSource(); is.setByteStream(bais); SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); SAXParser sp = spf.newSAXParser(); /* Get the XMLReader of the SAXParser we created. */ XMLReader xr = sp.getXMLReader(); /* Create a new ContentHandler and apply it to the XML-Reader */ xr.setContentHandler(mHandler); xr.parse(is); return true; } catch (ParserConfigurationException | SAXException | IOException e) { // TODO Auto-generated catch block Log.e(LOG_TAG, e.toString()); if(isRetry) { mError = true; mErrorText = e.toString(); } else { Log.w(LOG_TAG, "Retrying with aggressive character filtering!"); return parse(input, true); } } return false; } @Override public boolean parse(String input) { return parse(input, false); } /** * @return */ public boolean hasError(){ return mError; } /** * @return */ public String getErrorText(){ return mErrorText; } }