package edu.stanford.nlp.util; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.ParserConfigurationException; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import edu.stanford.nlp.io.IOUtils; /** * Provides some utilities for dealing with XML files, both by properly * parsing them and by using the methods of a desperate Perl hacker. * * @author Teg Grenager */ public class XMLUtils { private XMLUtils() {} // only static methods /** * Returns a non-validating XML parser. The parser ignores both DTDs and XSDs. * * @return An XML parser in the form of a DocumentBuilder */ public static DocumentBuilder getXmlParser() { DocumentBuilder db = null; try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false); //Disable DTD loading and validation //See http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); db = dbf.newDocumentBuilder(); db.setErrorHandler(new SAXErrorHandler()); } catch (ParserConfigurationException e) { System.err.printf("%s: Unable to create XML parser\n", XMLUtils.class.getName()); e.printStackTrace(); } catch(UnsupportedOperationException e) { System.err.printf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName()); e.printStackTrace(); } return db; } /** * Returns a validating XML parser given an XSD (not DTD!). * * @param schemaFile * @return An XML parser in the form of a DocumentBuilder */ public static DocumentBuilder getValidatingXmlParser(File schemaFile) { DocumentBuilder db = null; try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); Schema schema = factory.newSchema(schemaFile); dbf.setSchema(schema); db = dbf.newDocumentBuilder(); db.setErrorHandler(new SAXErrorHandler()); } catch (ParserConfigurationException e) { System.err.printf("%s: Unable to create XML parser\n", XMLUtils.class.getName()); e.printStackTrace(); } catch (SAXException e) { System.err.printf("%s: XML parsing exception while loading schema %s\n", XMLUtils.class.getName(),schemaFile.getPath()); e.printStackTrace(); } catch(UnsupportedOperationException e) { System.err.printf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName()); e.printStackTrace(); } return db; } /** * Block-level HTML tags that are rendered with surrounding line breaks. */ public static final Set<String> breakingTags = Generics.newHashSet(Arrays.asList(new String[] {"blockquote", "br", "div", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "li", "ol", "p", "pre", "ul", "tr", "td"})); /** * @param r the reader to read the XML/HTML from * @param mapBack a List of Integers mapping the positions in the result buffer * to positions in the original Reader, will be cleared on receipt * @return the String containing the resulting text */ public static String stripTags(Reader r, List<Integer> mapBack, boolean markLineBreaks) { if (mapBack != null) { mapBack.clear(); // just in case it has something in it! } StringBuilder result = new StringBuilder(); String text; String tag; int position = 0; try { do { text = XMLUtils.readUntilTag(r); // will do nothing if the next thing is a tag if (text.length() > 0) { // add offsets to the map back for (int i = 0; i < text.length(); i++) { result.append(text.charAt(i)); if (mapBack != null) { mapBack.add(Integer.valueOf(position + i)); } } position += text.length(); } // System.out.println(position + " got text: " + text); tag = XMLUtils.readTag(r); if (tag == null) { break; } if (markLineBreaks && XMLUtils.isBreaking(parseTag(tag))) { result.append("\n"); if (mapBack != null) { mapBack.add(Integer.valueOf(-position)); } } position += tag.length(); // System.out.println(position + " got tag: " + tag); } while (true); } catch (IOException e) { System.err.println("Error reading string"); e.printStackTrace(); } return result.toString(); } public static boolean isBreaking(String tag) { return breakingTags.contains(tag); } public static boolean isBreaking(XMLTag tag) { return breakingTags.contains(tag.name); } /** * Reads all text up to next XML tag and returns it as a String. * * @return the String of the text read, which may be empty. */ public static String readUntilTag(Reader r) throws IOException { if (!r.ready()) { return ""; } StringBuilder b = new StringBuilder(); int c = r.read(); while (c >= 0 && c != '<') { b.append((char) c); c = r.read(); } return b.toString(); } /** * @return the new XMLTag object, or null if couldn't be created */ public static XMLTag readAndParseTag(Reader r) throws IOException { String s = readTag(r); if (s == null) { return null; } XMLTag ret = null; try { ret = new XMLTag(s); } catch (Exception e) { System.err.println("Failed to handle |" + s + "|"); } return ret; } // Pattern is reentrant, going by the statement // "many matchers can share the same pattern" // on the Pattern javadoc. Therefore, this should be // safe as a static final variable. static final Pattern xmlEscapingPattern = Pattern.compile("\\&.+?;"); public static String unescapeStringForXML(String s) { StringBuilder result = new StringBuilder(); Matcher m = xmlEscapingPattern.matcher(s); int end = 0; while (m.find()) { int start = m.start(); result.append(s.substring(end, start)); end = m.end(); result.append(translate(s.substring(start, end))); } result.append(s.substring(end, s.length())); return result.toString(); } private static char translate(String s) { if (s.equals("&")) { return '&'; } else if (s.equals("<") || s.equals("≪")) { return '<'; } else if (s.equals(">") || s.equals("≫")) { return '>'; } else if (s.equals(""")) { return '\"'; } else if (s.equals("'")) { return '\''; } else if (s.equals("*") || s.equals("♯")) { return '-'; } else if (s.equals("=")) { return '='; } else if (s.equals(" ")) { return (char) 0xA0; } else if (s.equals("¡")) { return (char) 0xA1; } else if (s.equals("¢") || s.equals("&shilling;")) { return (char) 0xA2; } else if (s.equals("£")) { return (char) 0xA3; } else if (s.equals("¤")) { return (char) 0xA4; } else if (s.equals("¥")) { return (char) 0xA5; } else if (s.equals("¦")) { return (char) 0xA6; } else if (s.equals("§")) { return (char) 0xA7; } else if (s.equals("¨")) { return (char) 0xA8; } else if (s.equals("©")) { return (char) 0xA9; } else if (s.equals("ª")) { return (char) 0xAA; } else if (s.equals("« ")) { return (char) 0xAB; } else if (s.equals("¬")) { return (char) 0xAC; } else if (s.equals("­ ")) { return (char) 0xAD; } else if (s.equals("®")) { return (char) 0xAE; } else if (s.equals("¯")) { return (char) 0xAF; } else if (s.equals("°")) { return (char) 0xB0; } else if (s.equals("±")) { return (char) 0xB1; } else if (s.equals("²")) { return (char) 0xB2; } else if (s.equals("³")) { return (char) 0xB3; } else if (s.equals("´")) { return (char) 0xB4; } else if (s.equals("µ")) { return (char) 0xB5; } else if (s.equals("·")) { return (char) 0xB7; } else if (s.equals("¸")) { return (char) 0xB8; } else if (s.equals("¹")) { return (char) 0xB9; } else if (s.equals("º")) { return (char) 0xBA; } else if (s.equals("»")) { return (char) 0xBB; } else if (s.equals("¼ ")) { return (char) 0xBC; } else if (s.equals("½")) { return (char) 0xBD; } else if (s.equals("¾ ")) { return (char) 0xBE; } else if (s.equals("¿")) { return (char) 0xBF; } else if (s.equals("À")) { return (char) 0xC0; } else if (s.equals("Á")) { return (char) 0xC1; } else if (s.equals("Â")) { return (char) 0xC2; } else if (s.equals("Ã")) { return (char) 0xC3; } else if (s.equals("Ä")) { return (char) 0xC4; } else if (s.equals("Å")) { return (char) 0xC5; } else if (s.equals("Æ")) { return (char) 0xC6; } else if (s.equals("Ç")) { return (char) 0xC7; } else if (s.equals("È")) { return (char) 0xC8; } else if (s.equals("É")) { return (char) 0xC9; } else if (s.equals("Ê")) { return (char) 0xCA; } else if (s.equals("Ë")) { return (char) 0xCB; } else if (s.equals("Ì")) { return (char) 0xCC; } else if (s.equals("Í")) { return (char) 0xCD; } else if (s.equals("Î")) { return (char) 0xCE; } else if (s.equals("Ï")) { return (char) 0xCF; } else if (s.equals("Ð")) { return (char) 0xD0; } else if (s.equals("Ñ")) { return (char) 0xD1; } else if (s.equals("Ò")) { return (char) 0xD2; } else if (s.equals("Ó")) { return (char) 0xD3; } else if (s.equals("Ô")) { return (char) 0xD4; } else if (s.equals("Õ")) { return (char) 0xD5; } else if (s.equals("Ö")) { return (char) 0xD6; } else if (s.equals("×")) { return (char) 0xD7; } else if (s.equals("Ø")) { return (char) 0xD8; } else if (s.equals("Ù")) { return (char) 0xD9; } else if (s.equals("Ú")) { return (char) 0xDA; } else if (s.equals("Û")) { return (char) 0xDB; } else if (s.equals("Ü")) { return (char) 0xDC; } else if (s.equals("Ý")) { return (char) 0xDD; } else if (s.equals("Þ")) { return (char) 0xDE; } else if (s.equals("ß")) { return (char) 0xDF; } else if (s.equals("à")) { return (char) 0xE0; } else if (s.equals("á")) { return (char) 0xE1; } else if (s.equals("â")) { return (char) 0xE2; } else if (s.equals("ã")) { return (char) 0xE3; } else if (s.equals("ä")) { return (char) 0xE4; } else if (s.equals("å")) { return (char) 0xE5; } else if (s.equals("æ")) { return (char) 0xE6; } else if (s.equals("ç")) { return (char) 0xE7; } else if (s.equals("è")) { return (char) 0xE8; } else if (s.equals("é")) { return (char) 0xE9; } else if (s.equals("ê")) { return (char) 0xEA; } else if (s.equals("ë ")) { return (char) 0xEB; } else if (s.equals("ì")) { return (char) 0xEC; } else if (s.equals("í")) { return (char) 0xED; } else if (s.equals("î")) { return (char) 0xEE; } else if (s.equals("ï")) { return 0xEF; } else if (s.equals("ð")) { return (char) 0xF0; } else if (s.equals("ñ")) { return (char) 0xF1; } else if (s.equals("ò")) { return (char) 0xF2; } else if (s.equals("ó")) { return (char) 0xF3; } else if (s.equals("ô")) { return (char) 0xF4; } else if (s.equals("õ")) { return (char) 0xF5; } else if (s.equals("ö")) { return (char) 0xF6; } else if (s.equals("÷")) { return (char) 0xF7; } else if (s.equals("ø")) { return (char) 0xF8; } else if (s.equals("ù")) { return (char) 0xF9; } else if (s.equals("ú")) { return (char) 0xFA; } else if (s.equals("û")) { return (char) 0xFB; } else if (s.equals("ü")) { return (char) 0xFC; } else if (s.equals("ý")) { return (char) 0xFD; } else if (s.equals("þ")) { return (char) 0xFE; } else if (s.equals("ÿ")) { return (char) 0xFF; } else if (s.equals("Œ")) { return (char) 0x152; } else if (s.equals("œ")) { return (char) 0x153; } else if (s.equals("Š")) { return (char) 0x160; } else if (s.equals("š")) { return (char) 0x161; } else if (s.equals("Ÿ")) { return (char) 0x178; } else if (s.equals("ˆ")) { return (char) 0x2C6; } else if (s.equals("˜")) { return (char) 0x2DC; } else if (s.equals("‎")) { return (char) 0x200E; } else if (s.equals("‏")) { return (char) 0x200F; } else if (s.equals("–")) { return (char) 0x2013; } else if (s.equals("—")) { return (char) 0x2014; } else if (s.equals("‘")) { return (char) 0x2018; } else if (s.equals("’")) { return (char) 0x2019; } else if (s.equals("‚")) { return (char) 0x201A; } else if (s.equals("“") || s.equals("&bquo;") || s.equals("&bq;")) { return (char) 0x201C; } else if (s.equals("”") || s.equals("&equo;")) { return (char) 0X201D; } else if (s.equals("„")) { return (char) 0x201E; } else if (s.equals("∼")) { return (char) 0x223C; } else if (s.equals("√")) { return (char) 0x221A; } else if (s.equals("≤")) { return (char) 0x2264; } else if (s.equals("≥")) { return (char) 0x2265; } else if (s.equals("←")) { return (char) 0x2190; } else if (s.equals("↓")) { return (char) 0x2193; } else if (s.equals("→")) { return (char) 0x2192; } else if (s.equals("…")) { return (char) 0x2026; } else if (s.equals("′")) { return (char) 0x2032; } else if (s.equals("″") || s.equals("&ins;")) { return (char) 0x2033; } else if (s.equals("™")) { return (char) 0x2122; } else if (s.equals("Α") || s.equals("&Agr;")) { return (char) 0x391; } else if (s.equals("Β") || s.equals("&Bgr;")) { return (char) 0x392; } else if (s.equals("Γ") || s.equals("&Ggr;")) { return (char) 0x393; } else if (s.equals("Δ") || s.equals("&Dgr;")) { return (char) 0x394; } else if (s.equals("Ε") || s.equals("&Egr;")) { return (char) 0x395; } else if (s.equals("Ζ") || s.equals("&Zgr;")) { return (char) 0x396; } else if (s.equals("Η")) { return (char) 0x397; } else if (s.equals("Θ") || s.equals("&THgr;")) { return (char) 0x398; } else if (s.equals("Ι") || s.equals("&Igr;")) { return (char) 0x399; } else if (s.equals("Κ") || s.equals("&Kgr;")) { return (char) 0x39A; } else if (s.equals("Λ") || s.equals("&Lgr;")) { return (char) 0x39B; } else if (s.equals("Μ") || s.equals("&Mgr;")) { return (char) 0x39C; } else if (s.equals("Ν") || s.equals("&Ngr;")) { return (char) 0x39D; } else if (s.equals("Ξ") || s.equals("&Xgr;")) { return (char) 0x39E; } else if (s.equals("Ο") || s.equals("&Ogr;")) { return (char) 0x39F; } else if (s.equals("Π") || s.equals("&Pgr;")) { return (char) 0x3A0; } else if (s.equals("Ρ") || s.equals("&Rgr;")) { return (char) 0x3A1; } else if (s.equals("Σ") || s.equals("&Sgr;")) { return (char) 0x3A3; } else if (s.equals("Τ") || s.equals("&Tgr;")) { return (char) 0x3A4; } else if (s.equals("Υ") || s.equals("&Ugr;")) { return (char) 0x3A5; } else if (s.equals("Φ") || s.equals("&PHgr;")) { return (char) 0x3A6; } else if (s.equals("Χ") || s.equals("&KHgr;")) { return (char) 0x3A7; } else if (s.equals("Ψ") || s.equals("&PSgr;")) { return (char) 0x3A8; } else if (s.equals("Ω") || s.equals("&OHgr;")) { return (char) 0x3A9; } else if (s.equals("α") || s.equals("&agr;")) { return (char) 0x3B1; } else if (s.equals("β") || s.equals("&bgr;")) { return (char) 0x3B2; } else if (s.equals("γ") || s.equals("&ggr;")) { return (char) 0x3B3; } else if (s.equals("δ") || s.equals("&dgr;")) { return (char) 0x3B4; } else if (s.equals("ε") || s.equals("&egr;")) { return (char) 0x3B5; } else if (s.equals("ζ") || s.equals("&zgr;")) { return (char) 0x3B6; } else if (s.equals("η") || s.equals("&eegr;")) { return (char) 0x3B7; } else if (s.equals("θ") || s.equals("&thgr;")) { return (char) 0x3B8; } else if (s.equals("ι") || s.equals("&igr;")) { return (char) 0x3B9; } else if (s.equals("κ") || s.equals("&kgr;")) { return (char) 0x3BA; } else if (s.equals("λ") || s.equals("&lgr;")) { return (char) 0x3BB; } else if (s.equals("μ") || s.equals("&mgr;")) { return (char) 0x3BC; } else if (s.equals("ν") || s.equals("&ngr;")) { return (char) 0x3BD; } else if (s.equals("ξ") || s.equals("&xgr;")) { return (char) 0x3BE; } else if (s.equals("ο") || s.equals("&ogr;")) { return (char) 0x3BF; } else if (s.equals("π") || s.equals("&pgr;")) { return (char) 0x3C0; } else if (s.equals("ρ") || s.equals("&rgr;")) { return (char) 0x3C1; } else if (s.equals("σ") || s.equals("&sgr;")) { return (char) 0x3C3; } else if (s.equals("τ") || s.equals("&tgr;")) { return (char) 0x3C4; } else if (s.equals("υ") || s.equals("&ugr;")) { return (char) 0x3C5; } else if (s.equals("φ") || s.equals("&phgr;")) { return (char) 0x3C6; } else if (s.equals("χ") || s.equals("&khgr;")) { return (char) 0x3C7; } else if (s.equals("ψ") || s.equals("&psgr;")) { return (char) 0x3C8; } else if (s.equals("ω") || s.equals("&ohgr;")) { return (char) 0x3C9; } else if (s.equals("•")) { return (char) 0x2022; } else if (s.equals("%")) { return '%'; } else if (s.equals("+")) { return '+'; } else if (s.equals("‐")) { return '-'; } else if (s.equals("ă") || s.equals("ā") || s.equals("≊") || s.equals("ą") || s.equals("å")) { return 'a'; } else if (s.equals("Ā")) { return 'A'; } else if (s.equals("ć") || s.equals("č") || s.equals("ĉ")) { return 'c'; } else if (s.equals("Č")) { return 'C'; } else if (s.equals("ď")) { return 'd'; } else if (s.equals("ě") || s.equals("ē") || s.equals("ę")) { return 'e'; } else if (s.equals("Ē") || s.equals("Ě")) { return 'E'; } else if (s.equals("ĺ")) { return 'l'; } else if (s.equals("Ĺ")) { return 'L'; } else if (s.equals("ń") || s.equals("ň") || s.equals("ņ")) { return 'n'; } else if (s.equals("ř") || s.equals("ŕ")) { return 'r'; } else if (s.equals("Ř")) { return 'R'; } else if (s.equals("ō")) { return 'o'; } else if (s.equals("ī")) { return 'i'; } else if (s.equals("ś") || s.equals("ş") || s.equals("ŝ")) { return 's'; } else if (s.equals("&Sacute") || s.equals("Ş")) { return 'S'; } else if (s.equals("ť") || s.equals("ţ")) { return 't'; } else if (s.equals("ū") || s.equals("ů")) { return 'u'; } else if (s.equals("ŵ")) { return 'w'; } else if (s.equals("Ŷ")) { return 'Y'; } else if (s.equals("ŷ")) { return 'y'; } else if (s.equals("ž") || s.equals("ź")) { return 'z'; } else if (s.equals("Ž")) { return 'Z'; } else if (s.equals("♥")) { return (char) 0x2665; } else if (s.equals("∞")) { return (char) 0x221E; } else if (s.equals("$")) { return '$'; } else if (s.equals("⊂") || s.equals("{")) { return (char) 0x2282; } else if (s.equals("⊃") || s.equals("}")) { return (char) 0x2283; } else if (s.equals("[")) { return '['; } else if (s.equals("]")) { return ']'; } else { return ' '; } } /** Returns a String in which all the XML special characters have been * escaped. The resulting String is valid to print in an XML file as an * attribute or element value in all circumstances. (Note that it may * escape characters that didn't need to be escaped.) * * @param in The String to escape * @return The escaped String */ public static String escapeXML(String in) { int leng = in.length(); StringBuilder sb = new StringBuilder(leng); for (int i = 0; i < leng; i++) { char c = in.charAt(i); if (c == '&') { sb.append("&"); } else if (c == '<') { sb.append("<"); } else if (c == '>') { sb.append(">"); } else if (c == '"') { sb.append("""); } else if (c == '\'') { sb.append("'"); } else { sb.append(c); } } return sb.toString(); } /** Returns a String in which some the XML special characters have been * escaped: just the ones that need escaping in an element content. * * @param in The String to escape * @return The escaped String */ public static String escapeElementXML(String in) { int leng = in.length(); StringBuilder sb = new StringBuilder(leng); for (int i = 0; i < leng; i++) { char c = in.charAt(i); if (c == '&') { sb.append("&"); } else if (c == '<') { sb.append("<"); } else if (c == '>') { sb.append(">"); } else { sb.append(c); } } return sb.toString(); } /** Returns a String in which some XML special characters have been * escaped. This just escapes attribute value ones, assuming that * you're going to quote with double quotes. * That is, only " and & are escaped. * * @param in The String to escape * @return The escaped String */ public static String escapeAttributeXML(String in) { int leng = in.length(); StringBuilder sb = new StringBuilder(leng); for (int i = 0; i < leng; i++) { char c = in.charAt(i); if (c == '&') { sb.append("&"); } else if (c == '"') { sb.append("""); } else { sb.append(c); } } return sb.toString(); } public static String escapeTextAroundXMLTags(String s) { StringBuilder result = new StringBuilder(); Reader r = new StringReader(s); try { do { String text = readUntilTag(r); // System.out.println("got text: " + text); result.append(escapeXML(text)); XMLTag tag = readAndParseTag(r); // System.out.println("got tag: " + tag); if (tag == null) { break; } result.append(tag.toString()); } while (true); } catch (IOException e) { System.err.println("Error reading string"); e.printStackTrace(); } return result.toString(); } /** * return either the first space or the first nbsp */ public static int findSpace(String haystack, int begin) { int space = haystack.indexOf(' ', begin); int nbsp = haystack.indexOf('\u00A0', begin); if (space == -1 && nbsp == -1) { return -1; } else if (space >= 0 && nbsp >= 0) { return Math.min(space, nbsp); } else { // eg one is -1, and the other is >= 0 return Math.max(space, nbsp); } } public static class XMLTag { public String text; public String name; public Map<String,String> attributes; public boolean isEndTag; public boolean isSingleTag; /** * Assumes that String contains an XML tag. * * @param tag String to turn into an XMLTag object */ public XMLTag(String tag) { if (tag == null || tag.length() == 0) { throw new NullPointerException("Attempted to parse empty/null tag"); } if (tag.charAt(0) != '<') { throw new IllegalArgumentException("Tag did not start with <"); } if (tag.charAt(tag.length() - 1) != '>') { throw new IllegalArgumentException("Tag did not end with >"); } text = tag; int begin = 1; if (tag.charAt(1) == '/') { begin = 2; isEndTag = true; } else { isEndTag = false; } int end = tag.length() - 1; if (tag.charAt(tag.length() - 2) == '/') { end = tag.length() - 2; isSingleTag = true; } else { isSingleTag = false; } tag = tag.substring(begin, end); attributes = Generics.newHashMap(); begin = 0; end = findSpace(tag, 0); if (end < 0) { name = tag; } else { name = tag.substring(begin, end); do { begin = end + 1; while (begin < tag.length() && tag.charAt(begin) < 0x21) { begin++; // get rid of leading whitespace } if (begin == tag.length()) { break; } end = tag.indexOf('=', begin); if (end < 0) { String att = tag.substring(begin); attributes.put(att, ""); break; } String att = tag.substring(begin, end).trim(); begin = end + 1; String value = null; if (tag.length() > begin) { while (begin < tag.length() && tag.charAt(begin) < 0x21) { begin++; } if (begin < tag.length() && tag.charAt(begin) == '\"') { // get quoted expression begin++; end = tag.indexOf('\"', begin); if (end < 0) { break; // this is a problem } value = tag.substring(begin, end); end++; } else { // get unquoted expression end = findSpace(tag, begin); if (end < 0) { end = tag.length(); } System.out.println(begin + " " + end); value = tag.substring(begin, end); } } attributes.put(att, value); } while (end < tag.length() - 3); } } public String toString() { return text; } } // end static class XMLTag /** * Reads all text of the XML tag and returns it as a String. * Assumes that a '<' character has already been read. * * @param r The reader to read from * @return The String representing the tag, or null if one couldn't be read * (i.e., EOF). The returned item is a complete tag including angle * brackets, such as <code><TXT></code> */ public static String readTag(Reader r) throws IOException { if ( ! r.ready()) { return null; } StringBuilder b = new StringBuilder("<"); int c = r.read(); while (c >= 0) { b.append((char) c); if (c == '>') { break; } c = r.read(); } if (b.length() == 1) { return null; } return b.toString(); } public static XMLTag parseTag(String tagString) { if (tagString == null || tagString.length() == 0) { return null; } if (tagString.charAt(0) != '<' || tagString.charAt(tagString.length() - 1) != '>') { return null; } return new XMLTag(tagString); } public static Document readDocumentFromFile(String filename) throws Exception { InputSource in = new InputSource(new FileReader(filename)); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(false); DocumentBuilder db = factory.newDocumentBuilder(); db.setErrorHandler(new SAXErrorHandler()); return db.parse(in); } private static class SAXErrorHandler implements ErrorHandler { public static String makeBetterErrorString(String msg, SAXParseException ex) { StringBuilder sb = new StringBuilder(msg); sb.append(": "); String str = ex.getMessage(); if (str.lastIndexOf(".") == str.length() - 1) { str = str.substring(0, str.length() - 1); } sb.append(str); sb.append(" at document line ").append(ex.getLineNumber()); sb.append(", column ").append(ex.getColumnNumber()); if (ex.getSystemId() != null) { sb.append(" in entity from systemID ").append(ex.getSystemId()); } else if (ex.getPublicId() != null) { sb.append(" in entity from publicID ").append(ex.getPublicId()); } sb.append("."); return sb.toString(); } public void warning(SAXParseException exception) { System.err.println(makeBetterErrorString("Warning", exception)); } public void error(SAXParseException exception) { System.err.println(makeBetterErrorString("Error", exception)); } public void fatalError(SAXParseException ex) throws SAXParseException { throw new SAXParseException(makeBetterErrorString("Fatal Error", ex), ex.getPublicId(), ex.getSystemId(), ex.getLineNumber(), ex.getColumnNumber()); // throw new RuntimeException(makeBetterErrorString("Fatal Error", ex)); } } // end class SAXErrorHandler public static Document readDocumentFromString(String s) throws Exception { InputSource in = new InputSource(new StringReader(s)); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(false); return factory.newDocumentBuilder().parse(in); } /** Tests a few methods. * If the first arg is -readDoc then this method tests * readDocumentFromFile. * Otherwise, it tests readTag/readUntilTag and slurpFile. */ public static void main(String[] args) throws Exception { if (args[0].equals("-readDoc")) { Document doc = readDocumentFromFile(args[1]); System.out.println(doc); } else { String s = IOUtils.slurpFile(args[0]); Reader r = new StringReader(s); String tag = readTag(r); while (tag.length() > 0) { readUntilTag(r); tag = readTag(r); if (tag.length() == 0) { break; } System.out.println("got tag=" + new XMLTag(tag)); } } } }