package edu.stanford.nlp.util; import java.io.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.ParserConfigurationException; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.util.logging.Redwood; /** * Provides some utilities for dealing with XML files, both by properly * parsing them and by using the methods of a desperate Perl hacker. * * @author Teg Grenager * @author Grace Muzny */ public class XMLUtils { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(XMLUtils.class); private XMLUtils() {} // only static methods /** * Returns the text content of all nodes in the given file with the given tag. * * @return List of String text contents of tags. */ public static List<String> getTextContentFromTagsFromFile(File f, String tag) { List<String> sents = Generics.newArrayList(); try { sents = getTextContentFromTagsFromFileSAXException(f, tag); } catch (SAXException e) { log.info(e); } return sents; } /** * Returns the text content of all nodes in the given file with the given tag. * If the text contents contains embedded tags, strips the embedded tags out * of the returned text. e.g. <s>This is a <s>sentence</s> with embedded tags * </s> would return the list containing ["This is a sentence with embedded * tags", "sentence"]. * * @throws SAXException if tag doesn't exist in the file. * @return List of String text contents of tags. */ private static List<String> getTextContentFromTagsFromFileSAXException( File f, String tag) throws SAXException { List<String> sents = Generics.newArrayList(); try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(f); doc.getDocumentElement().normalize(); NodeList nodeList=doc.getElementsByTagName(tag); for (int i = 0; i < nodeList.getLength(); i++) { // Get element Element element = (Element)nodeList.item(i); String raw = element.getTextContent(); StringBuilder builtUp = new StringBuilder(); boolean inTag = false; for (int j = 0; j < raw.length(); j++) { if (raw.charAt(j) == '<') { inTag = true; } if (!inTag) { builtUp.append(raw.charAt(j)); } if (raw.charAt(j) == '>') { inTag = false; } } sents.add(builtUp.toString()); } } catch (IOException | ParserConfigurationException e) { log.info(e); } return sents; } /** * Returns the text content of all nodes in the given file with the given tag. * * @return List of String text contents of tags. */ public static List<Element> getTagElementsFromFile(File f, String tag) { List<Element> sents = Generics.newArrayList(); try { sents = getTagElementsFromFileSAXException(f, tag); } catch (SAXException e) { log.info(e); } return sents; } /** * Returns the text content of all nodes in the given file with the given tag. * If the text contents contains embedded tags, strips the embedded tags out * of the returned text. e.g. <s>This is a <s>sentence</s> with embedded tags * </s> would return the list containing ["This is a sentence with embedded * tags", "sentence"]. * * @throws SAXException if tag doesn't exist in the file. * @return List of String text contents of tags. */ private static List<Element> getTagElementsFromFileSAXException( File f, String tag) throws SAXException { List<Element> sents = Generics.newArrayList(); try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(f); doc.getDocumentElement().normalize(); NodeList nodeList=doc.getElementsByTagName(tag); for (int i = 0; i < nodeList.getLength(); i++) { // Get element Element element = (Element)nodeList.item(i); sents.add(element); } } catch (IOException | ParserConfigurationException e) { log.info(e); } return sents; } /** * Returns the elements in the given file with the given tag associated with * the text content of the two previous siblings and two next siblings. * * @return List of {@code Triple<String, Element, String>} Targeted elements surrounded * by the text content of the two previous siblings and two next siblings. */ public static List<Triple<String, Element, String>> getTagElementTriplesFromFile(File f, String tag) { List<Triple<String, Element, String>> sents = Generics.newArrayList(); try { sents = getTagElementTriplesFromFileSAXException(f, tag); } catch (SAXException e) { System.err.println(e); } return sents; } /** * Returns the elements in the given file with the given tag associated with * the text content of the previous and next siblings up to max numIncludedSiblings. * * @return List of Triple<String, Element, String> Targeted elements surrounded * by the text content of the two previous siblings and two next siblings. */ public static List<Triple<String, Element, String>> getTagElementTriplesFromFileNumBounded(File f, String tag, int num) { List<Triple<String, Element, String>> sents = Generics.newArrayList(); try { sents = getTagElementTriplesFromFileNumBoundedSAXException(f, tag, num); } catch (SAXException e) { System.err.println(e); } return sents; } /** * Returns the elements in the given file with the given tag associated with * the text content of the two previous siblings and two next siblings. * * @throws SAXException if tag doesn't exist in the file. * @return List of {@code Triple<String, Element, String>} Targeted elements surrounded * by the text content of the two previous siblings and two next siblings. */ public static List<Triple<String, Element, String>> getTagElementTriplesFromFileSAXException( File f, String tag) throws SAXException { return getTagElementTriplesFromFileNumBoundedSAXException(f, tag, 2); } /** * Returns the elements in the given file with the given tag associated with * the text content of the previous and next siblings up to max numIncludedSiblings. * * @throws SAXException if tag doesn't exist in the file. * @return List of Triple<String, Element, String> Targeted elements surrounded * by the text content of the two previous siblings and two next siblings. */ public static List<Triple<String, Element, String>> getTagElementTriplesFromFileNumBoundedSAXException( File f, String tag, int numIncludedSiblings) throws SAXException { List<Triple<String, Element, String>> sents = Generics.newArrayList(); try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(f); doc.getDocumentElement().normalize(); NodeList nodeList=doc.getElementsByTagName(tag); for (int i = 0; i < nodeList.getLength(); i++) { // Get element Node prevNode = nodeList.item(i).getPreviousSibling(); String prev = ""; int count = 0; while (prevNode != null && count <= numIncludedSiblings) { prev = prevNode.getTextContent() + prev; prevNode = prevNode.getPreviousSibling(); count++; } Node nextNode = nodeList.item(i).getNextSibling(); String next = ""; count = 0; while (nextNode != null && count <= numIncludedSiblings) { next = next + nextNode.getTextContent(); nextNode = nextNode.getNextSibling(); count++; } Element element = (Element)nodeList.item(i); Triple<String, Element, String> t = new Triple<>(prev, element, next); sents.add(t); } } catch (IOException | ParserConfigurationException e) { System.err.println(e); } return sents; } /** * Returns a non-validating XML parser. The parser ignores both DTDs and XSDs. * * @return An XML parser in the form of a DocumentBuilder */ public static DocumentBuilder getXmlParser() { DocumentBuilder db = null; try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false); //Disable DTD loading and validation //See http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); dbf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); db = dbf.newDocumentBuilder(); db.setErrorHandler(new SAXErrorHandler()); } catch (ParserConfigurationException e) { System.err.printf("%s: Unable to create XML parser\n", XMLUtils.class.getName()); e.printStackTrace(); } catch(UnsupportedOperationException e) { System.err.printf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName()); e.printStackTrace(); } return db; } /** * Returns a validating XML parser given an XSD (not DTD!). * * @param schemaFile File wit hXML schema * @return An XML parser in the form of a DocumentBuilder */ public static DocumentBuilder getValidatingXmlParser(File schemaFile) { DocumentBuilder db = null; try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); Schema schema = factory.newSchema(schemaFile); dbf.setSchema(schema); db = dbf.newDocumentBuilder(); db.setErrorHandler(new SAXErrorHandler()); } catch (ParserConfigurationException e) { System.err.printf("%s: Unable to create XML parser\n", XMLUtils.class.getName()); e.printStackTrace(); } catch (SAXException e) { System.err.printf("%s: XML parsing exception while loading schema %s\n", XMLUtils.class.getName(),schemaFile.getPath()); e.printStackTrace(); } catch(UnsupportedOperationException e) { System.err.printf("%s: API error while setting up XML parser. Check your JAXP version\n", XMLUtils.class.getName()); e.printStackTrace(); } return db; } /** * Block-level HTML tags that are rendered with surrounding line breaks. */ private static final Set<String> breakingTags = Generics.newHashSet(Arrays.asList(new String[] {"blockquote", "br", "div", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "li", "ol", "p", "pre", "ul", "tr", "td"})); /** * @param r the reader to read the XML/HTML from * @param mapBack a List of Integers mapping the positions in the result buffer * to positions in the original Reader, will be cleared on receipt * @return the String containing the resulting text */ public static String stripTags(Reader r, List<Integer> mapBack, boolean markLineBreaks) { if (mapBack != null) { mapBack.clear(); // just in case it has something in it! } StringBuilder result = new StringBuilder(); try { int position = 0; do { String text = XMLUtils.readUntilTag(r); if (text.length() > 0) { // add offsets to the map back for (int i = 0; i < text.length(); i++) { result.append(text.charAt(i)); if (mapBack != null) { mapBack.add(Integer.valueOf(position + i)); } } position += text.length(); } // System.out.println(position + " got text: " + text); String tag = XMLUtils.readTag(r); if (tag == null) { break; } if (markLineBreaks && XMLUtils.isBreaking(parseTag(tag))) { result.append("\n"); if (mapBack != null) { mapBack.add(Integer.valueOf(-position)); } } position += tag.length(); // System.out.println(position + " got tag: " + tag); } while (true); } catch (IOException e) { log.info("Error reading string"); e.printStackTrace(); } return result.toString(); } public static boolean isBreaking(String tag) { return breakingTags.contains(tag); } public static boolean isBreaking(XMLTag tag) { return breakingTags.contains(tag.name); } /** * Reads all text up to next XML tag and returns it as a String. * * @return the String of the text read, which may be empty. */ public static String readUntilTag(Reader r) throws IOException { if (!r.ready()) { return ""; } StringBuilder b = new StringBuilder(); int c = r.read(); while (c >= 0 && c != '<') { b.append((char) c); c = r.read(); } return b.toString(); } /** * @return the new XMLTag object, or null if couldn't be created */ public static XMLTag readAndParseTag(Reader r) throws IOException { String s = readTag(r); if (s == null) { return null; } XMLTag ret = null; try { ret = new XMLTag(s); } catch (Exception e) { log.info("Failed to handle |" + s + "|"); } return ret; } // Pattern is reentrant, going by the statement // "many matchers can share the same pattern" // on the Pattern javadoc. Therefore, this should be // safe as a static final variable. private static final Pattern xmlEscapingPattern = Pattern.compile("\\&.+?;"); public static String unescapeStringForXML(String s) { StringBuilder result = new StringBuilder(); Matcher m = xmlEscapingPattern.matcher(s); int end = 0; while (m.find()) { int start = m.start(); result.append(s.substring(end, start)); end = m.end(); result.append(translate(s.substring(start, end))); } result.append(s.substring(end, s.length())); return result.toString(); } private static char translate(String s) { switch (s) { case "&": return '&'; case "<": case "≪": return '<'; case ">": case "≫": return '>'; case """: return '\"'; case "'": return '\''; case "*": case "♯": return '-'; case "=": return '='; case " ": return (char) 0xA0; case "¡": return (char) 0xA1; case "¢": case "&shilling;": return (char) 0xA2; case "£": return (char) 0xA3; case "¤": return (char) 0xA4; case "¥": return (char) 0xA5; case "¦": return (char) 0xA6; case "§": return (char) 0xA7; case "¨": return (char) 0xA8; case "©": return (char) 0xA9; case "ª": return (char) 0xAA; case "« ": return (char) 0xAB; case "¬": return (char) 0xAC; case "­ ": return (char) 0xAD; case "®": return (char) 0xAE; case "¯": return (char) 0xAF; case "°": return (char) 0xB0; case "±": return (char) 0xB1; case "²": return (char) 0xB2; case "³": return (char) 0xB3; case "´": return (char) 0xB4; case "µ": return (char) 0xB5; case "·": return (char) 0xB7; case "¸": return (char) 0xB8; case "¹": return (char) 0xB9; case "º": return (char) 0xBA; case "»": return (char) 0xBB; case "¼ ": return (char) 0xBC; case "½": return (char) 0xBD; case "¾ ": return (char) 0xBE; case "¿": return (char) 0xBF; case "À": return (char) 0xC0; case "Á": return (char) 0xC1; case "Â": return (char) 0xC2; case "Ã": return (char) 0xC3; case "Ä": return (char) 0xC4; case "Å": return (char) 0xC5; case "Æ": return (char) 0xC6; case "Ç": return (char) 0xC7; case "È": return (char) 0xC8; case "É": return (char) 0xC9; case "Ê": return (char) 0xCA; case "Ë": return (char) 0xCB; case "Ì": return (char) 0xCC; case "Í": return (char) 0xCD; case "Î": return (char) 0xCE; case "Ï": return (char) 0xCF; case "Ð": return (char) 0xD0; case "Ñ": return (char) 0xD1; case "Ò": return (char) 0xD2; case "Ó": return (char) 0xD3; case "Ô": return (char) 0xD4; case "Õ": return (char) 0xD5; case "Ö": return (char) 0xD6; case "×": return (char) 0xD7; case "Ø": return (char) 0xD8; case "Ù": return (char) 0xD9; case "Ú": return (char) 0xDA; case "Û": return (char) 0xDB; case "Ü": return (char) 0xDC; case "Ý": return (char) 0xDD; case "Þ": return (char) 0xDE; case "ß": return (char) 0xDF; case "à": return (char) 0xE0; case "á": return (char) 0xE1; case "â": return (char) 0xE2; case "ã": return (char) 0xE3; case "ä": return (char) 0xE4; case "å": return (char) 0xE5; case "æ": return (char) 0xE6; case "ç": return (char) 0xE7; case "è": return (char) 0xE8; case "é": return (char) 0xE9; case "ê": return (char) 0xEA; case "ë ": return (char) 0xEB; case "ì": return (char) 0xEC; case "í": return (char) 0xED; case "î": return (char) 0xEE; case "ï": return 0xEF; case "ð": return (char) 0xF0; case "ñ": return (char) 0xF1; case "ò": return (char) 0xF2; case "ó": return (char) 0xF3; case "ô": return (char) 0xF4; case "õ": return (char) 0xF5; case "ö": return (char) 0xF6; case "÷": return (char) 0xF7; case "ø": return (char) 0xF8; case "ù": return (char) 0xF9; case "ú": return (char) 0xFA; case "û": return (char) 0xFB; case "ü": return (char) 0xFC; case "ý": return (char) 0xFD; case "þ": return (char) 0xFE; case "ÿ": return (char) 0xFF; case "Œ": return (char) 0x152; case "œ": return (char) 0x153; case "Š": return (char) 0x160; case "š": return (char) 0x161; case "Ÿ": return (char) 0x178; case "ˆ": return (char) 0x2C6; case "˜": return (char) 0x2DC; case "‎": return (char) 0x200E; case "‏": return (char) 0x200F; case "–": return (char) 0x2013; case "—": return (char) 0x2014; case "‘": return (char) 0x2018; case "’": return (char) 0x2019; case "‚": return (char) 0x201A; case "“": case "&bquo;": case "&bq;": return (char) 0x201C; case "”": case "&equo;": return (char) 0X201D; case "„": return (char) 0x201E; case "∼": return (char) 0x223C; case "√": return (char) 0x221A; case "≤": return (char) 0x2264; case "≥": return (char) 0x2265; case "←": return (char) 0x2190; case "↓": return (char) 0x2193; case "→": return (char) 0x2192; case "…": return (char) 0x2026; case "′": return (char) 0x2032; case "″": case "&ins;": return (char) 0x2033; case "™": return (char) 0x2122; case "Α": case "&Agr;": return (char) 0x391; case "Β": case "&Bgr;": return (char) 0x392; case "Γ": case "&Ggr;": return (char) 0x393; case "Δ": case "&Dgr;": return (char) 0x394; case "Ε": case "&Egr;": return (char) 0x395; case "Ζ": case "&Zgr;": return (char) 0x396; case "Η": return (char) 0x397; case "Θ": case "&THgr;": return (char) 0x398; case "Ι": case "&Igr;": return (char) 0x399; case "Κ": case "&Kgr;": return (char) 0x39A; case "Λ": case "&Lgr;": return (char) 0x39B; case "Μ": case "&Mgr;": return (char) 0x39C; case "Ν": case "&Ngr;": return (char) 0x39D; case "Ξ": case "&Xgr;": return (char) 0x39E; case "Ο": case "&Ogr;": return (char) 0x39F; case "Π": case "&Pgr;": return (char) 0x3A0; case "Ρ": case "&Rgr;": return (char) 0x3A1; case "Σ": case "&Sgr;": return (char) 0x3A3; case "Τ": case "&Tgr;": return (char) 0x3A4; case "Υ": case "&Ugr;": return (char) 0x3A5; case "Φ": case "&PHgr;": return (char) 0x3A6; case "Χ": case "&KHgr;": return (char) 0x3A7; case "Ψ": case "&PSgr;": return (char) 0x3A8; case "Ω": case "&OHgr;": return (char) 0x3A9; case "α": case "&agr;": return (char) 0x3B1; case "β": case "&bgr;": return (char) 0x3B2; case "γ": case "&ggr;": return (char) 0x3B3; case "δ": case "&dgr;": return (char) 0x3B4; case "ε": case "&egr;": return (char) 0x3B5; case "ζ": case "&zgr;": return (char) 0x3B6; case "η": case "&eegr;": return (char) 0x3B7; case "θ": case "&thgr;": return (char) 0x3B8; case "ι": case "&igr;": return (char) 0x3B9; case "κ": case "&kgr;": return (char) 0x3BA; case "λ": case "&lgr;": return (char) 0x3BB; case "μ": case "&mgr;": return (char) 0x3BC; case "ν": case "&ngr;": return (char) 0x3BD; case "ξ": case "&xgr;": return (char) 0x3BE; case "ο": case "&ogr;": return (char) 0x3BF; case "π": case "&pgr;": return (char) 0x3C0; case "ρ": case "&rgr;": return (char) 0x3C1; case "σ": case "&sgr;": return (char) 0x3C3; case "τ": case "&tgr;": return (char) 0x3C4; case "υ": case "&ugr;": return (char) 0x3C5; case "φ": case "&phgr;": return (char) 0x3C6; case "χ": case "&khgr;": return (char) 0x3C7; case "ψ": case "&psgr;": return (char) 0x3C8; case "ω": case "&ohgr;": return (char) 0x3C9; case "•": return (char) 0x2022; case "%": return '%'; case "+": return '+'; case "‐": return '-'; case "ă": case "ā": case "≊": case "ą": return 'a'; case "Ā": return 'A'; case "ć": case "č": case "ĉ": return 'c'; case "Č": return 'C'; case "ď": return 'd'; case "ě": case "ē": case "ę": return 'e'; case "Ē": case "Ě": return 'E'; case "ĺ": return 'l'; case "Ĺ": return 'L'; case "ń": case "ň": case "ņ": return 'n'; case "ř": case "ŕ": return 'r'; case "Ř": return 'R'; case "ō": return 'o'; case "ī": return 'i'; case "ś": case "ş": case "ŝ": return 's'; case "&Sacute": case "Ş": return 'S'; case "ť": case "ţ": return 't'; case "ū": case "ů": return 'u'; case "ŵ": return 'w'; case "Ŷ": return 'Y'; case "ŷ": return 'y'; case "ž": case "ź": return 'z'; case "Ž": return 'Z'; case "♥": return (char) 0x2665; case "∞": return (char) 0x221E; case "$": return '$'; case "⊂": case "{": return (char) 0x2282; case "⊃": case "}": return (char) 0x2283; case "[": return '['; case "]": return ']'; default: return ' '; } } /** Returns a String in which all the XML special characters have been * escaped. The resulting String is valid to print in an XML file as an * attribute or element value in all circumstances. (Note that it may * escape characters that didn't need to be escaped.) * * @param in The String to escape * @return The escaped String */ public static String escapeXML(String in) { int leng = in.length(); StringBuilder sb = new StringBuilder(leng); for (int i = 0; i < leng; i++) { char c = in.charAt(i); if (c == '&') { sb.append("&"); } else if (c == '<') { sb.append("<"); } else if (c == '>') { sb.append(">"); } else if (c == '"') { sb.append("""); } else if (c == '\'') { sb.append("'"); } else { sb.append(c); } } return sb.toString(); } /** Returns a String in which some the XML special characters have been * escaped: just the ones that need escaping in an element content. * * @param in The String to escape * @return The escaped String */ public static String escapeElementXML(String in) { int leng = in.length(); StringBuilder sb = new StringBuilder(leng); for (int i = 0; i < leng; i++) { char c = in.charAt(i); if (c == '&') { sb.append("&"); } else if (c == '<') { sb.append("<"); } else if (c == '>') { sb.append(">"); } else { sb.append(c); } } return sb.toString(); } /** Returns a String in which some XML special characters have been * escaped. This just escapes attribute value ones, assuming that * you're going to quote with double quotes. * That is, only " and & are escaped. * * @param in The String to escape * @return The escaped String */ public static String escapeAttributeXML(String in) { int leng = in.length(); StringBuilder sb = new StringBuilder(leng); for (int i = 0; i < leng; i++) { char c = in.charAt(i); if (c == '&') { sb.append("&"); } else if (c == '"') { sb.append("""); } else { sb.append(c); } } return sb.toString(); } public static String escapeTextAroundXMLTags(String s) { StringBuilder result = new StringBuilder(); Reader r = new StringReader(s); try { do { String text = readUntilTag(r); // System.out.println("got text: " + text); result.append(escapeXML(text)); XMLTag tag = readAndParseTag(r); // System.out.println("got tag: " + tag); if (tag == null) { break; } result.append(tag.toString()); } while (true); } catch (IOException e) { log.info("Error reading string"); e.printStackTrace(); } return result.toString(); } /** * return either the first space or the first nbsp */ public static int findSpace(String haystack, int begin) { int space = haystack.indexOf(' ', begin); int nbsp = haystack.indexOf('\u00A0', begin); if (space == -1 && nbsp == -1) { return -1; } else if (space >= 0 && nbsp >= 0) { return Math.min(space, nbsp); } else { // eg one is -1, and the other is >= 0 return Math.max(space, nbsp); } } public static class XMLTag { public String text; public String name; public Map<String,String> attributes; public boolean isEndTag; public boolean isSingleTag; /** * Assumes that String contains an XML tag. * * @param tag String to turn into an XMLTag object */ public XMLTag(String tag) { if (tag == null || tag.length() == 0) { throw new NullPointerException("Attempted to parse empty/null tag"); } if (tag.charAt(0) != '<') { throw new IllegalArgumentException("Tag did not start with <"); } if (tag.charAt(tag.length() - 1) != '>') { throw new IllegalArgumentException("Tag did not end with >"); } text = tag; int begin = 1; if (tag.charAt(1) == '/') { begin = 2; isEndTag = true; } else { isEndTag = false; } int end = tag.length() - 1; if (tag.charAt(tag.length() - 2) == '/') { end = tag.length() - 2; isSingleTag = true; } else { isSingleTag = false; } tag = tag.substring(begin, end); attributes = Generics.newHashMap(); begin = 0; end = findSpace(tag, 0); if (end < 0) { name = tag; } else { name = tag.substring(begin, end); do { begin = end + 1; while (begin < tag.length() && tag.charAt(begin) < 0x21) { begin++; // get rid of leading whitespace } if (begin == tag.length()) { break; } end = tag.indexOf('=', begin); if (end < 0) { String att = tag.substring(begin); attributes.put(att, ""); break; } String att = tag.substring(begin, end).trim(); begin = end + 1; String value = null; if (tag.length() > begin) { while (begin < tag.length() && tag.charAt(begin) < 0x21) { begin++; } if (begin < tag.length() && tag.charAt(begin) == '\"') { // get quoted expression begin++; end = tag.indexOf('\"', begin); if (end < 0) { break; // this is a problem } value = tag.substring(begin, end); end++; } else { // get unquoted expression end = findSpace(tag, begin); if (end < 0) { end = tag.length(); } // System.out.println(begin + " " + end); value = tag.substring(begin, end); } } attributes.put(att, value); } while (end < tag.length() - 3); } } public String toString() { return text; } /** * Given a list of attributes, return the first one that is non-null */ public String getFirstNonNullAttributeFromList(List<String> attributesList) { for (String attribute : attributesList) { if (attributes.get(attribute) != null) { return attributes.get(attribute); } } return null; } } // end static class XMLTag /** * Reads all text of the XML tag and returns it as a String. * Assumes that a '<' character has already been read. * * @param r The reader to read from * @return The String representing the tag, or null if one couldn't be read * (i.e., EOF). The returned item is a complete tag including angle * brackets, such as <code><TXT></code> */ public static String readTag(Reader r) throws IOException { if ( ! r.ready()) { return null; } StringBuilder b = new StringBuilder("<"); int c = r.read(); while (c >= 0) { b.append((char) c); if (c == '>') { break; } c = r.read(); } if (b.length() == 1) { return null; } return b.toString(); } public static XMLTag parseTag(String tagString) { if (tagString == null || tagString.isEmpty()) { return null; } if (tagString.charAt(0) != '<' || tagString.charAt(tagString.length() - 1) != '>') { return null; } return new XMLTag(tagString); } public static Document readDocumentFromFile(String filename) throws Exception { InputSource in = new InputSource(new FileReader(filename)); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(false); DocumentBuilder db = factory.newDocumentBuilder(); db.setErrorHandler(new SAXErrorHandler()); return db.parse(in); } private static class SAXErrorHandler implements ErrorHandler { public static String makeBetterErrorString(String msg, SAXParseException ex) { StringBuilder sb = new StringBuilder(msg); sb.append(": "); String str = ex.getMessage(); if (str.lastIndexOf('.') == str.length() - 1) { str = str.substring(0, str.length() - 1); } sb.append(str); sb.append(" at document line ").append(ex.getLineNumber()); sb.append(", column ").append(ex.getColumnNumber()); if (ex.getSystemId() != null) { sb.append(" in entity from systemID ").append(ex.getSystemId()); } else if (ex.getPublicId() != null) { sb.append(" in entity from publicID ").append(ex.getPublicId()); } sb.append("."); return sb.toString(); } public void warning(SAXParseException exception) { log.info(makeBetterErrorString("Warning", exception)); } public void error(SAXParseException exception) { log.info(makeBetterErrorString("Error", exception)); } public void fatalError(SAXParseException ex) throws SAXParseException { throw new SAXParseException(makeBetterErrorString("Fatal Error", ex), ex.getPublicId(), ex.getSystemId(), ex.getLineNumber(), ex.getColumnNumber()); // throw new RuntimeException(makeBetterErrorString("Fatal Error", ex)); } } // end class SAXErrorHandler public static Document readDocumentFromString(String s) throws Exception { InputSource in = new InputSource(new StringReader(s)); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(false); return factory.newDocumentBuilder().parse(in); } /** Tests a few methods. * If the first arg is -readDoc then this method tests * readDocumentFromFile. * Otherwise, it tests readTag/readUntilTag and slurpFile. */ public static void main(String[] args) throws Exception { if (args[0].equals("-readDoc")) { Document doc = readDocumentFromFile(args[1]); System.out.println(doc); } else { String s = IOUtils.slurpFile(args[0]); Reader r = new StringReader(s); String tag = readTag(r); while (tag != null && tag.length() > 0) { readUntilTag(r); tag = readTag(r); if (tag == null || tag.isEmpty()) { break; } System.out.println("got tag=" + new XMLTag(tag)); } } } }