package folioxml.core; import folioxml.folio.FolioToken; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * This is a base class for all XML-style tokens (SLX, XML). Override inXmlTokenMode() to determine whether attributes are allowed on closing tags. * * @param <T> * @author nathanael */ public class TokenBase<T extends TokenBase> { protected TokenBase() { } public TokenBase(String text) throws InvalidMarkupException { this.markup = text; this.reparse(); } public TokenBase(TokenType type, String text) throws InvalidMarkupException { this.type = type; this.markup = text; if (this.type == TokenType.Tag) parseTag(); } /** * Copies markup, sourceToken, type, tagName, tagType, and attributes * * @param target * @param deepCopyAttrs */ public void copyTo(TokenBase target, boolean deepCopyAttrs) { target.markup = this.markup; target.sourceToken = this.sourceToken; target.type = this.type; target.tagName = this.tagName; target.tagType = this.tagType; if (this.attrs != null) { if (deepCopyAttrs) target.attrs = (TreeMap<String, String>) this.attrs.clone(); else target.attrs = this.attrs; } } public boolean inXmlTokenMode() { return true; } /** * The Folio token this originated from (assuming this token was translated from a FolioToken). Not always present. */ public FolioToken sourceToken = null; /** * The original text the token was created with. Use updateMarkup() to update this to match the attributes and tag name. */ public String markup = null; private String tagName = null; private TreeMap<String, String> attrs = null; public enum TokenType { None, Text, Entity, Comment, Tag } /** * The type of token - Text, Entity, Comment, or Tag */ public TokenType type = TokenType.None; public enum TagType { None, Opening, Closing, SelfClosing } /** * The tag type - can be opening, closing, or selfClosing. * Closing is not a valid value for an XmlNode. */ public TagType tagType = TagType.None; public void setTagName(String tagName, boolean updateMarkup) { this.tagName = tagName; if (updateMarkup) updateMarkup(); } /** * Changes the tag name of the token, and updates the .markup property accordingly. * * @param tagName */ public T setTagName(String tagName) { setTagName(tagName, true); return (T) this; } /** * this.markup = this.toString(); * Rebuilds markup variable from memory structure. */ public void updateMarkup() { this.markup = toTokenString(); } /** * True if this token is a comment. False if it is text, entity, or tag * * @return */ public boolean isComment() { return this.type == TokenType.Comment; } /** * True if a text or entity token * * @return */ public boolean isTextOrEntity() { return (this.type == TokenType.Text || this.type == TokenType.Entity); } /** * Returns null if this in not a TokenType.Comment. Throws an InvalidMarkupException if the markup is not a token. * Returns the text between <!-- and --> * * @return * @throws InvalidMarkupException */ public String getCommentContents() throws InvalidMarkupException { if (!isComment()) return null; if (!markup.startsWith("<!--") || !markup.endsWith("-->")) throw new InvalidMarkupException("Failed to parse comment", this); return markup.substring(4, markup.length() - 3); } /** * Returns true if the token is not whitespace, and is either text or an entity. * * @return */ public boolean isContent() { return (isTextOrEntity() && !TokenUtils.isWhitespace(this.markup)); } /** * Returns true if the token is an SLX tag, not a comment, entity, or text token. * * @return */ public boolean isTag() { return this.type == TokenType.Tag; } public boolean isEntity() { return this.type == TokenType.Entity; } /** * Only defined for isTag() == true * * @return */ public boolean isOpening() { return this.tagType == TagType.Opening; } /** * Only defined for isTag() == true * * @return */ public boolean isClosing() { return this.tagType == TagType.Closing; } /** * Only defined for isTag() == true * * @return */ public boolean isSelfClosing() { return this.tagType == TagType.SelfClosing; } protected static String RegexEntity = "&[^;&< ]++;"; //Aug 21. Added space as banned character. Should help perf. Possessive quantifiers are good here - mutually exclusive groups /** * Needs DOTALL */ protected static String RegexComment = "<!--(.*?)-->"; //Lazy quantifier is what we want for proper comment parsing protected static String RegexText = "((?:[^<&]++|<\\s|&\\s)++)"; //Aug 21. Fixed so it doesn't match empty strings any more... was "((?:[^<&]++|<\\s|&\\s)*+)" protected static String RegexTag = "<(/)?+([\\w\\-\\.:]++)(\\s++[^>]*?)??(/)??>"; /** * Aug 21. Was: <(/)?+([\\w\\-\\.:]++)(\\s++[^>]*?)?(/)?+> * <p> * This regex was flawed, because groups 3 and 4 overlapped with character '/' (found in some tags)... * Since the first was lazy, and the second possessive, it exposed a java bug... * <p> * Added lazy quantifiers after both groups.. Should be correct parsing now. * <p> * Text discovered: * <record class="NormalLevel" fullPath="/" level="root" levelDefOrder="Year,Tape,Chapter,Section,Normal Level" * levels="Year,Tape,Chapter,Section"> */ protected static Pattern pEntity = Pattern.compile("^" + RegexEntity + "$", Pattern.DOTALL); protected static Pattern pComment = Pattern.compile("^" + RegexComment + "$", Pattern.DOTALL); /** * No opening angle brackets or ampersands, unless they are followed by whitespace. */ protected static Pattern pText = Pattern.compile("^" + RegexText + "$"); /** * group(1) closing slash * group(2) tag name * group(3) tag attributes * group(4) self closing slash **/ protected static Pattern pTag = Pattern.compile("^" + RegexTag + "$"); /** * group(1) name * group(2,3,4,5) values */ protected static Pattern attributePair = Pattern.compile("\\G\\s++(\\w[\\w-:]*+)(?:\\s*+=\\s*+\"([^\"]*+)\"|\\s*+=\\s*+'([^']*+)'|\\s*+=\\s*+([^\\s=/>]*+)|(\\s*?))"); /** * Returns true if the tag name (case-insensitive) matches the regex. Returns false unless the token is a tag. Returns false if there is a parse exception * * @param regex * @return */ public boolean matches(String regex) { if (!isTag()) return false; return TokenUtils.fastMatches(regex, this.getTagNameSilent()); } /** * Reparses all cached data from the .markup attribute. Also re-determines token type. * * @throws folioxml.core.InvalidMarkupException */ public void reparse() throws InvalidMarkupException { if (pEntity.matcher(markup).find()) { this.type = TokenType.Entity; } else if (pTag.matcher(markup).find()) { this.type = TokenType.Tag; parseTag(true); } else if (pComment.matcher(markup).find()) { this.type = TokenType.Comment; } else if (pText.matcher(markup).find()) { this.type = TokenType.Text; //TODO: parse whitespace=true/false here and cache for later use. It's always needed. } else { throw new InvalidMarkupException("Invalid use of < or &:" + markup); } //TODO: check for -- in XML comments. //Check for invalid text and entities also. } /** * Parses the 'markup' attribute if needed * * @throws folioxml.core.InvalidMarkupException */ protected void parseTag() throws InvalidMarkupException { parseTag(false); } protected void parseTag(boolean reparse) throws InvalidMarkupException { if (this.type == TokenType.None) { reparse(); return; } if (!isTag()) return; //Only parse tags if (tagName != null && !reparse) return; //Don't parse if it's already done Matcher m = pTag.matcher(markup); if (!m.find()) throw new InvalidMarkupException("Tag syntax is wrong: \"" + markup + "\".", this); parseTagFromMatcher(m); } protected void parseTagFromMatcher(Matcher m) throws InvalidMarkupException { //Tag type boolean closing = (m.group(1) != null && m.group(1).length() > 0); boolean selfClosing = (m.group(4) != null && m.group(4).length() > 0); if (closing) this.tagType = TagType.Closing; else if (selfClosing) this.tagType = TagType.SelfClosing; else this.tagType = TagType.Opening; //Tag name this.tagName = m.group(2); if (attrs != null) attrs.clear(); //Empty if we are doing a reparse //Parse attributes String attrText = m.group(3); if (attrText == null) attrText = ""; Matcher ma = attributePair.matcher(attrText); int index = 0; while (ma.find(index)) { if (attrs == null) attrs = new TreeMap<String, String>(String.CASE_INSENSITIVE_ORDER);//Default Collator case-insensitive (TETERARY) //FIixed bug #80 on Feb 2. String name = ma.group(1); String value = ma.group(2); if (value == null) value = ma.group(3); //Aug 21... fixed typo m->ma if (value == null) value = ma.group(4); if (value == null) value = ma.group(5); assert (name != null && value != null); //Jan 21. 2009. Fixed attribute parsing attrs.put(name, TokenUtils.attributeDecode(value)); index = ma.end(); } //check remainder if (index < attrText.length()) { String remainder = attrText.substring(index); if (!remainder.matches("^\\s*$")) { //Any remaining text after attribute parsing should be whitespace. Invalid syntax. throw new InvalidMarkupException("Failed to parse tag attributes: " + remainder, this); } } if (this.isClosing() && inXmlTokenMode() && attrs != null && !attrs.isEmpty()) throw new InvalidMarkupException("Closing xml tags cannot have attributes!", this); } /** * Returns the markup representation of the token, whether it is is an entity, comment, text, or tag * * @throws InvalidMarkupException */ public String toString() { return toTokenString(); } /** * Returns the markup representation of the token, whether it is is an entity, comment, text, or tag * * @throws InvalidMarkupException */ public String toTokenString() { if (this.isTag()) return writeTokenTo(null).toString(); else { return this.markup; //TODO: We need some way to prevent -- in comments (other than the start and end...) Added fix to SlxTranslator so comments are encoded properly when arriving from Folio } } public StringBuilder writeTokenTo(StringBuilder sb) { return writeTokenTo(sb, false); } public StringBuilder writeTokenTo(StringBuilder sb, boolean decodeEntitiesInText) { //Calculate size int initialCapacity = 20; if (markup != null) initialCapacity = markup.length(); //Grow or create if (sb != null) sb.ensureCapacity(sb.length() + initialCapacity); else sb = new StringBuilder(initialCapacity); if (tagName == null || !isTag()) { sb.append(decodeEntitiesInText ? TokenUtils.entityDecodeString(markup) : markup); } else { if (tagType == TagType.Closing) sb.append("</"); else sb.append("<"); //name sb.append(tagName); if (attrs != null) { Set<Entry<String, String>> pairs = attrs.entrySet(); for (Entry<String, String> entry : pairs) { sb.append(' '); //TODO add wrapping code here sb.append(entry.getKey()); sb.append("=\""); //Jan 21, 2009 - fixed attribute encoding bug. if (entry.getValue() != null) sb.append(TokenUtils.attributeEncode(entry.getValue())); sb.append('"'); } } if (tagType == TagType.SelfClosing) sb.append(" />"); else sb.append('>'); } return sb; } /** * Returns the tag name * * @return * @throws InvalidMarkupException */ public String getTagName() throws InvalidMarkupException { parseTag(); return tagName; } /** * Returns the tag name * * @return * @throws InvalidMarkupException */ public String getTagNameSilent() { try { parseTag(); } catch (InvalidMarkupException e) { } return tagName; } /** * Returns the value of the specified attribute. * * @param attributeName * @return * @throws InvalidMarkupException */ public String get(String attributeName) throws InvalidMarkupException { parseTag(); if (attrs == null) return null; return attrs.get(attributeName); } /** * Call before manipulating .attrs * Makes sure the tag has been parsed, and initializes the attribute collection if it is null. * * @throws folioxml.core.InvalidMarkupException */ protected void prepareAttrs() throws InvalidMarkupException { parseTag(); if (inXmlTokenMode() && !(this.isOpening() || this.isSelfClosing())) throw new InvalidMarkupException("You can only set attributes on opening and self-closing XML tokens", this); if (attrs == null) attrs = new TreeMap<String, String>(String.CASE_INSENSITIVE_ORDER); } /** * Returns a reference to the Map of the attributes. If the map is null, it is initialized. * Calling on a closing token in xmlMode will cause an exception. * * @return * @throws InvalidMarkupException */ public Map<String, String> getAttributes() throws InvalidMarkupException { prepareAttrs(); return attrs; } /** * Deletes the attribute map from this token * * @return */ public T deleteAttributes() { attrs = null; return (T) this; } //public boolean stopsNewContext; /** * Sets the value of the specified attribute. * Returns this; for chaining. * * @param attributeName * @param value * @return */ public T set(String attributeName, String value) throws InvalidMarkupException { prepareAttrs(); if (attributeName == null || value == null) throw new NullPointerException(); attrs.put(attributeName, value); return (T) this; } /** * Removes the specified attribute by name. * Returns this; for chaining. * * @param attributeName * @param value * @return */ public T removeAttr(String attributeName) throws InvalidMarkupException { prepareAttrs(); if (attributeName == null) throw new NullPointerException(); attrs.remove(attributeName); return (T) this; } /** * Appends the specified value to the current value of the attribute. Creates the attribute if it is missing. Returns this; for chaining. * * @param attributeName * @param value */ public T appendToAttribute(String attributeName, String value) throws InvalidMarkupException { prepareAttrs(); if (attributeName == null || value == null) throw new NullPointerException(); if (attrs.containsKey(attributeName)) attrs.put(attributeName, get(attributeName) + value); else attrs.put(attributeName, value); return (T) this; } /** * Appends the specified value to the current value of the attribute. Creates the attribute if it is missing. Returns this; for chaining. * If there is already data in the attribute, it will add a semicolon or comma. If attributename=="style", a semicolon is used. Otherwise a comma is used. For comma-delimted data, commas are html-encoded. * * @param attributeName * @param value */ public T appendToAttributeSmart(String attributeName, String value) throws InvalidMarkupException { prepareAttrs(); if (attributeName == null || value == null) throw new NullPointerException(); if (attrs.containsKey(attributeName)) { String originalValue = get(attributeName); String delimiter = attributeName.equalsIgnoreCase("style") ? ";" : ","; //add appropriate delimiter if (originalValue.length() > 0) { if (!originalValue.endsWith(delimiter)) { //TODO: This is a bug. We can't know if the first commas inserted are delimiters or just commas. //We have to have a way to mark that an attribute's value is a list... a trailing delimiter? //TODO: Analyze use cases and build tests. This is breaking groups at the moment. //if (delimiter.equals(",")) originalValue = originalValue.replace(",", ","); originalValue += delimiter; //Encode delimiters if it doesn't end with one } } //Remove trailing commas/semicolons prior to encoding. String suffix = ""; while (value.endsWith(delimiter)) { suffix += delimiter; value = value.substring(0, value.length() - 1); } //Enode commas *only*. semicolons are a bad idea - sometimes we want to add multiple css pairs at a time. Maybe an overload later? if (delimiter.equals(",")) value = value.replace(",", ","); attrs.put(attributeName, originalValue + value + suffix); } else attrs.put(attributeName, value); //Don't encode until the second item is added. return (T) this; } public T addAttributesTo(T target) throws InvalidMarkupException { prepareAttrs(); for (Entry<String, String> e : attrs.entrySet()) { target.appendToAttributeSmart(e.getKey(), e.getValue()); } return (T) this; } }