package folioxml.xml; import folioxml.core.InvalidMarkupException; import folioxml.core.TokenUtils; import folioxml.slx.*; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.*; /** * Never proccess the same data twice with the same instance, or you will encounter errors. (There is an algorithm checker) * * @author nathanael */ public class SlxToXmlTransformer { public SlxToXmlTransformer() { } public SlxToXmlTransformer(SlxRecordReader srr) { this.srr = srr; } public SlxToXmlTransformer(File f) throws UnsupportedEncodingException, FileNotFoundException, InvalidMarkupException, IOException { this.srr = new SlxRecordReader(f); } public SlxToXmlTransformer(boolean keepEmptyGhostTags, boolean createEmptyGhostsAtStartAndEnd, boolean reuseAttrMaps) { this.createEmptyGhostsAtStartAndEnd = createEmptyGhostsAtStartAndEnd; this.keepEmptyGhostTags = keepEmptyGhostTags; this.reuseAttrMaps = reuseAttrMaps; } /** * If true, ghost tags with no content will be maintained. If false, they will be deleted. */ public boolean keepEmptyGhostTags = false; //If false, ghost tags with no content will not be re-created. /** * True to place self-closing ghost tags at the beginning and end of a ghost tag's area, even if they contain no content. * This can be useful for reversing the splitting process, but makes the markup messier. */ public boolean createEmptyGhostsAtStartAndEnd = false; //If true, empty ghost pairs will be placed at the beginning and end of a ghost tag's area, even if they contain no content. /** * True to recycle the attribute maps from the SlxTokens. If you set this to true, you cannot use the SlxTokens or the SlxRecord again, they will be corrupted. */ public boolean reuseAttrMaps = false; public SlxRecordReader srr = null; public XmlRecord read() throws InvalidMarkupException, IOException { SlxRecord r = srr.read(); if (r == null) return null; return convert(r); } public void close() throws IOException { srr.close(); srr = null; } /** * Converts the SlxRecord. * If reuseAttrMaps, It will recycle data structures from the tokens where possible. Don't attempt to use the SlxRecord afterwards. * Regardless of the setting SlxTokens will have a pairID set on them. * * @param r * @return * @throws InvalidMarkupException * @throws IOException */ public XmlRecord convert(SlxRecord r) throws InvalidMarkupException, IOException { //1) First, tag ghost pairs with a UUID if they aren't already. if (!r.ghostPairsGenerated) tagGhostPairs(r.getTokenReaderForRecord()); //2) Then, build the tree structure and mark it up for splitting TNode tree = buildTree(r.getTokenReaderForRecord()); //3) Follow the splitting instructions splitGhosts(tree); //4) The tree should have one root node, <record>. Find this, and turn it into an XmlRecord instance. if (tree.childrenCount() > 1) throw new InvalidMarkupException("No tags can exist outside the record tag"); else if (tree.childrenCount() == 0) throw new InvalidMarkupException("Tree cannot be empty - record tag should exist"); ///tree.children //assert length is one (the SlxRecord). //Write out the new, XML compliant token stream XmlRecord newR = new XmlRecord(r, false); tree.toXmlNode().children.filterByTagName("record", false).first(true).moveChildrenTo(newR); r.slxXmlRecordTag = newR; //Ok, now future XmlRecords will be able to translate the .parent chain by lookup return newR; } /* * Tags the ghost pairs using SlxContextStack */ public void tagGhostPairs(ISlxTokenReader r) throws IOException, InvalidMarkupException { SlxContextStack s = new SlxContextStack(false, true); while (r.canRead()) { SlxToken t = r.read(); s.process(t); } if (s.size() > 0) throw new InvalidMarkupException("Token stream is not complete - there are orphaned tags"); } public TNode buildTree(ISlxTokenReader r) throws InvalidMarkupException, IOException { //The root of the tree. No token is attached to this node. TNode root = new TNode(); root.isRoot = true; //Use to build a TNode tree... TNodeStack s = new TNodeStack(); while (r.canRead()) { SlxToken token = r.read(); TNode current = new TNode(); current.t = token; //This is how we know how to build the tree. TNodeStack tracks it for us. current.parent = s.top() == null ? root : s.top(); //Process the TNode. At this point, both opening and closing TNodes are proccessed. Should throw an error if there is anything //wrong with the non-ghost tags structure. s.process(current); //But never allow closing nodes to be added into the hierarchy unless they are ghost tags. if (!token.isTag() || token.isGhost || !token.isClosing()) { current.parent.addChild(current); } //Verify ghost tags have been tagged if (token.isGhost && token.ghostPair == null) throw new InvalidMarkupException("buildTree cannot be called until ghost pairs are all tagged. Use tagGhostPairs if the tokens aren't coming from SlxRecordReader", token); //Good time to find the connecting paths between the pairs //Populate the ghostChildren collections. //Find duplicates in ghostChildren and put them in ghostPairs, then remove those entries in parent recursively. //Switch to using a linkedhashmap if (token.isGhost && token.isOpening()) { //Go all the way up the ancestry, adding the UUID and current TNode to the .ghostChildren collection of each ancestor TNode temp = current.parent; while (temp != null) { //Prepare if null if (temp.ghostChildren == null) temp.ghostChildren = new LinkedHashMap<UUID, TNode>(); temp.ghostChildren.put(token.ghostPair, current); //Mark every parent up the chain. temp = temp.parent; } } else if (token.isGhost && token.isClosing()) { //Cycle up the chain looking for the intersecting parent //Remove from ancestors above the intersection point. //Move to ghostPairs for current node TNode temp = current.parent; while (temp != null) { //prepare if null if (temp.ghostChildren == null) temp.ghostChildren = new LinkedHashMap<UUID, TNode>(); if (temp.ghostChildren.containsKey(token.ghostPair)) { //Match found - intersect point a/b\c TNode opener = temp.ghostChildren.get(token.ghostPair); //Remove children entries here and above TNode temp2 = temp; while (temp2 != null) { if (temp2.ghostChildren != null) temp2.ghostChildren.remove(token.ghostPair); temp2 = temp2.parent; } //Add to ghostPairs if (temp.ghostPairs == null) temp.ghostPairs = new LinkedHashMap<UUID, Pair<TNode, TNode>>(); temp.ghostPairs.put(token.ghostPair, new Pair<TNode, TNode>(opener, current)); current.opener = opener; break; } else { temp.ghostChildren.put(token.ghostPair, current); //Mark every parent up the chain. } temp = temp.parent; } } } //Big huge warning here! We are relying on SlxTransformer to ensure that ghost tags exist in matched pairs. //If there is a orphan ghost tag of either opening or closing type, it will encompass the entire remainder of the record. //One way to check for problems is make sure that root.ghostChildren is empty - this means no orphans. if (root.ghostChildren != null && root.ghostChildren.size() > 0) { throw new InvalidMarkupException("There are orphaned ghost tags present in this record. They should have already been cleaned by now"); } return root; } public HashSet<UUID> bannedPairs = new HashSet<UUID>(); public void splitGhosts(TNode b) throws InvalidMarkupException { //Go through the tree recursively, ghostChildren and ghostPairs should be populated. //Remove all ghost tag references from the children collections. We don't need them anymore - we have ghostChildren and ghostPairs. //Ghost tags have no ghostChildren or ghostPairs to work with, and no children either. //Order is important - since we are processing top-down, removing leaf nodes after their intermediate parent replacements already exist should be harmless. //We can still reference these through ghostPairs and ghostChildren until we're done, so the b -> parent link should not be broken. //Skip ghost nodes. What if it is located in a place it can't stay? if (b.t == null && !b.isRoot) throw new InvalidMarkupException("Error converting SLX to XML: Token not attached to tree node."); if (b.t != null && b.t.isGhost) { return; } TNode countDepth = b; int depth = 0; while (countDepth != null) { countDepth = countDepth.parent; depth++; } //Depth assertion...Not needed //if (depth > 35) // assert(depth < 40); //Or we have some kind of problem if (b.t != null && b.parent != null && b.parent.t != null) { if (b.t.ghostPair != null && b.t.ghostPair == b.parent.t.ghostPair) throw new InvalidMarkupException("We have nested ghost pairs - impossible?"); //we should never have two copies of a pair in the hierarchy //if (b.t.matches(b.parent.t.getTagName())){ // && (b.t.get("type") == null || b.t.get("type").equalsIgnoreCase(b.parent.t.get("type"))) //We have matching tags nested. //assert(true); //} } if (b.delayedGhosts != null && b.children != null) { for (SlxToken insert : b.delayedGhosts) { if (b.t != null && !SlxValidator.isAllowedInside(insert, b.t)) { for (TNode tnp : b.children) { tnp.addDelayedGhost(insert); } } else { b.insertParent(b.children, insert.toNonGhostVersion(true), false, -1, this); //Handles adding ghostChildren, moving ghostPairs if needed } } b.delayedGhosts = null; } //Process pairs (a/b\c if (b.ghostPairs != null) { while (b.ghostPairs.size() > 0) { Object key = b.ghostPairs.keySet().toArray()[0]; if (bannedPairs.contains(key)) { throw new InvalidMarkupException("Encountered the same ghost pair twice!", b.t); } TNode a = b.ghostPairs.get(key).a; TNode c = b.ghostPairs.get(key).b; b.ghostPairs.remove(key); //Removed; bannedPairs.add((UUID) key); //Somehow, the ghost pair gets recreated once more causing duplicates... TNode aBranch = a.findChildUnder(b); TNode cBranch = c.findChildUnder(b); if (aBranch == cBranch) throw new InvalidMarkupException("Bug in algorithm - aBranch and cBranch cannot be equal"); //b.children should exist if b.ghostPairs does. if (b.children.indexOf(aBranch) < 0 || b.children.indexOf(cBranch) < 0) throw new InvalidMarkupException("Failed to locate the branches A and C belong to. Ghost tag splitting failed on token", b.t); //Get the middle batch - between aBranch and cBranch - the simple part List<TNode> list = b.children.subList(b.children.indexOf(aBranch) + 1, b.children.indexOf(cBranch)); //There is only one peak for each pair of ghost tags. //Thus, we can steal the attribute collection from the opening ghost tag this once. //The ghost tag is discarded during writing, so the duplicate reference shouldn't cause a problem. //Should save a lot on initialization costs. Simple ghost tags are the majority. //DISCOVERED OVERSIGHT //You can't insert a intermediate parent under a <table> or <tr> tag. //Gotta push those down to the subchildren. SlxToken insert = a.t.toNonGhostVersion(false); //WARNING - the attribute collection on the original is cloned. if (b.t != null && !SlxValidator.isAllowedInside(insert, b.t)) { insert = a.t.toNonGhostVersion(true); //Gotta deep copy... //System.out.println("Failed to place " + insert.toString() + " inside " + b.toString()); for (TNode tnp : list) { tnp.addDelayedGhost(insert); } } else { b.insertParent(list, insert, aBranch == a && cBranch == c && keepEmptyGhostTags, b.children.indexOf(aBranch), this); //Handles adding ghostChildren, moving ghostPairs if needed (which affects this loop, but should be safe) } } } //Process a/b/c and a\b\c if (b.ghostChildren != null) { while (b.ghostChildren.size() > 0) { Object key = b.ghostChildren.keySet().toArray()[0]; TNode lower = b.ghostChildren.get(key); b.ghostChildren.remove(key); //remove TNode branch = lower.findChildUnder(b); if (b.children.indexOf(branch) < 0) throw new InvalidMarkupException("Failed to locate the branches A or C belong to. Ghost tag splitting failed on token", b.t); //b.children should exist if b.ghostChildren does. //Build SlxToken insert = null; List<TNode> list = null; //Find the opening ghost tag and make a non-ghost version. //Populate a list of afected children. if (lower.t.isOpening()) { insert = lower.t.toNonGhostVersion(true); list = b.children.subList(b.children.indexOf(branch) + 1, b.children.size());//Inserts a intermediate parent for children after 'branch' } else if (lower.t.isClosing()) { TNode opening = lower.opener; //findOpeningTag() Won't work - the pair is already gone. Gotta keep a reference. if (opening == null) throw new InvalidMarkupException("Matching opening node wasn't found.", lower.t); insert = opening.t.toNonGhostVersion(true); list = b.children.subList(0, b.children.indexOf(branch));//Before branch - a closing tag } //If we can't add an intermediate here, delay it to the child. if (b.t != null && !SlxValidator.isAllowedInside(insert, b.t)) { for (TNode tnp : list) { tnp.addDelayedGhost(insert); } } else { //Insert b.insertParent(list, insert, branch == lower && createEmptyGhostsAtStartAndEnd, b.children.indexOf(branch), this); } } } //Do all children recursively for (int i = 0; i < b.childrenCount(); i++) { splitGhosts(b.children.get(i)); } } private class TNode { public boolean isRoot = false; /** * The token this object wraps */ public SlxToken t = null; //What about the closing tag..? Drop? public TNode parent = null; public List<TNode> children = null; public UUID pairId = null; public TNode opener = null; //So closing ghost tags can keep a reference to their opeing ghost tag for cloning purposes. public LinkedHashMap<UUID, TNode> ghostChildren = null; public List<SlxToken> delayedGhosts = null; //These must be added around all node contents, since it was invalid XML for them to be added around this node itself. public LinkedHashMap<UUID, Pair<TNode, TNode>> ghostPairs = null; public boolean generated = false; public int childrenCount() { return children == null ? 0 : children.size(); } /** * Converts the tree into an xml Node * * @return */ public Node toXmlNode() { Node n = new Node(t, !reuseAttrMaps); //Original token gets modified! if (t != null) { if (t.isGhost) return null; assert (!(t.isTag() && t.isClosing())) : "Closing nodes should not exist here"; } if (children != null) { if (t != null) assert (t.isTag() && t.isOpening()) : "Huh?"; //Create the children collection if (children.size() > 0) n.children = new NodeList(children.size()); for (TNode tn : children) { Node c = tn.toXmlNode(); if (c != null) { n.children.list().add(c); c.parent = n; } } } return n; } public void fixChildrenParentRefs() { for (int i = 0; i < childrenCount(); i++) { children.get(i).parent = this; } } /* public void write(ISlxTokenWriter w) throws InvalidMarkupException{ if (t == null){ //The root node only writes the children. for (int i = 0; i < childrenCount(); i++){ children.get(i).write(w); } }else{ if (this.t.isGhost) return; //No closing tags should be present assert(!(this.t.isClosing() && this.t.isTag())); w.write(t); if (t.isTag() && t.isOpening()){ for (int i = 0; i < childrenCount(); i++){ children.get(i).write(w); } w.write(t.getClosingTag()); }else{ assert(childrenCount() == 0); } } } */ /** * Looks upwards through the hierarchy and returns the ancestor that is a immediate descendant of grandparent). * This may be itself. * * @param grandparent * @return */ public TNode findChildUnder(TNode grandparent) { TNode temp = this; while (temp.parent != null) { if (temp.parent == grandparent) return temp; temp = temp.parent; } return null; } /** * Inserts a parent between this node and a set of children. * * @param affectedChildren * @param t * @throws InvalidMarkupException */ public void insertParent(List<TNode> affectedChildren, SlxToken t, boolean keepIfEmpty, int insertEmptyTagAt, SlxToXmlTransformer optionalCaller) throws InvalidMarkupException { //You must pass in non-ghost children in for tags to be created. int nonGhosts = 0; for (TNode ch : affectedChildren) { if (!ch.t.isGhost) nonGhosts++; //Ghost tags cannot have children, so we don't have to be recursive. } if (nonGhosts == 0 && !keepIfEmpty) return; if (insertEmptyTagAt < 0 && keepIfEmpty) throw new InvalidMarkupException("Please specify a positive insertEmptyTagAt value if keepIfEmpty=true"); //We can't do it directly, or we will get a coomidifcation error - we have to clone before assigning to n.children ArrayList<TNode> copy = new ArrayList<TNode>(); copy.addAll(affectedChildren); //Here's what's happening. //insertParent is getting called in a peculiar order. Need to try to fix this anyhow... but I think the results should be the same regardless of order //insertParent is being called on B first somehow. (duh!, because b completes before a in ghostPairs....) (also, ghostPairs get processed before ghostChildren, that will always be a problem_ // <a><b></b></a> //The problem is that ghost tags pair peaks are occuring twice. //To avoid this, we need to prevent addGhostChildrenSmart from adding anything that doesn't already exist in the corresponding parent ggostPairs. //If the parent doesn't have a ghost pair, we shouldn't have it, because the parent has already proccessed it. //If we have it, and they do also, remove it from the parent. //ghostChildren aren't affected, since those are desired at every level of the tree. //The new intermediate parent TNode n = new TNode(); n.parent = this; n.children = copy; n.t = t; //The opening ghost tag n.generated = true; //Generated tokens are getting processed twice.. :( //Swap the parent references, insert 'n', and remove children n.fixChildrenParentRefs(); //Get index of first if (copy.size() > 0) { int insertAt = this.children.indexOf(copy.get(0)); this.children.removeAll(copy); this.children.add(insertAt, n); //Add 'n to replace the children. } else { //The empty check is performed at the top of the function. keepIfEmpty=true here. this.children.add(insertEmptyTagAt, n); } //Adding *all* ghost children from descendants of 'n' to 'n' for (TNode c : copy) { //Add direct ghost children as well. if (c.t.isGhost) n.addGhostChildSmart(c); //Add others if (c.ghostChildren != null) { Collection<TNode> vals = c.ghostChildren.values(); for (TNode v : vals) { n.addGhostChildSmart(v); //ghostPairs is updated on the fly. } } } if (n.ghostPairs != null && ghostPairs != null) { //Find the intersection of 'n.ghostPairs' and 'this.ghostPairs'. //Use this intersection as the new collection for n.ghostPairs. //Remove this intersection from 'this.ghostPiars'. Object[] nKeys = n.ghostPairs.keySet().toArray(new UUID[0]); //Can't iterate through a collection we are modifying. Have to use a fixed array of keys //Go through each key in 'n'. If present in 'this.ghostPairs', remove from this. //If not present, remove from 'n'. for (Object o : nKeys) { if (this.ghostPairs.remove(o) == null) { //Null means that 'this' didn't have it. //Soo. remove from n n.ghostPairs.remove(o); } else { //This means that 'this' had it, and removed it. only 'n' has it now. //Check the banned collection if (optionalCaller.bannedPairs.contains(o)) { throw new InvalidMarkupException("Encountered the same ghost pair twice!"); } } } } else { if (n.ghostPairs != null && ghostPairs == null) n.ghostPairs = null; //If the parent has no ghost pairs, we shouldn't have any. } } protected void addGhostChildSmart(TNode c) throws InvalidMarkupException { UUID key = c.t.ghostPair; if (key == null) throw new InvalidMarkupException("Ghost tag is missing pair key UUID"); if (ghostChildren == null) ghostChildren = new LinkedHashMap<UUID, TNode>(); if (ghostChildren.containsKey(key)) { TNode o = ghostChildren.get(key); if (o == c) { throw new InvalidMarkupException(); //Don't remember. But shouldn't happen. } //Add to ghostPairs if (ghostPairs == null) ghostPairs = new LinkedHashMap<UUID, Pair<TNode, TNode>>(); ghostPairs.put(key, new Pair<TNode, TNode>(o, c)); c.opener = o; //Remove from ghostChildren ghostChildren.remove(key); } else { ghostChildren.put(key, c); } } public void addChild(TNode c) { if (this.children == null) children = new ArrayList<TNode>(); children.add(c); } public void addDelayedGhost(SlxToken t) { if (this.delayedGhosts == null) delayedGhosts = new ArrayList<SlxToken>(); delayedGhosts.add(t); } public String toString() { return (t != null) ? t.toString() : "ROOT NODE"; } } private class Pair<A, B> { public final A a; public final B b; public Pair(A first, B second) { this.a = first; this.b = second; } } /** * Based on the SlxContextStack, but with all the ghost logic removed, and designed for the TNode wrapper type instead. * * @author nathanael */ private class TNodeStack extends Stack<TNode> { public TNode top() { if (this.size() > 0) return this.peek(); else return null; } /** * Returns the innermost tag that matches the specified tag name and type value. Tag name and value can be a regex. if typeValue == null, find() will return null * if typeValue is null, then types will not be filtered. * * @param name * @param typeValue * @param bypassContext * @return */ public SlxToken find(String name, String typeValue, boolean bypassContext) throws InvalidMarkupException { SlxToken s; for (int i = this.size() - 1; i >= 0; i--) { s = this.get(i).t; if (s.matches(name) && (typeValue == null || TokenUtils.fastMatches(typeValue, s.get("type")))) return s; if (!bypassContext && s.startsNewContext) return null; //don't cross context bounds } return null; } /** * Performs the appropriate .add() or .pop(), needed for the specified tag. * Compares tag name and the 'type' attribute to determine equivalence. * * @param t * @return * @throws InvalidMarkupException */ public void process(TNode tn) throws InvalidMarkupException { SlxToken t = tn.t; if (!t.isTag()) return; //Only tags are proccessed if (t.isGhost) return; //Ghost tags are considered self-closing. //If it's an opening tag, add it to the stack. Ghosts if (t.isOpening()) this.push(tn); //(Only for non-ghosts): Make sure closing tags match with what's on the top of the stack. Ghost elements span & link aren't counted. if (t.isClosing()) { if (this.size() < 1) throw new InvalidMarkupException("Unexpected closing tag - no matching opening tags", t); //!!Remove tag from stack TNode opener = this.pop(); //Compare tag names. If the closing tag has a type attribute, compare that as well. boolean isMatch = t.matches(opener.t.getTagName()) && (t.get("type") == null || t.get("type").equalsIgnoreCase(opener.t.get("type"))); //Verify that this closing tag matches the topmost open tag (that's not a ghost) if (!isMatch) { boolean useContext = !t.startsNewContext; boolean matchExistsInContext = (this.find(t.getTagName(), t.get("type"), useContext) != null); if (matchExistsInContext) throw new InvalidMarkupException("Closing tag for " + opener.t.markup + " expected first.", t); else throw new InvalidMarkupException("Unexpected closing tag found.", t); } } } } }