package com.tyndalehouse.step.tools.esv.deprecated; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @Deprecated public class EsvCompleteTagging { private static final Logger LOGGER = LoggerFactory.getLogger(EsvCompleteTagging.class); private Map<Node, List<Node>> nodeChanges = new LinkedHashMap<Node, List<Node>>(72000); /** * @param args the args, not used * @throws Exception */ public static void main(final String[] args) throws Exception { new EsvCompleteTagging().process(args[0], args[1]); } private void process(final String path, final String output) throws Exception { final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); final DocumentBuilder newDocumentBuilder = factory.newDocumentBuilder(); final Document i = newDocumentBuilder.parse(new File(path)); final Element root = i.getDocumentElement(); boolean hasChanges = true; int n = 1; while (hasChanges) { walkNode(root); hasChanges = this.nodeChanges.size() > 0; LOGGER.info("Pass #{}, {} changes", n++, this.nodeChanges.size()); processChanges(); } final TransformerFactory tfFactory = TransformerFactory.newInstance(); final Transformer t = tfFactory.newTransformer(); final OutputStream os = new FileOutputStream(new File(output)); t.transform(new DOMSource(root), new StreamResult(os)); os.close(); } private void processChanges() { for (final Entry<Node, List<Node>> change : this.nodeChanges.entrySet()) { final Node source = change.getKey(); final List<Node> mutations = change.getValue(); for (final Node destination : mutations) { final Node firstChild = source.getFirstChild(); source.insertBefore(destination, firstChild); } } this.nodeChanges = new HashMap<Node, List<Node>>(8000); } private void walkNode(final Node theNode) { final NodeList children = theNode.getChildNodes(); // printNode(theNode); for (int ii = 0; ii < children.getLength(); ii++) { final Node currentNode = children.item(ii); final boolean isWNode = "w".equals(currentNode.getNodeName()); if (isWNode) { // put as many changes as are allowed final List<Node> changes = new ArrayList<Node>(); for (int jj = ii - 1; jj >= 0; jj--) { final Node previousNode = children.item(jj); if (isFoldableNode(previousNode)) { LOGGER.trace("Two nodes, sharing the same parent, and current node is w"); changes.add(previousNode); } else { break; } } if (changes.size() > 0) { this.nodeChanges.put(currentNode, changes); } } if (currentNode.hasChildNodes()) { walkNode(currentNode); } else { // do nothing with leaf nodes // printNode(currentNode); } } } private boolean isFoldableNode(final Node previousNode) { // OSIS spec says that a, index, note, seg are allowed in w's, and obviously text final String nodeName = previousNode.getNodeName(); final boolean isTextNode = "#text".equals(nodeName); if (isTextNode) { // is a proper textNode? return isProperText(previousNode); } if ("a".equals(nodeName) || "index".equals(nodeName) || "note".equals(nodeName) || "seg".equals(nodeName)) { // check that it is preceded by a proper text final Node previousSibling = previousNode.getPreviousSibling(); if (previousSibling == null) { // first note and such like don't get wrapped up return false; } // otherwise check that it isn't just punctuation return isProperText(previousSibling); } return false; } private boolean isProperText(final Node previousNode) { if (!"#text".equals(previousNode.getNodeName())) { return false; } final Node nodeBeforePrevious = previousNode.getPreviousSibling(); if (nodeBeforePrevious != null && "#text".equals(nodeBeforePrevious.getNodeName())) { // we have two subsequent text nodes, so alert LOGGER.warn("Several text nodes follow each other"); return true; } final String s = previousNode.getNodeValue(); for (int ii = 0; ii < s.length(); ii++) { if (Character.isLetter(s.charAt(ii))) { return true; } } return false; } }