EsvXmlEnhancer.java example

Explorer
step-master
/*******************************************************************************
 * Copyright (c) 2012, Directors of the Tyndale STEP Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 * Redistributions of source code must retain the above copyright 
 * notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright 
 * notice, this list of conditions and the following disclaimer in 
 * the documentation and/or other materials provided with the 
 * distribution.
 * Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com)  
 * nor the names of its contributors may be used to endorse or promote 
 * products derived from this software without specific prior written 
 * permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 
 * THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************/
package com.tyndalehouse.step.tools.esv;

import com.tyndalehouse.step.tools.MultiMap;
import com.tyndalehouse.step.tools.MultiMapIndexer;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.Books;
import org.crosswire.jsword.book.OSISUtil;
import org.crosswire.jsword.passage.*;
import org.crosswire.jsword.versification.Testament;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static com.tyndalehouse.step.core.utils.StringUtils.*;
import static org.apache.commons.lang3.StringUtils.join;

/**
 * The Class EsvXmlEnhancer.
 */
@SuppressWarnings("all")
public class EsvXmlEnhancer {
    private static final Logger LOGGER = LoggerFactory.getLogger(EsvXmlEnhancer.class);
    private static final Pattern REF_CLEAN = Pattern.compile("[^a-zA-Z0-9: ]+");
    static final Pattern PUNCTUATION = Pattern.compile("[\\-—,.;*:'\\[\\]!\"`?’‘()-]+");
    private static final Pattern STRONGS_SPLITTING = Pattern.compile("<(\\d+)[a-z]?>");
    private static final Book ESV = Books.installed().getBook("ESV");
    private final File tagging;
    private final File esvText;
    private String currentVerse;
    private Deque<Tagging> verseTagging = null;
    private boolean error = false;
    private File outputPath;
    private String lastBook = "";
    private int runCode;

    /**
     * Instantiates a new esv xml enhancer.
     *
     * @param tagging the tagging
     * @param esvText the esv text
     */
    public EsvXmlEnhancer(final File tagging, final File esvText, File outputPath) {
        this.tagging = tagging;
        this.esvText = esvText;
        this.outputPath = outputPath;
    }

    /**
     * The main method.
     *
     * @param args the arguments
     * @throws Exception the exception
     */
    public static void main(final String[] args) throws Exception {
        final File tagging = new File(args[0]);
        final File esvText = new File(args[1]);
        final File outputPath = new File(args[2]);
        if (!outputPath.exists()) {
            new File(outputPath.getParent()).mkdirs();
        }

        int ret = new EsvXmlEnhancer(tagging, esvText, outputPath).go();
        System.exit(ret);
    }

    private int go() throws Exception {
        applyToText(parseTagging());
        LOGGER.info("Done!");
        return this.runCode;
    }

    private MultiMap<String, Tagging, Deque<Tagging>> parseTagging() throws Exception {
        final long start = System.currentTimeMillis();
        final List<Tagging> rawTagging = readTagging();
        LOGGER.info("Cleaning up tagging");
        cleanupTagging(rawTagging);
        LOGGER.info("Indexing tagging");
        final MultiMap<String, Tagging, Deque<Tagging>> indexTagging = indexTagging(rawTagging);

        LOGGER.info("Init phase took [{}]ms", System.currentTimeMillis() - start);
        traceLog(indexTagging);
        return indexTagging;
    }

    private void applyToText(final MultiMap<String, Tagging, Deque<Tagging>> indexTagging) throws Exception {
        final Document esv = readESVDoc();
        try {
            traverse(esv.getDocumentElement(), indexTagging);
        } catch (final AbortTagException abort) {
            LOGGER.warn("Aborted...");
            this.runCode = -1;
        }

        // save document
        writeDoc(esv);

    }

    private void writeDoc(final Document esv) throws Exception {
        final TransformerFactory factory = TransformerFactory.newInstance();
        final Transformer transformer = factory.newTransformer();
        final DOMSource source = new DOMSource(esv);
        final StreamResult result = new StreamResult(this.outputPath);
        transformer.transform(source, result);

    }

    private void traverse(final Element esv, final MultiMap<String, Tagging, Deque<Tagging>> indexTagging)
            throws Exception {
        // filter all verses first, we will process verse by verse
        LOGGER.trace("Tag [{}]", esv.getNodeName());

        if ("verse".equals(esv.getNodeName())) {
            this.currentVerse = esv.getAttribute("osisID");
            this.error = false;
            // limit processing up until
            // if ("Gen.22.15".equals(this.currentVerse)) {
            // throw new AbortTagException();
            // }

            this.verseTagging = indexTagging.get(this.currentVerse);
            processVerse(esv, indexTagging);
            return;
        }

        if ("chapter".equals(esv.getNodeName())) {
            final String osisID = esv.getAttribute("osisID");
            String bookName = osisID.substring(0, esv.getAttribute("osisID").indexOf('.'));
            this.currentVerse = osisID + ".0";

            if (!this.lastBook.equalsIgnoreCase(bookName)) {
                LOGGER.info("Processing chapter [{}]", this.currentVerse);
                this.lastBook = bookName;
            }
            this.error = false;
            this.verseTagging = indexTagging.get(this.currentVerse);
            processVerse(esv, indexTagging);
            //no return, since we want to process the children.
        }

        final Element element = (Element) esv;
        final NodeList childNodes = element.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            final Node item = childNodes.item(i);
            if (item instanceof Text) {
                if (StringUtils.isNotBlank(this.currentVerse)) {
                    if (this.verseTagging != null & !this.error) {
                        try {
                            final int advanceTokens = processVerseContent((Text) item, this.verseTagging);

                            if (advanceTokens != 0) {
                                LOGGER.debug("Advancing by [{}] token(s)", 1);
                                i++;
                            }
                        } catch (final AbortTagException e) {
                            // already logged
                        }
                    }
                }
            } else if (item instanceof Element) {
                final Element traversableElement = (Element) item;
                if (!isIgnoreable(traversableElement)) {
                    traverse(traversableElement, indexTagging);
                }
            }
        }
    }

    private boolean isIgnoreable(final Element traversableElement) {
        final String nodeName = traversableElement.getNodeName();

        if ("note".equals(nodeName)) {
            return true;
        }

        //if the node is a title, and is non-canonical, then we ignore
        if ("title".equals(nodeName)) {
            String title = traversableElement.getAttribute("canonical");
            return !title.equalsIgnoreCase("true");
        }

        return false;
    }

    private int processVerseContent(final Text item, final Deque<Tagging> verseTagging) throws Exception {
        final String textContent = item.getTextContent();
        LOGGER.trace("{}: [{}]", this.currentVerse, textContent);

        final String wordsFromESV = replacePunctuation(textContent);
        Tagging firstTag = verseTagging.peekFirst();
        if (firstTag == null) {
            if (isNotBlank(wordsFromESV)) {
                LOGGER.warn("{}: No tagging for [{}]", this.currentVerse, wordsFromESV);
                this.error = true;
                this.runCode = -1;
                throw new AbortTagException();
            }
            return 0;
        }

        Remainder initialRemainder = new Remainder(wordsFromESV, firstTag.getNonTaggedText());

        while (true) {
            final Remainder remainderAfterProcessingTag = processTag(initialRemainder.clone(), firstTag, item);

            if (isEmpty(remainderAfterProcessingTag.sourceText) || remainderAfterProcessingTag.advance > 0) {
                // remove the tag if empty
                if (isEmpty(firstTag.getNonTaggedText()) && isEmpty(firstTag.getTaggedText())) {
                    verseTagging.removeFirst();
                }

                // all text processed, so return
                return remainderAfterProcessingTag.advance;
            }

            // if both parts of the tag are empty by the end, then we can move on to the next tag
            if (isEmpty(firstTag.getNonTaggedText()) && isEmpty(firstTag.getTaggedText())) {
                verseTagging.removeFirst();
                firstTag = verseTagging.peekFirst();

                if (firstTag == null) {
                    LOGGER.warn("{}: Arrived at end of tagging data. Remainder of ESV text is: [{}]",
                            this.currentVerse, remainderAfterProcessingTag.sourceText);
                    this.runCode = -1;
                    return remainderAfterProcessingTag.advance;
                }

                remainderAfterProcessingTag.taggingText = firstTag.getNonTaggedText();
            }

            // check we have actually processed something
            if (initialRemainder.sourceText.equalsIgnoreCase(remainderAfterProcessingTag.sourceText)
                    && initialRemainder.taggingText.equalsIgnoreCase(remainderAfterProcessingTag.taggingText)) {
                LOGGER.warn("{}: No processing was made on ESV text between [{}] and [{}]",
                        this.currentVerse, remainderAfterProcessingTag.sourceText,
                        remainderAfterProcessingTag.taggingText);
                this.error = true;
                this.runCode = -1;
                // abort the tag processing
                throw new AbortTagException();
            }

            // set up to go round the look again
            initialRemainder = remainderAfterProcessingTag;
        }
    }

    private Remainder processTag(Remainder remainder, final Tagging firstTag, final Text item)
            throws AbortTagException {
        // final String nonTaggedText = firstTag.getNonTaggedText();
        // Remainder remainder = new Remainder(wordsFromESV, nonTaggedText);
        remainder = matchEsvToTagging(remainder, null, item);
        firstTag.setNonTaggedText(remainder.taggingText);

        if (isEmpty(remainder.sourceText)) {
            return remainder;
        }

        // now check if we parsed all the non-tagged text. if so, we can do the same for the tagging part
        if (isEmpty(firstTag.getNonTaggedText())) {
            remainder.taggingText = firstTag.getTaggedText();
        }

        remainder = matchEsvToTagging(remainder, firstTag, item);
        firstTag.setTaggedText(remainder.taggingText);
        return remainder;
    }

    /**
     * @param wordsFromESV
     * @param firstTag
     * @param taggedText
     * @return Remainder of tagging portion.
     * @throws AbortTagException
     */
    private Remainder matchEsvToTagging(final Remainder remainder, final Tagging tagData, final Text item)
            throws AbortTagException {
        final String taggedText = remainder.taggingText;
        final String wordsFromESV = remainder.sourceText;

        if (isNotBlank(taggedText)) {
            // no tag for these words - but need to check they match
            if (wordsFromESV.equalsIgnoreCase(taggedText)) {
                // full match, so simply set the tagging to nothing
                LOGGER.debug("{}: Matched words: [{}]", this.currentVerse, wordsFromESV);

                tagWord(taggedText, tagData, item, remainder);

                // no need to increment position in source text since there is nothing left
                remainder.sourceText = "";
                remainder.taggingText = "";
                return remainder;
            } else {
                // partial match
                final String[] taggedWords = taggedText.split(" ");
                final String[] esvWords = wordsFromESV.split(" ");

                // how many words can we match
                int ii = 0;
                for (; ii < esvWords.length && ii < taggedWords.length; ii++) {
                    if (esvWords[ii].equalsIgnoreCase(taggedWords[ii])) {
                        LOGGER.debug("{}: Partial matching of [{}]", this.currentVerse, esvWords[ii]);

                        // now we can tag a word
                        tagWord(taggedWords[ii], tagData, item, remainder);

                        remainder.positionInSourceText++;
                        // if we've tagged a word, move i forward to reflect below correctly and break
                        if (remainder.advance > 0) {
                            ii += remainder.advance;
                            break;
                        }
                    } else {
                        break;
                    }
                }

                // now look at value of ii, which is equal to last non-match
                // if we didn't get to the end of the tagged words
                final String esvLeftOver = ii < esvWords.length ? join(esvWords, ' ', ii, esvWords.length)
                        : "";
                final String tagLeftOver = ii < taggedWords.length ? join(taggedWords, ' ', ii,
                        taggedWords.length) : "";

                remainder.sourceText = esvLeftOver;
                remainder.taggingText = tagLeftOver;
                return remainder;
            }
        }

        return remainder;
    }

    private void tagWord(final String taggedText, final Tagging tagData, final Text item,
                         final Remainder remainder) throws AbortTagException {

        if (tagData == null) {
            return;
        }

        if (tagData.getNonTaggedText().length() > 0) {
            LOGGER.error("{}:Tagging with still unmunched non-tagged data: [{}]", this.currentVerse,
                    tagData.getNonTaggedText());
            this.runCode = -1;
            throw new AbortTagException();
        }

        LOGGER.trace("Tagging [{}] with [{}] in tag [{}]", taggedText, tagData, item);

        if (tagData.getOriginalTaggedText().equals(tagData.getTaggedText())) {
            LOGGER.debug("{}: Tagging entire tagData item: [{}] for words at position: [{}]",
                    this.currentVerse, tagData.getTaggedText(), remainder.positionInSourceText);

            int finalPosition = 0;
            if (remainder.positionInSourceText != 0) {
                final int position = remainder.positionInSourceText == 0 ? 0 : findWordPosition(
                        item.getTextContent(), remainder.positionInSourceText - 1);
                if (position == -1) {
                    LOGGER.error("Couldn't find a matched word to tag.");
                    this.runCode = -1;
                    throw new AbortTagException();
                }
                finalPosition = position + 1;
            } else {
                finalPosition = fastForwardNonAlphaNumeric(item.getTextContent());
            }

            final Text wordInDoc = item.splitText(finalPosition);

            final String textContent = wordInDoc.getTextContent();
            if (textContent.length() == tagData.getTaggedText().length()) {
                // take the whole tag
                createAndWrapWElement(tagData, item, remainder, wordInDoc);
                return;
            } else if (textContent.length() > tagData.getTaggedText().length()) {
                // need to split further. - we've preserved spaces but not other punctuation marks
                // so we need to figure out how many to fast forward...
                int lengthInDomElement = getLengthInDomWord(textContent, tagData.getTaggedText());

                wordInDoc.splitText(lengthInDomElement);
                createAndWrapWElement(tagData, item, remainder, wordInDoc);
                return;
            } else {
                LOGGER.trace("{}: Cross-tag: Not enough content in Text item", this.currentVerse,
                        textContent, tagData.getTaggedText().length());

                // we need to look ahead and store up the nodes that we're going to wrap
                // so, wordInDoc is the last portion of text.
                final Node n = wordInDoc.getNextSibling();
                final ArrayList<Node> matchingNodes = new ArrayList<Node>(8);
                matchingNodes.add(wordInDoc);
                grabMatchingNodes(matchingNodes, wordInDoc, tagData,
                        getLeftOverText(wordInDoc.getTextContent(), tagData.getTaggedText()));
                remainder.advance += tagData.getTaggedText().split(" ").length;

                // we're looking for text content
                // TODO remove after testing
                // this.error = true;
//                this.runCode = -1;
                // throw new AbortTagException();

            }

            // TODO TODO TODO
            // B- What happens if we're what we're tagging contains some punctuation - we probably end up with
            // not quite the right word
            // C- We need some way of telling the calling method that we have tagged the whole tag, not just a
            // little bit of it. As a result, we may need to increment bits further

        } else {
            LOGGER.warn("{}: Tagging data has been split: [{}], original was [{}]", this.currentVerse,
                    tagData.getTaggedText(), tagData.getOriginalTaggedText());
            this.runCode = -1;
        }
    }

    /**
     * Gets the corresponding length the tagged text from the point of view of the dom text
     *
     * @param textContent the dom text
     * @param taggedText  the tagged text
     * @return
     */
    int getLengthInDomWord(final String textContent, final String taggedText) {
        int baseLength = taggedText.length();

        //now we go through textContent and count the number of non alpha-numeric characters
        int nonAlpha = 0;
        char previousChar = 'a';
        for (int ii = 0; ii < baseLength + nonAlpha; ii++) {
            final char c = textContent.charAt(ii);
            if (!Character.isLetterOrDigit(c)) {
                //cater for double-spaces, just in case
                if (c == ' ') {
                    if (previousChar == ' ') {
                        nonAlpha++;
                    }
                } else {
                    //we only add if there is also a space marker/punctuation somewhere afterwards, given we're talking about English punctuation
                    if (ii + 1 < textContent.length() && !Character.isLetterOrDigit(textContent.charAt(ii + 1))) {
                        nonAlpha++;
                    }
                }

            }
            previousChar = c;
        }


        return baseLength + nonAlpha;
    }

    private String getLeftOverText(final String textContent, final String taggedText)
            throws AbortTagException {
        // we're looking for the bit in tagged text that has not yet been tagged
        int jj = 0;
        for (int ii = 0; ii < textContent.length(); ii++) {
            if (!Character.isLetterOrDigit(textContent.charAt(ii))) {
                continue;
            }

            // advance jj as far as is possible
            while (!Character.isLetterOrDigit(taggedText.charAt(jj))) {
                jj++;
            }

            // advance in sync with both strings
            if (Character.toLowerCase(taggedText.charAt(jj)) == Character.toLowerCase(textContent.charAt(ii))) {
                jj++;
            } else {
                LOGGER.error("{}: Somehow we were unable to match the given texts [{}] and [{}]",
                        this.currentVerse, textContent, taggedText);
                this.error = true;
                this.runCode = -1;
                throw new AbortTagException();
            }
        }

        while (!Character.isLetterOrDigit(taggedText.charAt(jj))) {
            jj++;
        }
        return taggedText.substring(jj);
    }

    private void grabMatchingNodes(final List<Node> matchingNodes, final Node wordInDoc,
                                   final Tagging tagData, final String textLeftOver) throws AbortTagException {
        final String remainingTextLeftOver = textLeftOver;
        final Node nextSibling = getNextSiblingToMatch(matchingNodes, wordInDoc, tagData, textLeftOver);

        // we have a next sibling
        if (nextSibling instanceof Element && isIgnoreable((Element) nextSibling)) {
            // then add to the list - i.e. add a note to the list - we tag the whole note with the lemma
            matchingNodes.add(nextSibling);
        } else if (nextSibling instanceof Element) {
            // non-ignoreable node
            // i.e. we need to traverse it - and hope for the best -i.e. that what we're going to try and
            // match will be a whole tag, rather than a bit.
            // otherwise we can't tag the element.
            if (nextSibling.getNodeName().equalsIgnoreCase("verse")) {
                LOGGER.error("[{}] We've gone too far - something didn't match [{}]", this.currentVerse,
                        tagData.getTaggedText());
                this.error = true;
                this.runCode = -1;
                throw new AbortTagException();
            }

            // traverse children nodes...
            // TODO
            LOGGER.warn("{}: Need to traverse children - scenario not yet catered for. Data was [{}]",
                    this.currentVerse,
                    tagData.getTaggedText());
            this.error = true;
            this.runCode = -1;
            throw new AbortTagException();
        } else if (nextSibling instanceof Text) {
            // we've got some text, so we may want to split it
            final boolean done = getNodePart((Text) nextSibling, textLeftOver);

            if (done) {
                // match is complete
                matchingNodes.add(nextSibling);

                // check all nodes have the same parent
                Node previousParent = null;
                for (final Node n : matchingNodes) {
                    if (previousParent == null) {
                        previousParent = n.getParentNode();
                    } else {
                        // check that we have the same parent
                        if (previousParent != n.getParentNode()) {
                            // TODO
                            LOGGER.warn(
                                    "{}: Attempting to tag elements with different parents. One case has not yet been catered for"
                                            + ", which is if the child is the only element different parent, then we can roll up. Portion of text was [{}]",
                                    this.currentVerse, tagData.getTaggedText()
                            );
                            this.runCode = -1;
                            this.error = true;
                            throw new AbortTagException();
                        }
                    }
                }

                // all nodes have the same parent
                final Element createWElement = createWElement(tagData, nextSibling.getOwnerDocument());

                // we insert it before the first element in our list
                final Node firstMatchedNode = matchingNodes.get(0);
                firstMatchedNode.getParentNode().insertBefore(createWElement, firstMatchedNode);
                for (final Node n : matchingNodes) {
                    createWElement.appendChild(n);
                }
                return;
            } else {
                // need to continue on to the next node
                // now work out how much text is left over, hopefully none, but you never know...
            }
        } else {
            this.error = true;
            LOGGER.error("{}: Attemping to match [{}] but unknown node type found: [{}]", this.currentVerse,
                    nextSibling.getNodeType());
            this.runCode = -1;
            throw new AbortTagException();
        }

        // go round the loop again
        grabMatchingNodes(matchingNodes, nextSibling, tagData, remainingTextLeftOver);
    }

    private Node getNextSiblingToMatch(final List<Node> matchingNodes, final Node wordInDoc,
                                       final Tagging tagData, final String textLeftOver) throws AbortTagException {
        final Node nextSibling = wordInDoc.getNextSibling();

        if (nextSibling == null) {
            LOGGER.trace(
                    "{}: Attemping to match [{}] to [{}] but no siblings available. Attempting to roll up.",
                    this.currentVerse, wordInDoc.getTextContent(), tagData.getTaggedText());

            // then, let's see if we can replace part of the list by the parent node
            replaceNodesByParent(matchingNodes, wordInDoc, tagData, textLeftOver);
            return getNextSiblingToMatch(matchingNodes, matchingNodes.get(matchingNodes.size() - 1), tagData,
                    textLeftOver);
        }
        return nextSibling;
    }

    private void replaceNodesByParent(final List<Node> matchingNodes, final Node refNode,
                                      final Tagging tagData, final String remainingText) throws AbortTagException {
        final Node parentNode = refNode.getParentNode();
        // if (parentNode.getChildNodes().getLength() > matchingNodes.size()) {
        // LOGGER.warn("{}: Impossible tag: Not enough nodes in match for [{}]. Impossible portion is [{}]",
        // this.currentVerse, tagData.getTaggedText(), remainingText);
        // this.error = true;
//        this.runCode = -1;
        // throw new AbortTagException();
        // }

        // otherwise, we may be lucky, so process the list from the end
        final NodeList childNodes = parentNode.getChildNodes();
        for (int ii = 0; ii < childNodes.getLength(); ii++) {
            if (!matchingNodes.remove(childNodes.item(ii))) {

                // node was not present. That's ok, so long as it is either a note or something that can be
                // ignored, or its text content is nothing but punctuation
                // or whitespace... At which point it's best to tag that even though it might look a bit funny
                if (isRollableNode(childNodes.item(ii))) {
                    // safely ignore
                } else {

                    LOGGER.warn(
                            "{}: Impossible tag. Not all nodes from parent are present. Tag data [{}]. Impossible portion is [{}]",
                            this.currentVerse, tagData.getTaggedText(), remainingText);
                    this.error = true;
                    this.runCode = -1;
                    throw new AbortTagException();
                }
            }
        }

        // all nodes from parent were there, so simply add on to the end the parent node
        matchingNodes.add(parentNode);
    }

    /**
     * Check is rollable node, rollable if either whitespace or punctuation or ignoreable
     *
     * @param candidate the candidate
     */
    private boolean isRollableNode(final Node candidate) {
        if (candidate instanceof Element && isIgnoreable((Element) candidate)) {
            return true;
        } else if (candidate instanceof Text
                && isPunctuationAndWhiteSpace(((Text) candidate).getTextContent())) {
            return true;
        }
        return false;
    }

    private boolean isPunctuationAndWhiteSpace(final String textContent) {
        for (int ii = 0; ii < textContent.length(); ii++) {
            if (Character.isLetterOrDigit(textContent.charAt(ii))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Gets the node part.
     *
     * @param nextSibling  the next sibling, may get split during the operation
     * @param textLeftOver the text left over
     * @return true if we're done, false otherwise
     * @throws AbortTagException the abort tag exception
     */
    private boolean getNodePart(final Text nextSibling, final String textLeftOver) throws AbortTagException {
        final String siblingText = nextSibling.getTextContent();

        int jj = 0;
        for (int ii = 0; ii < siblingText.length(); ii++) {
            final char siblingChar = siblingText.charAt(ii);
            // if jj has reached the end of textLeftOver - we've got a full match on the tag,
            // so depending on where i is, we return either part of the whole sibling
            if (jj >= textLeftOver.length()) {
                // by doing this here, we ensure that i < siblingText, so we need only part of the node
                nextSibling.splitText(ii);
                return true;
            }

            if (Character.isLetterOrDigit(siblingChar)) {

                // move jj up to next character
                while (!Character.isLetterOrDigit(textLeftOver.charAt(jj))) {
                    jj++;
                }

                // /attempt to match against text left over.
                if (Character.toLowerCase(siblingChar) == Character.toLowerCase(textLeftOver.charAt(jj))) {
                    // we've matched
                    jj++;
                } else {
                    // we're not a match, so abort
                    LOGGER.warn("{}: Failed to match [{}] against [{}] in cross-tag", this.currentVerse,
                            nextSibling, textLeftOver);
                    this.error = true;
                    this.runCode = -1;
                    throw new AbortTagException();
                }
            } // else move on to next character
        }

        // if we get here, then we may only got a partial match on the tagged text, but we've matched the
        // whole content of the sibling node
        if (jj >= textLeftOver.length()) {
            // we have a full match on textLeftOver, and ii > the text size, so return the whole node -
            // we're done
            return true;
        } else {
            // there is more matching to do, but we still want the whole node
            return false;
        }
    }

    private void createAndWrapWElement(final Tagging tagData, final Text item, final Remainder remainder,
                                       final Text wordInDoc) {
        // double check that we're tagging is what's in the word we've selected
        // Several things to think about
        // A- We must check that what we're tagging is the same as what's in the wordInDoc
        if (!equalsIngorePunctuationAndCase(tagData.getTaggedText(), wordInDoc.getTextContent())) {
            LOGGER.warn("{}: The text node content [{}] differs from the tagged data [{}]",
                    this.currentVerse, wordInDoc.getTextContent(), tagData.getTaggedText());
            this.runCode = -1;
        }

        final Element w = createWElement(tagData, wordInDoc.getOwnerDocument());
        item.getParentNode().insertBefore(w, wordInDoc);

        // move the text into the w node
        w.appendChild(wordInDoc);

        // we move by the number of words in the tag
        remainder.advance += tagData.getTaggedText().split(" ").length;
    }

    boolean equalsIngorePunctuationAndCase(final String taggedText, final String domText) {
//        if (text1.length() != text2.length()) {
//            return false;
//        }
        int nonAlpha = 0;

        // same length, compare char by char
        for (int ii = 0; ii < taggedText.length(); ii++) {
            final char c1 = taggedText.charAt(ii);
            final char c2 = domText.charAt(ii + nonAlpha);
            if (Character.isLetterOrDigit(c1) && Character.toLowerCase(c1) != Character.toLowerCase(c2)) {
                return false;
            }

            // for every other case we basically accept the letters, but we do an extra
            // ignore for punctuation in the source text if it is followed by a space
            if (!Character.isLetterOrDigit(c2) && c2 != ' ' && ii + 1 < domText.length() && domText.charAt(ii + 1) == ' ') {
                nonAlpha++;
            }
        }
        return true;

    }

    private int fastForwardNonAlphaNumeric(final String str) {
        // fast forward if we're starting with c
        int start = 0;
        for (; !Character.isLetterOrDigit(str.charAt(start)); start++) {
            ;
        }
        return start;
    }

    private Element createWElement(final Tagging tagData, final Document ownerDocument) {
        final Element w = ownerDocument.createElement("w");
        final Attr lemma = ownerDocument.createAttribute(OSISUtil.ATTRIBUTE_W_LEMMA);
        lemma.setNodeValue(createLemmaAttribute(tagData));
        w.setAttributeNode(lemma);

        final Attr morph = ownerDocument.createAttribute(OSISUtil.ATTRIBUTE_W_MORPH);
        morph.setNodeValue(createMorphAttribute(tagData));
        return w;
    }

    private String createMorphAttribute(final Tagging tagData) {
        final String grammar = tagData.getGrammar();
        final String[] splitGrammar = grammar.length() == 0 ? new String[0] : grammar.split(" ");
        final StringBuilder s = new StringBuilder(grammar.length() + 32);

        for (int i = 0; i < splitGrammar.length; i++) {
            s.append("morph:");
            s.append(splitGrammar[i]);

            if (i < splitGrammar.length - 1) {
                s.append(' ');
            }
        }

        return s.toString();
    }

    private String createLemmaAttribute(final Tagging tagData) {
        final String strongs = tagData.getStrongs();

        final String[] splitLemmas = strongs.split(" ");
        final StringBuilder s = new StringBuilder(strongs.length() + 32);
        for (int i = 0; i < splitLemmas.length; i++) {
            s.append("strong:");
            s.append(splitLemmas[i]);

            if (i < splitLemmas.length - 1) {
                s.append(" ");
            }
        }

        return s.toString();
    }

    int findWordPosition(final String str, final int n) {
        final int start = fastForwardNonAlphaNumeric(str);

        boolean foundLetter = true;
        int count = n;
        int ii = start;
        // now we start counting, and consider spaces and punctuation as word separators
        for (ii = start; ii < str.length(); ii++) {
            if (!Character.isLetterOrDigit(str.charAt(ii))) {
                // we found a separator - only accept as separator if previous character wasn't also a
                // separator
                if (foundLetter) {
                    count--;
                }
                foundLetter = false;
            } else {
                foundLetter = true;
            }
            if (count < 0) {
                // we fastforward if there is a bit more here too
                while (ii + 1 < str.length() && !Character.isLetterOrDigit(str.charAt(ii + 1))) {
                    ii++;
                }
                break;
            }
        }

        return ii;
    }

    private void processVerse(final Element esv, final MultiMap<String, Tagging, Deque<Tagging>> indexTagging) {
        final String osisID = esv.getAttribute("osisID");
        LOGGER.trace("Processing [{}]", osisID);
    }

    private Document readESVDoc() throws ParserConfigurationException, SAXException, IOException {
        final long start = System.currentTimeMillis();

        final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

        final DocumentBuilder newDocumentBuilder = factory.newDocumentBuilder();

        final Document esv = newDocumentBuilder.parse(this.esvText);

        LOGGER.info("Took [{}]ms to read ESV into Document", System.currentTimeMillis() - start);
        return esv;
    }

    private void cleanupTagging(final List<Tagging> rawTagging) throws Exception {
        for (final Tagging t : rawTagging) {
            removePunctuation(t);
            splitStrong(t);
            cleanRef(t);

            t.setOriginalTaggedText(t.getTaggedText());
        }
    }

    void splitStrong(final Tagging t) {
        final String rawStrongs = t.getRawStrongs();
        if (rawStrongs == null) {
            t.setStrongs("");
            t.setGrammar("");
            return;
        }

        final Matcher matcher = STRONGS_SPLITTING.matcher(rawStrongs);
        boolean matches;
        StringBuilder sb = new StringBuilder();
        while (matches = matcher.find()) {
            if (matcher.groupCount() > 0) {
                if (sb.length() > 0) {
                    sb.append(' ');
                }
                sb.append(matcher.group(1));
            }
        }

        t.setStrongs(sb.toString());
        if (t.getGrammar() == null) {
            t.setGrammar("");
        }
    }

    private void removePunctuation(final Tagging t) {
        t.setNonTaggedText(replacePunctuation(t.getNonTaggedText()));
        t.setTaggedText(replacePunctuation(t.getTaggedText()));
    }

    private String replacePunctuation(final String text) {
        if (text == null) {
            return "";
        }

        final String remainingText = PUNCTUATION.matcher(text).replaceAll(" ");
        if (remainingText != null) {
            return remainingText.replaceAll("\\s\\s+", " ").trim();
        }
        return "";
    }

    private void cleanRef(final Tagging t) throws NoSuchKeyException {
        final String reference = REF_CLEAN.matcher(t.getRef()).replaceAll("").trim();
        if (isBlank(reference)) {
            LOGGER.warn("Unable to parse reference [{}]", t.getRef());
            this.runCode = -1;
            return;
        }

        try {
            final Key key = ESV.getKey(reference);
            t.setRef(key.getOsisID());

            Verse v = null;
            if (key instanceof Passage) {
                v = (Verse) key.get(0);
            } else if (key instanceof Verse) {
                v = (Verse) key;
            }
            final int ordinal = v.getOrdinal();
            if (v.getVersification().getTestament(ordinal) == Testament.OLD) {
                prefixStrong(t, 'H');
            } else {
                prefixStrong(t, 'G');
            }
        } catch (NoSuchVerseException ex) {
            //deal with 1John
            if ("3John.1.15".equals(reference)) {
                t.setRef(reference);
                prefixStrong(t, 'G');
            } else {
                LOGGER.warn("Unable to recognise [{}] as a reference", reference);
                this.runCode = -1;
            }
        }

    }

    private void prefixStrong(final Tagging t, final char prefixLetter) {
        final String strongs = t.getStrongs();
        final String[] splits = strongs.split(" ");
        final StringBuilder sb = new StringBuilder(strongs.length() + 16);

        for (final String s : splits) {
            if (sb.length() > 0) {
                sb.append(' ');
            }
            sb.append(prefixLetter);
            sb.append(s);
        }
        t.setStrongs(sb.toString());
    }

    /**
     * Trace log of the tagging
     *
     * @param indexTagging the index tagging
     */
    private void traceLog(final MultiMap<String, Tagging, Deque<Tagging>> indexTagging) {
        if (LOGGER.isTraceEnabled()) {
            final Set<Entry<String, Deque<Tagging>>> entrySet = indexTagging.entrySet();
            for (final Entry<String, Deque<Tagging>> mappedEntry : entrySet) {
                LOGGER.trace("Contains ref [{}]", mappedEntry.getKey());
                final Deque<Tagging> value = mappedEntry.getValue();
                for (final Tagging t : value) {
                    LOGGER.trace("\tTagging is: [{}]", t);
                }
            }
        }
    }

    private MultiMap<String, Tagging, Deque<Tagging>> indexTagging(final List<Tagging> rawTagging) {
        final MultiMap<String, Tagging, Deque<Tagging>> map = new MultiMap<String, Tagging, Deque<Tagging>>(
                LinkedList.class);
        map.putCollection(rawTagging, new MultiMapIndexer<String, Tagging>() {

            @Override
            public String getKey(final Tagging t) {
                return t.getRef();
            }
        });

        return map;
    }

    private List<Tagging> readTagging() throws IOException {

        LOGGER.info("Reading in CSV file...");
        List<Tagging> tags = new ArrayList<Tagging>(32000);
        final List<String> lines = FileUtils.readLines(this.tagging);
        for (String line : lines) {
            String[] lineParts = line.split("\\t");
            Tagging t = new Tagging();
            if (lineParts.length > 0) t.setRef(lineParts[0]);
            if (lineParts.length > 1) t.setNonTaggedText(lineParts[1]);
            if (lineParts.length > 2) t.setTaggedText(lineParts[2]);
            if (lineParts.length > 3) t.setRawStrongs(lineParts[3]);
            tags.add(t);
        }
        LOGGER.info("Finished parsing CSV File...");
        return tags;
    }

    class Remainder {
        int positionInSourceText = 0;
        String sourceText;
        String taggingText;

        int advance = 0;

        /**
         * @param sourceText
         * @param taggingText
         */
        public Remainder(final String sourceText, final String taggingText, final int positionInSourceText) {
            this.sourceText = sourceText;
            this.taggingText = taggingText;
            this.positionInSourceText = positionInSourceText;
        }

        /**
         * @param sourceText
         * @param taggingText
         */
        public Remainder(final String sourceText, final String taggingText) {
            this.sourceText = sourceText;
            this.taggingText = taggingText;
        }

        @Override
        protected Remainder clone() throws CloneNotSupportedException {
            return new Remainder(this.sourceText, this.taggingText, this.positionInSourceText);
        }
    }
}