/*******************************************************************************
* Copyright (c) 2012, Directors of the Tyndale STEP Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com)
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package com.tyndalehouse.step.tools.esv;
import com.tyndalehouse.step.tools.MultiMap;
import com.tyndalehouse.step.tools.MultiMapIndexer;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.Books;
import org.crosswire.jsword.book.OSISUtil;
import org.crosswire.jsword.passage.*;
import org.crosswire.jsword.versification.Testament;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.tyndalehouse.step.core.utils.StringUtils.*;
import static org.apache.commons.lang3.StringUtils.join;
/**
* The Class EsvXmlEnhancer.
*/
@SuppressWarnings("all")
public class EsvXmlEnhancer {
private static final Logger LOGGER = LoggerFactory.getLogger(EsvXmlEnhancer.class);
private static final Pattern REF_CLEAN = Pattern.compile("[^a-zA-Z0-9: ]+");
static final Pattern PUNCTUATION = Pattern.compile("[\\-—,.;*:'\\[\\]!\"`?’‘()-]+");
private static final Pattern STRONGS_SPLITTING = Pattern.compile("<(\\d+)[a-z]?>");
private static final Book ESV = Books.installed().getBook("ESV");
private final File tagging;
private final File esvText;
private String currentVerse;
private Deque<Tagging> verseTagging = null;
private boolean error = false;
private File outputPath;
private String lastBook = "";
private int runCode;
/**
* Instantiates a new esv xml enhancer.
*
* @param tagging the tagging
* @param esvText the esv text
*/
public EsvXmlEnhancer(final File tagging, final File esvText, File outputPath) {
this.tagging = tagging;
this.esvText = esvText;
this.outputPath = outputPath;
}
/**
* The main method.
*
* @param args the arguments
* @throws Exception the exception
*/
public static void main(final String[] args) throws Exception {
final File tagging = new File(args[0]);
final File esvText = new File(args[1]);
final File outputPath = new File(args[2]);
if (!outputPath.exists()) {
new File(outputPath.getParent()).mkdirs();
}
int ret = new EsvXmlEnhancer(tagging, esvText, outputPath).go();
System.exit(ret);
}
private int go() throws Exception {
applyToText(parseTagging());
LOGGER.info("Done!");
return this.runCode;
}
private MultiMap<String, Tagging, Deque<Tagging>> parseTagging() throws Exception {
final long start = System.currentTimeMillis();
final List<Tagging> rawTagging = readTagging();
LOGGER.info("Cleaning up tagging");
cleanupTagging(rawTagging);
LOGGER.info("Indexing tagging");
final MultiMap<String, Tagging, Deque<Tagging>> indexTagging = indexTagging(rawTagging);
LOGGER.info("Init phase took [{}]ms", System.currentTimeMillis() - start);
traceLog(indexTagging);
return indexTagging;
}
private void applyToText(final MultiMap<String, Tagging, Deque<Tagging>> indexTagging) throws Exception {
final Document esv = readESVDoc();
try {
traverse(esv.getDocumentElement(), indexTagging);
} catch (final AbortTagException abort) {
LOGGER.warn("Aborted...");
this.runCode = -1;
}
// save document
writeDoc(esv);
}
private void writeDoc(final Document esv) throws Exception {
final TransformerFactory factory = TransformerFactory.newInstance();
final Transformer transformer = factory.newTransformer();
final DOMSource source = new DOMSource(esv);
final StreamResult result = new StreamResult(this.outputPath);
transformer.transform(source, result);
}
private void traverse(final Element esv, final MultiMap<String, Tagging, Deque<Tagging>> indexTagging)
throws Exception {
// filter all verses first, we will process verse by verse
LOGGER.trace("Tag [{}]", esv.getNodeName());
if ("verse".equals(esv.getNodeName())) {
this.currentVerse = esv.getAttribute("osisID");
this.error = false;
// limit processing up until
// if ("Gen.22.15".equals(this.currentVerse)) {
// throw new AbortTagException();
// }
this.verseTagging = indexTagging.get(this.currentVerse);
processVerse(esv, indexTagging);
return;
}
if ("chapter".equals(esv.getNodeName())) {
final String osisID = esv.getAttribute("osisID");
String bookName = osisID.substring(0, esv.getAttribute("osisID").indexOf('.'));
this.currentVerse = osisID + ".0";
if (!this.lastBook.equalsIgnoreCase(bookName)) {
LOGGER.info("Processing chapter [{}]", this.currentVerse);
this.lastBook = bookName;
}
this.error = false;
this.verseTagging = indexTagging.get(this.currentVerse);
processVerse(esv, indexTagging);
//no return, since we want to process the children.
}
final Element element = (Element) esv;
final NodeList childNodes = element.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
final Node item = childNodes.item(i);
if (item instanceof Text) {
if (StringUtils.isNotBlank(this.currentVerse)) {
if (this.verseTagging != null & !this.error) {
try {
final int advanceTokens = processVerseContent((Text) item, this.verseTagging);
if (advanceTokens != 0) {
LOGGER.debug("Advancing by [{}] token(s)", 1);
i++;
}
} catch (final AbortTagException e) {
// already logged
}
}
}
} else if (item instanceof Element) {
final Element traversableElement = (Element) item;
if (!isIgnoreable(traversableElement)) {
traverse(traversableElement, indexTagging);
}
}
}
}
private boolean isIgnoreable(final Element traversableElement) {
final String nodeName = traversableElement.getNodeName();
if ("note".equals(nodeName)) {
return true;
}
//if the node is a title, and is non-canonical, then we ignore
if ("title".equals(nodeName)) {
String title = traversableElement.getAttribute("canonical");
return !title.equalsIgnoreCase("true");
}
return false;
}
private int processVerseContent(final Text item, final Deque<Tagging> verseTagging) throws Exception {
final String textContent = item.getTextContent();
LOGGER.trace("{}: [{}]", this.currentVerse, textContent);
final String wordsFromESV = replacePunctuation(textContent);
Tagging firstTag = verseTagging.peekFirst();
if (firstTag == null) {
if (isNotBlank(wordsFromESV)) {
LOGGER.warn("{}: No tagging for [{}]", this.currentVerse, wordsFromESV);
this.error = true;
this.runCode = -1;
throw new AbortTagException();
}
return 0;
}
Remainder initialRemainder = new Remainder(wordsFromESV, firstTag.getNonTaggedText());
while (true) {
final Remainder remainderAfterProcessingTag = processTag(initialRemainder.clone(), firstTag, item);
if (isEmpty(remainderAfterProcessingTag.sourceText) || remainderAfterProcessingTag.advance > 0) {
// remove the tag if empty
if (isEmpty(firstTag.getNonTaggedText()) && isEmpty(firstTag.getTaggedText())) {
verseTagging.removeFirst();
}
// all text processed, so return
return remainderAfterProcessingTag.advance;
}
// if both parts of the tag are empty by the end, then we can move on to the next tag
if (isEmpty(firstTag.getNonTaggedText()) && isEmpty(firstTag.getTaggedText())) {
verseTagging.removeFirst();
firstTag = verseTagging.peekFirst();
if (firstTag == null) {
LOGGER.warn("{}: Arrived at end of tagging data. Remainder of ESV text is: [{}]",
this.currentVerse, remainderAfterProcessingTag.sourceText);
this.runCode = -1;
return remainderAfterProcessingTag.advance;
}
remainderAfterProcessingTag.taggingText = firstTag.getNonTaggedText();
}
// check we have actually processed something
if (initialRemainder.sourceText.equalsIgnoreCase(remainderAfterProcessingTag.sourceText)
&& initialRemainder.taggingText.equalsIgnoreCase(remainderAfterProcessingTag.taggingText)) {
LOGGER.warn("{}: No processing was made on ESV text between [{}] and [{}]",
this.currentVerse, remainderAfterProcessingTag.sourceText,
remainderAfterProcessingTag.taggingText);
this.error = true;
this.runCode = -1;
// abort the tag processing
throw new AbortTagException();
}
// set up to go round the look again
initialRemainder = remainderAfterProcessingTag;
}
}
private Remainder processTag(Remainder remainder, final Tagging firstTag, final Text item)
throws AbortTagException {
// final String nonTaggedText = firstTag.getNonTaggedText();
// Remainder remainder = new Remainder(wordsFromESV, nonTaggedText);
remainder = matchEsvToTagging(remainder, null, item);
firstTag.setNonTaggedText(remainder.taggingText);
if (isEmpty(remainder.sourceText)) {
return remainder;
}
// now check if we parsed all the non-tagged text. if so, we can do the same for the tagging part
if (isEmpty(firstTag.getNonTaggedText())) {
remainder.taggingText = firstTag.getTaggedText();
}
remainder = matchEsvToTagging(remainder, firstTag, item);
firstTag.setTaggedText(remainder.taggingText);
return remainder;
}
/**
* @param wordsFromESV
* @param firstTag
* @param taggedText
* @return Remainder of tagging portion.
* @throws AbortTagException
*/
private Remainder matchEsvToTagging(final Remainder remainder, final Tagging tagData, final Text item)
throws AbortTagException {
final String taggedText = remainder.taggingText;
final String wordsFromESV = remainder.sourceText;
if (isNotBlank(taggedText)) {
// no tag for these words - but need to check they match
if (wordsFromESV.equalsIgnoreCase(taggedText)) {
// full match, so simply set the tagging to nothing
LOGGER.debug("{}: Matched words: [{}]", this.currentVerse, wordsFromESV);
tagWord(taggedText, tagData, item, remainder);
// no need to increment position in source text since there is nothing left
remainder.sourceText = "";
remainder.taggingText = "";
return remainder;
} else {
// partial match
final String[] taggedWords = taggedText.split(" ");
final String[] esvWords = wordsFromESV.split(" ");
// how many words can we match
int ii = 0;
for (; ii < esvWords.length && ii < taggedWords.length; ii++) {
if (esvWords[ii].equalsIgnoreCase(taggedWords[ii])) {
LOGGER.debug("{}: Partial matching of [{}]", this.currentVerse, esvWords[ii]);
// now we can tag a word
tagWord(taggedWords[ii], tagData, item, remainder);
remainder.positionInSourceText++;
// if we've tagged a word, move i forward to reflect below correctly and break
if (remainder.advance > 0) {
ii += remainder.advance;
break;
}
} else {
break;
}
}
// now look at value of ii, which is equal to last non-match
// if we didn't get to the end of the tagged words
final String esvLeftOver = ii < esvWords.length ? join(esvWords, ' ', ii, esvWords.length)
: "";
final String tagLeftOver = ii < taggedWords.length ? join(taggedWords, ' ', ii,
taggedWords.length) : "";
remainder.sourceText = esvLeftOver;
remainder.taggingText = tagLeftOver;
return remainder;
}
}
return remainder;
}
private void tagWord(final String taggedText, final Tagging tagData, final Text item,
final Remainder remainder) throws AbortTagException {
if (tagData == null) {
return;
}
if (tagData.getNonTaggedText().length() > 0) {
LOGGER.error("{}:Tagging with still unmunched non-tagged data: [{}]", this.currentVerse,
tagData.getNonTaggedText());
this.runCode = -1;
throw new AbortTagException();
}
LOGGER.trace("Tagging [{}] with [{}] in tag [{}]", taggedText, tagData, item);
if (tagData.getOriginalTaggedText().equals(tagData.getTaggedText())) {
LOGGER.debug("{}: Tagging entire tagData item: [{}] for words at position: [{}]",
this.currentVerse, tagData.getTaggedText(), remainder.positionInSourceText);
int finalPosition = 0;
if (remainder.positionInSourceText != 0) {
final int position = remainder.positionInSourceText == 0 ? 0 : findWordPosition(
item.getTextContent(), remainder.positionInSourceText - 1);
if (position == -1) {
LOGGER.error("Couldn't find a matched word to tag.");
this.runCode = -1;
throw new AbortTagException();
}
finalPosition = position + 1;
} else {
finalPosition = fastForwardNonAlphaNumeric(item.getTextContent());
}
final Text wordInDoc = item.splitText(finalPosition);
final String textContent = wordInDoc.getTextContent();
if (textContent.length() == tagData.getTaggedText().length()) {
// take the whole tag
createAndWrapWElement(tagData, item, remainder, wordInDoc);
return;
} else if (textContent.length() > tagData.getTaggedText().length()) {
// need to split further. - we've preserved spaces but not other punctuation marks
// so we need to figure out how many to fast forward...
int lengthInDomElement = getLengthInDomWord(textContent, tagData.getTaggedText());
wordInDoc.splitText(lengthInDomElement);
createAndWrapWElement(tagData, item, remainder, wordInDoc);
return;
} else {
LOGGER.trace("{}: Cross-tag: Not enough content in Text item", this.currentVerse,
textContent, tagData.getTaggedText().length());
// we need to look ahead and store up the nodes that we're going to wrap
// so, wordInDoc is the last portion of text.
final Node n = wordInDoc.getNextSibling();
final ArrayList<Node> matchingNodes = new ArrayList<Node>(8);
matchingNodes.add(wordInDoc);
grabMatchingNodes(matchingNodes, wordInDoc, tagData,
getLeftOverText(wordInDoc.getTextContent(), tagData.getTaggedText()));
remainder.advance += tagData.getTaggedText().split(" ").length;
// we're looking for text content
// TODO remove after testing
// this.error = true;
// this.runCode = -1;
// throw new AbortTagException();
}
// TODO TODO TODO
// B- What happens if we're what we're tagging contains some punctuation - we probably end up with
// not quite the right word
// C- We need some way of telling the calling method that we have tagged the whole tag, not just a
// little bit of it. As a result, we may need to increment bits further
} else {
LOGGER.warn("{}: Tagging data has been split: [{}], original was [{}]", this.currentVerse,
tagData.getTaggedText(), tagData.getOriginalTaggedText());
this.runCode = -1;
}
}
/**
* Gets the corresponding length the tagged text from the point of view of the dom text
*
* @param textContent the dom text
* @param taggedText the tagged text
* @return
*/
int getLengthInDomWord(final String textContent, final String taggedText) {
int baseLength = taggedText.length();
//now we go through textContent and count the number of non alpha-numeric characters
int nonAlpha = 0;
char previousChar = 'a';
for (int ii = 0; ii < baseLength + nonAlpha; ii++) {
final char c = textContent.charAt(ii);
if (!Character.isLetterOrDigit(c)) {
//cater for double-spaces, just in case
if (c == ' ') {
if (previousChar == ' ') {
nonAlpha++;
}
} else {
//we only add if there is also a space marker/punctuation somewhere afterwards, given we're talking about English punctuation
if (ii + 1 < textContent.length() && !Character.isLetterOrDigit(textContent.charAt(ii + 1))) {
nonAlpha++;
}
}
}
previousChar = c;
}
return baseLength + nonAlpha;
}
private String getLeftOverText(final String textContent, final String taggedText)
throws AbortTagException {
// we're looking for the bit in tagged text that has not yet been tagged
int jj = 0;
for (int ii = 0; ii < textContent.length(); ii++) {
if (!Character.isLetterOrDigit(textContent.charAt(ii))) {
continue;
}
// advance jj as far as is possible
while (!Character.isLetterOrDigit(taggedText.charAt(jj))) {
jj++;
}
// advance in sync with both strings
if (Character.toLowerCase(taggedText.charAt(jj)) == Character.toLowerCase(textContent.charAt(ii))) {
jj++;
} else {
LOGGER.error("{}: Somehow we were unable to match the given texts [{}] and [{}]",
this.currentVerse, textContent, taggedText);
this.error = true;
this.runCode = -1;
throw new AbortTagException();
}
}
while (!Character.isLetterOrDigit(taggedText.charAt(jj))) {
jj++;
}
return taggedText.substring(jj);
}
private void grabMatchingNodes(final List<Node> matchingNodes, final Node wordInDoc,
final Tagging tagData, final String textLeftOver) throws AbortTagException {
final String remainingTextLeftOver = textLeftOver;
final Node nextSibling = getNextSiblingToMatch(matchingNodes, wordInDoc, tagData, textLeftOver);
// we have a next sibling
if (nextSibling instanceof Element && isIgnoreable((Element) nextSibling)) {
// then add to the list - i.e. add a note to the list - we tag the whole note with the lemma
matchingNodes.add(nextSibling);
} else if (nextSibling instanceof Element) {
// non-ignoreable node
// i.e. we need to traverse it - and hope for the best -i.e. that what we're going to try and
// match will be a whole tag, rather than a bit.
// otherwise we can't tag the element.
if (nextSibling.getNodeName().equalsIgnoreCase("verse")) {
LOGGER.error("[{}] We've gone too far - something didn't match [{}]", this.currentVerse,
tagData.getTaggedText());
this.error = true;
this.runCode = -1;
throw new AbortTagException();
}
// traverse children nodes...
// TODO
LOGGER.warn("{}: Need to traverse children - scenario not yet catered for. Data was [{}]",
this.currentVerse,
tagData.getTaggedText());
this.error = true;
this.runCode = -1;
throw new AbortTagException();
} else if (nextSibling instanceof Text) {
// we've got some text, so we may want to split it
final boolean done = getNodePart((Text) nextSibling, textLeftOver);
if (done) {
// match is complete
matchingNodes.add(nextSibling);
// check all nodes have the same parent
Node previousParent = null;
for (final Node n : matchingNodes) {
if (previousParent == null) {
previousParent = n.getParentNode();
} else {
// check that we have the same parent
if (previousParent != n.getParentNode()) {
// TODO
LOGGER.warn(
"{}: Attempting to tag elements with different parents. One case has not yet been catered for"
+ ", which is if the child is the only element different parent, then we can roll up. Portion of text was [{}]",
this.currentVerse, tagData.getTaggedText()
);
this.runCode = -1;
this.error = true;
throw new AbortTagException();
}
}
}
// all nodes have the same parent
final Element createWElement = createWElement(tagData, nextSibling.getOwnerDocument());
// we insert it before the first element in our list
final Node firstMatchedNode = matchingNodes.get(0);
firstMatchedNode.getParentNode().insertBefore(createWElement, firstMatchedNode);
for (final Node n : matchingNodes) {
createWElement.appendChild(n);
}
return;
} else {
// need to continue on to the next node
// now work out how much text is left over, hopefully none, but you never know...
}
} else {
this.error = true;
LOGGER.error("{}: Attemping to match [{}] but unknown node type found: [{}]", this.currentVerse,
nextSibling.getNodeType());
this.runCode = -1;
throw new AbortTagException();
}
// go round the loop again
grabMatchingNodes(matchingNodes, nextSibling, tagData, remainingTextLeftOver);
}
private Node getNextSiblingToMatch(final List<Node> matchingNodes, final Node wordInDoc,
final Tagging tagData, final String textLeftOver) throws AbortTagException {
final Node nextSibling = wordInDoc.getNextSibling();
if (nextSibling == null) {
LOGGER.trace(
"{}: Attemping to match [{}] to [{}] but no siblings available. Attempting to roll up.",
this.currentVerse, wordInDoc.getTextContent(), tagData.getTaggedText());
// then, let's see if we can replace part of the list by the parent node
replaceNodesByParent(matchingNodes, wordInDoc, tagData, textLeftOver);
return getNextSiblingToMatch(matchingNodes, matchingNodes.get(matchingNodes.size() - 1), tagData,
textLeftOver);
}
return nextSibling;
}
private void replaceNodesByParent(final List<Node> matchingNodes, final Node refNode,
final Tagging tagData, final String remainingText) throws AbortTagException {
final Node parentNode = refNode.getParentNode();
// if (parentNode.getChildNodes().getLength() > matchingNodes.size()) {
// LOGGER.warn("{}: Impossible tag: Not enough nodes in match for [{}]. Impossible portion is [{}]",
// this.currentVerse, tagData.getTaggedText(), remainingText);
// this.error = true;
// this.runCode = -1;
// throw new AbortTagException();
// }
// otherwise, we may be lucky, so process the list from the end
final NodeList childNodes = parentNode.getChildNodes();
for (int ii = 0; ii < childNodes.getLength(); ii++) {
if (!matchingNodes.remove(childNodes.item(ii))) {
// node was not present. That's ok, so long as it is either a note or something that can be
// ignored, or its text content is nothing but punctuation
// or whitespace... At which point it's best to tag that even though it might look a bit funny
if (isRollableNode(childNodes.item(ii))) {
// safely ignore
} else {
LOGGER.warn(
"{}: Impossible tag. Not all nodes from parent are present. Tag data [{}]. Impossible portion is [{}]",
this.currentVerse, tagData.getTaggedText(), remainingText);
this.error = true;
this.runCode = -1;
throw new AbortTagException();
}
}
}
// all nodes from parent were there, so simply add on to the end the parent node
matchingNodes.add(parentNode);
}
/**
* Check is rollable node, rollable if either whitespace or punctuation or ignoreable
*
* @param candidate the candidate
*/
private boolean isRollableNode(final Node candidate) {
if (candidate instanceof Element && isIgnoreable((Element) candidate)) {
return true;
} else if (candidate instanceof Text
&& isPunctuationAndWhiteSpace(((Text) candidate).getTextContent())) {
return true;
}
return false;
}
private boolean isPunctuationAndWhiteSpace(final String textContent) {
for (int ii = 0; ii < textContent.length(); ii++) {
if (Character.isLetterOrDigit(textContent.charAt(ii))) {
return false;
}
}
return true;
}
/**
* Gets the node part.
*
* @param nextSibling the next sibling, may get split during the operation
* @param textLeftOver the text left over
* @return true if we're done, false otherwise
* @throws AbortTagException the abort tag exception
*/
private boolean getNodePart(final Text nextSibling, final String textLeftOver) throws AbortTagException {
final String siblingText = nextSibling.getTextContent();
int jj = 0;
for (int ii = 0; ii < siblingText.length(); ii++) {
final char siblingChar = siblingText.charAt(ii);
// if jj has reached the end of textLeftOver - we've got a full match on the tag,
// so depending on where i is, we return either part of the whole sibling
if (jj >= textLeftOver.length()) {
// by doing this here, we ensure that i < siblingText, so we need only part of the node
nextSibling.splitText(ii);
return true;
}
if (Character.isLetterOrDigit(siblingChar)) {
// move jj up to next character
while (!Character.isLetterOrDigit(textLeftOver.charAt(jj))) {
jj++;
}
// /attempt to match against text left over.
if (Character.toLowerCase(siblingChar) == Character.toLowerCase(textLeftOver.charAt(jj))) {
// we've matched
jj++;
} else {
// we're not a match, so abort
LOGGER.warn("{}: Failed to match [{}] against [{}] in cross-tag", this.currentVerse,
nextSibling, textLeftOver);
this.error = true;
this.runCode = -1;
throw new AbortTagException();
}
} // else move on to next character
}
// if we get here, then we may only got a partial match on the tagged text, but we've matched the
// whole content of the sibling node
if (jj >= textLeftOver.length()) {
// we have a full match on textLeftOver, and ii > the text size, so return the whole node -
// we're done
return true;
} else {
// there is more matching to do, but we still want the whole node
return false;
}
}
private void createAndWrapWElement(final Tagging tagData, final Text item, final Remainder remainder,
final Text wordInDoc) {
// double check that we're tagging is what's in the word we've selected
// Several things to think about
// A- We must check that what we're tagging is the same as what's in the wordInDoc
if (!equalsIngorePunctuationAndCase(tagData.getTaggedText(), wordInDoc.getTextContent())) {
LOGGER.warn("{}: The text node content [{}] differs from the tagged data [{}]",
this.currentVerse, wordInDoc.getTextContent(), tagData.getTaggedText());
this.runCode = -1;
}
final Element w = createWElement(tagData, wordInDoc.getOwnerDocument());
item.getParentNode().insertBefore(w, wordInDoc);
// move the text into the w node
w.appendChild(wordInDoc);
// we move by the number of words in the tag
remainder.advance += tagData.getTaggedText().split(" ").length;
}
boolean equalsIngorePunctuationAndCase(final String taggedText, final String domText) {
// if (text1.length() != text2.length()) {
// return false;
// }
int nonAlpha = 0;
// same length, compare char by char
for (int ii = 0; ii < taggedText.length(); ii++) {
final char c1 = taggedText.charAt(ii);
final char c2 = domText.charAt(ii + nonAlpha);
if (Character.isLetterOrDigit(c1) && Character.toLowerCase(c1) != Character.toLowerCase(c2)) {
return false;
}
// for every other case we basically accept the letters, but we do an extra
// ignore for punctuation in the source text if it is followed by a space
if (!Character.isLetterOrDigit(c2) && c2 != ' ' && ii + 1 < domText.length() && domText.charAt(ii + 1) == ' ') {
nonAlpha++;
}
}
return true;
}
private int fastForwardNonAlphaNumeric(final String str) {
// fast forward if we're starting with c
int start = 0;
for (; !Character.isLetterOrDigit(str.charAt(start)); start++) {
;
}
return start;
}
private Element createWElement(final Tagging tagData, final Document ownerDocument) {
final Element w = ownerDocument.createElement("w");
final Attr lemma = ownerDocument.createAttribute(OSISUtil.ATTRIBUTE_W_LEMMA);
lemma.setNodeValue(createLemmaAttribute(tagData));
w.setAttributeNode(lemma);
final Attr morph = ownerDocument.createAttribute(OSISUtil.ATTRIBUTE_W_MORPH);
morph.setNodeValue(createMorphAttribute(tagData));
return w;
}
private String createMorphAttribute(final Tagging tagData) {
final String grammar = tagData.getGrammar();
final String[] splitGrammar = grammar.length() == 0 ? new String[0] : grammar.split(" ");
final StringBuilder s = new StringBuilder(grammar.length() + 32);
for (int i = 0; i < splitGrammar.length; i++) {
s.append("morph:");
s.append(splitGrammar[i]);
if (i < splitGrammar.length - 1) {
s.append(' ');
}
}
return s.toString();
}
private String createLemmaAttribute(final Tagging tagData) {
final String strongs = tagData.getStrongs();
final String[] splitLemmas = strongs.split(" ");
final StringBuilder s = new StringBuilder(strongs.length() + 32);
for (int i = 0; i < splitLemmas.length; i++) {
s.append("strong:");
s.append(splitLemmas[i]);
if (i < splitLemmas.length - 1) {
s.append(" ");
}
}
return s.toString();
}
int findWordPosition(final String str, final int n) {
final int start = fastForwardNonAlphaNumeric(str);
boolean foundLetter = true;
int count = n;
int ii = start;
// now we start counting, and consider spaces and punctuation as word separators
for (ii = start; ii < str.length(); ii++) {
if (!Character.isLetterOrDigit(str.charAt(ii))) {
// we found a separator - only accept as separator if previous character wasn't also a
// separator
if (foundLetter) {
count--;
}
foundLetter = false;
} else {
foundLetter = true;
}
if (count < 0) {
// we fastforward if there is a bit more here too
while (ii + 1 < str.length() && !Character.isLetterOrDigit(str.charAt(ii + 1))) {
ii++;
}
break;
}
}
return ii;
}
private void processVerse(final Element esv, final MultiMap<String, Tagging, Deque<Tagging>> indexTagging) {
final String osisID = esv.getAttribute("osisID");
LOGGER.trace("Processing [{}]", osisID);
}
private Document readESVDoc() throws ParserConfigurationException, SAXException, IOException {
final long start = System.currentTimeMillis();
final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
final DocumentBuilder newDocumentBuilder = factory.newDocumentBuilder();
final Document esv = newDocumentBuilder.parse(this.esvText);
LOGGER.info("Took [{}]ms to read ESV into Document", System.currentTimeMillis() - start);
return esv;
}
private void cleanupTagging(final List<Tagging> rawTagging) throws Exception {
for (final Tagging t : rawTagging) {
removePunctuation(t);
splitStrong(t);
cleanRef(t);
t.setOriginalTaggedText(t.getTaggedText());
}
}
void splitStrong(final Tagging t) {
final String rawStrongs = t.getRawStrongs();
if (rawStrongs == null) {
t.setStrongs("");
t.setGrammar("");
return;
}
final Matcher matcher = STRONGS_SPLITTING.matcher(rawStrongs);
boolean matches;
StringBuilder sb = new StringBuilder();
while (matches = matcher.find()) {
if (matcher.groupCount() > 0) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(matcher.group(1));
}
}
t.setStrongs(sb.toString());
if (t.getGrammar() == null) {
t.setGrammar("");
}
}
private void removePunctuation(final Tagging t) {
t.setNonTaggedText(replacePunctuation(t.getNonTaggedText()));
t.setTaggedText(replacePunctuation(t.getTaggedText()));
}
private String replacePunctuation(final String text) {
if (text == null) {
return "";
}
final String remainingText = PUNCTUATION.matcher(text).replaceAll(" ");
if (remainingText != null) {
return remainingText.replaceAll("\\s\\s+", " ").trim();
}
return "";
}
private void cleanRef(final Tagging t) throws NoSuchKeyException {
final String reference = REF_CLEAN.matcher(t.getRef()).replaceAll("").trim();
if (isBlank(reference)) {
LOGGER.warn("Unable to parse reference [{}]", t.getRef());
this.runCode = -1;
return;
}
try {
final Key key = ESV.getKey(reference);
t.setRef(key.getOsisID());
Verse v = null;
if (key instanceof Passage) {
v = (Verse) key.get(0);
} else if (key instanceof Verse) {
v = (Verse) key;
}
final int ordinal = v.getOrdinal();
if (v.getVersification().getTestament(ordinal) == Testament.OLD) {
prefixStrong(t, 'H');
} else {
prefixStrong(t, 'G');
}
} catch (NoSuchVerseException ex) {
//deal with 1John
if ("3John.1.15".equals(reference)) {
t.setRef(reference);
prefixStrong(t, 'G');
} else {
LOGGER.warn("Unable to recognise [{}] as a reference", reference);
this.runCode = -1;
}
}
}
private void prefixStrong(final Tagging t, final char prefixLetter) {
final String strongs = t.getStrongs();
final String[] splits = strongs.split(" ");
final StringBuilder sb = new StringBuilder(strongs.length() + 16);
for (final String s : splits) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(prefixLetter);
sb.append(s);
}
t.setStrongs(sb.toString());
}
/**
* Trace log of the tagging
*
* @param indexTagging the index tagging
*/
private void traceLog(final MultiMap<String, Tagging, Deque<Tagging>> indexTagging) {
if (LOGGER.isTraceEnabled()) {
final Set<Entry<String, Deque<Tagging>>> entrySet = indexTagging.entrySet();
for (final Entry<String, Deque<Tagging>> mappedEntry : entrySet) {
LOGGER.trace("Contains ref [{}]", mappedEntry.getKey());
final Deque<Tagging> value = mappedEntry.getValue();
for (final Tagging t : value) {
LOGGER.trace("\tTagging is: [{}]", t);
}
}
}
}
private MultiMap<String, Tagging, Deque<Tagging>> indexTagging(final List<Tagging> rawTagging) {
final MultiMap<String, Tagging, Deque<Tagging>> map = new MultiMap<String, Tagging, Deque<Tagging>>(
LinkedList.class);
map.putCollection(rawTagging, new MultiMapIndexer<String, Tagging>() {
@Override
public String getKey(final Tagging t) {
return t.getRef();
}
});
return map;
}
private List<Tagging> readTagging() throws IOException {
LOGGER.info("Reading in CSV file...");
List<Tagging> tags = new ArrayList<Tagging>(32000);
final List<String> lines = FileUtils.readLines(this.tagging);
for (String line : lines) {
String[] lineParts = line.split("\\t");
Tagging t = new Tagging();
if (lineParts.length > 0) t.setRef(lineParts[0]);
if (lineParts.length > 1) t.setNonTaggedText(lineParts[1]);
if (lineParts.length > 2) t.setTaggedText(lineParts[2]);
if (lineParts.length > 3) t.setRawStrongs(lineParts[3]);
tags.add(t);
}
LOGGER.info("Finished parsing CSV File...");
return tags;
}
class Remainder {
int positionInSourceText = 0;
String sourceText;
String taggingText;
int advance = 0;
/**
* @param sourceText
* @param taggingText
*/
public Remainder(final String sourceText, final String taggingText, final int positionInSourceText) {
this.sourceText = sourceText;
this.taggingText = taggingText;
this.positionInSourceText = positionInSourceText;
}
/**
* @param sourceText
* @param taggingText
*/
public Remainder(final String sourceText, final String taggingText) {
this.sourceText = sourceText;
this.taggingText = taggingText;
}
@Override
protected Remainder clone() throws CloneNotSupportedException {
return new Remainder(this.sourceText, this.taggingText, this.positionInSourceText);
}
}
}