package; import static com.tyndalehouse.step.core.utils.StringUtils.isBlank; import static com.tyndalehouse.step.core.utils.StringUtils.isNotBlank; import java.util.Deque; import java.util.Map; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.tyndalehouse.step.core.utils.StringUtils; public class EnricherState { private static final Pattern BY_STRONG = Pattern.compile("\\|"); private static final boolean ADD_TAG = true; private static final Logger LOGGER = LoggerFactory.getLogger(EnricherState.class); private final Map<String, Deque<Word>> verseContent; private String currentVerse; private Deque<Word> q; private Word w; private String[] wWords; private int wWordsII; private String previousVerse; private final String debugVerse = null; private boolean openStrongTag = false; // private final String debugVerse = "Gen.3.16"; public EnricherState(final Map<String, Deque<Word>> verseContent) { this.verseContent = verseContent; } public String match(final String text) { if (this.debugVerse != null && !this.debugVerse.equals(this.currentVerse)) { return text; } LOGGER.trace("Tracing: [{}]", text); try { if (isBlank(this.currentVerse)) { return text; } // if (tWords.length == 0) { // // no words, so exit //"Given element with no words to match"); // return text; // } final StringBuilder output = new StringBuilder(text.length() * 2); ensureOpenTagIfRequired(output); StringBuilder word = new StringBuilder(16); for (int ii = 0; ii < text.length(); ii++) { // get next word if required // move to a boolean flag, so as to warn later on? if (!ensureNextWord()) { // no next so append the remainder output.append(text.substring(ii)); if (isNotBlank(text) && hasAlphaNumerics(output)) { LOGGER.warn( "{}: Unable to continue with matching. No data left. Was hoping to match [{}]", this.currentVerse, text.substring(ii)); } return text; } final char c = text.charAt(ii); switch (c) { case ' ': case ',': case '?': case '[': case ']': case '+': case '!': case '.': case '/': case '-': case ';': case '\\': case '"': case ':': case '\'': case '(': case ')': case '—': // deal with any words already in buffer word = punctuationCharFound(output, word, c); break; default: word.append(c); break; } } // just keep the state as is with this implementation // if the last word matches, then we need to call wordsMatch() final String wordSoFar = word.toString(); if (wordSoFar.length() == 0) { return output.toString(); } if (isBlank(wordSoFar)) { output.append(wordSoFar); return output.toString(); } ensureNextWord(); boolean successMatch = false; if (doWordsMatch(wordSoFar)) { successMatch = true; wordsMatch(output, wordSoFar); word = new StringBuilder(16); } else { successMatch = tryNextMatch(output, wordSoFar); word = new StringBuilder(16); } if (!successMatch) { LOGGER.warn("{}:Unable to match, so simply appending word: [{}]", this.currentVerse, wordSoFar); output.append(wordSoFar); word = new StringBuilder(16); } return output.toString(); } catch (final Exception x) { LOGGER.error(x.getMessage(), x); return text; } } private void ensureOpenTagIfRequired(final StringBuilder output) { // // let's assume w is not null, for now // if (this.q == null) { // return; // } // // final Word peek = this.q.peek(); // if (ADD_TAG && !this.openStrongTag && peek != null && isNotBlank(peek.getS())) { // // add a strong tag opening // openStrongTag(output, peek.getS()); // } } private void openStrongTag(final StringBuilder output, final String codes) { output.append("#<#w lemma=\""); final StringBuilder lemma = new StringBuilder(codes.length() + 4); final StringBuilder morph = new StringBuilder(codes.length() + 4); final String[] strongs = BY_STRONG.split(codes); for (final String s : strongs) { if (lemma.length() != 0) { lemma.append(' '); } final int indexOfAt = s.indexOf('@'); if (indexOfAt == -1) { lemma.append("strong:"); lemma.append(s); } else { lemma.append("strong:"); lemma.append(s.substring(0, indexOfAt)); if (morph.length() != 0) { morph.append(' '); } if (s.charAt(0) == 'H') { morph.append("strongMorph:"); } else { morph.append("robinson:"); } morph.append(s.substring(indexOfAt + 1)); } } output.append(lemma.toString()); output.append("\""); if (morph.length() > 0) { output.append(" morph=\""); output.append(morph.toString()); output.append("\""); } output.append("#>#"); this.openStrongTag = true; } private boolean tryNextMatch(final StringBuilder output, final String wordSoFar) { boolean successMatch = false; // LOGGER.warn("Words do not match [{}] <> [{}]", wordSoFar, this.wWords[this.wWordsII]); // since words don't match, let's try the next word if (this.wWordsII + 1 < this.wWords.length && wordSoFar.equalsIgnoreCase(this.wWords[this.wWordsII + 1])) { LOGGER.warn("Advancing to next word as it matches"); successMatch = true; this.wWordsII++; wordsMatch(output, wordSoFar); } else if (this.wWordsII + 1 >= this.wWords.length) { final Word nextWord = this.q.peek(); if (nextWord == null || nextWord.getW() == null) { // no words left, so definitely no match } else { final String[] nextWords = getWordsFromWord(nextWord); if (nextWords.length > 0 && wordSoFar.equalsIgnoreCase(nextWords[0])) { LOGGER.warn("Advancing to next word as it matches"); successMatch = true; getNextWord(this.currentVerse); wordsMatch(output, wordSoFar); } } } return successMatch; } private boolean hasAlphaNumerics(final StringBuilder output) { for (int ii = 0; ii < output.length(); ii++) { if (Character.isLetterOrDigit(output.charAt(ii))) { return true; } } return false; } private boolean ensureNextWord() { if (this.wWordsII >= this.wWords.length) { if (!this.getNextWord(this.currentVerse)) { // no words left to match return false; } } return true; } private StringBuilder punctuationCharFound(final StringBuilder output, StringBuilder word, final char c) { final String wordSoFar = word.toString(); if (wordSoFar.length() == 0) { // do nothing } else if (isBlank(wordSoFar)) { output.append(wordSoFar); word = new StringBuilder(16); } else if (doWordsMatch(wordSoFar)) { wordsMatch(output, wordSoFar); word = new StringBuilder(16); } else { if (!tryNextMatch(output, wordSoFar)) { LOGGER.warn("[{}]: Next word [{}] in matching files does not match tagged data [{}]", new String[] { this.currentVerse, wordSoFar, this.wWords[this.wWordsII] }); output.append(wordSoFar); word = new StringBuilder(16); } else { // match was possible, so reset the word word = new StringBuilder(16); } } // output.append(word.toString()); output.append(c); return word; } private boolean doWordsMatch(final String wordSoFar) { return wordSoFar.equalsIgnoreCase(this.wWords[this.wWordsII]); } private void wordsMatch(final StringBuilder output, final String wordSoFar) { // munch both words and continue if (this.debugVerse != null) { LOGGER.debug("wWord: i={} s={}/ q={}, matched words from both files [{}]", new Object[] { this.wWordsII, this.wWords.length, this.q.size(), this.wWords[this.wWordsII] }); } else { LOGGER.trace("Matching word {}={}", wordSoFar, this.wWords[this.wWordsII]); } this.wWordsII++; if (ADD_TAG && isNotBlank(this.w.getS()) && this.openStrongTag == false) { openStrongTag(output, this.w.getS()); } output.append(wordSoFar); if (ADD_TAG && isNotBlank(this.w.getS())) { output.append("#<#/w#>#"); // output.append(this.w.getS()); // output.append("###"); this.openStrongTag = false; } } /** * @param currentVerse the currentVerse to set */ public void setCurrentVerse(final String currentVerse) { try { if (isBlank(currentVerse)) { // end of vere this.previousVerse = this.currentVerse; this.currentVerse = null; return; } // warn if we're moving to another verse if ((this.wWords != null && this.wWordsII < this.wWords.length) || (this.q != null && this.q.size() > 0)) { // warn if debugging a particular verse if (this.debugVerse == null) { LOGGER.warn("{}: Left over bits in matching file.", this.previousVerse); } } if (!getNextWord(currentVerse)) { LOGGER.warn("{}: No data available for verse", currentVerse); this.currentVerse = null; } else { this.currentVerse = currentVerse; } } catch (final Exception x) { LOGGER.error(x.getMessage(), x); } } private boolean getNextWord(final String currentVerse) { this.q = this.verseContent.get(currentVerse); if (this.q == null) { LOGGER.warn("{}: No data available in tagging file", currentVerse); return false; } this.w = this.q.poll(); if (this.w == null) { return false; } this.wWordsII = 0; this.wWords = getWordsFromWord(this.w); return true; } private String[] getWordsFromWord(final Word w) { return StringUtils.split(EsvOsisEnricher.reduce(w.getW())); } public int getNextSpace(final String w, final int lastIndex) { final int space = w.indexOf(' ', lastIndex + 1); if (space != -1) { return space; } else { LOGGER.warn("Attempting to retrieve empty word"); return w.length(); } } public boolean isVerse() { return this.currentVerse != null; } }