/******************************************************************************* * Copyright (c) 2012, Directors of the Tyndale STEP Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com) * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ package com.tyndalehouse.step.core.xsl.impl; import com.tyndalehouse.step.core.data.EntityDoc; import com.tyndalehouse.step.core.exceptions.StepInternalException; import com.tyndalehouse.step.core.service.VocabularyService; import com.tyndalehouse.step.core.service.jsword.JSwordVersificationService; import com.tyndalehouse.step.core.utils.JSwordUtils; import com.tyndalehouse.step.core.utils.StringConversionUtils; import com.tyndalehouse.step.core.utils.StringUtils; import com.tyndalehouse.step.core.xsl.InterlinearProvider; import org.crosswire.jsword.book.*; import org.crosswire.jsword.passage.*; import org.crosswire.jsword.versification.Testament; import org.crosswire.jsword.versification.Versification; import org.crosswire.jsword.versification.VersificationsMapper; import org.crosswire.jsword.versification.system.Versifications; import org.jdom2.Content; import org.jdom2.Element; import org.jdom2.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import static com.tyndalehouse.step.core.utils.StringConversionUtils.getAnyKey; import static com.tyndalehouse.step.core.utils.StringUtils.*; import static java.lang.String.format; /** * This object is not purposed to be used as a singleton. It builds up textual information on initialisation, and is * specific to requests. On initialisation, the OSIS XML is retrieved and iterated through to find all strong/morph * candidates * * @author chrisburrell */ public class InterlinearProviderImpl implements InterlinearProvider { public static final String NO_VERSE = "NO_VERSE"; /** * The Constant LOGGER. */ private static final Logger LOGGER = LoggerFactory.getLogger(InterlinearProviderImpl.class); /** * contains the set of tags that may contain biblical text, all lower case */ private static final Set<String> VALID_TEXT_ELEMENTS = new HashSet<String>(); /** * limited accuracy tries to do a location look up by using the verse number as part of the key. */ private final Map<DualKey<String, String>, Deque<Word>> limitedAccuracy = new HashMap<DualKey<String, String>, Deque<Word>>(); private final boolean originalLanguage; private boolean disabled = false; private Versification versification; // a temporary, non-thread-safe, transient, working variable, which keeps track of the verse we're in. private Verse currentVerse; private Book currentBook; private Map<String, String> hebrewDirectMapping; private Map<String, String> hebrewIndirectMappings; private Testament testament; private String masterVersion; private Versification masterVersification; private VocabularyService vocabularyService; private boolean stripAccents = false; private boolean stripVowels = false; static { VALID_TEXT_ELEMENTS.add("divinename"); VALID_TEXT_ELEMENTS.add("a"); VALID_TEXT_ELEMENTS.add("foreign"); VALID_TEXT_ELEMENTS.add("hi"); VALID_TEXT_ELEMENTS.add("name"); VALID_TEXT_ELEMENTS.add("q"); VALID_TEXT_ELEMENTS.add("w"); VALID_TEXT_ELEMENTS.add("seg"); VALID_TEXT_ELEMENTS.add("transChange"); VALID_TEXT_ELEMENTS.add("doxology"); VALID_TEXT_ELEMENTS.add("colophon"); VALID_TEXT_ELEMENTS.add("refrain"); VALID_TEXT_ELEMENTS.add("attribution"); } /** * sets up the interlinear provider with the correct version and text scope. * * @param versificationService versification service * @param version the version to use to set up the interlinear * @param versifiedKey the text scope reference, defining the bounds of the lookup * @param hebrewDirectMapping the hebrew overriding mappings * @param hebrewIndirectMappings the mappings used if no other mapping is found */ public InterlinearProviderImpl(final String masterVersion, Versification masterVersification, JSwordVersificationService versificationService, final String version, final Key versifiedKey, final Map<String, String> hebrewDirectMapping, final Map<String, String> hebrewIndirectMappings, final VocabularyService vocabProvider, boolean stripGreekAccents, boolean stripHebrewAccents, boolean stripVowels) { this.masterVersion = masterVersion; this.masterVersification = masterVersification; this.vocabularyService = vocabProvider; // first check whether the values passed in are correct if (areAnyBlank(version)) { this.originalLanguage = false; return; } this.hebrewIndirectMappings = hebrewIndirectMappings; this.hebrewDirectMapping = hebrewDirectMapping; this.currentBook = versificationService.getBookFromVersion(version); this.versification = versificationService.getVersificationForVersion(currentBook); if (this.currentBook == null) { throw new StepInternalException(format("Couldn't look up book: [%s]", version)); } //mark the book as original language this.originalLanguage = JSwordUtils.isAncientBook(currentBook); final boolean ancientHebrewBook = JSwordUtils.isAncientHebrewBook(currentBook); this.stripAccents = stripGreekAccents && JSwordUtils.isAncientGreekBook(currentBook) || stripHebrewAccents && ancientHebrewBook; this.stripVowels = ancientHebrewBook && this.stripAccents && stripVowels; BookData bookData; try { setTestamentType(versifiedKey); bookData = getBookDataWithVerse0(versifiedKey); scanForTextualInformation(bookData.getOsisFragment(), null); } catch (final BookException e) { throw new StepInternalException(e.getMessage(), e); } this.disabled = this.limitedAccuracy.size() == 0; } /** * package private version for testing purposes. */ InterlinearProviderImpl() { // exposing package private constructor this.originalLanguage = false; } /** * For verse 0, we can't simply lookup verse 0, because that doesn't return the pre-verse content which sits in * verse 1 so instead we need to replace the key to have verse 1 instead. For all purposes, such as verses 0-1, or * verse 1, etc. then we continue as normal. * * @param versifiedKey the key from the original versification * @return a bookdata with the correct verse */ private BookData getBookDataWithVerse0(final Key versifiedKey) { final Iterator<Key> iterator = versifiedKey.iterator(); Verse v = (Verse) iterator.next(); if (v != null && !iterator.hasNext()) { //then we're basically looking at a single verse... in this special case // we need to check it doesn't not map to verse 0. //if it did, we will need to return verse with 1. final VerseKey mappedVerse = VersificationsMapper.instance().mapVerse(v, this.versification); if (mappedVerse.getCardinality() == 1) { final Verse next = (Verse) mappedVerse.iterator().next(); if (next.getVerse() == 0) { return new BookData(this.currentBook, new Verse(this.versification, next.getBook(), next.getChapter(), next.getVerse() + 1)); } } } return new BookData(this.currentBook, versifiedKey); } @Override public String getWord(final String verseNumber, final String strong, final String morph) { // we use a linked hashset, because we want the behaviour of a set while we add to it, // but at the end, we will want to return the elements in order LOGGER.trace("Retrieving word for verse [{}], strong [{}], morph [{}]", verseNumber, strong, morph); final Set<String> results = new LinkedHashSet<String>(); if (isBlank(strong)) { // we might as well return, as we have no information to go on return ""; } // the keys passed in may have multiple references and morphologies, therefore, we need to lookup // multiple items. final String[] strongs = StringUtils.split(strong); //create the versified key, and convert to the bit we want Key key = null; try { if (verseNumber != null) { final Verse inputVerse = VerseFactory.fromString(this.masterVersification, verseNumber); key = VersificationsMapper.instance().mapVerse(inputVerse, this.versification); } } catch (NoSuchVerseException e) { LOGGER.error(e.getMessage(), e); return ""; } // There are at most strongs.length words, and we might have morphological data to help for (final String s : strongs) { // find corresponding strong: LOGGER.debug("Finding strong key [{}]", s); final String strongKey = getAnyKey(s); results.add(getWord(key, strongKey, true)); } return convertToString(results); } /** * Takes a set, and outputs the strings concatenated together (and separated by a space. * * @param results the results that should be converted to a string * @return a String containing results to be displayed */ private String convertToString(final Set<String> results) { final Iterator<String> iterator = results.iterator(); final StringBuilder sb = new StringBuilder(results.size() * 16); // add the first word without a space if (iterator.hasNext()) { sb.append(iterator.next()); } // add spaces between each element now while (iterator.hasNext()) { sb.append(' '); sb.append(iterator.next()); } String actualText = sb.toString(); if (stripVowels) { return StringConversionUtils.unAccent(actualText); } else if (stripAccents) { return StringConversionUtils.unAccentLeavingVowels(actualText); } else { return actualText; } } /** * returns words based on strong and verse number only. * * @param equivalentVerses the verse number * @param strong the strong reference * @param followMapping true to indicate we should follow the mappings, false to indicate we are already * following mappings and therefore want to prevent an infinite loop! * @return a word that matches or the empty string */ String getWord(final Key equivalentVerses, final String strong, final boolean followMapping) { if (strong != null && equivalentVerses != null) { //Key may be made up of several keys Iterator<Key> keyIterator = equivalentVerses.iterator(); while (keyIterator.hasNext()) { Verse v = (Verse) keyIterator.next(); String osisID = v.getVerse() == 0 ? NO_VERSE : v.getOsisID(); final DualKey<String, String> key = new DualKey<String, String>(strong, osisID); final Deque<Word> list = this.limitedAccuracy.get(key); if (list != null && !list.isEmpty()) { return retrieveWord(list); } } if (followMapping) { return lookupMappings(equivalentVerses, strong); } } else if (strong != null) { //then we know we have a null verse, so assume we're in pre-verse mode... final DualKey<String, String> key = new DualKey<String, String>(strong, NO_VERSE); final Deque<Word> list = this.limitedAccuracy.get(key); if (list != null && !list.isEmpty()) { return retrieveWord(list); } } // it is important to return an empty string here return ""; } /** * Lookup mappings, if the strong number is there, then it is used * * @param osisKey the OSIS key * @param strong the strong * @return the string */ private String lookupMappings(final Key osisKey, final String strong) { final boolean isOT = this.testament == Testament.OLD; // we ignore mapping lookups for anything greek or hebrew... if (!originalLanguage) { // currently only supporting OLD Testament if (isOT) { final String direct = this.hebrewDirectMapping.get(strong); if (direct != null) { return direct; } final String indirect = this.hebrewIndirectMappings.get(strong); if (indirect != null) { return indirect; } } } //else look up from vocab provider final String key = isOT ? 'H' + strong : 'G' + strong; final String reference = osisKey.getOsisID(); final EntityDoc[] strongDefinition = this.vocabularyService.getLexiconDefinitions(key, this.masterVersion, reference); //only examine the first one... if (strongDefinition.length > 0) { final String alternativeTagging = strongDefinition[0].get("alternativeTagging"); if (StringUtils.isNotBlank(alternativeTagging)) { // then we look to see if we've perhaps got some more tagging around for the alternatives... String[] alts = StringUtils.split(alternativeTagging, "[, ]+"); for (String a : alts) { String alternativeWord = this.getWord(osisKey, a.substring(1), false); if (StringUtils.isNotBlank(alternativeWord)) { return alternativeWord; } } } final String englishVocab = strongDefinition[0].get("stepGloss"); if (StringUtils.isNotBlank(englishVocab)) { return "#" + englishVocab; } } return ""; } /** * Retrieves the first word from the list, and removes from the list. If the word is PARTIAL, then retrieves the * next one too, and concatenates * * @param list a dequue containing all the items in question * @return the string */ private String retrieveWord(final Deque<Word> list) { Word word = list.removeFirst(); if (!word.isPartial()) { return word.getUntaggedText() != null ? word.getUntaggedText() + word.getText() : word.getText(); } final StringBuilder text = new StringBuilder(32); while (word != null && word.isPartial()) { if (word.getUntaggedText() != null) { text.append(word.getUntaggedText()); } text.append(word.getText()); text.append(", "); // increment to next word word = list.pollFirst(); } // append the last word if (word != null) { text.append(word.getText()); } return text.toString(); } /** * TODO: can be optimized by not iterating through major elements such as Notes for example setups all the initial * textual information for fast retrieval during XSL transformation. * * @param element element to start with. */ @SuppressWarnings("unchecked") private boolean scanForTextualInformation(final Element element, final String untaggedText) { // check to see if we've hit a new verse, if so, we update the verse updateVerseRef(element); // check to see if we've hit a node of interest if (element.getName().equals(OSISUtil.OSIS_ELEMENT_W)) { extractTextualInfoFromNode(element, untaggedText); return true; } //small optimization to remove processing of potentially verbose notes if (element.getName().equals(OSISUtil.OSIS_ELEMENT_NOTE)) { return false; } // iterate through all children and call recursively Object data; Element ele; final Iterator<Content> contentIter = element.getContent().iterator(); StringBuilder untaggedContent = null; while (contentIter.hasNext()) { data = contentIter.next(); //we capture untagged content at the same level as the elements that we process if (data instanceof Text) { if (untaggedContent == null) { untaggedContent = new StringBuilder(32); } untaggedContent.append(((Text) data).getText()); } if (data instanceof Element) { ele = (Element) data; if (untaggedContent != null) { if (scanForTextualInformation(ele, untaggedContent.toString())) { //we've consumed the untagged content, so remove it now untaggedContent = null; } } else { scanForTextualInformation(ele, null); } } } return false; } /** * Gets the OSIS id if any * * @param element the osis element */ private void updateVerseRef(final Element element) { final boolean isVerseMarker = OSISUtil.OSIS_ELEMENT_VERSE.equals(element.getName()); if (isVerseMarker) { final String osisId = element.getAttributeValue(OSISUtil.OSIS_ATTR_OSISID); if (osisId != null) try { currentVerse = VerseFactory.fromString(this.versification, osisId); } catch (NoSuchVerseException ex) { LOGGER.trace("Unable to convert ref - probably not a verse reference.", ex); } } } /** * retrieves textual information and adds it to the provider. * * @param element the element to extract information from */ private void extractTextualInfoFromNode(final Element element, final String untaggedContent) { final String strong = element.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA); final String word = getText(element); // do we need to do any manipulation? probably not because we are going to be // comparing against other OSIS XML texts which should be formatted in the same way! // however, some attributes may contain multiple strongs and morphs tagged to one word. // therefore we do need to split the text. final String[] strongs = split(strong); if (strongs == null) { return; } // there is no way of know which strong goes with which morph, and we only // have one phrase anyway final List<Word> words = new ArrayList<Word>(2); boolean partial = false; for (int ii = 0; ii < strongs.length; ii++) { final String strongKey = getAnyKey(strongs[ii]); if (!isH00(strongKey) && !blacklisted(strongKey)) { words.add(addTextualInfo(currentVerse, strongKey, word, untaggedContent)); } else { partial = true; } } if (partial) { for (final Word w : words) { w.setPartial(true); } } } /** * Gets the text of the element and its children * * @param element the element * @return the text */ private String getText(final Element element) { // can contain <a> and <seg>, both of which we need to output final StringBuilder sb = new StringBuilder(32); getTextRecurively(sb, element); return sb.toString(); } /** * Gets the text recurively. * * @param sb the sb * @param content the content */ private void getTextRecurively(final StringBuilder sb, final Content content) { if (content instanceof Text) { sb.append(((Text) content).getText()); return; } if (content instanceof Element) { // iterate through all children final Element element = (Element) content; // we only consider some elements if (!VALID_TEXT_ELEMENTS.contains(element.getName().toLowerCase())) { return; } final List<Content> children = element.getContent(); for (final Content c : children) { getTextRecurively(sb, c); } } } /** * Blacklisted, if the word is contained in a direct mapping for the relevant testament * * @param strongKey the strong key * @return true, if successful */ private boolean blacklisted(final String strongKey) { return this.testament == Testament.OLD && this.hebrewDirectMapping.containsKey(strongKey); } /** * Checks if is h00. * * @param currentStrong a strong number * @return true, if is a single H followed by only 0s, which indicates that the strong numbers go with their next * occurrence */ private boolean isH00(final String currentStrong) { for (int ii = 0; ii < currentStrong.length(); ii++) { if (currentStrong.charAt(ii) != '0') { return false; } } return true; } /** * Finally, we have some information to add to this provider. We try and add it in an efficient fashion. * <p/> * So, how do we store this? The most meaningful piece of data is a STRONG number, since it identifies the word that * we want to retrieve. Without the strong number, we don't have any information at all. Therefore, the first level * of lookup should be by Strong number. * <p/> * Made package private for testing purposes only. * * @param verseReference the verse reference that specifies locality (least important factor) * @param strongKey the strong number (identifies the root/meaning of the word) * @param word the word to be stored * @return the word that has been added */ Word addTextualInfo(final Verse verseReference, final String strongKey, final String word, final String untaggedContent) { final DualKey<String, String> strongVerseKey = new DualKey<String, String>(strongKey, verseReference == null ? NO_VERSE : verseReference.getOsisIDNoSubIdentifier()); Deque<Word> verseKeyedStrongs = this.limitedAccuracy.get(strongVerseKey); if (verseKeyedStrongs == null) { verseKeyedStrongs = new LinkedList<>(); this.limitedAccuracy.put(strongVerseKey, verseKeyedStrongs); } final Word w = new Word(word, untaggedContent); verseKeyedStrongs.add(w); return w; } /** * Sets the testament, to be used to determine the indirect/direct mappings to use when generating the interlinear. * * @param key the key to the passage being looked up */ private void setTestamentType(final Key key) { final Versification v11n = Versifications.instance().getVersification( (String) this.currentBook.getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION)); final Passage passage = KeyUtil.getPassage(key); this.testament = v11n.getTestament(v11n.getOrdinal(passage.getVerseAt(0))); } /** * @param currentBook the currentBook to set */ void setCurrentBook(final Book currentBook) { this.currentBook = currentBook; } /** * @param vocabService sets the vocab service */ void setVocabProvider(final VocabularyService vocabService) { this.vocabularyService = vocabService; } @Override public boolean isDisabled() { return disabled; } }