/*******************************************************************************
* Copyright (c) 2012, Directors of the Tyndale STEP Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com)
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package com.tyndalehouse.step.core.xsl.impl;
import com.tyndalehouse.step.core.data.EntityDoc;
import com.tyndalehouse.step.core.exceptions.StepInternalException;
import com.tyndalehouse.step.core.service.VocabularyService;
import com.tyndalehouse.step.core.service.jsword.JSwordVersificationService;
import com.tyndalehouse.step.core.utils.JSwordUtils;
import com.tyndalehouse.step.core.utils.StringConversionUtils;
import com.tyndalehouse.step.core.utils.StringUtils;
import com.tyndalehouse.step.core.xsl.InterlinearProvider;
import org.crosswire.jsword.book.*;
import org.crosswire.jsword.passage.*;
import org.crosswire.jsword.versification.Testament;
import org.crosswire.jsword.versification.Versification;
import org.crosswire.jsword.versification.VersificationsMapper;
import org.crosswire.jsword.versification.system.Versifications;
import org.jdom2.Content;
import org.jdom2.Element;
import org.jdom2.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import static com.tyndalehouse.step.core.utils.StringConversionUtils.getAnyKey;
import static com.tyndalehouse.step.core.utils.StringUtils.*;
import static java.lang.String.format;
/**
* This object is not purposed to be used as a singleton. It builds up textual information on initialisation, and is
* specific to requests. On initialisation, the OSIS XML is retrieved and iterated through to find all strong/morph
* candidates
*
* @author chrisburrell
*/
public class InterlinearProviderImpl implements InterlinearProvider {
public static final String NO_VERSE = "NO_VERSE";
/**
* The Constant LOGGER.
*/
private static final Logger LOGGER = LoggerFactory.getLogger(InterlinearProviderImpl.class);
/**
* contains the set of tags that may contain biblical text, all lower case
*/
private static final Set<String> VALID_TEXT_ELEMENTS = new HashSet<String>();
/**
* limited accuracy tries to do a location look up by using the verse number as part of the key.
*/
private final Map<DualKey<String, String>, Deque<Word>> limitedAccuracy = new HashMap<DualKey<String, String>, Deque<Word>>();
private final boolean originalLanguage;
private boolean disabled = false;
private Versification versification;
// a temporary, non-thread-safe, transient, working variable, which keeps track of the verse we're in.
private Verse currentVerse;
private Book currentBook;
private Map<String, String> hebrewDirectMapping;
private Map<String, String> hebrewIndirectMappings;
private Testament testament;
private String masterVersion;
private Versification masterVersification;
private VocabularyService vocabularyService;
private boolean stripAccents = false;
private boolean stripVowels = false;
static {
VALID_TEXT_ELEMENTS.add("divinename");
VALID_TEXT_ELEMENTS.add("a");
VALID_TEXT_ELEMENTS.add("foreign");
VALID_TEXT_ELEMENTS.add("hi");
VALID_TEXT_ELEMENTS.add("name");
VALID_TEXT_ELEMENTS.add("q");
VALID_TEXT_ELEMENTS.add("w");
VALID_TEXT_ELEMENTS.add("seg");
VALID_TEXT_ELEMENTS.add("transChange");
VALID_TEXT_ELEMENTS.add("doxology");
VALID_TEXT_ELEMENTS.add("colophon");
VALID_TEXT_ELEMENTS.add("refrain");
VALID_TEXT_ELEMENTS.add("attribution");
}
/**
* sets up the interlinear provider with the correct version and text scope.
*
* @param versificationService versification service
* @param version the version to use to set up the interlinear
* @param versifiedKey the text scope reference, defining the bounds of the lookup
* @param hebrewDirectMapping the hebrew overriding mappings
* @param hebrewIndirectMappings the mappings used if no other mapping is found
*/
public InterlinearProviderImpl(final String masterVersion, Versification masterVersification, JSwordVersificationService versificationService,
final String version, final Key versifiedKey, final Map<String, String> hebrewDirectMapping,
final Map<String, String> hebrewIndirectMappings, final VocabularyService vocabProvider,
boolean stripGreekAccents, boolean stripHebrewAccents, boolean stripVowels) {
this.masterVersion = masterVersion;
this.masterVersification = masterVersification;
this.vocabularyService = vocabProvider;
// first check whether the values passed in are correct
if (areAnyBlank(version)) {
this.originalLanguage = false;
return;
}
this.hebrewIndirectMappings = hebrewIndirectMappings;
this.hebrewDirectMapping = hebrewDirectMapping;
this.currentBook = versificationService.getBookFromVersion(version);
this.versification = versificationService.getVersificationForVersion(currentBook);
if (this.currentBook == null) {
throw new StepInternalException(format("Couldn't look up book: [%s]", version));
}
//mark the book as original language
this.originalLanguage = JSwordUtils.isAncientBook(currentBook);
final boolean ancientHebrewBook = JSwordUtils.isAncientHebrewBook(currentBook);
this.stripAccents = stripGreekAccents && JSwordUtils.isAncientGreekBook(currentBook) ||
stripHebrewAccents && ancientHebrewBook;
this.stripVowels = ancientHebrewBook && this.stripAccents && stripVowels;
BookData bookData;
try {
setTestamentType(versifiedKey);
bookData = getBookDataWithVerse0(versifiedKey);
scanForTextualInformation(bookData.getOsisFragment(), null);
} catch (final BookException e) {
throw new StepInternalException(e.getMessage(), e);
}
this.disabled = this.limitedAccuracy.size() == 0;
}
/**
* package private version for testing purposes.
*/
InterlinearProviderImpl() {
// exposing package private constructor
this.originalLanguage = false;
}
/**
* For verse 0, we can't simply lookup verse 0, because that doesn't return the pre-verse content which sits in
* verse 1 so instead we need to replace the key to have verse 1 instead. For all purposes, such as verses 0-1, or
* verse 1, etc. then we continue as normal.
*
* @param versifiedKey the key from the original versification
* @return a bookdata with the correct verse
*/
private BookData getBookDataWithVerse0(final Key versifiedKey) {
final Iterator<Key> iterator = versifiedKey.iterator();
Verse v = (Verse) iterator.next();
if (v != null && !iterator.hasNext()) {
//then we're basically looking at a single verse... in this special case
// we need to check it doesn't not map to verse 0.
//if it did, we will need to return verse with 1.
final VerseKey mappedVerse = VersificationsMapper.instance().mapVerse(v, this.versification);
if (mappedVerse.getCardinality() == 1) {
final Verse next = (Verse) mappedVerse.iterator().next();
if (next.getVerse() == 0) {
return new BookData(this.currentBook,
new Verse(this.versification, next.getBook(), next.getChapter(), next.getVerse() + 1));
}
}
}
return new BookData(this.currentBook, versifiedKey);
}
@Override
public String getWord(final String verseNumber, final String strong, final String morph) {
// we use a linked hashset, because we want the behaviour of a set while we add to it,
// but at the end, we will want to return the elements in order
LOGGER.trace("Retrieving word for verse [{}], strong [{}], morph [{}]", verseNumber,
strong, morph);
final Set<String> results = new LinkedHashSet<String>();
if (isBlank(strong)) {
// we might as well return, as we have no information to go on
return "";
}
// the keys passed in may have multiple references and morphologies, therefore, we need to lookup
// multiple items.
final String[] strongs = StringUtils.split(strong);
//create the versified key, and convert to the bit we want
Key key = null;
try {
if (verseNumber != null) {
final Verse inputVerse = VerseFactory.fromString(this.masterVersification, verseNumber);
key = VersificationsMapper.instance().mapVerse(inputVerse, this.versification);
}
} catch (NoSuchVerseException e) {
LOGGER.error(e.getMessage(), e);
return "";
}
// There are at most strongs.length words, and we might have morphological data to help
for (final String s : strongs) {
// find corresponding strong:
LOGGER.debug("Finding strong key [{}]", s);
final String strongKey = getAnyKey(s);
results.add(getWord(key, strongKey, true));
}
return convertToString(results);
}
/**
* Takes a set, and outputs the strings concatenated together (and separated by a space.
*
* @param results the results that should be converted to a string
* @return a String containing results to be displayed
*/
private String convertToString(final Set<String> results) {
final Iterator<String> iterator = results.iterator();
final StringBuilder sb = new StringBuilder(results.size() * 16);
// add the first word without a space
if (iterator.hasNext()) {
sb.append(iterator.next());
}
// add spaces between each element now
while (iterator.hasNext()) {
sb.append(' ');
sb.append(iterator.next());
}
String actualText = sb.toString();
if (stripVowels) {
return StringConversionUtils.unAccent(actualText);
} else if (stripAccents) {
return StringConversionUtils.unAccentLeavingVowels(actualText);
} else {
return actualText;
}
}
/**
* returns words based on strong and verse number only.
*
* @param equivalentVerses the verse number
* @param strong the strong reference
* @param followMapping true to indicate we should follow the mappings, false to indicate we are already
* following mappings and therefore want to prevent an infinite loop!
* @return a word that matches or the empty string
*/
String getWord(final Key equivalentVerses, final String strong, final boolean followMapping) {
if (strong != null && equivalentVerses != null) {
//Key may be made up of several keys
Iterator<Key> keyIterator = equivalentVerses.iterator();
while (keyIterator.hasNext()) {
Verse v = (Verse) keyIterator.next();
String osisID = v.getVerse() == 0 ? NO_VERSE : v.getOsisID();
final DualKey<String, String> key = new DualKey<String, String>(strong, osisID);
final Deque<Word> list = this.limitedAccuracy.get(key);
if (list != null && !list.isEmpty()) {
return retrieveWord(list);
}
}
if (followMapping) {
return lookupMappings(equivalentVerses, strong);
}
} else if (strong != null) {
//then we know we have a null verse, so assume we're in pre-verse mode...
final DualKey<String, String> key = new DualKey<String, String>(strong, NO_VERSE);
final Deque<Word> list = this.limitedAccuracy.get(key);
if (list != null && !list.isEmpty()) {
return retrieveWord(list);
}
}
// it is important to return an empty string here
return "";
}
/**
* Lookup mappings, if the strong number is there, then it is used
*
* @param osisKey the OSIS key
* @param strong the strong
* @return the string
*/
private String lookupMappings(final Key osisKey, final String strong) {
final boolean isOT = this.testament == Testament.OLD;
// we ignore mapping lookups for anything greek or hebrew...
if (!originalLanguage) {
// currently only supporting OLD Testament
if (isOT) {
final String direct = this.hebrewDirectMapping.get(strong);
if (direct != null) {
return direct;
}
final String indirect = this.hebrewIndirectMappings.get(strong);
if (indirect != null) {
return indirect;
}
}
}
//else look up from vocab provider
final String key = isOT ? 'H' + strong : 'G' + strong;
final String reference = osisKey.getOsisID();
final EntityDoc[] strongDefinition = this.vocabularyService.getLexiconDefinitions(key, this.masterVersion, reference);
//only examine the first one...
if (strongDefinition.length > 0) {
final String alternativeTagging = strongDefinition[0].get("alternativeTagging");
if (StringUtils.isNotBlank(alternativeTagging)) {
// then we look to see if we've perhaps got some more tagging around for the alternatives...
String[] alts = StringUtils.split(alternativeTagging, "[, ]+");
for (String a : alts) {
String alternativeWord = this.getWord(osisKey, a.substring(1), false);
if (StringUtils.isNotBlank(alternativeWord)) {
return alternativeWord;
}
}
}
final String englishVocab = strongDefinition[0].get("stepGloss");
if (StringUtils.isNotBlank(englishVocab)) {
return "#" + englishVocab;
}
}
return "";
}
/**
* Retrieves the first word from the list, and removes from the list. If the word is PARTIAL, then retrieves the
* next one too, and concatenates
*
* @param list a dequue containing all the items in question
* @return the string
*/
private String retrieveWord(final Deque<Word> list) {
Word word = list.removeFirst();
if (!word.isPartial()) {
return word.getUntaggedText() != null ? word.getUntaggedText() + word.getText() : word.getText();
}
final StringBuilder text = new StringBuilder(32);
while (word != null && word.isPartial()) {
if (word.getUntaggedText() != null) {
text.append(word.getUntaggedText());
}
text.append(word.getText());
text.append(", ");
// increment to next word
word = list.pollFirst();
}
// append the last word
if (word != null) {
text.append(word.getText());
}
return text.toString();
}
/**
* TODO: can be optimized by not iterating through major elements such as Notes for example setups all the initial
* textual information for fast retrieval during XSL transformation.
*
* @param element element to start with.
*/
@SuppressWarnings("unchecked")
private boolean scanForTextualInformation(final Element element, final String untaggedText) {
// check to see if we've hit a new verse, if so, we update the verse
updateVerseRef(element);
// check to see if we've hit a node of interest
if (element.getName().equals(OSISUtil.OSIS_ELEMENT_W)) {
extractTextualInfoFromNode(element, untaggedText);
return true;
}
//small optimization to remove processing of potentially verbose notes
if (element.getName().equals(OSISUtil.OSIS_ELEMENT_NOTE)) {
return false;
}
// iterate through all children and call recursively
Object data;
Element ele;
final Iterator<Content> contentIter = element.getContent().iterator();
StringBuilder untaggedContent = null;
while (contentIter.hasNext()) {
data = contentIter.next();
//we capture untagged content at the same level as the elements that we process
if (data instanceof Text) {
if (untaggedContent == null) {
untaggedContent = new StringBuilder(32);
}
untaggedContent.append(((Text) data).getText());
}
if (data instanceof Element) {
ele = (Element) data;
if (untaggedContent != null) {
if (scanForTextualInformation(ele, untaggedContent.toString())) {
//we've consumed the untagged content, so remove it now
untaggedContent = null;
}
} else {
scanForTextualInformation(ele, null);
}
}
}
return false;
}
/**
* Gets the OSIS id if any
*
* @param element the osis element
*/
private void updateVerseRef(final Element element) {
final boolean isVerseMarker = OSISUtil.OSIS_ELEMENT_VERSE.equals(element.getName());
if (isVerseMarker) {
final String osisId = element.getAttributeValue(OSISUtil.OSIS_ATTR_OSISID);
if (osisId != null)
try {
currentVerse = VerseFactory.fromString(this.versification, osisId);
} catch (NoSuchVerseException ex) {
LOGGER.trace("Unable to convert ref - probably not a verse reference.", ex);
}
}
}
/**
* retrieves textual information and adds it to the provider.
*
* @param element the element to extract information from
*/
private void extractTextualInfoFromNode(final Element element, final String untaggedContent) {
final String strong = element.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
final String word = getText(element);
// do we need to do any manipulation? probably not because we are going to be
// comparing against other OSIS XML texts which should be formatted in the same way!
// however, some attributes may contain multiple strongs and morphs tagged to one word.
// therefore we do need to split the text.
final String[] strongs = split(strong);
if (strongs == null) {
return;
}
// there is no way of know which strong goes with which morph, and we only
// have one phrase anyway
final List<Word> words = new ArrayList<Word>(2);
boolean partial = false;
for (int ii = 0; ii < strongs.length; ii++) {
final String strongKey = getAnyKey(strongs[ii]);
if (!isH00(strongKey) && !blacklisted(strongKey)) {
words.add(addTextualInfo(currentVerse, strongKey, word, untaggedContent));
} else {
partial = true;
}
}
if (partial) {
for (final Word w : words) {
w.setPartial(true);
}
}
}
/**
* Gets the text of the element and its children
*
* @param element the element
* @return the text
*/
private String getText(final Element element) {
// can contain <a> and <seg>, both of which we need to output
final StringBuilder sb = new StringBuilder(32);
getTextRecurively(sb, element);
return sb.toString();
}
/**
* Gets the text recurively.
*
* @param sb the sb
* @param content the content
*/
private void getTextRecurively(final StringBuilder sb, final Content content) {
if (content instanceof Text) {
sb.append(((Text) content).getText());
return;
}
if (content instanceof Element) {
// iterate through all children
final Element element = (Element) content;
// we only consider some elements
if (!VALID_TEXT_ELEMENTS.contains(element.getName().toLowerCase())) {
return;
}
final List<Content> children = element.getContent();
for (final Content c : children) {
getTextRecurively(sb, c);
}
}
}
/**
* Blacklisted, if the word is contained in a direct mapping for the relevant testament
*
* @param strongKey the strong key
* @return true, if successful
*/
private boolean blacklisted(final String strongKey) {
return this.testament == Testament.OLD && this.hebrewDirectMapping.containsKey(strongKey);
}
/**
* Checks if is h00.
*
* @param currentStrong a strong number
* @return true, if is a single H followed by only 0s, which indicates that the strong numbers go with their next
* occurrence
*/
private boolean isH00(final String currentStrong) {
for (int ii = 0; ii < currentStrong.length(); ii++) {
if (currentStrong.charAt(ii) != '0') {
return false;
}
}
return true;
}
/**
* Finally, we have some information to add to this provider. We try and add it in an efficient fashion.
* <p/>
* So, how do we store this? The most meaningful piece of data is a STRONG number, since it identifies the word that
* we want to retrieve. Without the strong number, we don't have any information at all. Therefore, the first level
* of lookup should be by Strong number.
* <p/>
* Made package private for testing purposes only.
*
* @param verseReference the verse reference that specifies locality (least important factor)
* @param strongKey the strong number (identifies the root/meaning of the word)
* @param word the word to be stored
* @return the word that has been added
*/
Word addTextualInfo(final Verse verseReference, final String strongKey, final String word, final String untaggedContent) {
final DualKey<String, String> strongVerseKey = new DualKey<String, String>(strongKey, verseReference == null ? NO_VERSE : verseReference.getOsisIDNoSubIdentifier());
Deque<Word> verseKeyedStrongs = this.limitedAccuracy.get(strongVerseKey);
if (verseKeyedStrongs == null) {
verseKeyedStrongs = new LinkedList<>();
this.limitedAccuracy.put(strongVerseKey, verseKeyedStrongs);
}
final Word w = new Word(word, untaggedContent);
verseKeyedStrongs.add(w);
return w;
}
/**
* Sets the testament, to be used to determine the indirect/direct mappings to use when generating the interlinear.
*
* @param key the key to the passage being looked up
*/
private void setTestamentType(final Key key) {
final Versification v11n = Versifications.instance().getVersification(
(String) this.currentBook.getBookMetaData().getProperty(BookMetaData.KEY_VERSIFICATION));
final Passage passage = KeyUtil.getPassage(key);
this.testament = v11n.getTestament(v11n.getOrdinal(passage.getVerseAt(0)));
}
/**
* @param currentBook the currentBook to set
*/
void setCurrentBook(final Book currentBook) {
this.currentBook = currentBook;
}
/**
* @param vocabService sets the vocab service
*/
void setVocabProvider(final VocabularyService vocabService) {
this.vocabularyService = vocabService;
}
@Override
public boolean isDisabled() {
return disabled;
}
}