//CHECKSTYLE:OFF
package com.tyndalehouse.step.tools.analysis;
import static com.tyndalehouse.step.core.utils.StringUtils.isBlank;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Pattern;
import org.crosswire.jsword.book.Book;
import org.crosswire.jsword.book.BookData;
import org.crosswire.jsword.book.Books;
import org.crosswire.jsword.book.OSISUtil;
import org.crosswire.jsword.passage.Key;
import org.jdom2.Content;
import org.jdom2.Element;
import org.jdom2.Text;
import org.jdom2.filter.AttributeFilter;
import org.jdom2.filter.ElementFilter;
import org.jdom2.filter.Filter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A tool to anaylise frequencies of each word in the bible
*
* @author chrisburrell
*
*/
@SuppressWarnings("unchecked")
public class BibleAnalysis {
Pattern punctuation = Pattern.compile("[ *0-9:.<>,!\";]+");
Logger LOGGER = LoggerFactory.getLogger(BibleAnalysis.class);
private static final String SCOPE = "Gen-Mal";
private String currentVerse;
private int currentPosition;
private Map<String, List<Word>> sourceWords;
private Map<String, List<Word>> targetWords;
private Map<String, List<Word>> targetPhrases;
private Map<String, List<Word>> targetVerses;
private Map<String, Integer> targetWordCounts;
private Analysis targetAnalysis;
private TaggedVersion taggedVersion;
private Analysis strongAnalysis;
// private TreeSet<WordCount> sourceKeyOrder;
// private TreeSet<WordCount> targetKeyOrder;
public static void main(final String[] args) throws Exception {
new BibleAnalysis().read("OSMHB", "ESV-THE");
// processLeastFrequent(sortedKeys, this.sourceWords);
}
public void read(final String initials, final String targetLanguage) throws Exception {
this.sourceWords = new HashMap<String, List<Word>>();
this.targetWords = new HashMap<String, List<Word>>();
this.targetPhrases = new HashMap<String, List<Word>>();
this.targetVerses = new HashMap<String, List<Word>>();
this.targetWordCounts = new HashMap<String, Integer>();
this.targetAnalysis = read(targetLanguage, this.targetWords, this.targetPhrases, this.targetVerses,
this.targetWordCounts);
this.strongAnalysis = sourceToStrongAnalysis();
prepareSolution();
final List<AnalyzedWord> remainingStrongs = new ArrayList<AnalyzedWord>(
this.strongAnalysis.analyzedWords.size() / 2);
while (this.strongAnalysis.analyzedWords.size() != 0) {
final AnalyzedWord processAnalyzedStrong = processAnalyzedStrong();
if (processAnalyzedStrong != null) {
remainingStrongs.add(processAnalyzedStrong);
}
}
// processLeastFrequent();
// processLeastFrequent();
// processLeastFrequent();
// }
printSolution();
}
public void printSolution() {
for (final Entry<String, TaggedVerse> v : this.taggedVersion.verses.entrySet()) {
// this.LOGGER.trace("Outputting analysis for verse: [{}]", v.getKey());
String lastWord = null;
final List<ExactMatch> exactMatches = v.getValue().exactMatches;
Collections.sort(exactMatches, new Comparator<ExactMatch>() {
@Override
public int compare(final ExactMatch o1, final ExactMatch o2) {
return o1.strongNumber.compareTo(o2.strongNumber);
}
});
for (final ExactMatch m : exactMatches) {
if (!m.strongNumber.equals(lastWord)) {
this.LOGGER.info("[{}] - exact match for strong [{}]", v.getKey(), m.strongNumber);
lastWord = m.strongNumber;
}
final AnalyzedWord word = m.word;
final StringBuilder positions = new StringBuilder(16);
for (final Integer i : word.versesToPositions.get(v.getKey())) {
positions.append(i);
positions.append(' ');
}
this.LOGGER.info("\t Word [{}] matching [{}] in position [{}] of the verse", new Object[] {
word.word, m.sourceWord, positions });
// this.LOGGER.info("\t\t[{}]", m.explanation);
}
}
}
private void prepareSolution() {
this.taggedVersion = new TaggedVersion();
for (final AnalyzedWord w : this.strongAnalysis.analyzedWords) {
for (final String verse : w.verses) {
TaggedVerse taggedVerse = this.taggedVersion.verses.get(verse);
if (taggedVerse == null) {
taggedVerse = new TaggedVerse();
this.taggedVersion.verses.put(verse, taggedVerse);
}
taggedVerse.strongNumbers.add(w.markedStrongNumber);
// taggedVerse.originalPosition.add(w.versesToPositions.get(verse));
}
}
}
private Analysis sourceToStrongAnalysis() {
final Analysis strongAnalysis = new Analysis();
final List<AnalyzedWord> analyzedStrongs = strongAnalysis.analyzedWords = new ArrayList<AnalyzedWord>(
32000);
final Map<String, AnalyzedWord> directStrongLinks = strongAnalysis.directLinks = new HashMap<String, AnalyzedWord>(
8000);
for (final List<Word> words : this.sourceWords.values()) {
for (final Word w : words) {
AnalyzedWord analyzedWord = directStrongLinks.get(w.strongNumber);
if (analyzedWord == null) {
analyzedWord = new AnalyzedWord();
analyzedWord.word = w.word;
analyzedWord.markedStrongNumber = w.strongNumber;
analyzedWord.totalCount = 1;
addVerseToAnalysis(w, analyzedWord);
analyzedWord.words.add(w);
directStrongLinks.put(w.strongNumber, analyzedWord);
} else {
// we have a word
addVerseToAnalysis(w, analyzedWord);
analyzedWord.totalCount++;
}
}
}
analyzedStrongs.addAll(directStrongLinks.values());
Collections.sort(analyzedStrongs, new Comparator<AnalyzedWord>() {
@Override
public int compare(final AnalyzedWord o1, final AnalyzedWord o2) {
return o1.occurencesInDifferentVerses < o2.occurencesInDifferentVerses ? -1
: o1.occurencesInDifferentVerses == o2.occurencesInDifferentVerses ? 0 : 1;
}
});
if (this.LOGGER.isTraceEnabled()) {
this.LOGGER.trace("The following strong analysis was performed:");
for (final AnalyzedWord aw : analyzedStrongs) {
this.LOGGER.trace("\t[{}]", aw);
this.LOGGER.trace("\tFound in:");
for (final String ref : aw.verses) {
this.LOGGER.trace("\t\t[{}]", ref);
}
}
}
return strongAnalysis;
}
private void addVerseToAnalysis(final Word w, final AnalyzedWord analyzedWord) {
analyzedWord.verses.add(w.verse);
List<Integer> list = analyzedWord.versesToPositions.get(w.verse);
if (list == null) {
analyzedWord.versesToPositions.put(w.verse, list);
list = new ArrayList<Integer>();
}
list.add(w.position);
analyzedWord.occurencesInDifferentVerses = analyzedWord.verses.size();
}
private AnalyzedWord processAnalyzedStrong() {
// start in the middle
final AnalyzedWord analyzedWord = this.strongAnalysis.analyzedWords
.remove(this.strongAnalysis.analyzedWords.size() / 2);
this.LOGGER.trace("Will be trying to match [{}]", analyzedWord.markedStrongNumber);
if (this.LOGGER.isTraceEnabled()) {
this.LOGGER.trace("[{}] occurs in: ");
for (final String v : analyzedWord.verses) {
this.LOGGER.trace("\t[{}]", v);
}
}
final Set<AnalyzedWord> wordsInAllTargetVerses = getAnalyzedWordsFromVersesInTarget(analyzedWord.verses);
final boolean foundWords = computeExactMatch(analyzedWord.verses, wordsInAllTargetVerses,
analyzedWord);
if (!foundWords) {
// add the strong back into a list, so that we can work on it a bit further
return analyzedWord;
}
return null;
// computeMatchToFewerVerses(analyzedWord.verses, wordsInAllTargetVerses, analyzedWord);
// // get most frequent
// int max = -1;
// List<String> maxEntries = new ArrayList<String>();
// for (final Entry<String, Integer> e : wordsInAllTargetVerses.entrySet()) {
// this.LOGGER.trace("Word [{}] occurs [{}] in the scanned text.", e.getKey(), e.getValue());
// if (e.getValue() > max) {
// maxEntries = new ArrayList<String>();
// maxEntries.add(e.getKey());
// max = e.getValue();
// } else if (e.getValue() == max) {
// maxEntries.add(e.getKey());
// }
// }
//
// this.LOGGER.trace(
// "The following entries are likely matches with an ocurrence in all matching verses of: [{}]",
// max);
// if (this.LOGGER.isTraceEnabled()) {
// for (final String s : maxEntries) {
// this.LOGGER.trace("\t[{}]", s);
// }
// }
}
/**
* @return number of words matched
*/
private boolean computeExactMatch(final Set<String> verses,
final Set<AnalyzedWord> wordsInAllTargetVerses, final AnalyzedWord strong) {
// find all words with the exact match, i.e. the same
boolean found = false;
for (final AnalyzedWord w : wordsInAllTargetVerses) {
if (w.occurencesInDifferentVerses == verses.size()) {
found = true;
// check if they are the same two verses, if so we have a match of very high confidence
if (verses.containsAll(w.verses)) {
markExactMatch(verses, w, strong);
}
}
}
return found;
}
private void markExactMatch(final Set<String> verses, final AnalyzedWord w, final AnalyzedWord strong) {
final String explanation = "Matches" + verses.toString();
for (final String v : verses) {
final TaggedVerse tv = this.taggedVersion.verses.get(v);
final ExactMatch e = new ExactMatch();
e.word = w;
e.explanation = explanation;
e.strongNumber = strong.markedStrongNumber;
e.sourceWord = strong.word;
e.numberVersesMatch = w.occurencesInDifferentVerses;
e.numExtraVerses = 0;
e.numVersesForStrong = w.occurencesInDifferentVerses;
tv.exactMatches.add(e);
}
}
private Set<AnalyzedWord> getAnalyzedWordsFromVersesInTarget(final Set<String> verses) {
// the most frequent word is the most likely match by storing the words for the relevant verses,
// against their counts
final Set<AnalyzedWord> words = new HashSet<AnalyzedWord>(verses.size() * 16);
// target words that occur in the same verse
for (final String verseId : verses) {
final List<Word> targetWordsInVerses = this.targetVerses.get(verseId);
this.LOGGER.trace("The following words are found in the target verse [{}]:", verseId);
if (targetWordsInVerses == null) {
// TODO
// TODO
// TODO
// TODO
// TODO can still try and make an exact match of what's left.
this.LOGGER.info("Skipping lookup for verse [{}] as not found in target text", verseId);
continue;
}
for (final Word w : targetWordsInVerses) {
this.LOGGER.trace("\t[{}]:", w);
words.add(this.targetAnalysis.directLinks.get(w.word));
}
}
if (this.LOGGER.isTraceEnabled()) {
this.LOGGER.trace("The following analyzed words have been found: ");
for (final AnalyzedWord w : words) {
this.LOGGER.trace("\t {}", w);
}
}
return words;
}
public Analysis read(final String initials, final Map<String, List<Word>> currentWords,
final Map<String, List<Word>> currentPhrases, final Map<String, List<Word>> currentVerses,
final Map<String, Integer> currentWordsCounts) throws Exception {
final Book b = Books.installed().getBook(initials);
final Filter filter = new ElementFilter("verse").and(new AttributeFilter(OSISUtil.ATTRIBUTE_W_LEMMA));
final Key key = b.getKey(SCOPE);
final BookData bookData = new BookData(b, key);
final Element osis = bookData.getOsis();
final Iterator<Element> descendants = osis.getDescendants(filter);
this.currentVerse = null;
this.currentPosition = 0;
while (descendants.hasNext()) {
final Element next = descendants.next();
this.currentVerse = next.getAttributeValue("osisID");
final ArrayList<Word> value = new ArrayList<Word>();
currentVerses.put(this.currentVerse, value);
this.currentPosition = 0;
processVerseChildren(next, currentWords, currentPhrases, value);
}
// output stats
return analyze(currentWords);
// final TreeSet<WordCount> sortedKeys = sort(currentWords, currentWordsCounts);
// this.LOGGER.trace("=======================================================");
// this.LOGGER.trace(initials);
// this.LOGGER.trace("=======================================================");
// if (this.LOGGER.isTraceEnabled()) {
// for (final WordCount wordCount : sortedKeys) {
// final List<Word> words = currentWords.get(wordCount.key);
// this.LOGGER.trace(String.format("%4d ocurrences of %s", words.size(), wordCount.key));
// }
// }
// return sortedKeys;
}
class Analysis {
List<AnalyzedWord> analyzedWords = new ArrayList<AnalyzedWord>();;
Map<String, AnalyzedWord> directLinks = new HashMap<String, AnalyzedWord>();
}
private Analysis analyze(final Map<String, List<Word>> currentWords) {
final Analysis analysis = new Analysis();
final Set<Entry<String, List<Word>>> entrySet = currentWords.entrySet();
for (final Entry<String, List<Word>> entry : entrySet) {
final AnalyzedWord a = new AnalyzedWord();
a.word = entry.getKey();
a.totalCount = entry.getValue().size();
final List<Word> values = entry.getValue();
for (final Word w : values) {
a.verses.add(w.verse);
List<Integer> positions = a.versesToPositions.get(w.verse);
if (positions == null) {
positions = new ArrayList<Integer>();
a.versesToPositions.put(w.verse, positions);
}
positions.add(w.position);
List<String> strongs = a.versesToStrongNumbers.get(w.verse);
if (strongs == null) {
strongs = new ArrayList<String>();
a.versesToStrongNumbers.put(w.verse, strongs);
}
positions.add(w.position);
}
a.occurencesInDifferentVerses = a.verses.size();
analysis.analyzedWords.add(a);
analysis.directLinks.put(a.word, a);
this.LOGGER.trace("Analyzed word {}", a);
}
Collections.sort(analysis.analyzedWords, new Comparator<AnalyzedWord>() {
@Override
public int compare(final AnalyzedWord o1, final AnalyzedWord o2) {
return o1.occurencesInDifferentVerses < o2.occurencesInDifferentVerses ? -1
: o1.occurencesInDifferentVerses == o2.occurencesInDifferentVerses ? 0 : 1;
}
});
return analysis;
}
private void processVerseChildren(final Element next, final Map<String, List<Word>> currentWords,
final Map<String, List<Word>> currentPhrases, final List<Word> currentVerses) throws Exception {
final String nodeName = next.getName();
if (nodeName.equals("note") || nodeName.equals("milestone")
|| ((nodeName.equals("div") && "colophon".equals(next.getAttributeValue("type"))))) {
// pass
return;
}
final List<Element> children = next.getChildren();
if (children.size() == 0) {
final String text = next.getText().toLowerCase().trim();
addPhraseOrWord(next, currentWords, currentPhrases, text, currentVerses);
this.currentPosition++;
} else {
// not leaf node, so iterate through the content to keep the ordering
final List<Content> content = next.getContent();
for (final Content c : content) {
if (c instanceof Text) {
addPhraseOrWord(next, currentWords, currentPhrases, ((Text) c).getText().toLowerCase()
.trim(), currentVerses);
} else if (c instanceof Element) {
processVerseChildren((Element) c, currentWords, currentPhrases, currentVerses);
} else {
throw new Exception("What is this? " + content.getClass());
}
this.currentPosition++;
}
}
}
private void addPhraseOrWord(final Element next, final Map<String, List<Word>> currentWords,
final Map<String, List<Word>> currentPhrases, final String text, final List<Word> currentVerses) {
if (text.indexOf(' ') != -1) {
addPhrase(currentPhrases, currentWords, text, this.currentVerse, this.currentPosition, next,
currentVerses);
} else {
addWord(currentWords, text, this.currentVerse, this.currentPosition, next, currentVerses);
}
}
public class WordCount {
String key;
int occurence;
/*
* (non-Javadoc)
*
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
return this.key.hashCode();
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(final Object obj) {
return this.key.equals(obj);
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "WordCount [key=" + this.key + ", occurence=" + this.occurence + "]";
}
}
private void addPhrase(final Map<String, List<Word>> phrases, final Map<String, List<Word>> words,
final String text, final String currentVerse, final int currentPosition, final Element next,
final List<Word> currentVerses) {
if (isBlank(text)) {
return;
}
final Word w = new Word();
w.position = currentPosition;
w.strongNumber = next.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
w.verse = currentVerse;
w.word = text;
List<Word> list = phrases.get(w.word);
if (list == null) {
list = new ArrayList<Word>();
phrases.put(w.word, list);
}
list.add(w);
// then add multiple words
final String[] split = text.split("[ !.,;:?]+");
for (final String s : split) {
addWord(words, s, currentVerse, currentPosition, next, currentVerses);
}
}
private void addWord(final Map<String, List<Word>> words, final String text, final String currentVerse,
final int currentPosition, final Element next, final List<Word> currentVerses) {
if (isBlank(text)) {
return;
}
final String newText = this.punctuation.matcher(text).replaceAll("");
if (isBlank(newText)) {
return;
}
final Word w = new Word();
w.position = currentPosition;
w.strongNumber = next.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
w.verse = currentVerse;
w.word = newText;
List<Word> list = words.get(w.word);
if (list == null) {
list = new ArrayList<Word>();
words.put(w.word, list);
}
list.add(w);
currentVerses.add(w);
}
}