/**************************************************************************************************
* Copyright (c) 2013, Directors of the Tyndale STEP Project *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright *
* notice, this list of conditions and the following disclaimer. *
* Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in *
* the documentation and/or other materials provided with the *
* distribution. *
* Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com) *
* nor the names of its contributors may be used to endorse or promote *
* products derived from this software without specific prior written *
* permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS *
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE *
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, *
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER *
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT *
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING *
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF *
* THE POSSIBILITY OF SUCH DAMAGE. *
**************************************************************************************************/
package com.tyndalehouse.step.tools.analysis;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author chrisburrell
*/
public class BerkeleyOutputToTaggingFormat {
private static final Map<String, String> entries = new HashMap<String, String>(12000);
public static void main(String[] args) throws IOException {
// David'
final String root = "C:\\Users\\David IB\\Dropbox\\STEP-Tagging\\autoTag\\Bibles\\";
final String strongs = FileUtils.readFileToString(new File(root + "bible.s"));
final String other = FileUtils.readFileToString(new File(root + "bible.o"));
final String results = FileUtils.readFileToString(new File(root + "training.align"));
final String keyFile = FileUtils.readFileToString(new File(root + "keyList.txt"));
/**
* Chris'
final String root = "C:\\temp\\berkeley\\berkeleyBibles\\output\\";
final String strongs = FileUtils.readFileToString(new File(root + "bible.s"));
final String other = FileUtils.readFileToString(new File(root + "bible.o"));
final String results = FileUtils.readFileToString(new File(root + "training.align"));
final String keyFile = FileUtils.readFileToString(new File(root + "keyList-nt.txt"));
*/
List<String[]> strongSentences = splitByWord(strongs);
List<String[]> otherSentences = splitByWord(other);
List<String[]> resultSentences = splitByWord(results);
List<String[]> keyList = splitByWord(keyFile);
final File path = new File("C:\\Users\\David IB\\AppData\\Roaming\\JSword\\step\\entities\\definition");
// final File path = new File("C:\\Users\\Chris\\AppData\\Roaming\\JSword\\step\\entities\\definition");
FSDirectory directory = FSDirectory.open(path);
final IndexSearcher indexSearcher = new IndexSearcher(directory);
String resultTagging = parseResultsAsTable(resultSentences, strongSentences, otherSentences, indexSearcher, keyList);
FileUtils.writeStringToFile(new File(root + "positionalTagging-table.txt"), resultTagging);
}
private static String parseResultsAsTable(final List<String[]> resultSentences, final List<String[]> strongSentences, final List<String[]> otherSentences, final IndexSearcher indexSearcher, final List<String[]> keyList) throws IOException {
StringBuilder resultingTagging = new StringBuilder(8000000);
//verse => results
Map<String, Map<Integer, String>> verseToResults = new HashMap<String, Map<Integer, String>>(32000);
for (int i = 0; i < resultSentences.size(); i++) {
final String[] sentence = resultSentences.get(i);
String ref = keyList.get(i)[0];
if (i % 200 == 0) {
System.out.println(ref);
}
Map<Integer, String> resultTagging = new HashMap<Integer, String>();
verseToResults.put(ref, resultTagging);
for (String word : sentence) {
String[] stringIndexes = word.split("-");
try {
int[] indexes = new int[]{Integer.parseInt(stringIndexes[0]), Integer.parseInt(stringIndexes[1])};
if (indexes[0] == 0 && indexes[1] == 0 && sentence.length == 1) {
continue;
}
//find word in sentence in each bible.
resultTagging.put(indexes[1], strongSentences.get(i)[indexes[0]]);
} catch (Exception e) {
System.out.println("Error in verse " + ref + " for word: " + word);
System.out.println(e.getMessage());
}
}
}
for (int ii = 0; ii < otherSentences.size(); ii++) {
//output every word
String[] words = otherSentences.get(ii);
int wordNumber = 0;
final String verseRef = keyList.get(ii)[0];
// resultingTagging.append("$");
outputVerseRef(resultingTagging, verseRef, wordNumber);
final Map<Integer, String> sentenceStrongs = verseToResults.get(verseRef);
boolean wasStrongNumber = false;
String lastStrongNumber = null;
for (int jj = 0; jj < words.length; jj++) {
wordNumber = jj;
String strongNumber = sentenceStrongs.get(jj);
// if (lastStrongNumber != null && !lastStrongNumber.equals(strongNumber)) { // to avoid repeating same Strongs number twice. But sometimes supposed to repeat! eg Matt.1.2
if (lastStrongNumber != null) {
outputStrongNumber(resultingTagging, verseRef, lastStrongNumber, indexSearcher);
outputVerseRef(resultingTagging, verseRef, wordNumber);
}
if (strongNumber == null) {
resultingTagging.append(words[jj]);
// if (wasStrongNumber) {
// resultingTagging.append('\n');
// } else {
resultingTagging.append(' ');
// }
wasStrongNumber = false;
lastStrongNumber = null;
} else {
/*
if (wasStrongNumber && lastStrongNumber.equals(strongNumber)) {
resultingTagging.append(' ');
} else {
*/
resultingTagging.append('\t');
// }
resultingTagging.append(words[jj]);
// resultingTagging.append('\t');
wasStrongNumber = true;
lastStrongNumber = strongNumber;
}
}
if (lastStrongNumber != null) {
outputStrongNumber(resultingTagging, verseRef, lastStrongNumber, indexSearcher);
}
if(resultingTagging.charAt(resultingTagging.length() -1) != '\n') {
resultingTagging.append('\n');
}
}
return resultingTagging.toString();
}
private static void outputVerseRef(final StringBuilder resultingTagging, final String verseRef, final int wordNumber) {
// resultingTagging.append("¦");
resultingTagging.append(verseRef);
resultingTagging.append('-');
resultingTagging.append(String.format("%03d", wordNumber));
resultingTagging.append('\t');
}
private static void outputStrongNumber(final StringBuilder resultingTagging, final String verseRef, final String lastStrongNumber, final IndexSearcher indexSearcher) {
//output the strong number and a new line
resultingTagging.append('\t');
resultingTagging.append("<");
resultingTagging.append(lastStrongNumber);
resultingTagging.append("> = ");
try {
appendLexicalEntry(indexSearcher, resultingTagging, lastStrongNumber);
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
resultingTagging.append(" ¬");
resultingTagging.append('\n');
}
private static String parseResults(final List<String[]> resultSentences, final List<String[]> strongSentences, final List<String[]> otherSentences, final IndexSearcher indexSearcher, final List<String[]> keyList) throws IOException {
StringBuilder resultingTagging = new StringBuilder(8000000);
for (int i = 0; i < resultSentences.size(); i++) {
final String[] sentence = resultSentences.get(i);
String ref = keyList.get(i)[0];
if (i % 200 == 0) {
System.out.println(ref);
}
resultingTagging.append(ref);
resultingTagging.append(' ');
for (String word : sentence) {
String[] stringIndexes = word.split("-");
try {
int[] indexes = new int[]{Integer.parseInt(stringIndexes[0]), Integer.parseInt(stringIndexes[1])};
if (indexes[0] == 0 && indexes[1] == 0 && sentence.length == 1) {
continue;
}
//find word in sentence in each bible.
String strong = strongSentences.get(i)[indexes[0]];
String other = otherSentences.get(i)[indexes[1]];
resultingTagging.append(other);
resultingTagging.append(" (");
appendLexicalEntry(indexSearcher, resultingTagging, strong);
resultingTagging.append(", ");
resultingTagging.append(strong);
resultingTagging.append(", ");
resultingTagging.append(word);
resultingTagging.append(") ");
} catch (Exception e) {
System.out.println("Error in verse " + ref + " for word: " + word);
System.out.println(e.getMessage());
}
}
resultingTagging.append('\n');
resultingTagging.append('\n');
}
return resultingTagging.toString();
}
private static void appendLexicalEntry(final IndexSearcher indexSearcher, final StringBuilder resultingTagging, String strong) throws IOException {
if (strong.length() > 5 && strong.charAt(1) == '0') {
strong = strong.substring(0, 1) + strong.substring(2);
}
String gloss = entries.get(strong);
if (gloss == null) {
final TopDocs lexicalEntries = indexSearcher.search(new TermQuery(new Term("strongNumber", strong)), Integer.MAX_VALUE);
if (lexicalEntries.scoreDocs.length > 0) {
gloss = indexSearcher.doc(lexicalEntries.scoreDocs[0].doc).get("stepGloss");
} else {
gloss = "";
}
entries.put(strong, gloss);
}
resultingTagging.append(gloss);
}
private static List<String[]> splitByWord(final String strongs) {
final String[] sentences = strongs.split("\r?\n");
List<String[]> sss = new ArrayList<String[]>(64000);
for (String sentence : sentences) {
final String[] split = org.apache.commons.lang3.StringUtils.split(sentence, ' ');
sss.add(split);
}
return sss;
}
}