// Chris Bradshaw /************************************************************************************************** * Copyright (c) 2013, Directors of the Tyndale STEP Project * * All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * * are met: * * * * Redistributions of source code must retain the above copyright * * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * * notice, this list of conditions and the following disclaimer in * * the documentation and/or other materials provided with the * * distribution. * * Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com) * * nor the names of its contributors may be used to endorse or promote * * products derived from this software without specific prior written * * permission. * * * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * * THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ package com.tyndalehouse.step.tools.analysis; import com.tyndalehouse.step.core.utils.StringConversionUtils; import com.tyndalehouse.step.core.utils.StringUtils; import org.apache.commons.io.FileUtils; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import java.io.File; import java.io.BufferedWriter; import java.io.OutputStreamWriter; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.lang.String; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.Date; /** * @author chrisburrell */ public class BerkeleyOutputConverter { private static final Map<String, String> entries = new HashMap<String, String>(12000); private static final Map<String, String> greekEntries = new HashMap<String, String>(12000); public static String strDebug = ""; //Will store debug information, which will then be dumped into a log file (it requires a successful execution) public static void main(String[] args) throws IOException { boolean bDebug = true; long startTime = System.currentTimeMillis(); if (bDebug) { WriteDebug("Reading data. Elapsed time: (" + (System.currentTimeMillis() - startTime) + ")"); } //This file needs some variables set to work properly on one's system. They follow here. // Stefan's //portionPassage indicates the passage to be processed. It is located (1) in a corresponding subdirectory, (2) having corresponding filenames. So if desired, they can all be distinguished at one point from each-other, just by virtue of their filenames. Having a special directory, allows the easy processing of portions of passages, and allows for later access for backreferencing. final String portionPassage = "OT"; final String root = "C:\\Users\\David IB\\Dropbox\\STEP-Tagging(DIB)\\autoTag\\NIV\\NIV2011A_NT+OT-SimplifiedHebTags\\"; String strongs = FileUtils.readFileToString(new File(root + portionPassage + ".s")); // Original Text in Strong Numbers; strongs # in a file for a section of verses; each verse on a new line String other = FileUtils.readFileToString(new File(root + portionPassage + ".u")); // Target Language in Stems Only; stems only -- Done with Paratext?; each verse on a new line String results = FileUtils.readFileToString(new File(root + portionPassage + ".align.txt")); // Original Language Aligned with Target Language; alignment from Berkeley; each verse on a new line String keyFile = FileUtils.readFileToString(new File(root + portionPassage + ".keyList.txt")); // Book/Chapter/Verse Division as Key; refs only (indicates verses) final String strJSwordPath = "C:\\Users\\David IB\\AppData\\Roaming\\JSword\\step\\entities\\definition"; //path to the JSword directory final String strOutputFileName = root + portionPassage + ".Output.txt"; //file in which the output is written final String strDebugFileName = root + portionPassage + "._DebugLog.txt"; //file in which the output is written /** * David's final String root = "C:\\Users\\chbradsh\\Documents\\GitHub\\dev\\BibleSample\\"; final String strongs = FileUtils.readFileToString(new File(root + "NT.s")); // strongs # final String other = FileUtils.readFileToString(new File(root + "NT.u")); // stems only final String results = FileUtils.readFileToString(new File(root + "NT.training.align")); // alignment from Berkeley final String keyFile = FileUtils.readFileToString(new File(root + "NT.keyList.txt")); // refs only */ /** * Chris' final String strongs = FileUtils.readFileToString(new File("c:\\temp\\bible.s")); final String other = FileUtils.readFileToString(new File("c:\\temp\\bible.o")); final String results = FileUtils.readFileToString(new File("c:\\temp\\training.align")); final String keyFile = FileUtils.readFileToString(new File("c:\\temp\\keyList.txt")); */ //Pre-processing if (bDebug) { WriteDebug("Preprocessing. Elapsed time: (" + (System.currentTimeMillis() - startTime) + ")"); } other = preStringProcessing (other); results = preStringProcessing (results); strongs = preStringProcessing (strongs); keyFile = preStringProcessing (keyFile); if (bDebug) { WriteDebug("Processing. Elapsed time: (" + (System.currentTimeMillis() - startTime) + ")"); } List<String[]> strongSentences = splitByWord(strongs); List<String[]> otherSentences = splitByWord(other); List<String[]> resultSentences = splitByWord(results); List<String[]> keyList = splitByWord(keyFile); final File path = new File(strJSwordPath); // final File path = new File("C:\\Users\\David IB\\AppData\\Roaming\\JSword\\step\\entities\\definition"); // final File path = new File("C:\\Users\\chbradsh\\AppData\\Roaming\\JSword\\step\\entities\\definition"); FSDirectory directory = FSDirectory.open(path); final IndexSearcher indexSearcher = new IndexSearcher(directory); final BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(strOutputFileName), "UTF8")); // final BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("C:\\Users\\David IB\\Dropbox\\STEP-Tagging\\autoTag\\BibleSample\\ChrisExperiments\\NT.tagging+Gk.txt"), "UTF8")); // final BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("C:\\Users\\chbradsh\\Documents\\GitHub\\dev\\BibleSample\\outfilename.txt"), "UTF8")); String resultTagging = parseResults(resultSentences, strongSentences, otherSentences, indexSearcher, keyList, out); out.close(); //Postprocessing if (bDebug) { WriteDebug("Postprocessing. Elapsed time: (" + (System.currentTimeMillis() - startTime) + ")"); } resultTagging = postStringProcessing (strOutputFileName); FileUtils.writeStringToFile(new File(strOutputFileName + "postprocessed.txt"), resultTagging ); if (bDebug) { WriteDebug("Completed. Elapsed time: (" + (System.currentTimeMillis() - startTime) + ")"); } FileUtils.writeStringToFile(new File(strDebugFileName), strDebug); strDebug=""; } private static void WriteDebug (final String strInput) { System.out.println(strInput); strDebug = strDebug + "\n" + strInput; //return Pattern.compile(strOriginalExpression).matcher(strInput).replaceAll(strReplacingExpression); } private static String preStringProcessing (String strInput) { //Process Data strInput = preStringProcessing_replaceCarriageReturnByNewLine(strInput); strInput = preStringProcessing_replaceDoubleNewLinesWithASingleNewLine(strInput); //strInput = preStringProcessing_removeSingleLeftPointingAngleQuotationMark (strInput); //strInput = preStringProcessing_removeSingleRightPointingAngleQuotationMark (strInput); //Return Data return strInput; } private static String preStringProcessing_replaceCarriageReturnByNewLine (final String strInput) { final String strOriginalExpression = "\r"; final String strReplacingExpression = "\n"; return strInput.replace(strOriginalExpression, strReplacingExpression); //return Pattern.compile(strOriginalExpression).matcher(strInput).replaceAll(strReplacingExpression); } private static String preStringProcessing_replaceDoubleNewLinesWithASingleNewLine (final String strInput) { final String strOriginalExpression = "\n\n"; final String strReplacingExpression = "\n"; return strInput.replace(strOriginalExpression, strReplacingExpression); //return Pattern.compile(strOriginalExpression).matcher(strInput).replaceAll(strReplacingExpression); } private static String preStringProcessing_removeSingleLeftPointingAngleQuotationMark (final String strInput) { final String strOriginalExpression = "‹"; final String strReplacingExpression = "<"; return strInput.replace(strOriginalExpression, strReplacingExpression); //return Pattern.compile(strOriginalExpression).matcher(strInput).replaceAll(strReplacingExpression); } private static String preStringProcessing_removeSingleRightPointingAngleQuotationMark (final String strInput) { final String strOriginalExpression = "›"; final String strReplacingExpression = ">"; return strInput.replace(strOriginalExpression, strReplacingExpression); //return Pattern.compile(strOriginalExpression).matcher(strInput).replaceAll(strReplacingExpression); } //Stefan 12/01/2016: PostProcesses the output into a better readable format private static String postStringProcessing (final String strFileName) throws IOException { //Get Data String strOutputFileContents = FileUtils.readFileToString(new File(strFileName)); //read the contents of the output file //Process Data //-removeDoubleQuotes strOutputFileContents = postStringProcessing_removeDoubleQuotes (strOutputFileContents); //-fix, where the initial word in the target language is not tabbed by the following parsing strOutputFileContents = postStringProcessing_FixUntabbed (strOutputFileContents); //-TagDifferentTargetWordsTaggedWithSameSourceWord strOutputFileContents = postStringProcessing_TagDifferentTargetWordsTaggedWithSameSourceWord (strOutputFileContents); //Return Data return strOutputFileContents; } private static String postStringProcessing_FixUntabbed (String strInput) { //Init String strIndicatorDoubleOccurrence = "\\~"; //The tagging of a Double Match boolean bDebug = false; String strPattern = ""; //Pattern for splitting the verse itself: see http://regexr.com/ String strPunctuationSigns = ""; //"\\:?\\:?\\,?\\.?"; strPattern = strPattern + "(\\w{2,}[^\t]*?)"; strPattern = strPattern + "(?:" + strPunctuationSigns + ")"; // Sometimes a word is not separated by a tab. This is a bug that is here resolved. strPattern = strPattern + "((([0-9]{3})\\-([A-Z][0-9]{1,5}[a-z]?\\{[^=]{1,}[=][^}]{1,}\\})))"; strPattern = strPattern + ""; if (bDebug) { WriteDebug("Pattern for finding non-tabbed word/parsing in a verse: " + strPattern + ":::"); } //Replacement of the matched String strReplacePattern = "$1\t$3"; if (bDebug) WriteDebug("Pattern (non-tabbed): " + strReplacePattern + ":::"); strInput = strInput.replaceAll(strPattern, strReplacePattern); return strInput; } //Stefan 13/01/2016: TagDifferentTargetWordsTaggedWithSameSourceWord private static String postStringProcessing_TagDifferentTargetWordsTaggedWithSameSourceWord (String strInput) { /* * when a Greek word is used to tag more than one Swahili word, it needs to be marked. eg in v.16 pais is tagged to both "watoto" and "kiume" I think a good way to mark this is to preceded all occurrences with an "~" (we want to try to get each Greek word occurring only once if poss.) eg watoto ~016-G3816{child= pais} wote 014-G3956{all= pas} wa 015-G3588{the/this/who=ho} */ // Init FD // Identify WORD : // Identify MATCH // NOT in same verse AND a DIFFERENT WORD //Init String strOutput = ""; String strIndicatorDoubleOccurrence = "\\~"; //The tagging of a Double Match boolean bDebug = false; String strPattern = ""; // "(?s)";// String strPatternVerse = ""; //Sample entry (FULL): 40_Mat.002.016-002 Herode G2264{Herod=Ἡρώδης} //***Match first occurrence // Sample: Herode 002-G2264{Herod=Ἡρώδης} // <Target Language: Word>/t<position number>-<Strong #>{<English Translation>=<Source Language>} //MASSIVE FRUSTRATION, I cannot backreference more than 9. It should work, but it doesn't! //Because of this, I'll have to do it in two steps!!! First split it into verses and then do the actual matching. //Pattern for splitting the text into verses strPatternVerse = strPatternVerse+ "(\\$[0-9]{2}\\_[0-9A-Z][a-zA-Z]{1,4}\\.[0-9]{1,3}\\.[0-9]{1,3})"; //David adjusted the numbering 18/2/16 --> [0-9]{3}\.[0-9]{3} --> [0-9]{1,3}\.[0-9]{1,3} strPatternVerse = strPatternVerse+ "(([^\n]*[\n])*?)"; strPatternVerse = strPatternVerse+ "(\\~)"; if (bDebug) { WriteDebug("Pattern for versification: " + strPatternVerse + ":::"); } //Pattern for splitting the verse itself: see http://regexr.com/ String strPunctuationSigns = ""; //"\\:?\\:?\\,?\\.?"; strPattern = strPattern + "("; strPattern = strPattern + "(\\w{2,})"; strPattern = strPattern + "(?:" + strPunctuationSigns + "\\t?)"; // Sometimes a word is not separated by a tab. This is a bug that is here resolved. strPattern = strPattern + "(((?!\\~)([0-9]{3})\\-([A-Z][0-9]{1,5}[a-z]?\\{[^=]{1,}[=][^}]{1,}\\}\\t)){0,8})"; strPattern = strPattern + "(((?!\\~)([0-9]{3})\\-([A-Z][0-9]{1,5}[a-z]?\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){1})"; strPattern = strPattern + "(((?!\\~)([0-9]{3})\\-([A-Z][0-9]{1,5}[a-z]?\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){0,8})"; strPattern = strPattern + ")"; strPattern = strPattern + "(([^\n]*[\n])*?)"; strPattern = strPattern + "(\\w{2,})"; strPattern = strPattern + "(?:" + strPunctuationSigns + "\\t?)"; // Sometimes a word is not separated by a tab. This is a bug that is here resolved. strPattern = strPattern + "("; strPattern = strPattern + "(((?!\\~)([0-9]{3})\\-([A-Z][0-9]{1,5}[a-z]\\{[^=]{1,}[=][^}]{1,}\\}\\t)){0,8})"; strPattern = strPattern + "\\8"; strPattern = strPattern + "(((?!\\~)([0-9]{3})\\-([A-Z][0-9]{1,5}[a-z]\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){0,8})"; strPattern = strPattern + ")"; /* strPattern = strPattern + "("; strPattern = strPattern + "(\\w{2,})"; //Target Language (Swahili) Word--Storing this for backref, maybe followed by garbage, e.g. another word or punctuation //tab strPattern = strPattern + "(?:\\:?\\,?\\.?\\t?)"; strPattern = strPattern + "(("; //a Single Matching or more, up to nine (I doubt there'll be more) strPattern = strPattern + "(?!" + strIndicatorDoubleOccurrence + ")"; //exclude previously double matches strPattern = strPattern + "([0-9]{3})\\-"; //<position number>- //<Strong #>{<English Translation>=<Source Language>} strPattern = strPattern + "([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t)"; //Match Strong indication--Storing this for backref(\3) (probably could just do [AHG] (Aramaic, Greek, Hebrew) strPattern = strPattern + "){0,8})"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word strPattern = strPattern + "(("; //a Single Matching or more, up to nine (I doubt there'll be more) strPattern = strPattern + "(?!" + strIndicatorDoubleOccurrence + ")"; //exclude previously double matches strPattern = strPattern + "([0-9]{3})\\-"; //<position number>- //<Strong #>{<English Translation>=<Source Language>} strPattern = strPattern + "([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)"; //Match Strong indication--Storing this for backref(\3) (probably could just do [AHG] (Aramaic, Greek, Hebrew) strPattern = strPattern + "){1})"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word strPattern = strPattern + "(("; //a Single Matching or more, up to nine (I doubt there'll be more) strPattern = strPattern + "(?!" + strIndicatorDoubleOccurrence + ")"; //exclude previously double matches strPattern = strPattern + "([0-9]{3})\\-"; //<position number>- //<Strong #>{<English Translation>=<Source Language>} strPattern = strPattern + "([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)"; //Match Strong indication--Storing this for backref(\3) (probably could just do [AHG] (Aramaic, Greek, Hebrew) strPattern = strPattern + "){0,8})"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word strPattern = strPattern + ")"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word //Sample entry section: (Negative Lookahead) & BackRef#6: Possible Garbage at the end of the Original Line //strPattern = strPattern + "([^\n]*)[\n]"; //exclude lines that have already been matched //Sample entry section: BackRef#7 and #8: Other lines (Note $8 is not referenced later on, on purpose, because it is duplicating) strPattern = strPattern + "(([^\n]*[\n])*?)"; //strPattern = strPattern + "(?![0-9]{2}\\_[a-zA-Z]{2,5}\\.[0-9]{3}\\.[0-9]{3})"; //Rematch with a second occurrence, verse wise strPattern = strPattern + "(\\w{2,})"; //1st backref strPattern = strPattern + "(?:\\:?\\,?\\.?\\t?)"; strPattern = strPattern + "("; //All Matchings strPattern = strPattern + "(("; //a Single Matching or more, up to nine (I doubt there'll be more) strPattern = strPattern + "(?!" + strIndicatorDoubleOccurrence + ")"; //exclude previously double matches strPattern = strPattern + "([0-9]{3})\\-"; //<position number>- //<Strong #>{<English Translation>=<Source Language>} strPattern = strPattern + "([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t)"; //Match Strong indication--Storing this for backref(\3) (probably could just do [AHG] (Aramaic, Greek, Hebrew) strPattern = strPattern + "){0,8})"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word strPattern = strPattern + "\\7"; //<position number>- strPattern = strPattern + "(("; //a Single Matching or more, up to nine (I doubt there'll be more) strPattern = strPattern + "(?!" + strIndicatorDoubleOccurrence + ")"; //exclude previously double matches strPattern = strPattern + "([0-9]{3})\\-"; //<position number>- //<Strong #>{<English Translation>=<Source Language>} strPattern = strPattern + "([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)"; //Match Strong indication--Storing this for backref(\3) (probably could just do [AHG] (Aramaic, Greek, Hebrew) strPattern = strPattern + "){0,8})"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word strPattern = strPattern + ")"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word strPattern = strPattern + ")"; //Line, Book#,BookName,Chap,Verse--Storing this for backref, followed by the matching word */ if (bDebug) { WriteDebug("Pattern for finding duplicates in verse: " + strPattern + ":::"); } //Sample entry section: BackRef#9: Different Position in the verse //strPattern = strPattern + "(\\-[0-9]{3})\t\t"; //Word Number is different //Sample entry section: BackRef#10: Different Target Language Word //strPattern = strPattern + "(?!\\3)(\\w{2,})"; //Exclude places with the exact same word //Word is different too //Sample entry section: BackRef#11: Different Postword Rubbish //strPattern = strPattern + "([^ ]{0,} )"; //Rematch with a second occurrence, Original Language Match-wise //strPattern = strPattern + "\\5"; //The Original stuff is the same. If this is all true, we got a match /* Handy for Debug Pattern pattern = Pattern.compile(strPattern, Pattern.DOTALL); //, Pattern.MULTILINE | Pattern.MULTILINE Matcher matcher = pattern.matcher(strInput); System.out.println("found: " + strPattern + ":::"); System.out.println("Did we have a match?: " + matcher.find()+ ":::"); */ //Replacement of the matched String strReplacePattern = "$2\t$3" + strIndicatorDoubleOccurrence + "$8$11"; //Inbetween Lines strReplacePattern = strReplacePattern + "$15"; //Counterpart Match + Additions strReplacePattern = strReplacePattern + "$17\t$19" + strIndicatorDoubleOccurrence + "$8$23"; // strReplacePattern = strReplacePattern + "$31$33"; if (bDebug) WriteDebug("Pattern: " + strReplacePattern + ":::"); //strPattern = "(\\w{2,})(\\:?\\,?\\.?\\t?)((((?!SCHEET)([0-9]{3})\\-([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){0,8})(((?!SCHEET)([0-9]{3})\\-([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){1})(((?!SCHEET)([0-9]{3})\\-([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){0,8}))(([^\\n]*[\\n])*)(?![0-9]{2}\\_[a-zA-Z]{2,5}\\.[0-9]{3}\\.[0-9]{3})(\\w{2,})(\\:?\\,?\\.?\\t?)((((?!SCHEET)([0-9]{3})\\-([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){0,8})\\8(((?!SCHEET)([0-9]{3})\\-([A-Z][0-9]{1,5}\\{[^=]{1,}[=][^}]{1,}\\}\\t?)){0,8}))"; //Return altered strInput //Now get each verse, and try the individual matching Pattern p = Pattern.compile(strPatternVerse); Matcher m = p.matcher(strInput); int lastMatchPos = 0; int intNumMatch = 0; String strCurrentVerse; while (m.find()) { intNumMatch=intNumMatch + 1; strCurrentVerse = m.group(1) + "" + m.group(2)+ "" + m.group(4); if (bDebug) {WriteDebug("<Match: " + intNumMatch + ">" + strCurrentVerse + "</Match: " + intNumMatch + ">" ); } //Sometimes multiple matches need to iteratively be performed //Match the elements within the verse! while (strCurrentVerse != strCurrentVerse.replaceAll(strPattern, strReplacePattern)) { strCurrentVerse = strCurrentVerse.replaceAll(strPattern, strReplacePattern); } strOutput = strOutput + "\n" + strCurrentVerse ; lastMatchPos = m.end(); } if (lastMatchPos != strInput.length()) { if (bDebug) {WriteDebug("No Matches found!"); } //If nothing is to be matched, which would be odd, then return the input strOutput = strInput; } return strOutput; } //Stefan 13/01/2016: removeDoubleQuotes private static String postStringProcessing_removeDoubleQuotes (final String strInput) { final String strOriginalExpression = "\""; //We need to get rid of double quotes final String strReplacingExpression = "“"; //We do not need anything in its stead return Pattern.compile(strOriginalExpression).matcher(strInput).replaceAll(strReplacingExpression); } private static String parseResults(final List<String[]> resultSentences, final List<String[]> strongSentences, final List<String[]> otherSentences, final IndexSearcher indexSearcher, final List<String[]> keyList, final BufferedWriter out) throws IOException { StringBuilder resultingTagging = new StringBuilder(8000000); int prev; boolean bDebug = false; String strTemp = ""; prev = -1; for (int i = 0; i < resultSentences.size(); i++) { String[] sentence = resultSentences.get(i); if (bDebug) { WriteDebug("Array number --" + i + " of " + resultSentences.size() + "-- of resultSentences. \t\t The sentence starts with: " + keyList.get(i)[0] + "\t(" + sentence[0] + ")"); } if (bDebug) { //Debug attempt 22/2/16 try { strTemp = keyList.get(i)[0]; //keyList.get(i); } catch (IndexOutOfBoundsException e) { WriteDebug("Array falls on the --" + i + " of " + resultSentences.size() + "-- of resultSentences. \t\t The previous sentence started with: " + keyList.get(i-1)[0]); } } String ref = keyList.get(i)[0]; if (i % 200 == 0) { WriteDebug("Every 200th entry (" + i + "): " + ref); } resultingTagging.append('\n'); resultingTagging.append("$"); out.write('\n'); out.write("$"); sentence = reOrder(sentence); int[][] sentence_array = new int[resultSentences.get(i).length +1][2]; int word_counter = 0; //construct an array for this sentence for (String word : sentence) { String[] stringIndexes = word.split("-"); sentence_array[word_counter][0] = Integer.parseInt(stringIndexes[0]); sentence_array[word_counter][1] = Integer.parseInt(stringIndexes[1]); word_counter++; } prev =-1; boolean first = true; int tab_count = 0; word_counter = 0; for (String word : sentence) { String[] stringIndexes = word.split("-"); try { int[] indexes = new int[]{Integer.parseInt(stringIndexes[0]), Integer.parseInt(stringIndexes[1])}; if (indexes[0] == 0 && indexes[1] == 0) { // not sure what this used to be for // continue; } //find word in sentence in each bible. String strong = strongSentences.get(i)[indexes[0]]; String other = otherSentences.get(i)[indexes[1]]; //Add reference before the first word in the sentence if (word_counter == 0) { out.write(ref); } //Add unaligned 'other' words if (indexes[1]-1 != prev) { for (int j = prev+1; j < indexes[1]; j++) { out.write("\n"); out.write(otherSentences.get(i)[j]); } } //Add aligned 'other word' if (indexes[1] != prev) { out.write("\n"); out.write(other); out.write("\t"); } // Add strong out.write(String.format("%03d", indexes[0] + 1)); out.write("-"); out.write(strong); out.write("{"); appendLexicalEntry(indexSearcher, resultingTagging, strong, out); out.write("="); appendGreekEntry(indexSearcher, resultingTagging, strong, out); out.write("}\t"); //add next Greek word(s) if not tagged int testStrong; boolean missingGreek; String checkMissing; if (first){ first = false; for (int l=0; l < indexes[0]; l++){ missingGreek = true; checkMissing = Integer.toString(l) + "-"; for (int m = 0; m < sentence.length; m++) { if (sentence[m].startsWith(checkMissing)) { missingGreek = false; break; } } if (missingGreek) { String missingStrong = strongSentences.get(i)[l]; out.write(String.format("%03d", l+1)); out.write("-"); out.write(missingStrong); out.write("{"); appendLexicalEntry(indexSearcher, resultingTagging, missingStrong, out); out.write("="); appendGreekEntry(indexSearcher, resultingTagging, missingStrong, out); out.write("}\t"); } } } for (int n=indexes[0]+1; n < strongSentences.get(i).length; n++){ missingGreek = true; testStrong = n; checkMissing = Integer.toString(testStrong) + "-"; for (int k = 0; k < sentence.length; k++) { if (sentence[k].startsWith(checkMissing)) { missingGreek = false; break; } } if (!missingGreek) break; String missingStrong = strongSentences.get(i)[testStrong]; out.write(String.format("%03d", testStrong+1)); out.write("-"); out.write(missingStrong); out.write("{"); appendLexicalEntry(indexSearcher, resultingTagging, missingStrong, out); out.write("="); appendGreekEntry(indexSearcher, resultingTagging, missingStrong, out); out.write("}\t"); } int additional_word = 0; //Have we added an extra word? if (indexes[1] != prev) { // add aligned word int n = 1; // If there is more than one trans word for this Greek word, add it now while (sentence_array[word_counter + n][0] == indexes[0]) { out.write("\n"); out.write(otherSentences.get(i)[indexes[1]+n]); additional_word++; n++; } prev = indexes[1] + additional_word; } } catch (Exception e) { WriteDebug("Error in verse " + ref + " for word: " + word); WriteDebug(e.getMessage()); } word_counter++; } // get unaligned end of sentence int otherLength = otherSentences.get(i).length; if (prev < otherLength) { for (int j = prev + 1; j < otherLength; j++) { out.write("\n"); out.write(otherSentences.get(i)[j]); } out.write("\n"); out.write("~"); } } return resultingTagging.toString(); } private static String[] reOrder(final String[] sentence) { List<String> words = Arrays.asList(sentence); Collections.sort(words, new Comparator<String>() { @Override public int compare(final String o1, final String o2) { if (o1 == null || o1.length() == 0) { return 1; } if (o2 == null || o2.length() == 0) { return -1; } return ((Integer) Integer.parseInt(o1.split("-")[1])).compareTo(Integer.parseInt(o2.split("-")[1])); } }); return words.toArray(new String[words.size()]); } private static void appendLexicalEntry(final IndexSearcher indexSearcher, final StringBuilder resultingTagging, String strong, BufferedWriter out) throws IOException { if (strong.length() > 5 && strong.charAt(1) == '0') { strong = strong.substring(0, 1) + strong.substring(2); } String gloss = entries.get(strong); if (gloss == null) { final TopDocs lexicalEntries = indexSearcher.search(new TermQuery(new Term("strongNumber", StringConversionUtils.getStrongPaddedKey(strong))), Integer.MAX_VALUE); if (lexicalEntries.scoreDocs.length > 0) { gloss = indexSearcher.doc(lexicalEntries.scoreDocs[0].doc).get("stepGloss"); } else { gloss = ""; } entries.put(strong, gloss); } resultingTagging.append(gloss); out.write(gloss); } private static void appendGreekEntry(final IndexSearcher indexSearcher, final StringBuilder resultingTagging, String strong, final BufferedWriter out) throws IOException { if (strong.length() > 5 && strong.charAt(1) == '0') { strong = strong.substring(0, 1) + strong.substring(2); } String greek = greekEntries.get(strong); if (greek == null) { final TopDocs lexicalEntries = indexSearcher.search(new TermQuery(new Term("strongNumber", StringConversionUtils.getStrongPaddedKey(strong))), Integer.MAX_VALUE); if (lexicalEntries.scoreDocs.length > 0) { greek = indexSearcher.doc(lexicalEntries.scoreDocs[0].doc).get("accentedUnicode"); } else { greek = ""; } greekEntries.put(strong, greek); } out.write(greek); } private static List<String[]> splitByWord(final String strongs) { final String[] sentences = strongs.split("\r?\n"); List<String[]> sss = new ArrayList<String[]>(64000); for (String sentence : sentences) { final String[] split = org.apache.commons.lang3.StringUtils.split(sentence, ' '); sss.add(split); } return sss; } }