/************************************************************************************************** * Copyright (c) 2013, Directors of the Tyndale STEP Project * * All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * * are met: * * * * Redistributions of source code must retain the above copyright * * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * * notice, this list of conditions and the following disclaimer in * * the documentation and/or other materials provided with the * * distribution. * * Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com) * * nor the names of its contributors may be used to endorse or promote * * products derived from this software without specific prior written * * permission. * * * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * * THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ package com.tyndalehouse.step.tools.analysis; import org.apache.commons.io.FileUtils; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * @author chrisburrell */ public class BerkeleyOutputToTaggingFormat { private static final Map<String, String> entries = new HashMap<String, String>(12000); public static void main(String[] args) throws IOException { // David' final String root = "C:\\Users\\David IB\\Dropbox\\STEP-Tagging\\autoTag\\Bibles\\"; final String strongs = FileUtils.readFileToString(new File(root + "bible.s")); final String other = FileUtils.readFileToString(new File(root + "bible.o")); final String results = FileUtils.readFileToString(new File(root + "training.align")); final String keyFile = FileUtils.readFileToString(new File(root + "keyList.txt")); /** * Chris' final String root = "C:\\temp\\berkeley\\berkeleyBibles\\output\\"; final String strongs = FileUtils.readFileToString(new File(root + "bible.s")); final String other = FileUtils.readFileToString(new File(root + "bible.o")); final String results = FileUtils.readFileToString(new File(root + "training.align")); final String keyFile = FileUtils.readFileToString(new File(root + "keyList-nt.txt")); */ List<String[]> strongSentences = splitByWord(strongs); List<String[]> otherSentences = splitByWord(other); List<String[]> resultSentences = splitByWord(results); List<String[]> keyList = splitByWord(keyFile); final File path = new File("C:\\Users\\David IB\\AppData\\Roaming\\JSword\\step\\entities\\definition"); // final File path = new File("C:\\Users\\Chris\\AppData\\Roaming\\JSword\\step\\entities\\definition"); FSDirectory directory = FSDirectory.open(path); final IndexSearcher indexSearcher = new IndexSearcher(directory); String resultTagging = parseResultsAsTable(resultSentences, strongSentences, otherSentences, indexSearcher, keyList); FileUtils.writeStringToFile(new File(root + "positionalTagging-table.txt"), resultTagging); } private static String parseResultsAsTable(final List<String[]> resultSentences, final List<String[]> strongSentences, final List<String[]> otherSentences, final IndexSearcher indexSearcher, final List<String[]> keyList) throws IOException { StringBuilder resultingTagging = new StringBuilder(8000000); //verse => results Map<String, Map<Integer, String>> verseToResults = new HashMap<String, Map<Integer, String>>(32000); for (int i = 0; i < resultSentences.size(); i++) { final String[] sentence = resultSentences.get(i); String ref = keyList.get(i)[0]; if (i % 200 == 0) { System.out.println(ref); } Map<Integer, String> resultTagging = new HashMap<Integer, String>(); verseToResults.put(ref, resultTagging); for (String word : sentence) { String[] stringIndexes = word.split("-"); try { int[] indexes = new int[]{Integer.parseInt(stringIndexes[0]), Integer.parseInt(stringIndexes[1])}; if (indexes[0] == 0 && indexes[1] == 0 && sentence.length == 1) { continue; } //find word in sentence in each bible. resultTagging.put(indexes[1], strongSentences.get(i)[indexes[0]]); } catch (Exception e) { System.out.println("Error in verse " + ref + " for word: " + word); System.out.println(e.getMessage()); } } } for (int ii = 0; ii < otherSentences.size(); ii++) { //output every word String[] words = otherSentences.get(ii); int wordNumber = 0; final String verseRef = keyList.get(ii)[0]; // resultingTagging.append("$"); outputVerseRef(resultingTagging, verseRef, wordNumber); final Map<Integer, String> sentenceStrongs = verseToResults.get(verseRef); boolean wasStrongNumber = false; String lastStrongNumber = null; for (int jj = 0; jj < words.length; jj++) { wordNumber = jj; String strongNumber = sentenceStrongs.get(jj); // if (lastStrongNumber != null && !lastStrongNumber.equals(strongNumber)) { // to avoid repeating same Strongs number twice. But sometimes supposed to repeat! eg Matt.1.2 if (lastStrongNumber != null) { outputStrongNumber(resultingTagging, verseRef, lastStrongNumber, indexSearcher); outputVerseRef(resultingTagging, verseRef, wordNumber); } if (strongNumber == null) { resultingTagging.append(words[jj]); // if (wasStrongNumber) { // resultingTagging.append('\n'); // } else { resultingTagging.append(' '); // } wasStrongNumber = false; lastStrongNumber = null; } else { /* if (wasStrongNumber && lastStrongNumber.equals(strongNumber)) { resultingTagging.append(' '); } else { */ resultingTagging.append('\t'); // } resultingTagging.append(words[jj]); // resultingTagging.append('\t'); wasStrongNumber = true; lastStrongNumber = strongNumber; } } if (lastStrongNumber != null) { outputStrongNumber(resultingTagging, verseRef, lastStrongNumber, indexSearcher); } if(resultingTagging.charAt(resultingTagging.length() -1) != '\n') { resultingTagging.append('\n'); } } return resultingTagging.toString(); } private static void outputVerseRef(final StringBuilder resultingTagging, final String verseRef, final int wordNumber) { // resultingTagging.append("¦"); resultingTagging.append(verseRef); resultingTagging.append('-'); resultingTagging.append(String.format("%03d", wordNumber)); resultingTagging.append('\t'); } private static void outputStrongNumber(final StringBuilder resultingTagging, final String verseRef, final String lastStrongNumber, final IndexSearcher indexSearcher) { //output the strong number and a new line resultingTagging.append('\t'); resultingTagging.append("<"); resultingTagging.append(lastStrongNumber); resultingTagging.append("> = "); try { appendLexicalEntry(indexSearcher, resultingTagging, lastStrongNumber); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } resultingTagging.append(" ¬"); resultingTagging.append('\n'); } private static String parseResults(final List<String[]> resultSentences, final List<String[]> strongSentences, final List<String[]> otherSentences, final IndexSearcher indexSearcher, final List<String[]> keyList) throws IOException { StringBuilder resultingTagging = new StringBuilder(8000000); for (int i = 0; i < resultSentences.size(); i++) { final String[] sentence = resultSentences.get(i); String ref = keyList.get(i)[0]; if (i % 200 == 0) { System.out.println(ref); } resultingTagging.append(ref); resultingTagging.append(' '); for (String word : sentence) { String[] stringIndexes = word.split("-"); try { int[] indexes = new int[]{Integer.parseInt(stringIndexes[0]), Integer.parseInt(stringIndexes[1])}; if (indexes[0] == 0 && indexes[1] == 0 && sentence.length == 1) { continue; } //find word in sentence in each bible. String strong = strongSentences.get(i)[indexes[0]]; String other = otherSentences.get(i)[indexes[1]]; resultingTagging.append(other); resultingTagging.append(" ("); appendLexicalEntry(indexSearcher, resultingTagging, strong); resultingTagging.append(", "); resultingTagging.append(strong); resultingTagging.append(", "); resultingTagging.append(word); resultingTagging.append(") "); } catch (Exception e) { System.out.println("Error in verse " + ref + " for word: " + word); System.out.println(e.getMessage()); } } resultingTagging.append('\n'); resultingTagging.append('\n'); } return resultingTagging.toString(); } private static void appendLexicalEntry(final IndexSearcher indexSearcher, final StringBuilder resultingTagging, String strong) throws IOException { if (strong.length() > 5 && strong.charAt(1) == '0') { strong = strong.substring(0, 1) + strong.substring(2); } String gloss = entries.get(strong); if (gloss == null) { final TopDocs lexicalEntries = indexSearcher.search(new TermQuery(new Term("strongNumber", strong)), Integer.MAX_VALUE); if (lexicalEntries.scoreDocs.length > 0) { gloss = indexSearcher.doc(lexicalEntries.scoreDocs[0].doc).get("stepGloss"); } else { gloss = ""; } entries.put(strong, gloss); } resultingTagging.append(gloss); } private static List<String[]> splitByWord(final String strongs) { final String[] sentences = strongs.split("\r?\n"); List<String[]> sss = new ArrayList<String[]>(64000); for (String sentence : sentences) { final String[] split = org.apache.commons.lang3.StringUtils.split(sentence, ' '); sss.add(split); } return sss; } }