package yuku.alkitabconverter.unboundbatch;
import yuku.alkitab.util.Ari;
import yuku.alkitabconverter.util.KjvUtils;
import yuku.alkitabconverter.util.TextDb;
import yuku.alkitabconverter.util.TextDb.VerseState;
import yuku.alkitabconverter.yes_common.Yes2Common;
import yuku.alkitabconverter.yet.YetFileOutput;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Scanner;
import java.util.Set;
public class UnboundBatchConverter {
static String DATA_DIR = "/Users/yuku/j/operasi/unbound";
List<String> appConfigEntries = new ArrayList<>();
public static void main(String[] args) throws Exception {
new UnboundBatchConverter().convertAll();
}
void convertAll() throws Exception {
// look for directories starting with "ready-"
File[] superdirs = new File(DATA_DIR).listFiles(new FileFilter() {
@Override public boolean accept(File f) {
return f.isDirectory() && f.getName().startsWith("ready-");
}
});
// look for all directories under them but only those with corresponding ".properties" file
for (final File superdir: superdirs) {
File[] dirs = superdir.listFiles(new FileFilter() {
@Override public boolean accept(File f) {
return f.isDirectory() && new File(superdir, f.getName() + ".properties").exists();
}
});
for (File dir: dirs) {
processVersion(superdir, dir);
}
}
Collections.sort(appConfigEntries);
for (String s: appConfigEntries) {
System.out.println(s);
}
}
void processVersion(File superdir, File dir) throws Exception {
// look for text: ending in _utf8_mapped_to_NRSVA.txt or _utf8.txt
File textFile = null;
boolean mapped = false;
for (File f: dir.listFiles()) {
if (f.getName().endsWith("_utf8_mapped_to_NRSVA.txt")) {
textFile = f;
mapped = true;
break;
}
}
if (textFile == null) {
for (File f: dir.listFiles()) {
if (f.getName().endsWith("_utf8.txt")) {
textFile = f;
mapped = false;
break;
}
}
}
if (textFile == null) {
throw new RuntimeException("text file not found in dir " + dir);
}
// read properties file
Yes2Common.VersionInfo versionInfo = new Yes2Common.VersionInfo();
final InputStreamReader propInput = new InputStreamReader(new FileInputStream(new File(superdir, dir.getName() + ".properties")), "utf-8");
Properties prop = new Properties();
prop.load(propInput);
versionInfo.locale = prop.getProperty("versionInfo.locale");
versionInfo.shortName = prop.getProperty("versionInfo.shortName");
versionInfo.longName = prop.getProperty("versionInfo.longName");
versionInfo.description = prop.getProperty("versionInfo.description");
final String outputName = prop.getProperty("output.name");
propInput.close();
// read booknames file
Scanner sc = new Scanner(new File(superdir, dir.getName() + ".booknames.txt"));
List<String> bookNames = new ArrayList<>();
while (sc.hasNextLine()) {
bookNames.add(sc.nextLine());
}
versionInfo.setBookNamesAndAbbreviations(bookNames, null);
sc.close();
TextDb textDb = processTextFile(superdir.getName(), dir.getName(), textFile, mapped);
// CREATE YET FILE
{
final YetFileOutput yet = new YetFileOutput(new File("/tmp/" + outputName + ".yet"));
yet.setVersionInfo(versionInfo);
yet.setTextDb(textDb);
yet.write();
}
// CREATE DUMP FILE
PrintStream ps = new PrintStream(new File("/tmp/" + outputName + ".txt"));
textDb.dump(ps);
ps.close();
// CREATE YES FILE
Yes2Common.createYesFile(new File("/tmp", outputName + ".yes"), versionInfo, textDb, null, true);
appConfigEntries.add(String.format("<preset locale=%-6s shortName=%-9s longName=%s filename_preset=%s url=%s />", q(versionInfo.locale), q(versionInfo.shortName), q(versionInfo.longName), q(outputName + ".yes"), q("https://alkitab-host.appspot.com/addon/yes2/" + outputName + "--1.yes.gz")));
System.out.println("Processing finished, total verses: " + textDb.size());
}
String q(String s) {
return '"' + s.replace("\"", """) + '"';
}
TextDb processTextFile(String categoryName, String versionName, File textFile, boolean mapped) throws Exception {
List<String> rawLines = new ArrayList<>();
List<String[]> brokenLines = new ArrayList<>();
Map<String, Integer> columns = new LinkedHashMap<>();
BufferedReader sc = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), "utf-8"));
while (true) {
String rawLine = sc.readLine();
if (rawLine == null) break;
if (rawLine.startsWith("#columns")) {
String[] splits = rawLine.split("\t");
for (int i = 1; i < splits.length; i++) {
columns.put(splits[i], i-1);
}
} else if (rawLine.startsWith("#") || rawLine.length() == 0) {
// nop
} else {
rawLines.add(rawLine);
brokenLines.add(rawLine.split("\t", -1));
}
}
System.out.println(versionName + " mapped=" + mapped + " has rawLines " + rawLines.size());
// check required columns
if (!columns.containsKey("orig_book_index")) throw new RuntimeException("column orig_book_index not found");
if (!columns.containsKey("orig_chapter")) throw new RuntimeException("column orig_chapter not found");
if (!columns.containsKey("orig_verse")) throw new RuntimeException("column orig_verse not found");
if (!columns.containsKey("text")) throw new RuntimeException("column text not found");
sc.close();
// check if all verses belong to kjv
return checkVersesAllKjv(categoryName, versionName, rawLines, brokenLines, columns, mapped);
}
TextDb checkVersesAllKjv(String categoryName, String versionName, List<String> rawLines, List<String[]> brokenLines, Map<String, Integer> columns, boolean mapped) throws Exception {
TextDb textDb = new TextDb();
int col_text = columns.get("text");
// These versions has a bug (?) that Ps 66 has only 19 verses according to NRSVA
// but their orig_verses has 1-20 verses which is correct. The NRSVA itself contains 20 verses.
// So it seems like unnecessary verse shift.
boolean versionWithPsalm66bug = Arrays.asList("afrikaans_1953_ucs2", "french_ostervald_1996_ucs2", "norwegian_ucs2", "wlc_consonants_ucs2", "wlc_ucs2", "wlc_vowels_ucs2").contains(versionName);
// PATCH for missing data:
// afrikaans_1953_ucs2
if (!mapped) {
int col_orig_book_index = columns.get("orig_book_index");
int col_orig_chapter = columns.get("orig_chapter");
int col_orig_verse = columns.get("orig_verse");
for (String[] brokenLine : brokenLines) {
try {
int bookId = Integer.parseInt(brokenLine[col_orig_book_index].substring(0, 2)) - 1;
int chapter_1 = Integer.parseInt(brokenLine[col_orig_chapter]);
int verse_1 = Integer.parseInt(brokenLine[col_orig_verse]);
if (bookId > 65) { // Apocrypha
continue;
}
if (!KjvUtils.isValidKjv(bookId, chapter_1, verse_1)) {
System.out.printf("NOT VALID KJV: %s %s %s\n", bookId, chapter_1, verse_1);
} else {
// post-process
String verseText = brokenLine[col_text];
verseText = postProcessText(verseText);
textDb.append(bookId, chapter_1, verse_1, verseText, -1);
}
} catch (Exception e) {
System.out.println("error when processing: " + Arrays.toString(brokenLine));
e.printStackTrace();
}
}
} else {
int col_orig_chapter = columns.get("orig_chapter");
int col_orig_verse = columns.get("orig_verse");
int col_nrsva_book_index = columns.get("nrsva_book_index");
int col_nrsva_chapter = columns.get("nrsva_chapter");
int col_nrsva_verse = columns.get("nrsva_verse");
int bookId = -1;
int chapter_1 = 0;
int verse_1 = 0;
int orig_chapter_1 = 0;
int orig_verse_1 = 0;
// Some of the files contain invalid NRSVA bcvs on the book of psalms,
// we need to register that and make the NRSVA verses 1+2 -> KJV verse 1
// and subsequent NRSVA verses n -> KJV verse (n-1)
Set<Integer> psalmChaptersWithInvalidNrsvaBcv = new LinkedHashSet<>();
int[] psalmChaptersCanHaveInvalidNrsvaBcv = {30,51,52,54,60,84,85};
for (String[] brokenLine : brokenLines) {
if (brokenLine[col_nrsva_book_index].length() == 0) {
// use previous line's info
} else {
bookId = Integer.parseInt(brokenLine[col_nrsva_book_index].substring(0, 2)) - 1;
chapter_1 = Integer.parseInt(brokenLine[col_nrsva_chapter]);
verse_1 = Integer.parseInt(brokenLine[col_nrsva_verse]);
}
if (!KjvUtils.isValidKjv(bookId, chapter_1, verse_1)) {
if (bookId == 18 /* psalms */ && Arrays.binarySearch(psalmChaptersCanHaveInvalidNrsvaBcv, chapter_1) >= 0) {
psalmChaptersWithInvalidNrsvaBcv.add(chapter_1);
}
}
}
if (psalmChaptersWithInvalidNrsvaBcv.size() > 0) {
System.out.println("warning: psalmChaptersWithInvalidNrsvaBcv = " + psalmChaptersWithInvalidNrsvaBcv);
}
for (String[] brokenLine : brokenLines) {
try {
if (brokenLine[col_nrsva_book_index].length() == 0) {
// use previous line's info
} else {
bookId = Integer.parseInt(brokenLine[col_nrsva_book_index].substring(0, 2)) - 1;
chapter_1 = Integer.parseInt(brokenLine[col_nrsva_chapter]);
verse_1 = Integer.parseInt(brokenLine[col_nrsva_verse]);
}
if (brokenLine[col_orig_verse].length() == 0) {
// use previous line's info
} else {
orig_chapter_1 = Integer.parseInt(brokenLine[col_orig_chapter]);
orig_verse_1 = Integer.parseInt(brokenLine[col_orig_verse]);
}
if (bookId > 65) { // Apocrypha
continue;
}
// convert NRSVA to KJV:
// 3John 1 64N 1 NRSVA ayat 14+15 -> KJV ayat 14
if (bookId == 63 && chapter_1 == 1 && verse_1 == 15) {
// should have verse_1 = 14;
verse_1 = 14;
}
// Rev 12 66N 12 NRSVA ayat 17+18 -> KJV ayat 17
else if (bookId == 65 && chapter_1 == 12 && verse_1 == 18) {
// should have verse_1 = 17;
verse_1 = 17;
}
else if (bookId == 18 /* psalms */ && psalmChaptersWithInvalidNrsvaBcv.contains(chapter_1) && verse_1 >= 2) {
verse_1 -= 1;
}
else if (bookId == 46 /* 2Cor */ && chapter_1 == 13 && verse_1 == 12) {
if (orig_verse_1 == 12) {
// first part of NRSVA verse 12, which is KJV verse 12
} else if (orig_verse_1 == 13) {
// second part of NRSVA verse 12, which is KJV verse 13
verse_1 = 13;
} else {
throw new RuntimeException("This should not happen 390");
}
}
else if (bookId == 46 /* 2Cor */ && chapter_1 == 13 && verse_1 == 13) {
if (orig_verse_1 == 13) {
// this version follows NRSVA, no need to adjust
} else if (orig_verse_1 == 14) {
verse_1 = 14;
} else {
throw new RuntimeException("This should not happen 125");
}
}
else if (versionWithPsalm66bug && bookId == 18 /* Ps */ && chapter_1 == 66) {
verse_1 = orig_verse_1; // just follow the original
}
if (!KjvUtils.isValidKjv(bookId, chapter_1, verse_1)) {
throw new RuntimeException("NOT VALID KJV: " + bookId + " " + chapter_1 + " " + verse_1);
}
String prefix;
if (chapter_1 != orig_chapter_1 || verse_1 != orig_verse_1) {
prefix = "(" + orig_chapter_1 + "-" + orig_verse_1 + ") ";
} else {
prefix = "";
}
// post-process
String verseText = brokenLine[col_text];
verseText = postProcessText(verseText);
textDb.append(bookId, chapter_1, verse_1, prefix + verseText, -1, " ");
} catch (Exception e) {
System.out.println("error when processing: " + Arrays.toString(brokenLine));
e.printStackTrace();
}
}
}
{ // check whether ALL KJV verses are stored
final boolean[] lids = new boolean[31102];
textDb.processEach(new TextDb.TextProcessor() {
@Override public void process(int ari, VerseState ayatState) {
lids[KjvUtils.ariToLid(ari) - 1] = true;
}
});
List<Integer> notexists = new ArrayList<>();
for (int i = 0; i < lids.length; i++) {
boolean exist = lids[i];
if (!exist) {
notexists.add(KjvUtils.lidToAri(i + 1));
}
}
// SPECIAL CASE FOR 2Cor 13
// In NRSVA, 2Cor 13 has 13 verses, in KJV has 14 verses
// NRSVA verse 12 -> KJV verse 12+13
// NRSVA verse 13 -> KJV verse 14
// This is for those versions originally following NRSVA (not KJV) so we need to append empty verse 14.
if (notexists.contains(0x2e0d0e) && !notexists.contains(0x2e0d0d)) {
System.out.println("info: adding 2cor 13:14");
textDb.append(0x2e0d0e, "", -1);
notexists.remove((Integer)0x2e0d0e);
}
// SPECIAL CASE FOR Rom 16
// In NRSVA and KJV, Rom 16 has 27 verses, but on WEB (or maybe others) it has only 24 or 25 verses. Verse 25 onwards are empty.
if (!notexists.contains(0x2c1018)) {
if (notexists.contains(0x2c1019)) {
System.out.println("info: adding Rom 16:25");
textDb.append(0x2c1019, "", -1);
notexists.remove((Integer)0x2c1019);
}
if (notexists.contains(0x2c101a)) {
System.out.println("info: adding Rom 16:26");
textDb.append(0x2c101a, "", -1);
notexists.remove((Integer)0x2c101a);
}
if (notexists.contains(0x2c101b)) {
System.out.println("info: adding Rom 16:27");
textDb.append(0x2c101b, "", -1);
notexists.remove((Integer)0x2c101b);
}
}
if (notexists.size() > 0) {
System.out.println("warning: kjv verses not found: " + notexists.size());
System.out.println("warning: such as: ");
for (int i = 0; i < notexists.size() && i < 5; i++) {
int ari = notexists.get(i);
System.out.println("warning: - " + Ari.toBook(ari) + " " + Ari.toChapter(ari) + " " + Ari.toVerse(ari));
}
}
}
return textDb;
}
private static String postProcessText(String verseText) {
{ // look for <I> (italics)
if (verseText.contains("<I>") && verseText.contains("</I>")) {
verseText = verseText.replaceAll("<I>", "@9").replaceAll("</I>", "@7");
} else if ((verseText.contains("<I>") ^ verseText.contains("</I>"))) {
throw new RuntimeException("Verse contains <I> or </I> but no corresponding tag");
}
}
{ // look for start para marker ΒΆ
if (verseText.contains("\u00B6")) {
verseText = verseText.replaceAll("\u00B6 ?", "@^");
}
}
return verseText;
}
}