package yuku.alkitabconverter.usfx_common; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.ext.DefaultHandler2; import yuku.alkitab.util.Ari; import yuku.alkitab.yes2.model.PericopeData; import yuku.alkitabconverter.util.FootnoteDb; import yuku.alkitabconverter.util.TextDb; import yuku.alkitabconverter.util.XrefDb; import yuku.alkitabconverter.yes_common.Yes2Common; import yuku.alkitabconverter.yet.YetFileOutput; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import java.util.Stack; public class UsfxToYet { static final SAXParserFactory factory = SAXParserFactory.newInstance(); TextDb textDb = new TextDb(); StringBuilder mystery = new StringBuilder(); XrefDb xrefDb = new XrefDb(); FootnoteDb footnoteDb = new FootnoteDb(); PericopeData pericopeData = new PericopeData(); { pericopeData.entries = new ArrayList<>(); } public void u(final InputStream[] inputs, final int[] books_0, final String info_locale, final String info_short_name, final String info_long_name, final String info_description, final List<String> book_names, final List<String> book_abbrs, final OutputStream output_yet) throws IOException { for (int i = 0; i < inputs.length; i++) { final InputStream input = inputs[i]; final int book_0 = books_0[i]; System.out.println("input start;"); try { SAXParser parser = factory.newSAXParser(); XMLReader r = parser.getXMLReader(); System.out.println("input buffer size (old) = " + r.getProperty("http://apache.org/xml/properties/input-buffer-size")); r.setProperty("http://apache.org/xml/properties/input-buffer-size", 1048576); System.out.println("input buffer size (new) = " + r.getProperty("http://apache.org/xml/properties/input-buffer-size")); r.setFeature("http://xml.org/sax/features/namespaces", true); parser.parse(input, new Handler(book_0)); } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } System.out.println("input done; now total rec: " + textDb.size()); } System.out.println("OUTPUT MYSTERY:"); System.out.println(mystery); System.out.println("OUTPUT XREF:"); xrefDb.processEach(XrefDb.defaultShiftTbProcessor); xrefDb.dump(); System.out.println("OUTPUT FOOTNOTE:"); footnoteDb.dump(); // POST-PROCESS textDb.normalize(); textDb.removeEmptyVerses(); textDb.dump(); ////////// PROSES KE YET final YetFileOutput yet = new YetFileOutput(output_yet); final Yes2Common.VersionInfo versionInfo = new Yes2Common.VersionInfo(); versionInfo.locale = info_locale; versionInfo.shortName = info_short_name; versionInfo.longName = info_long_name; versionInfo.description = info_description; versionInfo.setBookNamesAndAbbreviations(book_names, book_abbrs); yet.setVersionInfo(versionInfo); yet.setTextDb(textDb); yet.setPericopeData(pericopeData); yet.setXrefDb(xrefDb); yet.setFootnoteDb(footnoteDb); yet.write(); } public class Handler extends DefaultHandler2 { private static final int LEVEL_p_r = -2; private static final int LEVEL_p_ms = -3; private static final int LEVEL_p_mr = -4; int book_0 = -1; int chapter_1 = 0; int verse_1 = 0; String[] tree = new String[80]; int depth = 0; Stack<Object> writeTarget = new Stack<>(); Object writeTarget_mystery = new Object(); Object writeTarget_text = new Object(); Object writeTarget_pericopeTitle = new Object(); Object writeTarget_xref = new Object(); Object writeTarget_footnote = new Object(); List<PericopeData.Entry> pericopeBuffer = new ArrayList<>(); boolean afterThisMustStartNewPerikop = true; // if true, we have done with a pericope title, so the next text must become a new pericope title instead of appending to existing one // states int sLevel = 0; int textIndent = -1; // -2 adalah para start; 0 1 2 3 4 adalah q level; int xref_state = -1; // 0 is initial (just encountered xref tag <x>), 1 is source, 2 is target int footnote_state = -1; // 0 is initial (just encountered footnote tag <f>), 1 is fr (reference), 2 is fk (keywords), 3 is ft (text) // for preventing split of characters in text elements StringBuilder charactersBuffer = new StringBuilder(); public Handler(int book_0) { this.book_0 = book_0; writeTarget.push(writeTarget_mystery); } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { flushCharactersBuffer(); tree[depth++] = localName; System.out.print("(start:) "); print(); String alamat = address(); if (alamat.endsWith("/c")) { String id = attributes.getValue("id"); System.out.println("#c:" + id); chapter_1 = Integer.parseInt(id.trim()); verse_1 = 1; // reset ayat tiap ganti pasal } else if (alamat.endsWith("/v")) { String id = attributes.getValue("id"); System.out.println("#v:" + id); try { verse_1 = Integer.parseInt(id); } catch (NumberFormatException e) { System.out.println("// number format exception for: " + id); // get until first non number for (int pos = 0; pos < id.length(); pos++) { if (!Character.isDigit(id.charAt(pos))) { String s = id.substring(0, pos); verse_1 = Integer.parseInt(s); System.out.println("// number format exception simplified to: " + s); break; } } } } else if (alamat.endsWith("/f")) { writeTarget.push(writeTarget_footnote); footnote_state = 0; } else if (alamat.endsWith("/f/fr")) { footnote_state = 1; } else if (alamat.endsWith("/f/fk")) { footnote_state = 2; } else if (alamat.endsWith("/f/ft")) { footnote_state = 3; } else if (alamat.endsWith("/p")) { String sfm = attributes.getValue("sfm"); if (sfm != null) { switch (sfm) { case "r": writeTarget.push(writeTarget_pericopeTitle); sLevel = LEVEL_p_r; break; case "mt": writeTarget.push(writeTarget_mystery); break; case "ms": writeTarget.push(writeTarget_pericopeTitle); sLevel = LEVEL_p_ms; break; case "mr": writeTarget.push(writeTarget_pericopeTitle); sLevel = LEVEL_p_mr; break; case "mi": writeTarget.push(writeTarget_text); textIndent = 2; break; case "pi": // Indented para writeTarget.push(writeTarget_text); textIndent = 1; break; case "pc": // Centered para writeTarget.push(writeTarget_text); textIndent = 2; break; case "m": /* * Flush left (margin) paragraph. * • No first line indent. * • Followed immediately by a space and paragraph text, or by a new line and a verse marker. * • Usually used to resume prose at the margin (without indent) after poetry or OT quotation (i.e. continuation of the previous paragraph). */ writeTarget.push(writeTarget_text); textIndent = 0; // inden 0 break; default: throw new RuntimeException("p@sfm ga dikenal: " + sfm); } } else { writeTarget.push(writeTarget_text); textIndent = -2; } } else if (alamat.endsWith("/q")) { writeTarget.push(writeTarget_text); int level = Integer.parseInt(attributes.getValue("level")); if (level >= 1 && level <= 4) { textIndent = level; } else { throw new RuntimeException("q level = " + level); } } else if (alamat.endsWith("/s")) { writeTarget.push(writeTarget_pericopeTitle); sLevel = Integer.parseInt(attributes.getValue("level")); } else if (alamat.endsWith("/x")) { writeTarget.push(writeTarget_xref); xref_state = 0; } else if (alamat.endsWith("/x/milestone")) { // after milestone, we will have xref source xref_state = 1; } else if (alamat.endsWith("/x/xt")) { // after xt, we will have xref target xref_state = 2; } else if (alamat.endsWith("/wj")) { writeTarget.push(writeTarget_text); write("@6"); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { flushCharactersBuffer(); System.out.print("(end:) "); print(); String alamat = address(); if (alamat.endsWith("/p")) { writeTarget.pop(); } else if (alamat.endsWith("/f")) { writeTarget.pop(); } else if (alamat.endsWith("/s")) { afterThisMustStartNewPerikop = true; writeTarget.pop(); } else if (alamat.endsWith("/x")) { writeTarget.pop(); } else if (alamat.endsWith("/wj")) { write("@5"); writeTarget.pop(); } tree[--depth] = null; } private void flushCharactersBuffer() { if (charactersBuffer.length() > 0) { charactersCompleted(charactersBuffer.toString()); charactersBuffer.setLength(0); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { charactersBuffer.append(ch, start, length); } void charactersCompleted(String text) { System.out.println("#text:" + text); if (text.trim().length() == 0) { // when processing footnotes, we still continue even though the text is whitespace only. if (writeTarget.peek() != writeTarget_footnote) { return; } } write(text); } private void write(String chars) { Object target = writeTarget.peek(); if (target == writeTarget_mystery) { System.out.println("$tulis ke mystery " + book_0 + " " + chapter_1 + " " + verse_1 + ":" + chars); mystery.append(chars).append('\n'); } else if (target == writeTarget_text) { System.out.println("$tulis ke teks[jenis=" + textIndent + "] " + book_0 + " " + chapter_1 + " " + verse_1 + ":" + chars); textDb.append(book_0, chapter_1, verse_1, chars.replace("\n", " ").replaceAll("\\s+", " "), textIndent); textIndent = -1; // reset if (pericopeBuffer.size() > 0) { for (PericopeData.Entry pe: pericopeBuffer) { pe.block.title = pe.block.title.replace("\n", " ").replace(" ", " ").trim(); System.out.println("(commit to perikopData " + book_0 + " " + chapter_1 + " " + verse_1 + ":) " + pe.block.title); pe.ari = Ari.encode(book_0, chapter_1, verse_1); pericopeData.entries.add(pe); } pericopeBuffer.clear(); } } else if (target == writeTarget_pericopeTitle) { // masukin ke data perikop final String title = chars; if (sLevel == 0 || sLevel == 1 || sLevel == LEVEL_p_mr || sLevel == LEVEL_p_ms) { if (afterThisMustStartNewPerikop || pericopeBuffer.size() == 0) { PericopeData.Entry entry = new PericopeData.Entry(); entry.ari = 0; // done later when writing teks so we know which verse this pericope starts from entry.block = new PericopeData.Block(); entry.block.title = title; pericopeBuffer.add(entry); afterThisMustStartNewPerikop = false; System.out.println("$tulis ke pericopeBuffer (new entry) (size now: " + pericopeBuffer.size() + "): " + title); } else { pericopeBuffer.get(pericopeBuffer.size() - 1).block.title += title; System.out.println("$tulis ke pericopeBuffer (append to existing) (size now: " + pericopeBuffer.size() + "): " + title); } } else if (sLevel == LEVEL_p_r) { // paralel if (pericopeBuffer.size() == 0) { throw new RuntimeException("paralel found but no perikop on buffer: " + title); } PericopeData.Entry entry = pericopeBuffer.get(pericopeBuffer.size() - 1); entry.block.parallels = parseParallel(title); } else if (sLevel == 2) { System.out.println("$tulis ke tempat sampah (perikop level 2): " + title); } else { throw new RuntimeException("sLevel = " + sLevel + " not understood: " + title); } } else if (target == writeTarget_xref) { System.out.println("$tulis ke xref (state=" + xref_state + ") " + book_0 + " " + chapter_1 + " " + verse_1 + ":" + chars); int ari = Ari.encode(book_0, chapter_1, verse_1); if (xref_state == 0) { // compatibility when \x and \x* are written without any \xo or \xt markers. // Check chars, if it contains more than just spaces, -, +, or a character, it means it looks like a complete xref entry. final String content = chars; final int xrefIndex; if (content.replaceFirst("[-+a-zA-Z]", "").replaceAll("\\s", "").length() > 0) { xrefIndex = xrefDb.addComplete(ari, chars); } else { xrefIndex = xrefDb.addBegin(ari); } textDb.append(ari, "@<x" + (xrefIndex + 1) + "@>@/", -1); } else if (xref_state == 1) { xrefDb.appendText(ari, chars); } else if (xref_state == 2) { xrefDb.appendText(ari, chars); } else { throw new RuntimeException("xref_state not supported"); } } else if (target == writeTarget_footnote) { System.out.println("$tulis ke footnote (state=" + footnote_state + ") " + book_0 + " " + chapter_1 + " " + verse_1 + ":" + chars); final int ari = Ari.encode(book_0, chapter_1, verse_1); if (footnote_state == 0) { final String content; // remove caller at the beginning if (chars.matches("[a-zA-Z+-]\\s.*")) { // remove that first 2 characters content = chars.substring(2); } else { content = chars; } final int footnoteIndex = footnoteDb.addBegin(ari); if (content.trim().length() != 0) { footnoteDb.appendText(ari, content.replace("\n", " ")); } textDb.append(ari, "@<f" + (footnoteIndex + 1) + "@>@/", -1); } else if (footnote_state == 2) { footnoteDb.appendText(ari, "@9" + chars.replace("\n", " ") + "@7"); } else if (footnote_state == 1 || footnote_state == 3) { footnoteDb.appendText(ari, chars.replace("\n", " ")); } else { throw new RuntimeException("footnote_state not supported"); } } } void print() { for (int i = 0; i < depth; i++) { System.out.print('/'); System.out.print(tree[i]); } System.out.println(); } private StringBuilder a = new StringBuilder(); private String address() { a.setLength(0); for (int i = 0; i < depth; i++) { a.append('/').append(tree[i]); } return a.toString(); } } /** * (Mat. 23:1-36; Mrk. 12:38-40; Luk. 20:45-47) -> [Mat. 23:1-36, Mrk. 12:38-40, Luk. 20:45-47] * (Mat. 10:26-33, 19-20) -> [Mat. 10:26-33, Mat. 10:19-20] * (Mat. 6:25-34, 19-21) -> [Mat. 6:25-34, Mat. 6:19-21] * (Mat. 10:34-36) -> [Mat. 10:34-36] * (Mat. 26:57-58, 69-75; Mrk. 14:53-54, 66-72; Yoh. 18:12-18, 25-27) -> [Mat. 26:57-58, Mat. 26:69-75, Mrk. 14:53-54, Mrk. 14:66-72, Yoh. 18:12-18, Yoh. 18:25-27] * (2Taw. 13:1--14:1) -> [2Taw. 13:1--14:1] * (2Taw. 14:1-5, 15:16--16:13) -> [2Taw. 14:1-5, 2Taw. 15:16--16:13] * (2Taw. 2:13-14, 3:15--5:1) -> [2Taw. 2:13-14, 2Taw. 3:15--5:1] * (2Taw. 34:3-7, 35:1-27) -> [2Taw. 34:3-7, 2Taw. 35:1-27] */ static List<String> parseParallel(String judul) { List<String> res = new ArrayList<>(); judul = judul.trim(); if (judul.startsWith("(")) judul = judul.substring(1); if (judul.endsWith(")")) judul = judul.substring(0, judul.length() - 1); String kitab = null; String pasal = null; String ayat; String[] alamats = judul.split("[;,]"); for (String alamat : alamats) { alamat = alamat.trim(); String[] bagians = alamat.split(" +", 2); String pa; if (bagians.length == 1) { // no kitab; if (kitab == null) throw new RuntimeException("no existing kitab"); pa = bagians[0]; } else { kitab = bagians[0]; pa = bagians[1]; } String[] parts = pa.split(":", 2); if (parts.length == 1) { // no pasal if (pasal == null) throw new RuntimeException("no existing pasal"); ayat = parts[0]; } else { pasal = parts[0]; ayat = parts[1]; } res.add(kitab + " " + pasal + ":" + ayat); } return res; } }