package yuku.alkitabconverter.ja_kougo; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.ext.DefaultHandler2; import yuku.alkitab.yes2.model.PericopeData; import yuku.alkitabconverter.OsisBookNames; import yuku.alkitabconverter.util.Rec; import yuku.alkitabconverter.util.TextDb; import yuku.alkitabconverter.yes_common.Yes2Common; import yuku.alkitabconverter.yet.YetFileOutput; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public class Proses1 { public static final String INFO_SHORT_NAME = "KOUGO"; public static final String INFO_LONG_NAME = "口語訳"; public static final String INFO_DESCRIPTION = "口語訳新約聖書(1954年版) Colloquial Japanese (1954)"; static String INPUT_TEKS_1 = "./ja-kougo/xml/"; public static String INPUT_TEKS_ENCODING = "utf-8"; public static int INPUT_TEKS_ENCODING_YES = 2; // 1: ascii; 2: utf-8; public static String INPUT_KITAB = "./ja-kougo/ja-kougo-kitab.txt"; static String OUTPUT_YET = "./ja-kougo/ja-kougo.yet"; public static int OUTPUT_ADA_PERIKOP = 1; final SAXParserFactory factory = SAXParserFactory.newInstance(); Handler handler; List<Rec> xrec = new ArrayList<>(); PericopeData pericopeData = new PericopeData(); { pericopeData.entries = new ArrayList<>(); } public static void main(String[] args) throws Exception { new Proses1().u(); } public void u() throws Exception { handler = new Handler(); String[] files = new File(INPUT_TEKS_1).list(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(".xml"); } }); Arrays.sort(files); for (String file : files) { //System.out.println(file); FileInputStream in = new FileInputStream(new File(INPUT_TEKS_1, file)); SAXParser parser = factory.newSAXParser(); parser.getXMLReader().setFeature("http://xml.org/sax/features/namespaces", true); parser.parse(in, handler); //System.out.println("file " + file + " done; now total rec: " + xrec.size()); } // POST-PROCESS for (Rec rec: xrec) { // tambah @@ kalo perlu if (rec.text.contains("@") && !rec.text.startsWith("@@")) { rec.text = "@@" + rec.text; } // betulin 〔セラ yang ga ada kurung tutupnya rec.text = rec.text.replaceAll("(\u3014(ヒガヨン、)?セラ)(($|[^\u3015]))", "$1\u3015$3"); System.out.println(rec.book_1 + "\t" + rec.chapter_1 + "\t" + rec.verse_1 + "\t" + rec.text); } //System.out.println("Total rec: " + xrec.size()); ////////// PROSES KE YET final YetFileOutput yet = new YetFileOutput(new File(OUTPUT_YET)); final Yes2Common.VersionInfo versionInfo = new Yes2Common.VersionInfo(); versionInfo.locale = "ja"; versionInfo.shortName = INFO_SHORT_NAME; versionInfo.longName = INFO_LONG_NAME; versionInfo.description = INFO_DESCRIPTION; versionInfo.setBookNamesFromFile(INPUT_KITAB); yet.setVersionInfo(versionInfo); yet.setPericopeData(pericopeData); yet.setTextDb(new TextDb(xrec)); yet.write(); } public class Handler extends DefaultHandler2 { int kitab_1 = 0; int pasal_1 = 0; int ayat_1 = 0; int lastAri = 0; boolean ayatSid = false; String[] tree = new String[80]; int depth = 0; StringBuilder b = new StringBuilder(); StringBuilder b_comment = new StringBuilder(); boolean simpan = false; boolean simpan_comment = false; int indenDiXml = 0; int indenDiAyat = 0; @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { tree[depth++] = localName; //System.out.print("(start:) "); //cetak(); String alamat = alamat(); if (alamat.equals("/book")) { String bookName = attributes.getValue("id"); kitab_1 = OsisBookNames.abbrToKitab0(bookName) + 1; } else if (alamat.endsWith("/chapter")) { String pasal_s = attributes.getValue("id"); pasal_1 = Integer.parseInt(pasal_s); ayat_1 = 1; // reset to 1 } else if (alamat.endsWith("/verse")) { String ayat_s = attributes.getValue("id"); if (ayat_s != null) { String[] splits = ayat_s.split(":", 2); if (splits.length < 2) throw new RuntimeException("ayat ngaco: " + ayat_s); try { ayat_1 = Integer.parseInt(splits[1]); } catch (NumberFormatException e) { b.append("(" + ayat_s + ") "); ayat_1 = parseIntSecukupnya(splits[1]); } ayatSid = false; simpan = true; } else { String ayat_s2 = attributes.getValue("sid"); if (ayat_s2 != null) { String[] splits = ayat_s2.split(":", 2); if (splits.length < 2) throw new RuntimeException("ayat ngaco: " + ayat_s); try { ayat_1 = Integer.parseInt(splits[1]); } catch (NumberFormatException e) { b.append("(" + ayat_s2 + ") "); ayat_1 = parseIntSecukupnya(splits[1]); } ayatSid = true; simpan = true; } else { String ayat_s3 = attributes.getValue("eid"); if (ayat_s3 != null && ayatSid) { newVerse(kitab_1, pasal_1, ayat_1, b.toString()); b.setLength(0); } simpan = false; } } } else if (alamat.endsWith("/l")) { indenDiXml++; if (indenDiXml == 1) { b.append("@1"); indenDiAyat = 1; } else if (indenDiXml > 1) { throw new RuntimeException("inden di xml: " + indenDiXml); } } else if (alamat.endsWith("/comment")) { simpan_comment = true; } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { //System.out.print("(end:) "); //cetak(); String alamat = alamat(); if (alamat.endsWith("/verse")) { if (!ayatSid) { newVerse(kitab_1, pasal_1, ayat_1, b.toString()); b.setLength(0); simpan = false; } } else if (alamat.endsWith("/l")) { indenDiXml--; } else if (alamat.endsWith("/p")) { // tambah @8 di ayat terakhir ATAU buffer b (belum jadi ayat) if (b.length() > 0) { b.append("@8"); } else { Rec lastRec = xrec.get(xrec.size() - 1); lastRec.text = lastRec.text + "@8"; } } else if (alamat.endsWith("/comment")) { simpan_comment = false; String comment = b_comment.toString(); b_comment.setLength(0); // masukin ke data perikop PericopeData.Entry entry = new PericopeData.Entry(); entry.ari = (kitab_1 - 1) << 16 | pasal_1 << 8 | ayat_1; entry.block = new PericopeData.Block(); entry.block.title = comment; pericopeData.entries.add(entry); System.out.println("Perikop: " + kitab_1 + " " + pasal_1 + " " + ayat_1 + " " + comment); } tree[--depth] = null; } private int parseIntSecukupnya(String s) { for (int i = 0; i < s.length(); i++) { if (!Character.isDigit(s.charAt(i))) { s = s.substring(0, i); break; } } return Integer.parseInt(s); } private void newVerse(int kitab_1, int pasal_1, int ayat_1, String isi) { int ari = (kitab_1 - 1) << 16 | pasal_1 << 8 | ayat_1; if (ari > lastAri) { if (pasal_1 == ((lastAri & 0x00ff00) >> 8)) { // isi dengan kekosongan for (int a_1 = ((lastAri+1) & 0xff); a_1 < ayat_1; a_1++) { Rec rec = new Rec(); rec.book_1 = kitab_1; rec.chapter_1 = pasal_1; rec.verse_1 = a_1; rec.text = ""; xrec.add(rec); } } Rec rec = new Rec(); rec.book_1 = kitab_1; rec.chapter_1 = pasal_1; rec.verse_1 = ayat_1; rec.text = isi; xrec.add(rec); // reset inden ke 0 lagi indenDiAyat = 0; lastAri = ari; // bukan di bawah. } else { // ari sama lagi, ato malah mundur. Maka append ke rec terakhir Rec lastRec = xrec.get(xrec.size()-1); lastRec.text += " "; lastRec.text += " (" + pasal_1 + ":" + ayat_1 + ") "; lastRec.text += isi; } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (simpan) { if (indenDiXml == 0 && indenDiAyat != 0) { b.append("@0"); indenDiAyat = 0; } b.append(ch, start, length); } else if (simpan_comment) { b_comment.append(ch, start, length); } } void cetak() { for (int i = 0; i < depth; i++) { System.out.print('/'); System.out.print(tree[i]); } System.out.println(); } private StringBuilder a = new StringBuilder(); private String alamat() { a.setLength(0); for (int i = 0; i < depth; i++) { a.append('/').append(tree[i]); } return a.toString(); } } }