package yuku.alkitab.base.util; import android.util.Log; import yuku.alkitab.debug.BuildConfig; import yuku.alkitab.model.Book; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.WeakHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Jumper { public static final String TAG = Jumper.class.getSimpleName(); public interface Logger { void d(String msg); } // default logger private Logger logger = msg -> Log.d(TAG, msg); private String p_book; private int p_chapter; private int p_verse; /** The reference string is a verse range, with dash as delimiter */ private boolean p_hasRange = false; /** If bookId found from OSIS book names, set this to other than -1 and this will be returned */ private int p_bookIdFromOsis = -1; private boolean parseSucceeded = false; public static class BookRef { public String condensed; public int bookId; @Override public String toString() { return condensed + ":" + bookId; } } private static WeakHashMap<Book[], List<BookRef>> condensedCache = new WeakHashMap<>(); /** * Parse with default logger. */ public Jumper(String referenceToParse) { parseSucceeded = parse(referenceToParse); } /** * Override default logger and parse, e.g. for unit testing, to prevent calling android.util.Log methods. */ public void setLogger(final Logger logger) { this.logger = logger; } public Jumper(String referenceToParse, Logger logger) { this.logger = logger; parseSucceeded = parse(referenceToParse); } /** * Can't be parsed as a pure number. "4-5": true. "Hello": true. "123": false. "12b": false. * This is not the opposite of isNumber. */ private static boolean isWord(String s) { char c = s.charAt(0); if (c < '0' || c > '9') return true; try { Integer.parseInt(s); return false; } catch (NumberFormatException e) { return true; } } /** * @return true if s is a number, or s is a number followed by a single lowercase character 'a'-'z' inclusive. * That is for handling verse parts like 12a or 15b. */ private static boolean isNumber(String s) { try { Integer.parseInt(s); return true; } catch (NumberFormatException e) { // try special case if (s.length() > 1 && s.charAt(s.length() - 1) >= 'a' && s.charAt(s.length() - 1) <= 'z') { try { Integer.parseInt(s.substring(0, s.length() - 1)); return true; } catch (NumberFormatException e2) { return false; } } return false; } } /** * @return true if s is a number. */ private static boolean isPureNumber(String s) { try { Integer.parseInt(s); return true; } catch (NumberFormatException e) { return false; } } /** * @return integer value of s if s is a number, or s is a number followed by a single lowercase character 'a'-'z' inclusive. * That is for handling verse parts like 12a or 15b. * Returns 0 when it's unable to parse. */ private static int numberize(String s) { try { return Integer.parseInt(s); } catch (NumberFormatException e) { // try special case if (s.length() > 1 && s.charAt(s.length() - 1) >= 'a' && s.charAt(s.length() - 1) <= 'z') { try { return Integer.parseInt(s.substring(0, s.length() - 1)); } catch (NumberFormatException e2) { return 0; } } return 0; } } private boolean parse0(String reference) { reference = reference.trim(); if (reference.length() == 0) { return false; } if (BuildConfig.DEBUG) logger.d("jumper stage 0: " + reference); //# STAGE 4: replace en-dash and em-dash to normal dash if (reference.contains("\u2013") || reference.contains("\u2014")) { reference = reference.replaceAll("[\u2013\u2014]", "-"); if (BuildConfig.DEBUG) logger.d("jumper stage 4: " + reference); } //# STAGE 5: Remove spaces on the left and right of "-" if (reference.indexOf('-') >= 0) { reference = reference.replaceAll("\\s+-\\s+|\\s+-|-\\s+", "-"); if (BuildConfig.DEBUG) logger.d("jumper stage 5: " + reference); } //# STAGE 7: Check whether this is in strict osis ID format. // This can be BookName.Chapter.Verse or BookName.Chapter // Or, either of the above separated by a '-' notosis: { if (reference.indexOf('.') < 0) { break notosis; // must contain a dot } String osisId; if (reference.indexOf('-') >= 0) { // optionally a '-' final String[] osisIds = reference.split("-"); if (osisIds.length != 2) { break notosis; // wrong format } osisId = osisIds[0]; p_hasRange = true; } else { osisId = reference; } Pattern p = OsisBookNames.getBookNameWithChapterAndOptionalVersePattern(); Matcher m = p.matcher(osisId); if (m.matches()) { if (BuildConfig.DEBUG) logger.d("jumper stage 7: ref matching osis pattern found: " + osisId); String osisBookName = m.group(1); String chapter_s = m.group(2); String verse_s = m.group(3); try { p_bookIdFromOsis = OsisBookNames.osisBookNameToBookId(osisBookName); p_chapter = Integer.parseInt(chapter_s); p_verse = (verse_s == null || verse_s.length() == 0)? 0: Integer.parseInt(verse_s); } catch (Exception e) { throw new RuntimeException("Should not happen. In jumper stage 7", e); } if (BuildConfig.DEBUG) logger.d("jumper stage 7: successfully parsed osis id: " + p_bookIdFromOsis + ' ' + p_chapter + ' ' + p_verse); return true; } } //# STAGE 10: Split based on SPACE, :, PERIOD, and whitespaces between -'s and numbers. //# Sample of wrong output: [Kisah, rasul34, 6-7, 8] //# Sample of right output: [Kisah, rasul34, 6, -, 7, 8] String[] parts = reference.split("((\\s|:|\\.)+|(?=[0-9])(?<=-)|(?=-)(?<=[0-9][a-z]?))"); if (BuildConfig.DEBUG) logger.d("jumper stage 10: " + Arrays.toString(parts)); //# STAGE 12: Remove string from empty parts { int hasEmpty = 0; for (String b: parts) { if (b.length() == 0) { hasEmpty++; break; } } if (hasEmpty > 0) { String[] partsWithoutEmpties = new String[parts.length - hasEmpty]; int c = 0; for (String b: parts) { if (b.length() != 0) { partsWithoutEmpties[c++] = b; } } parts = partsWithoutEmpties; } } if (BuildConfig.DEBUG) logger.d("jumper stage 12: " + Arrays.toString(parts)); if (parts.length == 0) { return false; } //# STAGE 20: Expand cases like Joh3 to Joh 3 //# Sample output: [Kisah, rasul, 34, 6, -, 7, 8] { ArrayList<String> bel = new ArrayList<>(); for (String b: parts) { if (isWord(b)) { String number = ""; for (int i = b.length() - 1; i >= 0; i--) { char c = b.charAt(i); if (c >= '0' && c <= '9') { // found a digit number = c + number; } else { break; } } if (number.length() > 0) { // a number found behind a word bel.add(b.substring(0, b.length() - number.length())); bel.add(number); } else { bel.add(b); } } else { bel.add(b); } } parts = bel.toArray(parts); } if (BuildConfig.DEBUG) logger.d("jumper stage 20: " + Arrays.toString(parts)); //# STAGE 25: Look for part that is "-", then remove from it to the end. { boolean hasDash = false; int at = -1; for (int i = 0; i < parts.length; i++) { if ("-".equals(parts[i]) || "--".equals(parts[i])) { hasDash = true; at = i; break; } } if (hasDash) { String[] bel = new String[at]; System.arraycopy(parts, 0, bel, 0, at); parts = bel; p_hasRange = true; if (BuildConfig.DEBUG) logger.d("jumper stage 25: " + Arrays.toString(parts)); } } //# STAGE 30: Morph something like "3" "john" to "3 john" { ArrayList<String> bel = new ArrayList<>(); int startWord = 0; // see from the right which one is not a number. That is the start of book. for (int i = parts.length - 1; i >= 0; i--) { final String part = parts[i]; if (!isNumber(part)) { // this and all earlier than this is the book. startWord = i; break; } if (i == 0) { // special case, probably the first part is something like "1j" or "1y" for 1 John. if (isWord(part)) { startWord = i; break; } if (BuildConfig.DEBUG) { logger.d("jumper stage 30: too much, how come there are more than 2 numbers: returning false"); } return false; } } String s = null; for (int j = 0; j <= startWord; j++) { s = (s == null)? parts[j]: s + " " + parts[j]; } bel.add(s); bel.addAll(Arrays.asList(parts).subList(startWord + 1, parts.length)); parts = bel.toArray(new String[bel.size()]); } if (BuildConfig.DEBUG) logger.d("jumper stage 30: " + Arrays.toString(parts)); if (parts.length == 1) { // 1 part only // It means it can be CHAPTER or BOOK only if (isWord(parts[0])) { // it's a BOOK p_book = parts[0]; return true; } else { // it's a CHAPTER p_chapter = numberize(parts[0]); return true; } } if (parts.length == 2) { // 2 parts // means it could be CHAPTER VERSE (in the same book) if (isPureNumber(parts[0]) && isNumber(parts[1])) { p_chapter = numberize(parts[0]); p_verse = numberize(parts[1]); return true; } // or BOOK CHAPTER if (isPureNumber(parts[1])) { p_book = parts[0]; p_chapter = numberize(parts[1]); return true; } return false; } if (parts.length == 3) { // 3 parts // it means it must be BOOK CHAPTER VERSE. Could not be otherwise. p_book = parts[0]; p_chapter = numberize(parts[1]); p_verse = numberize(parts[2]); return true; } return false; } private boolean parse(String alamat) { boolean res = parse0(alamat); if (BuildConfig.DEBUG) { logger.d("jumper after parse0: p_book=" + p_book + " p_chapter=" + p_chapter + " p_verse=" + p_verse); } return res; } public static List<BookRef> createBookCandidates(Book[] books) { String[] bookNames = new String[books.length]; int[] bookIds = new int[books.length]; for (int i = 0, booksLength = books.length; i < booksLength; i++) { final Book book = books[i]; bookNames[i] = book.shortName; bookIds[i] = book.bookId; } return createBookCandidates(bookNames, bookIds); } static List<BookRef> createBookCandidates(String[] bookNames, int[] bookIds) { // create cache of condensed book titles where all spaces are stripped and lowercased and "1" becomes "I", "2" becomes "II" etc. final List<BookRef> res = new ArrayList<>(); for (int i = 0, len = bookNames.length; i < len; i++) { String condensed = bookNames[i].replaceAll("(\\s|-|_)+", "").toLowerCase(Locale.getDefault()); { BookRef ref = new BookRef(); ref.condensed = condensed; ref.bookId = bookIds[i]; res.add(ref); } if (condensed.contains("1") || condensed.contains("2") || condensed.contains("3")) { condensed = condensed.replace("1", "i").replace("2", "ii").replace("3", "iii"); BookRef ref = new BookRef(); ref.condensed = condensed; ref.bookId = bookIds[i]; res.add(ref); } } return res; } private int guessBook(List<BookRef> refs) { if (p_book == null) { return -1; } int res = -1; // 0. clean up p_book p_book = p_book.replaceAll("(\\s|-|_)", "").toLowerCase(Locale.getDefault()); if (BuildConfig.DEBUG) logger.d("guessBook phase 0: p_book = " + p_book); // 1. try to match wholly (e.g.: "genesis", "john") for (BookRef ref: refs) { if (ref.condensed.equals(p_book)) { if (BuildConfig.DEBUG) logger.d("guessBook phase 1 success: " + p_book); return ref.bookId; } } // 2. try to match by prefix. If there is only one match, success int pos_forLater = -1; { int passed = 0; for (BookRef ref: refs) { if (ref.condensed.startsWith(p_book)) { passed++; if (passed == 1) pos_forLater = ref.bookId; } } if (passed == 1) { if (BuildConfig.DEBUG) logger.d("guessBook phase 2 success: " + pos_forLater + " for " + p_book); return pos_forLater; } else { if (BuildConfig.DEBUG) logger.d("guessBook phase 2: passed=" + passed); } } // 3. String matching only when p_book is 2 letters or more if (p_book.length() >= 2) { int minScore = 99999999; int pos = -1; for (BookRef ref: refs) { int score = Levenshtein.distance(p_book, ref.condensed); if (p_book.charAt(0) != ref.condensed.charAt(0)) { score += 150; // approximately 1.5 times insertion cost } if (BuildConfig.DEBUG) { logger.d("guessBook phase 3: with " + ref + ", score " + score); } if (score < minScore) { minScore = score; pos = ref.bookId; } } if (pos != -1) { if (BuildConfig.DEBUG) logger.d("guessBook phase 3 success: " + pos + " with score " + minScore); return pos; } } // 7. Return the earlier match if there is more than one that passed phase 2. if (pos_forLater != -1) { if (BuildConfig.DEBUG) logger.d("guessBook phase 7 success: " + pos_forLater + " for " + p_book); return pos_forLater; } return res; } /** * @return whether the parsing succeeded */ public boolean getParseSucceeded() { return parseSucceeded; } public String getUnparsedBook() { return p_book; } /** * @param books list of books from which the looked for book is searched * @return bookId of one of the books (or -1). */ public int getBookId(Book[] books) { if (p_bookIdFromOsis != -1) return p_bookIdFromOsis; List<BookRef> refs = condensedCache.get(books); if (refs == null) { String[] bookNames = new String[books.length]; int[] bookIds = new int[books.length]; for (int i = 0; i < books.length; i++) { bookNames[i] = books[i].shortName; bookIds[i] = books[i].bookId; } refs = createBookCandidates(bookNames, bookIds); condensedCache.put(books, refs); if (BuildConfig.DEBUG) logger.d("New condensedCache entry: " + refs); } return guessBook(refs); } /** * Give list of (bookName, bookId) from which the looked for book is searched. * @return bookId of one of the books (or -1). */ public int getBookId(String[] bookNames, int[] bookIds) { if (p_bookIdFromOsis != -1) return p_bookIdFromOsis; return guessBook(createBookCandidates(bookNames, bookIds)); } public int getChapter() { return p_chapter; } public int getVerse() { return p_verse; } /** The reference string is a verse range, with dash as delimiter */ public boolean getHasRange() { return p_hasRange; } }