/**
* Copyright (c) 2015 unfoldingWord
* http://creativecommons.org/licenses/MIT/
* See LICENSE file for details.
* Contributors:
* PJ Fechner <pj@actsmedia.com>
*/
package model.parsers;
import junit.framework.Assert;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by PJ Fechner on 2/26/15.
* Class for parsing USFM
*/
public class USFMParser {
private static final String TAG = "USFMParser";
private static final Pattern VERSE_REGEX = Pattern.compile("\\\\v\\s([0-9-])*\\s", Pattern.DOTALL);
private static final Pattern NUMBER_REGEX = Pattern.compile("\\s*(\\d*)");
private static final Pattern Q_NUMBER_REGEX = Pattern.compile("\\\\q\\d");
private static final Pattern Q_REGEX = Pattern.compile("\\\\(q)\\d?\\ .*");
private static final Pattern D_REGEX = Pattern.compile("\\\\d.*");
private static final String QS_REGEX = "\\\\(qs)\\d?\\ .*\\\\qs\\*";
private static final Pattern SP_REGEX = Pattern.compile("\\\\sp.*");
private static final Pattern ADD_REGEX = Pattern.compile("\\\\add.*\\\\add\\*", Pattern.DOTALL);
private static final Pattern FOOTNOTE_REGEX = Pattern.compile("\\\\f\\s+.*\\n*\\\\f[*]");
private static final Pattern FOOTNOTE_TEXT_REGEX = Pattern.compile("\\\\f.*\\\\f\\*", Pattern.DOTALL);
private static final Pattern FOOTNOTE_VERSE_REGEX = Pattern.compile("\\\\fqa.*\\\\f[*]");
private static final Pattern SINGLE_CHAPTER_BOOK_NAME_REGEX = Pattern.compile("\\\\cl.(.*)");
private static final String TAB = " ";
private int footnoteNumber = 1;
/**
* Processes usfm and returns a map with key of the chapter number and object of the parsed usfm text for that chapter
* @param usfmBytes
* @return
* @throws CharacterCodingException
*/
public Map<String, String> getChaptersFromUsfm(byte[] usfmBytes) throws CharacterCodingException {
String text = getStringFromBytes(usfmBytes);
ArrayList<String> chaptersArray = getChapters(text);
Map<String, String> chapters = new HashMap<String, String>();
for (String chapter : chaptersArray) {
Matcher numberMatcher = NUMBER_REGEX.matcher(chapter);
String chapterNumber = "";
while (numberMatcher.find()) {
chapterNumber = numberMatcher.group(0);
break;
}
if (chapterNumber.trim().length() < 1) {
Assert.fail();
}
int chapterStartIndex = chapter.indexOf("\\");
if(chapterStartIndex > -1) {
chapter = chapter.substring(chapterStartIndex);
chapters.put(chapterNumber, chapter);
}
}
return chapters;
}
public static String getSingleChapterBookName(String text){
Matcher verseMatcher = SINGLE_CHAPTER_BOOK_NAME_REGEX.matcher(text);
while (verseMatcher.find()) {
String name = verseMatcher.group(0);
name = name.substring(name.indexOf(" "), name.length()).trim();
return name;
}
return "";
}
static public String getStringFromBytes(byte[] bytes) throws CharacterCodingException{
Charset utfSet = Charset.forName("UTF-8");
CharsetDecoder decoder = utfSet.newDecoder();
CharBuffer buffer = decoder.decode(ByteBuffer.wrap(bytes));
String byteString = String.valueOf(buffer.array());
return byteString;
}
public String parseUsfmChapter(String chapter){
footnoteNumber = 1;
chapter = handleDs(chapter);
chapter = handleSPs(chapter);
chapter = handleAdds(chapter);
chapter = handleQSelahs(chapter);
chapter = replaceQs(chapter);
chapter = replaceVerseTags(chapter);
chapter = findFootnotes(chapter);
chapter = addLineBreaks(chapter);
chapter = cleanUp(chapter);
String finalChapterText = "<div class=\"chapter-div\"><p>" + chapter + "</p></div>";
return finalChapterText;
}
private ArrayList<String> getChapters(String chapters) {
String[] chapterArray = chapters.split("\\\\c ");
ArrayList<String> chapterList = new ArrayList<String>();
chapterList.addAll(Arrays.asList(chapterArray).subList(1, chapterArray.length));
return chapterList;
}
private String replaceVerseTags(String text) {
Matcher verseMatcher = VERSE_REGEX.matcher(text);
ArrayList<String> verseText = new ArrayList<String>();
while (verseMatcher.find()) {
verseText.add(verseMatcher.group(0));
}
if (verseText.isEmpty()) {
return text;
}
for (String verseString : verseText) {
String verseLessString = verseString;
verseLessString = verseLessString.replace("\\v ", "");
verseLessString = "<span class=\"verse\"> " + verseLessString + "</span>";
text = text.replace(verseString, verseLessString);
}
return text;
}
private String handleQSelahs(String text){
return text.replaceAll(QS_REGEX, "<span class=\"selah\">Selah<br/></span></br>");
}
private String handleSPs(String text) {
Matcher spMatcher = SP_REGEX.matcher(text);
ArrayList<String> spText = new ArrayList<>();
while (spMatcher.find()) {
spText.add(spMatcher.group(0));
}
if (spText.isEmpty()) {
return text;
}
for (String spString : spText) {
String spLessString = spString.replace("\\sp ", "<br/><p class=\"sp\">") + "</p><br/>";
text = text.replace(spString, spLessString);
}
return text;
}
private String handleAdds(String text) {
Matcher addMatcher = ADD_REGEX.matcher(text);
ArrayList<String> addText = new ArrayList<>();
while (addMatcher.find()) {
addText.add(addMatcher.group(0));
}
if (addText.isEmpty()) {
return text;
}
for (String addString : addText) {
String addLessString = addString.replace("\\add ", "[").replace(" \\add*", "]");
text = text.replace(addString, addLessString);
}
return text;
}
private String handleDs(String text){
Matcher dMatcher = D_REGEX.matcher(text);
ArrayList<String> dText = new ArrayList<>();
while (dMatcher.find()) {
dText.add(dMatcher.group(0));
}
if (dText.isEmpty()) {
return text;
}
for (String dString : dText) {
String dLessString = dString.replace("\\d", "</p><p class=\"d\">") + "</p><p>";
text = text.replace(dString, dLessString);
}
return text;
}
private String replaceQs(String text) {
Matcher qMatcher = Q_REGEX.matcher(text);
ArrayList<String> qText = new ArrayList<String>();
while (qMatcher.find()) {
qText.add(qMatcher.group(0));
}
if (qText.isEmpty()) {
return text;
}
for (String qString : qText) {
String qLessString = qString;
Matcher numberMatcher = Q_NUMBER_REGEX.matcher(qLessString);
String qNumber = "";
while (numberMatcher.find()) {
qNumber = numberMatcher.group(0);
break;
}
if(qNumber.length() >= 1){
qNumber = qNumber.substring(2);
// System.out.println("or there");
}
if(qLessString.replace("\\q" + qNumber, "").replace("\\s", "").trim().length() > 4){
qLessString = qLessString.replace("\\q" + qNumber, "<span class=\"q" + qNumber + "\">") + "</span>";
text = text.replace(qString, qLessString);
}
}
return text;
}
private String findFootnotes(String text) {
Matcher verseMatcher = FOOTNOTE_REGEX.matcher(text);
ArrayList<String> verseText = new ArrayList<>();
while (verseMatcher.find()) {
verseText.add(verseMatcher.group(0));
}
if(verseText.size() > 0) {
for(String footnote : verseText){
String footnoteText = findFootnoteText(footnote);
String footnoteNumberText = "<sup class=\"footnote-number\">" + Integer.toString(this.footnoteNumber) + "</sup>";
// String footnoteVerse = findFootnoteVerseText(footnote) + footnoteNumberText;
text = text.replace(footnote, footnoteNumberText);
text = text + "<p class=\"footnote\">" + footnoteNumberText + footnoteText + "</p>";
footnoteNumber++;
}
return text;
}
else{
return text;
}
}
private String findFootnoteVerseText(String text){
Matcher verseMatcher = FOOTNOTE_VERSE_REGEX.matcher(text);
ArrayList<String> verseText = new ArrayList<String>();
while (verseMatcher.find()) {
verseText.add(verseMatcher.group(0));
}
if(verseText.size() > 0){
for(String footnote : verseText){
footnote = footnote.replaceAll("\\\\fqa*\\s*", "");
footnote = footnote.replaceAll("\\\\f[*]", "");
return footnote;
}
}
return text;
}
private String findFootnoteText(String text){
Matcher verseMatcher = FOOTNOTE_TEXT_REGEX.matcher(text);
ArrayList<String> verseText = new ArrayList<String>();
while (verseMatcher.find()) {
verseText.add(verseMatcher.group(0));
}
if(verseText.size() > 0){
for(String footnote : verseText){
footnote = footnote.replaceAll("\\\\f(\\w|\\+|\\*|\\s\\W)*", "");
footnote = footnote.replaceAll("\\\\fqa", "");
return footnote;
}
}
return text;
}
private String addLineBreaks(String text) {
if (text.substring(0, 2).equalsIgnoreCase("\\p")) {
text = text.substring(2);
}
text = text.replace("\\b", "<br/>");
String sRegex = "\\\\pi\\d*";
text = text.replaceAll(sRegex, "<br/>" + TAB);
text = text.replace("\\p", "<br/>");
return text;
}
private String cleanUp(String text) {
String sRegex = "\\\\(\\S)*\\s*";
text = text.replaceAll(sRegex, "");
text = text.replace("\n", " ");
text = text.replace("\\m ", "");
text = text.replace("\\q ", "");
return text;
}
public static String getTextCss(int textSize, String textDirection){
String css = "<style type=\"text/css\">\n" +
".selah {text-align: right; font-style: italic; float: right; padding-right: 1em;}\n" +
".verse { font-size: 9pt}\n" +
".q, .q1, .q2 { margin:0; display: block; padding:0;}\n" +
".q, .q1 { padding-left: 1em; }\n" +
".q2 { padding-left: 2em; }\n" +
".q3 { padding-left: 3em; }\n" +
".d {font-style: italic; text-align: center; padding: 0px; line-height: 0.9; font-size: " + Integer.toString(textSize - 2) + "pt; width: 90%; padding: 0 5% 0 5%;}\n" +
"p { width:96%; font-size: " + Integer.toString(textSize) + "pt; text-align: justify; line-height: 1.3; padding:5px; unicode-bidi:bidi-override; direction:" +
textDirection + ";}\n" +
".footnote {font-size: 11pt;}\n" +
"sup {font-size: 9pt;}\n" +
"sp {font-style: italic;}\n" +
"</style>\n";
return css;
}
}