/*
* EmphasisResolver.java
* Copyright (C) 2009 David Milne, d.n.milne@gmail.com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.wikipedia.miner.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This parses MediaWiki syntax for '''bold''' and ''italic'' text with the equivalent html markup.
*
* @author David Milne
*/
public class EmphasisResolver {
public String resolveEmphasis(String text) {
StringBuffer sb = new StringBuffer() ;
for (String line:text.split("\n")) {
sb.append(resolveLine(line)) ;
sb.append("\n") ;
}
sb.deleteCharAt(sb.length()-1) ;
return sb.toString() ;
}
/**
* This is a direct translation of the php function doAllQuotes used by the original MediaWiki software.
*
* @param line the line to resolve emphasis within
* @return the line, with all emphasis markup resolved to html tags
*/
private String resolveLine(String line) {
//System.out.println("Resolving line '" + line + "'") ;
String[] arr = getSplits("$"+line) ;
if (arr.length <= 1)
return line ;
//First, do some preliminary work. This may shift some apostrophes from
//being mark-up to being text. It also counts the number of occurrences
//of bold and italics mark-ups.
int numBold = 0 ;
int numItalics = 0 ;
for (int i=0 ; i<arr.length ; i++) {
if (i % 2 == 1) {
//If there are ever four apostrophes, assume the first is supposed to
// be text, and the remaining three constitute mark-up for bold text.
if (arr[i].length() == 4) {
arr[i-1] = arr[i-1] + "'" ;
arr[i] = getFilledString(3) ;
} else if (arr[i].length() > 5) {
//If there are more than 5 apostrophes in a row, assume they're all
//text except for the last 5.
arr[i-1] = arr[i-1] + getFilledString(arr[i].length()-5) ;
arr[i] = getFilledString(5) ;
}
switch(arr[i].length()) {
case 2:
numItalics++ ;
break ;
case 3:
numBold++ ;
break ;
case 5:
numItalics++ ;
numBold++ ;
}
}
}
//If there is an odd number of both bold and italics, it is likely
//that one of the bold ones was meant to be an apostrophe followed
//by italics. Which one we cannot know for certain, but it is more
//likely to be one that has a single-letter word before it.
if ((numBold%2==1) && (numItalics%2==1)) {
int i= 0;
int firstSingleLetterWord = -1 ;
int firstMultiLetterWord = -1 ;
int firstSpace = -1 ;
for (String r:arr) {
if ((i%2==1) && r.length()==3) {
//added these checks to avoid string out of bounds exceptions
if (i==0) continue ;
if (arr[i-1].length() < 2) continue ;
char x1 = arr[i-1].charAt(arr[i-1].length()-1) ;
char x2 = arr[i-1].charAt(arr[i-1].length()-2) ;
if (x1==' ') {
if (firstSpace == -1)
firstSpace = i ;
} else if (x2==' ') {
if (firstSingleLetterWord == -1)
firstSingleLetterWord = i ;
} else {
if (firstMultiLetterWord == -1)
firstMultiLetterWord = i ;
}
}
i++ ;
}
// If there is a single-letter word, use it!
if (firstSingleLetterWord > -1) {
arr[firstSingleLetterWord] = "''" ;
arr[firstSingleLetterWord-1] = arr[firstSingleLetterWord] + "'" ;
} else if (firstMultiLetterWord > -1) {
// If not, but there's a multi-letter word, use that one.
arr[firstMultiLetterWord] = "''" ;
arr[firstMultiLetterWord-1] = arr[firstMultiLetterWord] + "'" ;
} else if (firstSpace > -1) {
// ... otherwise use the first one that has neither.
// (notice that it is possible for all three to be -1 if, for example,
// there is only one pentuple-apostrophe in the line)
arr[firstSpace] = "''" ;
arr[firstSpace-1] = arr[firstSpace] + "'" ;
}
}
// Now let's actually convert our apostrophic mush to HTML!
StringBuffer output = new StringBuffer() ;
StringBuffer buffer = new StringBuffer() ;
String state = "" ;
int i = 0 ;
for (String r:arr) {
if (i%2==0) {
if (state.equals("both"))
buffer.append(r) ;
else
output.append(r) ;
} else {
if (r.length() == 2 ) {
if ( state.equals("i")) {
output.append("</i>");
state = "";
} else if (state.equals("bi")) {
output.append("</i>");
state = "b";
} else if ( state.equals("ib")) {
output.append("</b></i><b>");
state = "b";
} else if ( state.equals("both")) {
output.append("<b><i>") ;
output.append(buffer.toString()) ;
output.append("</i>") ;
state = "b";
} else {
//$state can be "b" or ""
output.append("<i>") ;
state = state + "i";
}
} else if ( r.length() == 3 ) {
if ( state.equals("b") ) {
output.append("</b>");
state = "";
} else if ( state.equals("bi")) {
output.append("</i></b><i>");
state = "i";
} else if ( state.equals("ib")) {
output.append("</b>");
state = "i";
} else if ( state.equals("both")) {
output.append("<i><b>") ;
output.append(buffer) ;
output.append("</b>") ;
state = "i";
} else {
//$state can be "i" or ""
output.append("<b>");
state = state + "b";
}
} else if ( r.length() == 5 ) {
if ( state.equals("b")) {
output.append("</b><i>");
state = "i";
} else if ( state.equals("i")) {
output.append("</i><b>");
state = "b";
} else if ( state.equals("bi")) {
output.append("</i></b>");
state = "";
} else if ( state.equals("ib")) {
output.append("</b></i>");
state = "";
} else if ( state.equals("both")) {
output.append("<i><b>") ;
output.append(buffer) ;
output.append("</b></i>");
state = "";
} else {
// ($state == "")
buffer = new StringBuffer() ;
state = "both";
}
}
}
i++ ;
}
//Now close all remaining tags. Notice that the order is important.
if ( state.equals("b") || state.equals("ib")) {
output.append("</b>") ;
}
if ( state.equals("i") || state.equals("bi") || state.equals("ib") ) {
output.append("</i>");
}
if ( state.equals("bi") ) {
output.append("</b>");
}
// There might be lonely ''''', so make sure we have a buffer
if ( state.equals("both") && buffer.length() > 0 ) {
output.append("<b><i>") ;
output.append(buffer) ;
output.append("</i></b>");
}
//remove leading $
output.deleteCharAt(0) ;
return output.toString() ;
}
/*
* Does the same job as php function preg_split
*/
private String[] getSplits(String text) {
ArrayList<String> splits = new ArrayList<String>() ;
Pattern p = Pattern.compile("\\'{2,}") ;
Matcher m = p.matcher(text) ;
int lastCopyIndex = 0 ;
while (m.find()) {
if (m.start() > lastCopyIndex)
splits.add(text.substring(lastCopyIndex, m.start())) ;
splits.add(m.group()) ;
lastCopyIndex = m.end();
}
if (lastCopyIndex < text.length()-1) {
splits.add(text.substring(lastCopyIndex)) ;
}
return splits.toArray(new String[splits.size()]);
}
private String getFilledString(int length) {
StringBuffer sb = new StringBuffer() ;
for (int i=0 ; i<length ; i++)
sb.append("'") ;
return sb.toString() ;
}
public static void main(String[] args) {
EmphasisResolver er = new EmphasisResolver() ;
String markup = "'''War''' is an openly declared state of organized [[violent]] [[Group conflict|conflict]], typified by extreme [[aggression]], [[societal]] disruption, and high [[Mortality rate|mortality]]. As a behavior pattern, warlike tendencies are found in many [[primate]] species, including [[humans]], and also found in many [[ant]] species. The set of techniques used by a group to carry out war is known as '''warfare'''." ;
//String markup = "Parsing '''MediaWiki''''s syntax for '''bold''' and ''italic'' markup is a '''''deceptively''' difficult'' task. Whoever came up with the markup scheme should be '''shot'''." ;
System.out.println(er.resolveEmphasis(markup)) ;
}
}