package org.juxtasoftware.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.eclipse.mylyn.wikitext.core.parser.MarkupParser;
import org.eclipse.mylyn.wikitext.core.parser.builder.HtmlDocumentBuilder;
import org.eclipse.mylyn.wikitext.mediawiki.core.MediaWikiLanguage;
public class WikiTextUtils {
public static final File toTxt( InputStream wikiStream ) throws IOException {
// first strip markup that does not translate correctly into html or plain text, and make
// the output difficult to read/understand. Bad markup:
// <ref></ref>, <ref/>
// [[Image: ... ]]
// [[File: ... ]]
// {{Citation needed}}
InputStreamReader isr = new InputStreamReader(wikiStream, "UTF-8");
BufferedReader r = new BufferedReader( isr );
File stripped = File.createTempFile("stripped", "dat");
FileOutputStream fos = new FileOutputStream(stripped);
OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
boolean strippingRef = false;
boolean strippingTag = false;
String endMarker = "";
String startMarker = "";
int depth = 0;
boolean extractingQuote = false;
boolean discardQuoteData = false;
while (true) {
String line = r.readLine();
if ( line == null ) {
break;
} else {
line = line.trim();
if ( strippingTag ) {
StringBuilder buf= new StringBuilder();
for (int i=0; i<line.length(); i++) {
buf.append( line.charAt(i));
if ( buf.indexOf(startMarker) > -1 ) {
depth++;
buf = new StringBuilder();
} else if ( buf.indexOf(endMarker) > -1 ) {
depth--;
if ( depth == 0) {
line = line.substring(i+1);
strippingTag = false;
break;
} else {
buf = new StringBuilder();
}
}
}
if ( strippingTag ) {
continue;
}
}
if (line.length() == 0) {
continue;
}
line = stripCitationNeeded(line);
StripResult sr = stripTag("{{", "cite", "}}", line);
if ( sr.depth != 0 ) {
depth = sr.depth;
endMarker = "}}";
startMarker = "{{";
strippingTag = true;
continue;
} else {
line = sr.strippedLine;
}
sr = stripTag("[[", "File:", "]]", line);
if ( sr.depth != 0 ) {
depth = sr.depth;
endMarker = "]]";
startMarker = "[[";
strippingTag = true;
continue;
} else {
line = sr.strippedLine;
}
sr = stripTag("[[", "Image:", "]]", line);
if ( sr.depth != 0 ) {
depth = sr.depth;
endMarker = "]]";
startMarker = "[[";
strippingTag = true;
continue;
} else {
line = sr.strippedLine;
}
// cquotes
if ( discardQuoteData ) {
if ( line.indexOf("}}") > -1 ) {
int pos = line.indexOf("}}");
discardQuoteData = false;
line = line.substring(pos+2);
}
} else if ( extractingQuote ) {
if ( line.indexOf("|") > -1 ) {
extractingQuote = false;
int p = line.indexOf("|");
String before = line.substring(0, p);
String after = line.substring(p+1);
if ( after.indexOf("}}") > -1 ) {
line = after.substring(after.indexOf("}}")+2);
} else {
discardQuoteData = true;
line = before;
}
} else if ( line.indexOf("}}") > -1 ) {
int pos = line.indexOf("}}");
extractingQuote = false;
line = line.substring(0,pos)+line.substring(pos+2);
}
} else {
String quoteType = hasQuote(line);
if ( quoteType != null) {
int pos = line.indexOf(quoteType);
int p2 = line.indexOf("|", pos);
String front = line.substring(0, pos);
String back = line.substring(p2+1);
if ( quoteType.equals("{{rquote")) {
p2 = back.indexOf("|");
if ( p2 > -1 ) {
back = back.substring(p2+1);
}
}
if ( back.indexOf("}}") > -1 ) {
line = front+"\n\n"+ cleanQuoteText(back)+"\n\n";
} else {
extractingQuote = true;
line = front+"\n\n"+back;
}
}
}
if ( strippingRef ) {
if ( line.contains("</ref>") ) {
int end = line.indexOf("</ref>");
line = line.substring(end+6);
strippingRef = false;
}
}
if ( strippingRef == false) {
// From [[Category:: on the file just sets up
// links at the page footer. Doen't translate correctly to text
// so stop here
if ( line.contains("[[Category:") || line.contains("[[ar:") || line.contains("{{DEFAULTSORT:") ) {
break;
}
while ( line.contains("<ref") ) {
int start = line.indexOf("<ref");
int end = line.indexOf("</ref>", start);
int singleEnd = line.indexOf("/>", start);
int tagEnd = -1;
int endOffset = 0;
if ( singleEnd > -1 ) {
endOffset = 2;
tagEnd = singleEnd;
if ( end > -1 ) {
if ( end < singleEnd ) {
tagEnd = end;
endOffset = 6;
}
}
} else if ( end > -1 ) {
tagEnd = end;
endOffset = 6;
}
if (tagEnd > -1 ) {
String endBit = line.substring(tagEnd+endOffset);
line = line.substring(0,start) + endBit;
} else {
line = line.substring(0,start);
strippingRef = true;
break;
}
}
if ( line.trim().length() > 0 ) {
line = line.replaceAll("<br\\/>", "\n");
line = line.replaceAll("<br \\/>", "\n");
line = line.replaceAll("\\{\\{.*\\}\\}","");
line += "\n";
osw.write(line);
}
}
}
}
IOUtils.closeQuietly(osw);
// Next, turn this to html using textile-j (this one does the best job of those I tried out)
File html = File.createTempFile("html", "dat");
FileWriterWithEncoding fw = new FileWriterWithEncoding(html, "UTF-8");
HtmlDocumentBuilder builder = new HtmlDocumentBuilder(fw);
builder.setEmitAsDocument(false);
MarkupParser parser = new MarkupParser(new MediaWikiLanguage());
parser.setBuilder(builder);
InputStream fis = new FileInputStream( stripped );
parser.parse( new InputStreamReader(fis) );
IOUtils.closeQuietly(fw);
stripped.delete();
// Next, turn the html into plain text
HtmlUtils.strip(html);
File txtFile = HtmlUtils.toTxt( new FileInputStream(html) );
// Last, strip junk
return stripStrayJunk(txtFile);
}
private static String cleanQuoteText(String back) {
String clean = back.substring(0, back.indexOf("}}"));
if ( clean.indexOf("|") > -1) {
return clean.substring(0, clean.indexOf("|"));
}
return clean;
}
private static String hasQuote(String line) {
String[] quotes = {"{{cquote", "{{rquote", "{{quote"};
for ( int i=0; i<quotes.length; i++) {
if ( line.contains(quotes[i])) {
return quotes[i];
}
}
return null;
}
private static File stripStrayJunk(File txtFile) throws IOException {
File out = File.createTempFile("cleaned", "dat");
FileOutputStream fos = new FileOutputStream(out);
final OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
FileInputStream fis = new FileInputStream(txtFile);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader r = new BufferedReader( isr );
while (true) {
String line = r.readLine();
if ( line == null ) {
break;
} else {
line = line.replaceAll("\\[.*\\|\\[", "");
line = line.replaceAll("\\[\\[", "");
line = line.replaceAll("\\]\\]", "");
line = line.replaceAll("<\\/ref>", "");
line = line.replaceAll("}}", "");
line += "\n";
osw.write(line);
}
}
IOUtils.closeQuietly( r );
IOUtils.closeQuietly( osw );
return out;
}
private static StripResult stripTag(final String tagStart, final String tag, final String tagEnd, String line) {
final String fullStart = tagStart+tag;
while ( line.contains(fullStart) ) {
int start = line.indexOf(fullStart);
int depth = 1;
StringBuilder buf= new StringBuilder();
for (int i=start+7; i<line.length(); i++) {
buf.append(line.charAt(i));
if (buf.indexOf(tagStart) > -1) {
depth++;
buf = new StringBuilder();
} else if ( buf.indexOf(tagEnd) > -1 ) {
depth--;
if ( depth == 0) {
line = line.substring(0, start) + line.substring(i+1);
break;
} else {
buf = new StringBuilder();
}
}
}
if ( depth > 0 ) {
return new StripResult(line.substring(0, start), depth);
}
}
return new StripResult(line,0);
}
static final class StripResult {
public final String strippedLine;
public final int depth;
public StripResult(String line, int depth) {
this.strippedLine = line;
this.depth = depth;
}
}
private static String stripCitationNeeded(String line) {
while ( line.contains("{{Citation needed")) {
int start = line.indexOf("{{Citation needed");
int end = line.indexOf("}}", start);
if ( end == -1 ) {
break;
} else {
line = line.substring(0, start) + line.substring(end+2);
}
}
return line;
}
}