WikiTextUtils.java example

Explorer
juxta-service-master
package org.juxtasoftware.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.eclipse.mylyn.wikitext.core.parser.MarkupParser;
import org.eclipse.mylyn.wikitext.core.parser.builder.HtmlDocumentBuilder;
import org.eclipse.mylyn.wikitext.mediawiki.core.MediaWikiLanguage;

public class WikiTextUtils {
    
    public static final File toTxt( InputStream wikiStream ) throws IOException {
        
        // first strip markup that does not translate correctly into html or plain text, and make
        // the output difficult to read/understand. Bad markup:
        //    <ref></ref>, <ref/>
        //    [[Image: ... ]]
        //    [[File: ... ]]
        //    {{Citation needed}}
        InputStreamReader isr = new InputStreamReader(wikiStream, "UTF-8");
        BufferedReader r = new BufferedReader( isr );
        File stripped = File.createTempFile("stripped", "dat");
        FileOutputStream fos = new FileOutputStream(stripped);
        OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
        
        boolean strippingRef = false;
        boolean strippingTag = false;
        String endMarker = "";
        String startMarker = "";
        int depth = 0;
        boolean extractingQuote = false;
        boolean discardQuoteData = false;
        
        while (true) {
            String line = r.readLine();
            if ( line == null ) {
                break;
            } else {
                line = line.trim();
                if ( strippingTag ) {
                    StringBuilder buf= new StringBuilder();
                    for (int i=0; i<line.length(); i++) {
                        buf.append( line.charAt(i));
                        if ( buf.indexOf(startMarker) > -1 ) {
                            depth++;
                            buf = new StringBuilder();
                        } else if ( buf.indexOf(endMarker) > -1 ) {
                            depth--;
                            if ( depth == 0) {
                                line = line.substring(i+1);
                                strippingTag = false;
                                break;
                            } else {
                                buf = new StringBuilder();
                            }
                        }
                    }
                    if ( strippingTag ) {
                        continue;
                    }
                }
                
                if (line.length() == 0) {
                    continue;
                }
                
                line = stripCitationNeeded(line);
                
                StripResult sr = stripTag("{{", "cite", "}}", line);
                if ( sr.depth != 0 ) {
                    depth = sr.depth;
                    endMarker = "}}";
                    startMarker = "{{";
                    strippingTag = true;
                    continue;
                } else {
                    line = sr.strippedLine;
                }
                
                sr = stripTag("[[", "File:", "]]", line);
                if ( sr.depth != 0 ) {
                    depth = sr.depth;
                    endMarker = "]]";
                    startMarker = "[[";
                    strippingTag = true;
                    continue;
                } else {
                    line = sr.strippedLine;
                }
                
                sr = stripTag("[[", "Image:", "]]", line);
                if ( sr.depth != 0 ) {
                    depth = sr.depth;
                    endMarker = "]]";
                    startMarker = "[[";
                    strippingTag = true;
                    continue;
                } else {
                    line = sr.strippedLine;
                }
                
                // cquotes
                if ( discardQuoteData ) {
                    if ( line.indexOf("}}") > -1 ) {
                        int pos = line.indexOf("}}");
                        discardQuoteData = false;
                        line = line.substring(pos+2);
                    }
                } else if ( extractingQuote ) {
                    if ( line.indexOf("|") > -1 )  {
                        extractingQuote = false;
                        int p = line.indexOf("|");
                        String before = line.substring(0, p);
                        String after = line.substring(p+1);
                        if ( after.indexOf("}}") > -1 ) {
                            line = after.substring(after.indexOf("}}")+2);
                        } else {
                            discardQuoteData = true;
                            line = before;
                        }
                    } else if ( line.indexOf("}}") > -1 ) {
                        int pos = line.indexOf("}}");
                        extractingQuote = false;
                        line = line.substring(0,pos)+line.substring(pos+2);
                    }
                } else {
                    String quoteType = hasQuote(line);
                    if ( quoteType != null) {
                        int pos = line.indexOf(quoteType);
                        int p2 = line.indexOf("|", pos);
                        String front = line.substring(0, pos);
                        String back = line.substring(p2+1);
                        if ( quoteType.equals("{{rquote")) {
                            p2 = back.indexOf("|");
                            if ( p2 > -1 ) {
                                back = back.substring(p2+1);
                            }
                        }

                        if ( back.indexOf("}}") > -1 ) {
                            line = front+"\n\n"+ cleanQuoteText(back)+"\n\n";
                        } else {
                            extractingQuote = true;
                            line = front+"\n\n"+back;
                        }
                    }
                }

                if ( strippingRef ) {
                    if ( line.contains("</ref>") ) {
                        int end = line.indexOf("</ref>");
                        line = line.substring(end+6);
                        strippingRef = false;
                    }
                } 
                
                if ( strippingRef == false) {
                    // From [[Category:: on the file just sets up
                    // links at the page footer. Doen't translate correctly to text
                    // so stop here
                    if ( line.contains("[[Category:") || line.contains("[[ar:") || line.contains("{{DEFAULTSORT:") ) {
                        break;
                    }
                    while ( line.contains("<ref") ) {
                        int start = line.indexOf("<ref");
                        int end = line.indexOf("</ref>", start);
                        int singleEnd = line.indexOf("/>", start);
                        int tagEnd  = -1;
                        int endOffset = 0;
                        if ( singleEnd > -1 ) {
                            endOffset = 2;
                            tagEnd = singleEnd;
                            if ( end > -1 ) {
                                if ( end < singleEnd ) {
                                    tagEnd = end;
                                    endOffset = 6;
                                }
                            } 
                        } else if ( end > -1 ) {
                            tagEnd = end;
                            endOffset = 6;
                        }
                        
                        if (tagEnd > -1 ) {
                            String endBit = line.substring(tagEnd+endOffset);
                            line = line.substring(0,start) + endBit;
                        } else {
                            line = line.substring(0,start);
                            strippingRef = true;
                            break;
                        }
                    }
                    
                    if ( line.trim().length() > 0 ) {
                        line = line.replaceAll("<br\\/>", "\n");
                        line = line.replaceAll("<br \\/>", "\n");
                        line = line.replaceAll("\\{\\{.*\\}\\}","");
                        line += "\n";
                        osw.write(line);
                    }
                }
            }
        }
        IOUtils.closeQuietly(osw);
        
        // Next, turn this to html using textile-j (this one does the best job of those I tried out)
        File html = File.createTempFile("html", "dat");
        FileWriterWithEncoding fw = new FileWriterWithEncoding(html, "UTF-8");
        HtmlDocumentBuilder builder = new HtmlDocumentBuilder(fw);
        builder.setEmitAsDocument(false);
        MarkupParser parser = new MarkupParser(new MediaWikiLanguage());
        parser.setBuilder(builder);
        InputStream fis = new FileInputStream( stripped );
        parser.parse( new InputStreamReader(fis) );
        IOUtils.closeQuietly(fw);
        stripped.delete();
        
        // Next, turn the html into plain text
        HtmlUtils.strip(html);
        File txtFile = HtmlUtils.toTxt( new FileInputStream(html) );
        
        // Last, strip junk
        return stripStrayJunk(txtFile);
    }

    private static String cleanQuoteText(String back) {
        String clean = back.substring(0, back.indexOf("}}"));
        if ( clean.indexOf("|") > -1) {
            return clean.substring(0, clean.indexOf("|"));
        }
        return clean;
    }

    private static String hasQuote(String line) {
        String[] quotes = {"{{cquote", "{{rquote", "{{quote"};
        for ( int i=0; i<quotes.length; i++) {
            if ( line.contains(quotes[i])) {
                return quotes[i];
            }
        }
        return null;
    }

    private static File stripStrayJunk(File txtFile) throws IOException {
        
        File out = File.createTempFile("cleaned", "dat");
        FileOutputStream fos = new FileOutputStream(out);
        final  OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
        FileInputStream fis = new FileInputStream(txtFile);
        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
        BufferedReader r = new BufferedReader( isr );
        while (true) {
            String line = r.readLine();
            if ( line == null ) {
                break;
            } else {
                line = line.replaceAll("\\[.*\\|\\[", "");
                line = line.replaceAll("\\[\\[", "");
                line = line.replaceAll("\\]\\]", "");
                line = line.replaceAll("<\\/ref>", "");
                line = line.replaceAll("}}", "");
                line += "\n";
                osw.write(line);
            }
        }
        IOUtils.closeQuietly( r );
        IOUtils.closeQuietly( osw );
        
        return out;
    }

    private static StripResult stripTag(final String tagStart, final String tag, final String tagEnd, String line) {
        final String fullStart = tagStart+tag;
        while ( line.contains(fullStart) ) {
            int start = line.indexOf(fullStart);
            int depth = 1;
            StringBuilder buf= new StringBuilder();
            for (int i=start+7; i<line.length(); i++) {
                buf.append(line.charAt(i));
                if  (buf.indexOf(tagStart) > -1) {
                    depth++;
                    buf = new StringBuilder();
                } else if ( buf.indexOf(tagEnd) > -1 ) {
                    depth--;
                    if ( depth == 0) {
                        line = line.substring(0, start) + line.substring(i+1);
                        break;
                    } else {
                        buf = new StringBuilder();
                    }
                }
            }
            if ( depth > 0 ) {
                return new StripResult(line.substring(0, start), depth);
            }
        }
        return new StripResult(line,0);
    }
    
    static final class StripResult {
        public final String strippedLine;
        public final int depth;
        public StripResult(String line, int depth) {
            this.strippedLine = line;
            this.depth = depth;
        }
    }

    private static String stripCitationNeeded(String line) {
        while ( line.contains("{{Citation needed")) {
            int start = line.indexOf("{{Citation needed");
            int end = line.indexOf("}}", start);
            if ( end == -1 ) {
                break;
            } else {
                line = line.substring(0, start) + line.substring(end+2);
            }
        }
        return line;
    }

}