package com.darkrockstudios.apps.tminus.dataupdate.wikipedia;
import android.util.Log;
import org.json.JSONException;
import org.json.JSONObject;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by Adam on 2/9/14.
*/
// This Wiki api call will get the summary section for a given article
// /w/api.php?action=parse&format=json&page=Falcon_9&prop=text§ion=0
// the Parse tree option might be what we want, it allows us to traverse the content via XML rather than regex that crap
public class WikiArticleHandler
{
private static final String TAG = WikiArticleHandler.class.getSimpleName();
private static final Pattern WIKI_ARTICLE_PATTERN =
Pattern.compile( "^http[s]?://[a-z]{2}.wikipedia.org/wiki/([a-zA-Z0-9-_\\(\\)]+)/?(?:\\?.*)?$" );
private static final Pattern COMMENT_PATTERN = Pattern.compile( "\\<\\!--(.*?)--\\>" );
private static final Pattern GENERAL_LINK_PATTERN = Pattern.compile( "\\[?\\[((?:.*?))[\\|](?:(.*?))?\\]\\]?" );
private static final Pattern SIMPLE_LINK_PATTERN = Pattern.compile( "\\[?\\[([^|]*?)\\]\\]?" );
private static final Pattern SIMPLE_REF_PATTERN =
Pattern.compile( "<\\s*ref\\s*>.*?<\\s*/\\s*ref\\s*>", Pattern.CASE_INSENSITIVE );
private static final Pattern REF_PATTERN =
Pattern.compile( "<\\s*ref(?:[^<]*?)[^/]>.*?<\\s*/\\s*ref\\s*>", Pattern.CASE_INSENSITIVE );
private static final Pattern REF_TAG_PATTERN =
Pattern.compile( "<\\s*ref(?:.*?)/>", Pattern.CASE_INSENSITIVE );
private static final Pattern LANG_PATTERN =
Pattern.compile( "\\{\\{lang(?:-|\\|)[a-z]+[|](.*?)\\}\\}", Pattern.CASE_INSENSITIVE );
private static final Pattern CONVERT_PATTERN =
Pattern.compile( "\\{\\{convert\\|([0-9]+)\\|([a-zA-Z]+)\\}\\}", Pattern.CASE_INSENSITIVE );
private static final Pattern BOLD_PATTERN = Pattern.compile( "'''(.+?)'''" );
private static final Pattern ITALICS_PATTERN = Pattern.compile( "''(.+?)''" );
public static String processWikiArticle( final JSONObject response )
{
String articleText;
Log.d( TAG, "Received wiki ARTICLE data, parsing..." );
try
{
JSONObject parse = response.getJSONObject( "parse" );
JSONObject text = parse.getJSONObject( "wikitext" );
String rawArticleText = text.getString( "*" );
articleText = cleanUpWikiText( rawArticleText );
}
catch( final JSONException e )
{
e.printStackTrace();
articleText = null;
}
return articleText;
}
private static String cleanUpWikiText( final String wikiText )
{
// Unescape the wikitext
String articleText = wikiText.replace( "\\\"", "\"" );
articleText = articleText.replace( "\\/", "/" );
// This thing is often huge, lets remove it first to cut down the size of the text
articleText = removeWikiElement( "Infobox", articleText );
// Use our regex patterns to clean out the wiki syntax and replace it with mostly plain text
// or simple HTML for formatting
Matcher matcher;
matcher = COMMENT_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "" );
matcher = SIMPLE_REF_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "" );
matcher = REF_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "" );
matcher = REF_TAG_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "" );
articleText = removeWikiAssetElement( "File", articleText );
matcher = SIMPLE_LINK_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "$1" );
matcher = GENERAL_LINK_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "$2" );
matcher = LANG_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "$1" );
matcher = CONVERT_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "$1 $2" );
matcher = BOLD_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "<strong>$1</strong>" );
matcher = ITALICS_PATTERN.matcher( articleText );
articleText = matcher.replaceAll( "<em>$1</em>" );
// Remove any remaining Wiki elements
articleText = removeWikiElement( "", articleText );
// Lastly trim any whitespace, and then space things out
articleText = articleText.trim();
articleText = articleText.replace( "\n", "<br/>" );
return articleText;
}
private static String removeWikiAssetElement( final String tag, final String articleText )
{
return removeWikiElement( tag, articleText, "[[", "]]" );
}
private static String removeWikiElement( final String tag, final String articleText )
{
return removeWikiElement( tag, articleText, "{{", "}}" );
}
private static String removeWikiElement( final String tag, final String articleText, final String openBracket, final String closeBracket )
{
String cleanedArticle = articleText;
final String tagStart = openBracket + tag;
final Locale locale = Locale.ENGLISH;
String lowerCaseArticleText = articleText.toLowerCase( locale );
int startPos = -1;
while( (startPos = lowerCaseArticleText.indexOf( tagStart.toLowerCase( locale ) )) > -1 )
{
int endPos = startPos + tagStart.length();
int openBrackets = 1;
while( openBrackets > 0 )
{
int nextOpen = lowerCaseArticleText.indexOf( openBracket, endPos );
int nextClose = lowerCaseArticleText.indexOf( closeBracket, endPos );
if( nextOpen > -1 && nextOpen < nextClose )
{
endPos = nextOpen + openBracket.length();
++openBrackets;
}
else if( nextClose > -1 )
{
endPos = nextClose + closeBracket.length();
--openBrackets;
}
else
{
Log.d( TAG, "Broken WIki tag" );
break;
}
}
cleanedArticle =
cleanedArticle.substring( 0, startPos ) + cleanedArticle.substring( endPos, cleanedArticle.length() );
lowerCaseArticleText = lowerCaseArticleText.substring( 0, startPos ) +
lowerCaseArticleText.substring( endPos, lowerCaseArticleText.length() );
}
return cleanedArticle;
}
public static String extractArticleTitle( final String wikiUrl )
{
final String articleTitle;
if( wikiUrl != null && !wikiUrl.isEmpty() )
{
Matcher matcher = WIKI_ARTICLE_PATTERN.matcher( wikiUrl );
if( matcher.matches() && matcher.groupCount() == 1 )
{
articleTitle = matcher.group( 1 );
}
else
{
articleTitle = null;
}
}
else
{
articleTitle = null;
}
return articleTitle;
}
}