package org.wikipedia.miner.web.util; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.servlet.http.HttpServletRequest; import org.wikipedia.miner.model.Article; import org.wikipedia.miner.model.Wikipedia; import org.wikipedia.miner.util.EmphasisResolver; import org.wikipedia.miner.util.MarkupStripper; import org.dmilne.xjsf.param.EnumParameter; import com.sleepycat.je.DatabaseException; public class MarkupFormatter { public enum EmphasisFormat{PLAIN,WIKI,HTML} ; public enum LinkFormat{PLAIN,HTML,WIKI,WIKI_ID} ; private EmphasisResolver emphasisResolver = new EmphasisResolver() ; private MarkupStripper stripper = new MarkupStripper() ; private EnumParameter<EmphasisFormat> prmEmphasisFormat ; private EnumParameter<LinkFormat> prmLinkFormat ; protected Pattern linkPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.DOTALL) ; public MarkupFormatter() { String[] descEmphasisFormat = {"all emphasis discarded","as mediawiki markup", "as html markup"} ; prmEmphasisFormat = new EnumParameter<EmphasisFormat>("emphasisFormat", "The format of bold and italic markup within returned snippets", EmphasisFormat.HTML, EmphasisFormat.values(), descEmphasisFormat) ; String[] descLinkFormat = {"all links discarded", "as html links to wikipedia", "as mediawiki markup", "as modified mediawiki markup [[id|anchor]]"} ; prmLinkFormat = new EnumParameter<LinkFormat>("linkFormat", "The format of link markup within returned snippets", LinkFormat.HTML, LinkFormat.values(), descLinkFormat) ; } public EnumParameter<EmphasisFormat> getEmphasisFormatParam() { return prmEmphasisFormat ; } public EnumParameter<LinkFormat> getLinkFormatParam() { return prmLinkFormat ; } /** * Returns a copy of the given snippet of media-wiki markup, where links to the given topics * have been replaced with bold emphasis, and all other links have been discarded. * * @param markup the mediawiki markup to be modified * @param topicIds a HashSet of topic ids that are to be highlighted * @param wikipedia an instance of wikipedia that can be used to resolve links * @return the modified snippet of mediawiki markup */ public String highlightTopics(String markup, HashSet<Integer> topicIds, Wikipedia wikipedia) { Matcher m = linkPattern.matcher(markup) ; int lastPos = 0 ; StringBuffer sb = new StringBuffer() ; while(m.find()) { String link = m.group(1) ; String anchor ; String dest ; int pos = link.lastIndexOf("|") ; if (pos >1) { dest = link.substring(0,pos) ; anchor = link.substring(pos+1) ; } else { dest = link ; anchor = link ; } Article art = wikipedia.getArticleByTitle(dest) ; sb.append(markup.substring(lastPos, m.start())) ; if (art != null && topicIds.contains(art.getId())) { sb.append("'''") ; sb.append(anchor) ; sb.append("'''") ; } else { sb.append(anchor) ; } lastPos = m.end() ; } sb.append(markup.substring(lastPos)) ; return sb.toString() ; } /** * * * @param markup * @param emphasisFormat * @param linkFormat * @param wikipedia * @return * @throws DatabaseException */ public String format(String markup, HttpServletRequest request, Wikipedia wikipedia) throws DatabaseException { markup = stripper.stripAllButInternalLinksAndEmphasis(markup, null) ; //deal with emphasis formatting EmphasisFormat emphasisFormat = prmEmphasisFormat.getDefaultValue() ; if (request != null) emphasisFormat = prmEmphasisFormat.getValue(request) ; switch(emphasisFormat) { case PLAIN : markup = stripper.stripEmphasis(markup, null) ; case HTML : markup = emphasisResolver.resolveEmphasis(markup) ; break ; } // deal with links LinkFormat linkFormat = prmLinkFormat.getDefaultValue() ; if (request != null) linkFormat = prmLinkFormat.getValue(request) ; if (linkFormat == LinkFormat.WIKI) return markup ; if (linkFormat == LinkFormat.PLAIN) { markup = stripper.stripInternalLinks(markup, null) ; return markup ; } Matcher m = linkPattern.matcher(markup) ; int lastPos = 0 ; StringBuffer sb = new StringBuffer() ; while(m.find()) { sb.append(markup.substring(lastPos, m.start())) ; String link = m.group(1) ; String anchor ; String dest ; int pos = link.lastIndexOf("|") ; if (pos >1) { dest = link.substring(0,pos) ; anchor = link.substring(pos+1) ; } else { dest = link ; anchor = link ; } Article art = wikipedia.getArticleByTitle(dest) ; if (art == null) { sb.append(anchor) ; } else { switch(linkFormat) { case HTML: sb.append("<a href=\"http://www." + wikipedia.getConfig().getLangCode() + ".wikipedia.org/wiki/" + art.getTitle() + "\">") ; sb.append(anchor) ; sb.append("</a>") ; break ; case WIKI_ID: sb.append("[[" + art.getId() + "|" + anchor + "]]") ; break ; } } lastPos = m.end() ; } sb.append(markup.substring(lastPos)) ; markup = sb.toString() ; if (linkFormat != LinkFormat.WIKI && linkFormat != LinkFormat.WIKI_ID) { markup = markup.replaceAll("\\[\\[", "") ; markup = markup.replaceAll("\\]\\]", "") ; } return markup ; } }