package org.wikipedia.miner.extract.model; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.util.Languages.Language; import org.wikipedia.miner.extract.util.Languages.NamespaceAlias; import org.wikipedia.miner.extract.util.SiteInfo; import org.wikipedia.miner.extract.util.SiteInfo.Namespace; import org.wikipedia.miner.extract.util.Util; public class DumpLinkParser { private static Logger logger = Logger.getLogger(DumpLinkParser.class) ; private Language language ; private SiteInfo siteInfo ; Pattern langPattern ; Pattern namespacePattern ; Pattern filePattern ; public DumpLinkParser(Language lc, SiteInfo si) { this.language = lc ; this.siteInfo = si ; langPattern = Pattern.compile("([a-z\\-]+)\\:(.*)", Pattern.DOTALL) ; List<String> namespaces = new ArrayList<String>() ; for (Namespace namespace:siteInfo.getNamespaces()) namespaces.add(namespace.getName()) ; for (NamespaceAlias alias:language.getNamespaceAliases()) namespaces.add(alias.getFrom()) ; namespacePattern = Pattern.compile("(" + StringUtils.join(namespaces, "|") + ")\\:(.*)", Pattern.CASE_INSENSITIVE + Pattern.DOTALL) ; //TODO: this should really be loaded from an external file that can be modified easily filePattern = Pattern.compile("(.*)\\.(gif|png|jpg|jpeg|ogg|ogv|svg)", Pattern.CASE_INSENSITIVE) ; } public DumpLink parseLink(String markup, String sourceTitle) throws Exception { markup = markup.trim(); String lang = null; Namespace namespace = null ; String target = null ; String section = null ; String anchor = null ; // get language code, if any Matcher m = langPattern.matcher(markup) ; if (m.matches()) { lang = m.group(1) ; markup = m.group(2).trim() ; } //get namespace, if any m = namespacePattern.matcher(markup) ; if (m.matches()) { namespace = getNamespace(m.group(1)) ; markup = m.group(2).trim() ; } else { namespace = siteInfo.getMainNamespace() ; } String[] chunks = markup.split("\\|") ; if (chunks.length == 1) { target = chunks[0].trim() ; anchor = chunks[0].trim() ; } else if (chunks.length == 2) { target = chunks[0].trim() ; anchor = chunks[1].trim() ; } else { target = chunks[0].trim() ; anchor = chunks[chunks.length-1].trim() ; } //handle sections int poundIndex = target.indexOf('#') ; if (poundIndex >= 0) { section = target.substring(poundIndex+1) ; target = target.substring(0, poundIndex) ; } //handle files that weren't properly put in the File namespace m = filePattern.matcher(target) ; if (m.matches()) namespace = siteInfo.getNamespace(SiteInfo.FILE_KEY) ; //just put up a warning about any links with multiple pipes that were not to files (they are weird) if (namespace.getKey() != SiteInfo.FILE_KEY && chunks.length > 2) { logger.warn("Too many pipes: " + markup) ; //TODO: this is hacky, we should have a more graceful way of getting rid of these weird links namespace = siteInfo.getNamespace(SiteInfo.SPECIAL_KEY) ; } //handle internal links if (target.length() == 0) target = sourceTitle ; //handle pipe trick if (anchor.length() == 0) anchor = target ; target = Util.normaliseTitle(target) ; return new DumpLink(lang, namespace, target, section, anchor) ; } private Namespace getNamespace(String name) { NamespaceAlias alias = language.getAlias(name) ; Namespace namespace ; if (alias == null) namespace = siteInfo.getNamespace(name) ; else namespace = siteInfo.getNamespace(alias.getTo()) ; if (namespace == null) { logger.warn("Unknown namespace: " + name); namespace = siteInfo.getMainNamespace() ; } return namespace ; } }