package org.wikipedia.miner.extract.util; import java.io.File; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.simpleframework.xml.Attribute; import org.simpleframework.xml.Element; import org.simpleframework.xml.ElementList; import org.simpleframework.xml.Root; import org.simpleframework.xml.Serializer; import org.simpleframework.xml.Transient; import org.simpleframework.xml.core.Persister; @Root public class Languages { @ElementList(inline=true, entry="Language") private List<Language> languages; @Transient private Map<String,Integer> languageIndexesByCode ; private Map<String,Integer> getLanguageIndexesByCode() { if (languageIndexesByCode != null) return languageIndexesByCode ; languageIndexesByCode = new HashMap<String,Integer>() ; int index = 0 ; for (Language lang:languages) { languageIndexesByCode.put(lang.getCode(), index) ; index++ ; } return languageIndexesByCode ; } public Language get(String code) { Integer index = getLanguageIndexesByCode().get(code) ; if (index == null) return null ; return languages.get(index) ; } public static Languages load(File file) throws Exception { Serializer serializer = new Persister(); return serializer.read(Languages.class, file); } public static Languages load(InputStream input) throws Exception { Serializer serializer = new Persister(); return serializer.read(Languages.class, input) ; } public static class Language { @Attribute private String code ; @Attribute private String name ; @Attribute private String localName ; @Element(name="RootCategory") private String rootCategory ; @ElementList(inline=true, entry="DisambiguationCategory") private List<String> disambigCategories ; @ElementList(inline=true, entry="DisambiguationTemplate") private List<String> disambigTemplates ; @ElementList(inline=true, entry="RedirectIdentifier") private List<String> redirectIdentifiers ; @ElementList(inline=true, required=false, entry="NamespaceAlias") private List<NamespaceAlias> namespaceAliases ; @Transient private Pattern disambigPattern ; @Transient private Pattern redirectPattern ; @Transient private Map<String,Integer> aliasMap ; public String getCode() { return code; } public String getName() { return name; } public String getLocalName() { return localName; } public String getRootCategory() { return rootCategory; } public List<String> getDisambigCategories() { return disambigCategories; } public List<String> getDisambigTemplates() { return disambigTemplates; } public List<String> getRedirectIdentifiers() { return redirectIdentifiers; } public List<NamespaceAlias> getNamespaceAliases() { return namespaceAliases; } public Pattern getDisambigPattern() { if (disambigPattern != null) return disambigPattern ; String disambigCategoryRegex = null ; if (!disambigCategories.isEmpty()) { StringBuffer tmp = new StringBuffer() ; tmp.append("\\[\\[\\s*") ; if (disambigCategories.size() == 1) { tmp.append(disambigCategories.get(0)) ; } else { tmp.append("(") ; for (String dc:disambigCategories) { tmp.append(dc) ; tmp.append("|") ; } tmp.deleteCharAt(tmp.length()-1) ; tmp.append(")") ; } tmp.append("\\s*\\]\\]") ; disambigCategoryRegex = tmp.toString() ; } String disambigTemplateRegex = null ; if (!disambigTemplates.isEmpty()) { StringBuffer tmp = new StringBuffer() ; tmp.append("\\{\\{\\s*") ; if (disambigTemplates.size() == 1) { tmp.append(disambigTemplates.get(0)) ; } else { tmp.append("(") ; for (String dt:disambigTemplates) { tmp.append(dt) ; tmp.append("|") ; } tmp.deleteCharAt(tmp.length()-1) ; tmp.append(")") ; } tmp.append("\\s*\\}\\}") ; disambigTemplateRegex = tmp.toString() ; } if (disambigCategoryRegex == null && disambigTemplateRegex == null) { throw new NullPointerException("language configuration does not specify any categories or templates for identifying disambiguation pages") ; } if (disambigCategoryRegex != null && disambigTemplateRegex != null) { disambigPattern = Pattern.compile("(" + disambigCategoryRegex + "|" + disambigTemplateRegex + ")", Pattern.CASE_INSENSITIVE) ; } else { if (disambigCategoryRegex != null) disambigPattern = Pattern.compile(disambigCategoryRegex, Pattern.CASE_INSENSITIVE) ; else disambigPattern = Pattern.compile(disambigTemplateRegex, Pattern.CASE_INSENSITIVE) ; } return disambigPattern ; } public Pattern getRedirectPattern() { if (redirectPattern != null) return redirectPattern ; StringBuffer redirectRegex = new StringBuffer("\\#") ; redirectRegex.append("(") ; for (String ri:redirectIdentifiers) { redirectRegex.append(ri) ; redirectRegex.append("|") ; } redirectRegex.deleteCharAt(redirectRegex.length()-1) ; redirectRegex.append(")[:\\s]*(?:\\[\\[(.*)\\]\\]|(.*))") ; redirectPattern = Pattern.compile(redirectRegex.toString(), Pattern.CASE_INSENSITIVE) ; return redirectPattern ; } private Map<String,Integer> getAliasMap() { if (aliasMap != null) return aliasMap ; aliasMap = new HashMap<String,Integer>() ; int index = 0 ; for (NamespaceAlias alias:namespaceAliases) { aliasMap.put(alias.from.toLowerCase(), index) ; index++ ; } return aliasMap ; } public NamespaceAlias getAlias(String fromNamespace) { Integer index = getAliasMap().get(fromNamespace.toLowerCase()) ; if (index == null) return null ; return namespaceAliases.get(index) ; } } public static class NamespaceAlias { public String getFrom() { return from; } public String getTo() { return to; } @Attribute private String from ; @Attribute private String to ; } }