package org.wikibrain.core.lang; import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.wikibrain.core.WikiBrainException; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.regex.Pattern; /** * Provides access to language-specific parsing information. * The data is loaded from tsv resources. */ public class LanguageInfo { private static Logger LOG = LoggerFactory.getLogger(LanguageInfo.class); private static final String INFO_FILENAME = "language_info.tsv"; /** * All Language parser information. * This array is parallel with the Language array. * Since we don't have parsing information for some languages, some values will be null. */ public static LanguageInfo LANGUAGE_INFOS[] = new LanguageInfo[Language.LANGUAGES.length]; static { InputStream stream = null; try { stream = Language.class.getClassLoader() .getResourceAsStream(INFO_FILENAME); loadAllLanguages(stream); } catch (WikiBrainException e) { throw new RuntimeException(e); // What else can we do? } finally { if (stream != null) IOUtils.closeQuietly(stream); } } public static LanguageInfo getByLanguage(Language lang) { return LANGUAGE_INFOS[lang.getId() - 1]; } public static LanguageInfo getById(int langId) { return LANGUAGE_INFOS[langId-1]; } public static LanguageInfo getByLangCode(String langCode) { return LANGUAGE_INFOS[Language.getByLangCode(langCode).getId() - 1]; } private Language language; private List<String> categoryNames = new ArrayList<String>(); private List<String> disambiguationCategoryNames = new ArrayList<String>(); private List<AltNamespaceStruct> alternativeArticleNamespaces = new ArrayList<AltNamespaceStruct>(); private Pattern redirectPattern = null; private Pattern categoryPattern = null; private Pattern defaultCategoryPattern = null; private Pattern categoryReplacePattern = null; private Pattern mainTemplatePattern = null; private Pattern seeAlsoTemplatePattern = null; private Pattern mainInlinePattern = null; private Pattern seeAlsoInlinePattern = null; private Pattern seeAlsoHeaderPattern = null; private int numLinks; private int numArticles; private LanguageInfo(Language language) { this.language = language; } public boolean hasAlternativeArticleNamespaces(){ return (alternativeArticleNamespaces.size() > 0); } public void setRedirectNames(List<String> names) { String rd_pattern = list2NonCapturingGroup(names)+":{0,1}\\s*\\[\\[(.+?)\\]\\]"; redirectPattern = Pattern.compile(rd_pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } public void setCategoryNames(List<String> names) { this.categoryNames = names; String cat_pattern = "\\A"+list2NonCapturingGroup(names)+ "\\s*:\\s*(.+)"; categoryPattern = Pattern.compile(cat_pattern,Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); String default_cat_pattern = "\\A"+names.get(0)+ "\\s*:\\s*(.+)"; defaultCategoryPattern = Pattern.compile(default_cat_pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); String cat_replace_pattern = "\\A("+list2NonCapturingGroup(names)+ ")\\s*:\\s*(.+)"; categoryReplacePattern = Pattern.compile(cat_replace_pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } public void setMainTemplates(List<String> names) { if (names.size() > 0){ String maintemplate_pattern = "\\A"+list2NonCapturingGroup(names)+"\\Z"; mainTemplatePattern = Pattern.compile(maintemplate_pattern,Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } } public void setSeeAlsoTemplates(List<String> names) { if (names.size() > 0){ String seealso_pattern = "\\A"+list2NonCapturingGroup(names)+"\\Z"; seeAlsoTemplatePattern = Pattern.compile(seealso_pattern,Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } } public void setSeeAlsoHeaders(List<String> names) { if (names.size() > 0){ String seealsoheader_pattern = "\\A"+list2NonCapturingGroup(names)+"\\Z"; seeAlsoHeaderPattern = Pattern.compile(seealsoheader_pattern,Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } } public void setMainInlines(List<String> names) { if (names.size() > 0){ String maininline_pattern = list2NonCapturingGroup(names); mainInlinePattern = Pattern.compile(maininline_pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } } public void setSeeAlsoInlines(List<String> names) { if (names.size() > 0){ String seealsoinline_pattern = list2NonCapturingGroup(names); seeAlsoInlinePattern = Pattern.compile(seealsoinline_pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } } public void setDisambiguationCategoryNames(List<String> names) { this.disambiguationCategoryNames = names; } public void setAlternativeArticleNamespaces(String cell) { this.alternativeArticleNamespaces = getAltNamespaces(cell); } public Language getLanguage() { return language; } public List<String> getCategoryNames() { return this.categoryNames; } public List<String> getDisambiguationCategoryNames() { return disambiguationCategoryNames; } public List<AltNamespaceStruct> getAlternativeArticleNamespaces() { return alternativeArticleNamespaces; } public Pattern getRedirectPattern() { return redirectPattern; } public Pattern getCategoryPattern() { return categoryPattern; } public Pattern getDefaultCategoryPattern() { return defaultCategoryPattern; } public Pattern getCategoryReplacePattern() { return categoryReplacePattern; } public Pattern getMainTemplatePattern() { return mainTemplatePattern; } public Pattern getSeeAlsoTemplatePattern() { return seeAlsoTemplatePattern; } public Pattern getMainInlinePattern() { return mainInlinePattern; } public Pattern getSeeAlsoInlinePattern() { return seeAlsoInlinePattern; } public Pattern getSeeAlsoHeaderPattern() { return seeAlsoHeaderPattern; } public static class AltNamespaceStruct{ public final String prefix; public final Integer nsId; public AltNamespaceStruct(String scsv){ String[] parts = scsv.split(";"); this.prefix = parts[0]; this.nsId = Integer.parseInt(parts[1]); } @Override public String toString(){ return prefix + ";" + nsId; } @Override public boolean equals(Object o){ if (o instanceof String){ return (this.prefix.equals(o)); }else if(o instanceof AltNamespaceStruct){ AltNamespaceStruct ans = (AltNamespaceStruct)o; return (ans.nsId.equals(nsId) && ans.prefix.equals(o)); }else{ return false; } } } private List<AltNamespaceStruct> getAltNamespaces(String cell){ List<AltNamespaceStruct> rVal = new ArrayList<AltNamespaceStruct>(); if (cell.length() > 0){ String[] nss = cell.split(","); for(String ns : nss){ AltNamespaceStruct ans = new AltNamespaceStruct(ns); rVal.add(ans); } } return rVal; } public int getNumLinks() { return numLinks; } public int getNumArticles() { return numArticles; } public void setNumLinks(int numLinks) { this.numLinks = numLinks; } public void setNumArticles(int numArticles) { this.numArticles = numArticles; } public String getDefaultCategoryNamespaceName(){ return categoryNames.get(0); } private static List<String> csv2List(String csv){ if (csv.length() > 0){ if (csv.charAt(0)=='"'&&csv.charAt(csv.length()-1)=='"'){ csv = csv.substring(1,csv.length()-1); } return Arrays.asList(csv.split(",")); }else{ return new ArrayList<String>(); } } public static String list2NonCapturingGroup(Collection<String> list){ return "(?:" + StringUtils.join(list, "|") + ")"; } private static void loadLanguage(String fields[], String line) throws WikiBrainException { String tokens[] = StringUtils.splitPreserveAllTokens(line, "\t"); if (tokens.length != fields.length) { throw new WikiBrainException("invalid number of fields in " + StringEscapeUtils.escapeJava(line)); } Language lang = Language.getByLangCode(tokens[0]); LanguageInfo info = new LanguageInfo(lang); for (int i = 1; i < fields.length; i++) { if (fields[i].equals("alternativeArticleNamespaces")) { info.setAlternativeArticleNamespaces(tokens[i]); } else { try { if (fields[i].startsWith("num")) { BeanUtils.setProperty(info, fields[i], Integer.valueOf(tokens[i])); } else { BeanUtils.setProperty(info, fields[i], csv2List(tokens[i])); } } catch (IllegalAccessException e) { throw new WikiBrainException("unknown property in LanguageInfo: " + fields[i]); } catch (InvocationTargetException e) { throw new WikiBrainException(e); } } } LANGUAGE_INFOS[lang.getId() - 1] = info; } private static void loadAllLanguages(InputStream stream) throws WikiBrainException { List<String> lines; try { lines = IOUtils.readLines(stream, "UTF-8"); } catch (IOException e) { throw new WikiBrainException(e); } // read and validate header String fields[] = StringUtils.splitPreserveAllTokens(lines.get(0), "\t"); if (!fields[0].equals("langCode")) { throw new WikiBrainException( "invalid header in " + INFO_FILENAME + ": " + StringEscapeUtils.escapeJava(lines.get(0))); } for (String line : lines.subList(1, lines.size())) { loadLanguage(fields, line); } } }