package org.wikibrain.parser.wiki; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.model.RawPage; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import static com.google.common.base.Preconditions.checkArgument; public class SubarticleParser { private final LanguageInfo lang; public SubarticleParser(LanguageInfo lang) { this.lang = lang; } public ParsedLink.SubarticleType isSeeAlsoHeader(LanguageInfo lang, String headerName){ if (lang.getSeeAlsoHeaderPattern() != null && headerName != null){ Matcher m = lang.getSeeAlsoHeaderPattern().matcher(headerName); if (m.find()) return ParsedLink.SubarticleType.SEEALSO_HEADER; } return null; } private static final Pattern templatePipePattern = Pattern.compile("([^\\|]+)"); private static final Pattern templateLinkPattern = Pattern.compile("\\[\\[(.+?)\\]\\]"); public List<String> getContentsOfTemplatePipe(String templateText){ // to catch links in templates like this in Hebrew: {{ערך מורחב|ערכים=[[ספרות עברית]], [[ספרות ישראלית]]}} List<String> rVal = new ArrayList<String>(); Matcher m = templateLinkPattern.matcher(templateText); while(m.find()){ rVal.add(m.group(1)); } if (rVal.size() == 0){ m = templatePipePattern.matcher(templateText); int index = 0; while(m.find()){ if (index > 0){ // don't want to include the template name rVal.add(m.group(1)); } index++; } } return rVal; } public static String removeTemplateAnchor(String templateText){ String[] parts = templateText.split("\\{\\{!\\}\\}"); return parts[0]; } public ParsedLink.SubarticleType isTemplateSubarticle(String templateName, String templateText) { ParsedLink.SubarticleType rVal = null; if (lang.getMainTemplatePattern() != null){ Matcher m = lang.getMainTemplatePattern().matcher(templateName); if (m.find()) rVal = ParsedLink.SubarticleType.MAIN_TEMPLATE; } if (rVal == null && lang.getSeeAlsoTemplatePattern() != null){ Matcher m = lang.getSeeAlsoTemplatePattern().matcher(templateName); if (m.find()) rVal = ParsedLink.SubarticleType.SEEALSO_TEMPLATE; } if (rVal != null){ rVal = handleSpecialTemplateBasedSubarticleSpecialCases(templateName, templateText, rVal); } return rVal; } private static int LEFT_WINDOW = 150; // max distance to look for newline char; given that subarticle lines tend to be short, this should be more than enough public ParsedLink.SubarticleType isInlineSubarticle(int location, RawPage pageXml){ if ((lang.getMainInlinePattern() != null || lang.getSeeAlsoInlinePattern() != null) && location > 0){ Boolean valid = false; Integer beginningOfValidText = null; String window = null; Integer searchLoc = null; if (location-LEFT_WINDOW > 0){ int leftBoundary = location-LEFT_WINDOW; window = pageXml.getBody().substring(leftBoundary, location); searchLoc = window.length()-1; char curChar; boolean found = false; do{ curChar = window.charAt(searchLoc); if (curChar == '\n' || curChar == '\r'){ found = true; } searchLoc--; }while(searchLoc >= 0 && !found); if (found){ valid = true; beginningOfValidText = leftBoundary+searchLoc; checkArgument(beginningOfValidText >= 0, window); } }else{ beginningOfValidText = 0; valid = true; } if(!valid) return null; String textUntilNewLine = pageXml.getBody().substring(beginningOfValidText, location); if (lang.getMainInlinePattern() != null){ Matcher m = lang.getMainInlinePattern().matcher(textUntilNewLine); if (m.find()){ return ParsedLink.SubarticleType.MAIN_INLINE; } } if (lang.getSeeAlsoInlinePattern() != null){ Matcher m = lang.getSeeAlsoInlinePattern().matcher(textUntilNewLine); if (m.find()) return ParsedLink.SubarticleType.SEEALSO_INLINE; } } return null; } private final static Pattern special_DanishSeOgs = Pattern.compile("Tekst\\s*=\\s*Se også", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); public ParsedLink.SubarticleType handleSpecialTemplateBasedSubarticleSpecialCases(String templateName, String templateText, ParsedLink.SubarticleType normalType) { ParsedLink.SubarticleType rVal = normalType; // danish if (lang.getLanguage().getLangCode().equals("de")){ if (normalType.equals(ParsedLink.SubarticleType.MAIN_TEMPLATE)){ Matcher m = special_DanishSeOgs.matcher(templateText); if (m.find()){ rVal = ParsedLink.SubarticleType.SEEALSO_TEMPLATE; } } } return rVal; } }