package com.s24.wiki; import java.util.HashSet; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import com.beust.jcommander.internal.Lists; import com.google.common.base.CharMatcher; import com.google.common.base.Splitter; import edu.jhu.nlp.wikipedia.WikiPage; public class EnglishGrammarPageParser extends PageParser { private final static Pattern pattern = Pattern.compile("\\{\\{en-noun\\|(.*)\\}\\}"); private final static CharMatcher regularStems = CharMatcher.anyOf("|~-?!"); private final static CharMatcher illegalCharacters = CharMatcher.anyOf("'%&/.´`*$@{[]"); private final boolean includeRegularStems; public EnglishGrammarPageParser(PageParserCallback cb, boolean includeRegularStems) { super(cb); this.includeRegularStems = includeRegularStems; } @Override public void parse(WikiPage page) { Matcher matcher = pattern.matcher(page.getWikiText()); Set<String> left = new HashSet<>(); Set<String> right = new HashSet<>(); String word = page.getTitle().trim().toLowerCase(Locale.US); left.add(word); right.add(word); // legal words ony if (!illegalCharacters.matchesAnyOf(word) && !CharMatcher.DIGIT.matchesAnyOf(word)) { // explcit plural given if (matcher.find()) { String stem = matcher.group(1).toLowerCase(Locale.US); // no illegal stems (identical, nostem) for (String s : Splitter.on('|').trimResults().split(stem)) { if (!regularStems.matchesAllOf(s) && !illegalCharacters.matchesAnyOf(s) && !CharMatcher.DIGIT.matchesAnyOf(s)) { if ("s".equalsIgnoreCase(s) || "es".equalsIgnoreCase(s)) { if (includeRegularStems) { left.add(word + s); } } else if (s.length() > 2) { left.add(s); } } else if (includeRegularStems && "-".equals(s)) { left.add(word + "s"); } else if (includeRegularStems && "~".equals(s)) { left.add(word + "s"); } } } else if (includeRegularStems) { left.add(word + "s"); } } // System.out.println("Found: " + StringUtils.join(left, ",") + " => " + right.iterator().next()); callback.callback(Lists.newArrayList(left), Lists.newArrayList(right)); } @Override protected String getName() { return "Übersicht"; } }