package com.s24.wiki; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.jhu.nlp.wikipedia.WikiPage; public class GermanSubwordPageParser extends GermanPageParser { public GermanSubwordPageParser(PageParserCallback cb) { super(cb); } @Override public void parse(WikiPage page) { if (isValidPage(page)) { Pattern pattern = Pattern.compile("\\{\\{Unterbegriffe\\}\\}\n(:.*\n)+"); Matcher matcher = pattern.matcher(page.getWikiText()); List<String> right = new ArrayList<String>(); final List<String> left = new ArrayList<String>(); PageParser grammar = new GermanGrammarPageParser(new PageParserCallback() { @Override public void callback(List<String> l, List<String> r) { left.addAll(l); } }); grammar.parse(page); right.addAll(left); if (matcher.find()) { Pattern wordpattern = Pattern.compile("\\[\\[(\\p{Upper}[\\p{L}]+)\\]\\]"); Matcher wordmatcher = wordpattern.matcher(matcher.group()); int end = 0; while (wordmatcher.find(end)) { String item = wordmatcher.group(1).toLowerCase(); if (!right.contains(item)) { right.add(item); } end = wordmatcher.end(); } } callback.callback(left, right); } } @Override protected String getName() { return "Unterbegriffe"; } }