package com.s24.wiki.links; import java.io.IOException; import java.net.URL; import java.net.URLEncoder; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.google.common.base.Predicate; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; public class WikipediaExpanderDao implements ExpanderDao { private Logger log = LoggerFactory.getLogger(WikipediaExpanderDao.class); public Set<String> expand(String keyword) { SortedSet<String> result = new TreeSet<String>(); result.add(nomalizeValue(keyword)); try { List<String> attributes = Lists.newArrayList("links", "categories"); URL url = new URL("http://de.wikipedia.org/w/api.php?action=parse&format=xml&page=" + URLEncoder.encode(keyword, "UTF-8") + "&prop=" + StringUtils.join(attributes , '|')); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(url.openStream()); NodeList links = doc.getElementsByTagName("links"); if (links.getLength() == 1 && links.item(0).getChildNodes().getLength() == 1) { result.addAll(expand(links.item(0).getChildNodes().item(0).getTextContent())); } for (String attribute : attributes) { NodeList parent = doc.getElementsByTagName(attribute); if (parent.getLength() == 1) { NodeList children = parent.item(0).getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node node = children.item(i); result.add(nomalizeValue(node.getTextContent())); } } } } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } return filterResult(result); } private String nomalizeValue(String str) { return str.replace('_', ' ').replaceAll("\\(.*?\\)", "").trim(); } private Set<String> filterResult(final Set<String> result) { return Sets.filter(result, new Predicate<String>() { @Override public boolean apply(String input) { return !input.contains(":"); } }); } private boolean isDefinition(Set<String> result) { return result.contains("Wikipedia:Begriffsklärung"); } @Override public Map<String, Set<String>> expand(List<String> queries) { Map<String, Set<String>> terms = Maps.newHashMap(); for (String query : queries) { Set<String> result = expand(query); // found a page && is not a definition page if (result.size() > 1 && !isDefinition(result)) { result = filterResult(result); terms.put(query, result); log.info(query + " -> " + StringUtils.join(result, "|")); } } return terms; } }