package core.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class WikiReader { private final static Pattern PARAGRAPH = Pattern.compile("<p>.*</p>"); //$NON-NLS-1$ public String getWikiLink(String article, String lang) { article = article.replaceAll(" ", "_"); //$NON-NLS-1$ //$NON-NLS-2$ return "http://" + lang + ".wikipedia.org/wiki/" + article; //$NON-NLS-1$ //$NON-NLS-2$ } public String readFirstParagraph(String article, String lang) throws IOException { URL url = new URL(this.getWikiLink(article, lang)); URLConnection connection = url.openConnection(); BufferedReader r = null; try { r = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8"))); //$NON-NLS-1$ String line = null; StringBuilder b = new StringBuilder(); while ((line = r.readLine()) != null) { b.append(line); b.append("\n"); //$NON-NLS-1$ if (b.indexOf("</p>") != -1) { //$NON-NLS-1$ // we read everything we needed break; } } String wikiText = b.toString(); Matcher m = PARAGRAPH.matcher(wikiText); if (!m.find()) { throw new IOException("Error while reading the first paragraph"); //$NON-NLS-1$ } String result = wikiText.substring(m.start(), m.end()); return this.removeHtml(result); } finally { if (r != null) { try { r.close(); } catch (IOException e) {}; } } } public String removeHtml(String s) throws IOException { final StringBuilder b = new StringBuilder(); ParserDelegator delegator = new ParserDelegator(); // the third parameter is TRUE to ignore charset directive delegator.parse(new StringReader(s), new ParserCallback() { public void handleText(char[] data, int pos) { b.append(data); }; }, true); return b.toString(); } }