package com.cognitionis.wiki_basickit; import java.io.*; import java.net.MalformedURLException; import java.net.URL; /** * @author Hector Llorens * @since 2011 */ public class Wiki_bk { public static String wiki2txt(String title) { return wiki2txt(title, "en", "ascii"); } public static String wiki2txt(String title, String lang) { String charset="ascii"; if (!lang.equalsIgnoreCase("en")){ charset="utf8"; } return wiki2txt(title, "en", charset); } public static String wiki2txt(String title, String lang, String charset) { try { title = title.trim().replaceAll(" ", "_"); if (title.matches("(http://)?(en|es).wikipedia.*")) { if (title.matches("(http://)?(en|es)\\.wikipedia\\.org/wiki/.+")) { lang = title.replaceFirst("(http://)?(en|es)\\..*", "$2"); title = title.replaceFirst(".*wiki/(.+)", "$1"); } else { throw new MalformedURLException("Malformed URL: " + title); } } if (lang == null) { lang = "en"; } File f = new File(title.replaceAll("/", "-") + "-" + lang + ".txt"); if (!f.exists()) { URL url; String line; String input = ""; url = new URL("http://" + lang + ".wikipedia.org/w/index.php?title=" + title + "&printable=yes"); BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); BufferedWriter writer = new BufferedWriter(new FileWriter(new File(title.replaceAll("/", "-") + ".cleanhtml"))); try { while ((line = reader.readLine()) != null) { input += line + " "; //System.err.println(line); } // IMP: faster than write only the body in the bucle like comented below // clean html for parsers: remove headers, remove link, img and other tags input = input.replaceFirst(".*<body[^>]*>(.*)</body>.*", "<html>$1</html>").replaceAll("(?i)<[/]?(a|img|small|br|input)(\\s+[^>]*)?>", "").replaceAll("&(nbsp|reg);", " ").replaceAll("\\s+", " ")+"\n"; //.replaceAll("<([^/])", "\n<$1"); //.replaceAll("-->", "-->\n"); writer.write(input); } finally { if (reader != null) { reader.close(); } if (writer != null) { writer.close(); } } // NOTE THAT NOW EN AND ES ARE THE SAME (unless charset is set) if (lang.equalsIgnoreCase("en")) { WikiHtml2PlainHandler wikihtml2plain = new WikiHtml2PlainHandler(); wikihtml2plain.init(charset); wikihtml2plain.saveFile(title.replaceAll("/", "-") + ".cleanhtml", title.replaceAll("/", "-") + "-" + lang + ".txt"); } else { System.err.println("Leaving accents an non-ascii chars"); WikiHtml2PlainESHandler wikihtml2plainES = new WikiHtml2PlainESHandler(); wikihtml2plainES.saveFile(title.replaceAll("/", "-") + ".cleanhtml", title.replaceAll("/", "-") + "-" + lang + ".txt"); } if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Creating " + title.replaceAll("/", "-") + "-" + lang + ".txt"); } File f2 = new File(title.replaceAll("/", "-") + ".cleanhtml"); f2.delete(); } return title.replaceAll("/", "-") + "-" + lang + ".txt"; } catch (Exception e) { System.err.println("Errors found :\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); } return null; } } }