package evidencemining.parse.wikipedia; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Stack; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; /** * HTML to plain-text. Second step for Wikipedia evidence Mining. Each HTML file * is transformed into a plaintext file, only containing the plain text and * entity annotations. Structural elements like tables, headlines, references * will be removed. * * @author Stefan Zwicklbauer */ public class S1HtmlToPlainTextWithEntities extends HTMLEditorKit.ParserCallback { public static String MAINDIR = "/mnt/storage/zwicklbauer/WikiParse/ger_wiki/dump/html/"; public static String SAVEDIR = "/mnt/storage/zwicklbauer/WikiParse/ger_wiki/dump/plain/"; // public static String REDIRECTFILE = "/home/zwicklbauer/HDTGeneration/redirects_en.nt"; public static final int NUMBERTHREADS = 40; private StringBuffer stringBuffer; private Stack<IndexType> indentStack; // private static HashMap<String, String> redirects = new HashMap<String, String>(); private boolean isEntity; private boolean tableContent; private boolean isList; private boolean isHeader; private boolean isStyle; private StringBuffer substring; private static HashSet<String> filesHash = new HashSet<String>(); private HashMap<String, Integer> writtenFiles; public static class IndexType { public String type; public int counter; // used for ordered lists public IndexType(String type) { this.type = type; counter = 0; } } public S1HtmlToPlainTextWithEntities() { super(); stringBuffer = new StringBuffer(); indentStack = new Stack<IndexType>(); isEntity = false; tableContent = false; isList = false; isHeader = false; isStyle = false; substring = new StringBuffer(); this.writtenFiles = new HashMap<String, Integer>(); } public static void createFileHash(File dir) { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { String filen = files[i].getName().replaceAll(".html", ""); filen = filen.replaceAll("'", "%"); try { filen = URLDecoder.decode(filen, "UTF-8"); filen = URLEncoder.encode(filen, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } filesHash.add(filen); } else { createFileHash(files[i]); } } } // public static void createRedirectHashMap(File main) { // Model model = ModelFactory.createDefaultModel(); // model.read(REDIRECTFILE); // StmtIterator iter = model.listStatements(); // while (iter.hasNext()) { // Statement stmt = iter.nextStatement(); // Resource subject = stmt.getSubject(); // String sourceurl = subject.toString().replaceAll( // "http://dbpedia.org/resource/", ""); // // try { // sourceurl = URLDecoder.decode(sourceurl, "UTF-8"); // sourceurl = URLEncoder.encode(sourceurl, "UTF-8"); // } catch (UnsupportedEncodingException e) { // e.printStackTrace(); // } // // RDFNode object = stmt.getObject(); // String targeturl = ""; // if (object instanceof Resource) { // targeturl = object.toString().replaceAll( // "http://dbpedia.org/resource/", ""); // try { // targeturl = URLDecoder.decode(targeturl, "UTF-8"); // targeturl = URLEncoder.encode(targeturl, "UTF-8"); // } catch (UnsupportedEncodingException e) { // e.printStackTrace(); // } // } // if (filesHash.contains(targeturl)) { // redirects.put(sourceurl, targeturl); // } // } // } public static String convert(String html) { S1HtmlToPlainTextWithEntities parser = new S1HtmlToPlainTextWithEntities(); Reader in = new StringReader(html); try { // the HTML to convert parser.parse(in); } catch (Exception e) { } finally { try { in.close(); } catch (IOException ioe) { // this should never happen } } return parser.getText(); } public void parse(Reader in) throws IOException { ParserDelegator delegator = new ParserDelegator(); // the third parameter is TRUE to ignore charset directive delegator.parse(in, this, Boolean.TRUE); } private boolean checkFile(String filename) { return filesHash.contains(filename); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t.toString().equals("a") && checkLink(a.toString())) { @SuppressWarnings("rawtypes") Enumeration e = a.getAttributeNames(); Object object = null; while (e.hasMoreElements()) { Object obj = e.nextElement(); if (obj.toString().equalsIgnoreCase("href")) { object = obj; break; } } if (!String.valueOf(a.getAttribute(object)).contains("#")) { String link = a.getAttribute(object).toString(); link = link.replaceAll(".html", ""); link = link.replaceAll("'", "%"); try { link = URLDecoder.decode(link, "UTF-8"); link = URLEncoder.encode(link, "UTF-8"); } catch (UnsupportedEncodingException e1) { e1.printStackTrace(); } if (checkFile(link)) { // if (redirects.containsKey(link)) { // String back = link; // link = redirects.get(link); // System.out.print("Redirect From: "+back+"To: "+link); // } substring.append("<a " + object.toString() + "=\"" + link + "\">"); isEntity = true; } } } else if (t.toString().equals("p")) { if (stringBuffer.length() > 0 && !stringBuffer.substring(stringBuffer.length() - 1) .equals("\n")) { newLine(); } newLine(); } else if (t.toString().equals("ol")) { indentStack.push(new IndexType("ol")); newLine(); } else if (t.toString().equals("ul")) { isList = true; indentStack.push(new IndexType("ul")); newLine(); } else if (t.toString().equals("li")) { indentStack.push(new IndexType("li")); } else if (t.toString().equals("dl")) { newLine(); } else if (t.toString().equals("dt")) { newLine(); } else if (t.toString().equals("dd")) { indentStack.push(new IndexType("dd")); newLine(); } else if (t.toString().equals("h1") || t.toString().equals("h2") || t.toString().equals("h3") || t.toString().equals("h4") || t.toString().equals("h5") || t.toString().equals("h6")) { isHeader = true; } else if (t.toString().equals("style")) { isStyle = true; } if (t.toString().equals("table")) { tableContent = true; } } private boolean checkLink(String l) { if (l.contains("href=") && l.contains("id=w") && !tableContent && !l.contains("template") && !l.contains("Template") && !isList && !isHeader && !l.toLowerCase().contains("http:") && !l.contains("/")) { return true; } return false; } private void newLine() { } public void handleEndTag(HTML.Tag t, int pos) { if (t.toString().equals("a") && isEntity) { String s[] = substring.toString().split(">"); if (!s[1].matches("[ *]")) { substring.append("</a>"); stringBuffer.append(substring.toString()); } substring = new StringBuffer(); isEntity = false; } else if (t.toString().equals("p")) { newLine(); } else if (t.toString().equals("ol")) { indentStack.pop(); ; newLine(); } else if (t.toString().equals("ul")) { indentStack.pop(); ; newLine(); isList = false; } else if (t.toString().equals("li")) { indentStack.pop(); ; newLine(); } else if (t.toString().equals("dd")) { indentStack.pop(); ; } else if (t.toString().equals("h1") || t.toString().equals("h2") || t.toString().equals("h3") || t.toString().equals("h4") || t.toString().equals("h5") || t.toString().equals("h6")) { isHeader = false; } else if (t.toString().equals("style")) { isStyle = false; } if (t.toString().equals("table")) { tableContent = false; } } public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t.toString().equals("br")) { newLine(); } } public void handleText(char[] text, int pos) { String s = new String(text); s = s.replaceAll("\\[\\d*\\]", " "); s = s.replaceAll("<[0-9A-Za-z \\ /\n\r\t]*>", " "); s = s.replaceAll("[^A-Za-z0-9 \n\t.,!? äöüÄÜÖ]", " "); s = s.replaceAll("[\\s]+", " "); if (!tableContent && !isList && !s.contains("Template") && !isHeader && !isStyle) { if (isEntity) { substring.append(s); } else { stringBuffer.append(s); } } } public String getText() { return stringBuffer.toString(); } public static void main(String args[]) { File mainDirectory = new File(MAINDIR); S1HtmlToPlainTextWithEntities.createFileHash(mainDirectory); // S1HtmlToPlainTextWithEntities.createRedirectHashMap(mainDirectory); S1HtmlToPlainTextWithEntities s1 = new S1HtmlToPlainTextWithEntities(); s1.readFile(mainDirectory); ; } public void readFile(File file) { File[] files = file.listFiles(); for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()) { readFile(files[i]); } else { String oldFileName = files[i].getName(); File outputFile = new File(SAVEDIR + oldFileName); processFile(files[i], outputFile); } } } public void processFile(File input, File outputFile) { String c = ""; try { String line = null; BufferedReader reader = new BufferedReader(new FileReader(input)); while ((line = reader.readLine()) != null) { c += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } String output = convert(c); output = output.replaceAll("DEFAULTSORT", ""); if (output.length() > 10) { int length = -1; if (writtenFiles.containsKey(outputFile.getName())) { length = writtenFiles.get(outputFile.getName()); } if (length == -1 || length < output.length()) { PrintWriter writer; try { writer = new PrintWriter(new FileWriter(outputFile, false)); writer.write("<?xml version='1.0' encoding='utf-8'?><content>" + output + "</content>"); writer.flush(); writer.close(); writtenFiles.put(outputFile.getName(), output.length()); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } } class Thread implements Runnable { private List<File> toprocess; public Thread(List<File> toprocess) { super(); this.toprocess = toprocess; } @Override public void run() { for (int i = 0; i < toprocess.size(); i++) { doAction(toprocess.get(i)); } } private void doAction(File f) { if (!f.isDirectory()) { String oldFileName = f.getName(); File outputFile = new File(SAVEDIR + oldFileName); processFile(f, outputFile); } else { processDir(f); } } private void processDir(File f) { File[] dir = f.listFiles(); for (int i = 0; i < dir.length; i++) { if (!dir[i].isDirectory()) { doAction(dir[i]); } else { processDir(dir[i]); } } } } }