package com.cognitionis.wiki_basickit; import com.cognitionis.utils_basickit.SAXReader; import java.util.*; import org.xml.sax.*; /** * @author Hector Llorens * @since 2011 */ public class WikiHtml2PlainESHandler extends SAXReader { boolean inText = false, inSection = false, hasText=false, hasSentence=false, inSentence=false; boolean inH=false; // section titles boolean inSup=false; // references / citations int inTable=0; // StringBuilder docidStrb; StringBuilder textStrb; StringBuilder sentenceStrb; StringBuilder H2Strb; // String docid; String root_tag = null; ArrayList<String> sentences; @Override public void startElement(final String uri, final String localName, final String tag, final Attributes attributes) throws SAXException { //System.err.println("found "+tag); if (textStrb == null) { textStrb = new StringBuilder(); } //System.out.println("tag: "+tag); if (root_tag == null) { root_tag = tag; } if (tag.equalsIgnoreCase("table")) { inTable++; } if (tag.equalsIgnoreCase("sup")) { inSup=true; } if (tag.matches("h[1234]")) { H2Strb=null; H2Strb=new StringBuilder(); inH=true; } if (tag.equalsIgnoreCase("html")) { if (!hasText) { textStrb = null; textStrb = new StringBuilder(); hasSentence = false; sentences = null; } hasText = true; inText = true; } if (tag.equalsIgnoreCase("p")) { if ((hasText && inText) || !hasText) { if (!hasSentence) { hasSentence = true; sentences = null; sentences = new ArrayList<String>(); textStrb = null; // For the garbage collector - free memory } // reload sentenceStrb sentenceStrb = null; // For the garbage collector - free memory sentenceStrb = new StringBuilder(); inSentence = true; } } } /* * Only text, excluding all tags */ @Override public void characters(final char[] c, final int start, final int length) { //System.err.print(c); /* if (inDocid) { docidStrb.append(c, start, length); } */ if (hasText) { if (inText) { if (hasSentence) { if (inSentence && inTable==0 && !inSup) { sentenceStrb.append(c, start, length); } if (inH && inTable==0 && !inSup) { H2Strb.append(c, start, length); } } else { textStrb.append(c, start, length); } } } else { if (hasSentence) { if (inSentence && inTable==0 && !inSup) { sentenceStrb.append(c, start, length); } } else { textStrb.append(c, start, length); } } } @Override public void endElement(final String uri, final String localName, final String tag) throws SAXException { if (tag.equalsIgnoreCase("html") && inText) { inText = false; if (!hasSentence) { System.out.println("no sentences"); strBuilder = textStrb; } else { int n = sentences.size() - 1; for (int i = 0; i < n; i++) { strBuilder.append(sentences.get(i) + "\n\n"); } strBuilder.append(sentences.get(n)); sentences = null; } textStrb = null; // For the garbage collector - free memory } if (tag.equalsIgnoreCase("p") && inSentence) { inSentence = false; if(sentenceStrb.length()>0){ String temp=sentenceStrb.toString().replaceAll("(\n|\r)", " ").replaceAll("\\s+", " ").replaceAll("(—|–|\\|)", " - ").replaceAll("’", "'").replaceAll("…", " ").replaceAll("(“|”)", "\"").trim(); //temp=java.text.Normalizer.normalize(temp, java.text.Normalizer.Form.NFD); sentences.add(temp); } sentenceStrb = null; // For the garbage collector - free memory } if (tag.equalsIgnoreCase("table") && inTable>0) { inTable--; } if (tag.equalsIgnoreCase("sup") && inSup) { inSup=false; } if (tag.matches("h[1234]")) { inH=false; if(H2Strb.length()>0 && !H2Strb.toString().replaceAll("(\n|\r|\\s*\\[\\s*edit(ar)?\\s*\\]\\s*)", "").matches("(Media|Animated maps|See also|Notes|References|External links|Véase también|Referencias|Libros relacionados|Enlaces externos)")){ String temp=H2Strb.toString().replaceAll("(\n|\r|\\s*\\[\\s*edit(ar)?\\s*\\]\\s*\t)", " ").replaceAll("\\s+", " ").replaceAll("(—|–|\\|)", " - ").replaceAll("’", "'").replaceAll("…", " ").replaceAll("(“|”)", "\"").trim(); //temp=java.text.Normalizer.normalize(temp, java.text.Normalizer.Form.NFD); sentences.add(temp+"."); } H2Strb = null; // For the garbage collector - free memory } // ho puc fer quan s'acaba el document si no tenia text... if (tag.equalsIgnoreCase(root_tag)) { if (!hasText) { if (hasSentence) { int n = sentences.size() - 1; for (int i = 0; i < n; i++) { strBuilder.append(sentences.get(i) + "\n"); } strBuilder.append(sentences.get(n)); sentences = null; } else { strBuilder = textStrb; textStrb = null; // For the garbage collector - free memory } } } } //es pot gastar start i enddocument...a. }