package com.cognitionis.wiki_basickit;
import java.util.*;
import org.xml.sax.*;
import com.cognitionis.utils_basickit.SAXReader;
/**
* @author Hector Llorens
* @since 2011
*/
public class WikiHtml2PlainHandler extends SAXReader {
boolean inText = false, inSection = false, hasText = false, hasSentence = false, inSentence = false;
boolean inH = false; // section titles
boolean inSup = false; // references / citations
int inTable = 0;
// StringBuilder docidStrb;
StringBuilder textStrb;
StringBuilder sentenceStrb;
StringBuilder H2Strb;
// String docid;
String root_tag = null;
ArrayList<String> sentences;
String encoding = "utf8";
public void init(String charset) {
encoding = charset;
}
@Override
public void startElement(final String uri, final String localName,
final String tag, final Attributes attributes) throws SAXException {
//System.err.println("found "+tag);
if (textStrb == null) {
textStrb = new StringBuilder();
}
//System.out.println("tag: "+tag);
if (root_tag == null) {
root_tag = tag;
}
if (tag.equalsIgnoreCase("table")) {
inTable++;
}
if (tag.equalsIgnoreCase("sup")) {
inSup = true;
}
if (tag.matches("h[1234]")) {
H2Strb = null;
H2Strb = new StringBuilder();
inH = true;
}
if (tag.equalsIgnoreCase("html")) {
if (!hasText) {
textStrb = null;
textStrb = new StringBuilder();
hasSentence = false;
sentences = null;
}
hasText = true;
inText = true;
}
if (tag.equalsIgnoreCase("p")) {
if ((hasText && inText) || !hasText) {
if (!hasSentence) {
hasSentence = true;
sentences = null;
sentences = new ArrayList<String>();
textStrb = null; // For the garbage collector - free memory
}
// reload sentenceStrb
sentenceStrb = null; // For the garbage collector - free memory
sentenceStrb = new StringBuilder();
inSentence = true;
}
}
}
/*
* Only text, excluding all tags
*/
@Override
public void characters(final char[] c, final int start, final int length) {
//System.err.print(c);
/* if (inDocid) {
docidStrb.append(c, start, length);
}
*/
if (hasText) {
if (inText) {
if (hasSentence) {
if (inSentence && inTable == 0 && !inSup) {
sentenceStrb.append(c, start, length);
}
if (inH && inTable == 0 && !inSup) {
H2Strb.append(c, start, length);
}
} else {
textStrb.append(c, start, length);
}
}
} else {
if (hasSentence) {
if (inSentence && inTable == 0 && !inSup) {
sentenceStrb.append(c, start, length);
}
} else {
textStrb.append(c, start, length);
}
}
}
@Override
public void endElement(final String uri,
final String localName,
final String tag)
throws SAXException {
if (tag.equalsIgnoreCase("html") && inText) {
inText = false;
if (!hasSentence) {
System.out.println("no sentences");
strBuilder = textStrb;
} else {
int n = sentences.size() - 1;
for (int i = 0; i
< n; i++) {
strBuilder.append(sentences.get(i) + "\n\n");
}
strBuilder.append(sentences.get(n));
sentences = null;
}
textStrb = null; // For the garbage collector - free memory
}
if (tag.equalsIgnoreCase("p") && inSentence) {
inSentence = false;
if (sentenceStrb.length() > 0) {
String temp = sentenceStrb.toString().replaceAll("(\n|\r|\\p{javaSpaceChar})", " ").replaceAll("\\s+", " ").replaceAll("(—|–)", " - ").replaceAll("’", "'").trim();
if (encoding.equals("ascii")) {
temp = java.text.Normalizer.normalize(temp, java.text.Normalizer.Form.NFD);
temp = temp.replaceAll("[^\\p{ASCII}]", "");
}
sentences.add(temp);
}
sentenceStrb = null; // For the garbage collector - free memory
}
if (tag.equalsIgnoreCase("table") && inTable > 0) {
inTable--;
}
if (tag.equalsIgnoreCase("sup") && inSup) {
inSup = false;
}
if (tag.matches("h[1234]")) {
inH = false;
if (H2Strb.length() > 0 && !H2Strb.toString().replaceAll("(\n|\r|\\s*\\[\\s*edit(ar)?\\s*\\]\\s*)", "").matches("(Media|Animated maps|See also|Notes|References|External links)")) {
String temp = H2Strb.toString().replaceAll("(\n|\r|\\s*\\[\\s*edit(ar)?\\s*\\]\\s*|\t)", " ").replaceAll("\\s+", " ").replaceAll("(—|–)", " - ").replaceAll("’", "'").trim();
// NOT ALWAYS WORK THAT BELOW NFD + ASCII
if (encoding.equals("ascii")) {
temp = java.text.Normalizer.normalize(temp, java.text.Normalizer.Form.NFD);
temp = temp.replaceAll("[^\\p{ASCII}]", "");
}
sentences.add(temp + ".");
}
H2Strb = null; // For the garbage collector - free memory
}
// ho puc fer quan s'acaba el document si no tenia text...
if (tag.equalsIgnoreCase(root_tag)) {
if (!hasText) {
if (hasSentence) {
int n = sentences.size() - 1;
for (int i = 0; i
< n; i++) {
strBuilder.append(sentences.get(i) + "\n");
}
strBuilder.append(sentences.get(n));
sentences = null;
} else {
strBuilder = textStrb;
textStrb = null; // For the garbage collector - free memory
}
}
}
}
//es pot gastar start i enddocument...a.
}