/*
*
* This SAX parser treats a basic NLP corpus document in XML
*
* The basic elements are a main tag (i.e., TimeML, then a DOC tag that may contain DOC_?(ID|NO) tag
* And basic <TEXT> and <s> tags for wrapping the complete text and each segmented sentence
*
* If <TEXT> -> Only consider text in <TEXT>
* If <s> -> Only consider text in <s>
* If <TEXT> && <s> -> Only consider text in <s> in <TEXT>
*
* Priorize <TEXT> if there is <TEXT> anything even <s> outside <TEXT> are not considered
*
* Removes all the tags and returns the text between the text (including sentence segmentation if present).
* Replaces & like xml scapes by the original characters
*
*/
package com.cognitionis.utils_basickit;
import com.cognitionis.utils_basickit.SAXReader;
import java.util.*;
import org.xml.sax.*;
/**
* @author Hector Llorens
* @since 2011
*/
public class Xml2PlainHandler extends SAXReader {
boolean inText = false, inSentence = false, hasText = false, hasSentence = false, inDocid = false;
// StringBuilder docidStrb;
StringBuilder textStrb;
StringBuilder sentenceStrb;
// String docid;
String root_tag = null;
ArrayList<String> sentences;
@Override
public void startElement(final String uri, final String localName,
final String tag, final Attributes attributes) throws SAXException {
//System.err.println("found "+tag);
if (textStrb == null) {
textStrb = new StringBuilder();
}
if (root_tag == null) {
root_tag = tag;
//System.out.println("roottag: "+tag);
}
/* if (tag.toUpperCase().matches("DOC_?(ID|NO)")) {
docidStrb = new StringBuilder();
inDocid = true;
}
*/
if (tag.equalsIgnoreCase("text")) {
if (!hasText) {
textStrb = null;
textStrb = new StringBuilder();
hasSentence = false;
sentences = null;
}
hasText = true;
inText = true;
}
if (tag.equalsIgnoreCase("s")) {
if ((hasText && inText) || !hasText) {
if (!hasSentence) {
hasSentence = true;
sentences = null;
sentences = new ArrayList<String>();
textStrb = null; // For the garbage collector - free memory
}
// reload sentenceStrb
sentenceStrb = null; // For the garbage collector - free memory
sentenceStrb = new StringBuilder();
inSentence = true;
}
}
}
/*
* Only text, excluding all tags
*/
@Override
public void characters(final char[] c, final int start, final int length) {
//System.err.print(c);
/* if (inDocid) {
docidStrb.append(c, start, length);
}
*/
if (hasText) {
if (inText) {
if (hasSentence) {
if (inSentence) {
sentenceStrb.append(c, start, length);
}
} else {
textStrb.append(c, start, length);
}
}
} else {
if (hasSentence) {
if (inSentence) {
sentenceStrb.append(c, start, length);
}
} else {
textStrb.append(c, start, length);
}
}
}
@Override
public void endElement(final String uri,
final String localName,
final String tag)
throws SAXException {
/* if (tag.toUpperCase().matches("DOC_?(ID|NO)") && inDocid) {
inDocid = false;
docid =
docidStrb.toString().trim();
docidStrb = null; // For the garbage collector - free memory
System.out.println("Docid: (" + docid + ")");
}
*/
if (tag.equalsIgnoreCase("text") && inText) {
inText = false;
if (!hasSentence) {
strBuilder = textStrb;
} else {
int n = sentences.size() - 1;
for (int i = 0; i <
n; i++) {
strBuilder.append(sentences.get(i) + "\n");
}
strBuilder.append(sentences.get(n));
sentences = null;
}
textStrb = null; // For the garbage collector - free memory
}
if (tag.equalsIgnoreCase("s") && inSentence) {
inSentence = false;
sentences.add(sentenceStrb.toString().replaceAll("(\n|\r)", "").replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">"));
sentenceStrb = null; // For the garbage collector - free memory
}
// ho puc fer quan s'acaba el document si no tenia text...
if (tag.equalsIgnoreCase(root_tag)) {
if (!hasText) {
if (hasSentence) {
int n = sentences.size() - 1;
for (int i = 0; i <
n; i++) {
strBuilder.append(sentences.get(i) + "\n");
}
strBuilder.append(sentences.get(n));
sentences = null;
} else {
strBuilder.append(textStrb.toString().replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">"));
textStrb = null; // For the garbage collector - free memory
}
}
}
}
//es pot gastar start i enddocument...a.
}