/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.msnbc;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class MSNBC_XMLHandler extends DefaultHandler implements MSNBC_Result {
private static final Logger LOGGER = LoggerFactory.getLogger(MSNBC_XMLHandler.class);
private static final String DOCUMENT_TAG_NAME = "ReferenceProblem";
private static final String DOCUMENT_FILE_NAME_TAG_NAME = "ReferenceFileName";
private static final String MARKING_TAG_NAME = "ReferenceInstance";
private static final String MARKING_SURFACE_FORM_TAG_NAME = "SurfaceForm";
private static final String MARKING_OFFSET_TAG_NAME = "Offset";
private static final String MARKING_LENGH_TAG_NAME = "Length";
private static final String MARKING_MEANING_TAG_NAME = "ChosenAnnotation";
private static final String MARKING_NUMBER_OF_ANNOTATORS_TAG_NAME = "NumAnnotators";
private static final String MARKING_ANNOTATOR_ID_TAG_NAME = "AnnotatorId";
private static final String MARKING_ANNOTATION_TAG_NAME = "Annotation";
protected String textFileName;
protected List<MSNBC_NamedEntity> nes = new ArrayList<MSNBC_NamedEntity>();
protected int state = 0;
protected MSNBC_NamedEntity currentNE;
protected StringBuilder buffer = new StringBuilder();
public List<MSNBC_NamedEntity> getMarkings() {
return nes;
}
public String getTextFileName() {
return textFileName;
}
@Override
public void startDocument() throws SAXException {
state = 0;
textFileName = null;
nes.clear();
currentNE = null;
super.startDocument();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
switch (qName) {
case DOCUMENT_FILE_NAME_TAG_NAME: {
state = 1;
buffer.setLength(0);
break;
}
case MARKING_TAG_NAME: {
currentNE = new MSNBC_NamedEntity();
break;
}
case MARKING_SURFACE_FORM_TAG_NAME: {
state = 2;
buffer.setLength(0);
break;
}
case MARKING_OFFSET_TAG_NAME: {
state = 3;
buffer.setLength(0);
break;
}
case MARKING_LENGH_TAG_NAME: {
state = 4;
buffer.setLength(0);
break;
}
case MARKING_MEANING_TAG_NAME: {
state = 5;
buffer.setLength(0);
break;
}
case DOCUMENT_TAG_NAME: // falls through
case MARKING_NUMBER_OF_ANNOTATORS_TAG_NAME:
case MARKING_ANNOTATOR_ID_TAG_NAME:
case MARKING_ANNOTATION_TAG_NAME: {
state = 0;
break;
}
default: {
LOGGER.warn("Found an unknown XML tag name \"" + localName + "\". It will be ignored.");
break;
}
}
super.startElement(uri, localName, qName, attributes);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
super.characters(ch, start, length);
if (state > 0) {
buffer.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
switch (qName) {
case DOCUMENT_FILE_NAME_TAG_NAME: {
textFileName = buffer.toString().trim();
break;
}
case MARKING_TAG_NAME: {
if (currentNE.isComplete()) {
nes.add(currentNE);
} else {
LOGGER.warn("Got an incomplete named entity " + currentNE.toString() + ". It will be discarded.");
}
currentNE = null;
break;
}
case MARKING_SURFACE_FORM_TAG_NAME: {
if (currentNE != null) {
currentNE.setSurfaceForm(buffer.toString().trim());
} else {
LOGGER.error("Found a \"" + MARKING_SURFACE_FORM_TAG_NAME + "\" tag outside of a \"" + MARKING_TAG_NAME
+ "\" tag. It will be ignored.");
}
break;
}
case MARKING_OFFSET_TAG_NAME: {
if (currentNE != null) {
try {
int offset = Integer.parseInt(buffer.toString().trim());
currentNE.setStartPosition(offset);
} catch (NumberFormatException e) {
LOGGER.error("Couldn't parse the start position of a named entity. buffer=\"" + buffer + "\"");
}
} else {
LOGGER.error("Found a \"" + MARKING_OFFSET_TAG_NAME + "\" tag outside of a \"" + MARKING_TAG_NAME
+ "\" tag. It will be ignored.");
}
break;
}
case MARKING_LENGH_TAG_NAME: {
if (currentNE != null) {
try {
int length = Integer.parseInt(buffer.toString().trim());
currentNE.setLength(length);
} catch (NumberFormatException e) {
LOGGER.error("Couldn't parse the length of a named entity. buffer=\"" + buffer + "\"");
}
} else {
LOGGER.error("Found a \"" + MARKING_LENGH_TAG_NAME + "\" tag outside of a \"" + MARKING_TAG_NAME
+ "\" tag. It will be ignored.");
}
break;
}
case MARKING_MEANING_TAG_NAME: {
if (currentNE != null) {
currentNE.addUri(buffer.toString().trim());
} else {
LOGGER.error("Found a \"" + MARKING_MEANING_TAG_NAME + "\" tag outside of a \"" + MARKING_TAG_NAME
+ "\" tag. It will be ignored.");
}
break;
}
default: {
// nothing to do
}
}
super.endElement(uri, localName, qName);
}
}