/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.iitb;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class IITB_XMLHandler extends DefaultHandler {
private static final Logger LOGGER = LoggerFactory.getLogger(IITB_XMLHandler.class);
private static final String ANNOTATIONS_LIST_TAG_NAME = "iitb.CSAW.entityAnnotations";
private static final String ANNOTATION_TAG_NAME = "annotation";
private static final String DOCUMENT_FILE_NAME_TAG_NAME = "docName";
private static final String WIKI_TITLE_TAG_NAME = "wikiName";
private static final String ANNOTATION_OFFSET_TAG_NAME = "offset";
private static final String ANNOTATION_LENGH_TAG_NAME = "length";
private static final String USER_ID_TAG_NAME = "userId";
protected Map<String, Set<IITB_Annotation>> documentAnnotationsMap = new HashMap<String, Set<IITB_Annotation>>();
protected IITB_Annotation currentAnnotation;
protected StringBuilder buffer = new StringBuilder();
public Map<String, Set<IITB_Annotation>> getDocumentAnnotationsMap() {
return documentAnnotationsMap;
}
@Override
public void startDocument() throws SAXException {
currentAnnotation = null;
super.startDocument();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
switch (qName) {
case DOCUMENT_FILE_NAME_TAG_NAME: // falls through
case WIKI_TITLE_TAG_NAME:
case ANNOTATION_OFFSET_TAG_NAME:
case ANNOTATION_LENGH_TAG_NAME: {
buffer.setLength(0);
break;
}
case ANNOTATION_TAG_NAME: {
currentAnnotation = new IITB_Annotation();
break;
}
case ANNOTATIONS_LIST_TAG_NAME: // falls through
case USER_ID_TAG_NAME: {
// nothing to do
break;
}
default: {
LOGGER.warn("Found an unknown XML tag name \"" + qName + "\". It will be ignored.");
break;
}
}
super.startElement(uri, localName, qName, attributes);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
super.characters(ch, start, length);
if (currentAnnotation != null) {
buffer.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
switch (qName) {
case ANNOTATION_TAG_NAME: {
if (currentAnnotation.isComplete()) {
Set<IITB_Annotation> annotations;
if (documentAnnotationsMap.containsKey(currentAnnotation.documentName)) {
annotations = documentAnnotationsMap.get(currentAnnotation.documentName);
} else {
annotations = new HashSet<IITB_Annotation>();
documentAnnotationsMap.put(currentAnnotation.documentName, annotations);
}
annotations.add(currentAnnotation);
} else {
LOGGER.warn(
"Got an incomplete named entity " + currentAnnotation.toString() + ". It will be discarded.");
}
currentAnnotation = null;
break;
}
case DOCUMENT_FILE_NAME_TAG_NAME: {
if (currentAnnotation != null) {
currentAnnotation.documentName = buffer.toString().trim();
} else {
LOGGER.warn("Found a tag (\"" + DOCUMENT_FILE_NAME_TAG_NAME
+ "\") without an open annotation. It will be ignored.");
}
break;
}
case WIKI_TITLE_TAG_NAME: {
if (currentAnnotation != null) {
currentAnnotation.wikiTitle = buffer.toString().trim();
} else {
LOGGER.warn("Found a tag (\"" + WIKI_TITLE_TAG_NAME
+ "\") without an open annotation. It will be ignored.");
}
break;
}
case ANNOTATION_OFFSET_TAG_NAME: {
if (currentAnnotation != null) {
try {
currentAnnotation.offset = Integer.parseInt(buffer.toString().trim());
} catch (NumberFormatException e) {
LOGGER.error("Couldn't parse the offset of an annotation. buffer=\"" + buffer + "\"", e);
}
} else {
LOGGER.warn("Found a tag (\"" + ANNOTATION_OFFSET_TAG_NAME
+ "\") without an open annotation. It will be ignored.");
}
break;
}
case ANNOTATION_LENGH_TAG_NAME: {
if (currentAnnotation != null) {
try {
currentAnnotation.length = Integer.parseInt(buffer.toString().trim());
} catch (NumberFormatException e) {
LOGGER.error("Couldn't parse the length of an annotation. buffer=\"" + buffer + "\"", e);
}
} else {
LOGGER.warn("Found a tag (\"" + ANNOTATION_LENGH_TAG_NAME
+ "\") without an open annotation. It will be ignored.");
}
break;
}
default: {
// nothing to do
}
}
super.endElement(uri, localName, qName);
}
}