package org.aksw.gerbil.dataset.impl.senseval; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class SensevalSAXHandler extends DefaultHandler { public static final String SENTENCE_ELEMENT = "sentence"; public static final String INSTANCE_ELEMENT = "instance"; private static final String WF_ELEMENT = "wf"; private StringBuilder sentence = new StringBuilder(); private List<Marking> markings = new ArrayList<Marking>(); private List<Document> documents; private int start = 0; private int length; private int i = 0; private String instanceUri; private byte field = -1; public SensevalSAXHandler(List<Document> documents) { this.documents = documents; } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if (qName.equalsIgnoreCase(SENTENCE_ELEMENT)) { field = 0; markings = new ArrayList<Marking>(); } else if (qName.equalsIgnoreCase(INSTANCE_ELEMENT)) { field = 1; length = 0; instanceUri = ""; } else if (qName.equalsIgnoreCase(WF_ELEMENT)) { field = 2; length = 0; } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equalsIgnoreCase(SENTENCE_ELEMENT)) { i++; documents.add(new DocumentImpl(sentence.toString(), "http://senseval" + i, markings)); sentence = new StringBuilder(); } else if (qName.equalsIgnoreCase(INSTANCE_ELEMENT)) { markings.add(new NamedEntity(start, length, instanceUri)); start = sentence.length(); } else if (qName.equalsIgnoreCase(WF_ELEMENT)) { start = sentence.length(); } this.field = 0; } @Override public void characters(char ch[], int start, int length) throws SAXException { switch (field) { case 0: break; case 1: case 2: this.length = length; String word = new String(Arrays.copyOfRange(ch, start, start + length)); if(word.equals("&")){ word = word.replace("&", "&"); } this.start+= addWordToSentence(word); } this.field = 0; } public List<Document> getDocuments() { return documents; } private int addWordToSentence(String word) { if (sentence.length() == 0) { sentence.append(word); return 0; } if (word.matches("(,|\\.|;|:|!|\\?)")) { sentence.append(word); return 0; } else { sentence.append(" ").append(word); return 1; } } }