/** * */ package de.berlin.hu.uima.cr.ddi.parser; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.u_compare.shared.semantic.NamedEntity; import org.uimafit.util.JCasUtil; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import de.berlin.hu.types.PubmedDocument; import de.berlin.hu.util.Constants; import sprint.uima.types.*; import java.util.ArrayList; import java.util.Iterator; /** * @author Tim Rocktäschel * */ //FIXME: probably not working anymore public class DDICorpusContentHandlerImpl implements ContentHandler { // tags private static final String CORPUS_TAG = "corpus"; // not needed private static final String DOCUMENT_TAG = "document"; private static final String SENTENCE_TAG = "sentence"; private static final String SENTENCEANALYSES_TAG = "sentenceanalyses"; private static final String TOKENIZATIONS_TAG = "tokenizations"; private static final String TOKENIZATION_TAG = "tokenization"; // single tags private static final String ENTITY_TAG = "entity"; private static final String PAIR_TAG = "pair"; private static final String TOKEN_TAG = "token"; // attributes private static final String CHAR_OFFSET_ATTR = "charOffset"; private static final String ID_ATTR = "id"; private static final String TEXT_ATTR = "text"; private static final String ENTITY_TYPE_ATTR = "type"; private static final String ENTITY_1_ATTR = "e1"; private static final String ENTITY_2_ATTR = "e2"; private static final String INTERACTION_ATTR = "interaction"; private static final String TOKENIZER_ATTR = "tokenizer"; private static final String POS_ATTR = "POS"; // TODO: add annotation // parses // parse // dependency private enum ElementType {Ignore, Corpus, Document, Sentence, Sentenceanalyses, Tokenizations, Tokenization}; private ElementType currentElementType; private JCas jcas; private StringBuffer documentTextStringBuffer; private int sentenceOffset; private int nextSentenceOffset; private String documentId; private Sentence currentSentence; private ArrayList<Pair> currentPairs; private boolean firstEntitySeen; public DDICorpusContentHandlerImpl(JCas jcas) { this.jcas = jcas; } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) */ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // is not needed if (CORPUS_TAG.equalsIgnoreCase(qName)) { currentElementType = ElementType.Corpus; } else if (DOCUMENT_TAG.equalsIgnoreCase(qName)) { createDocumentAnnotation(atts); firstEntitySeen = false; } else if (SENTENCE_TAG.equalsIgnoreCase(qName)) { createSentenceAnnotation(atts); } else if (ENTITY_TAG.equalsIgnoreCase(qName)) { createEntityAnnotation(atts); firstEntitySeen = true; } else if (PAIR_TAG.equalsIgnoreCase(qName)) { // createPairAnnotation(atts); } else if (SENTENCEANALYSES_TAG.equalsIgnoreCase(qName)) { currentElementType = ElementType.Sentenceanalyses; } else if (TOKENIZATIONS_TAG.equalsIgnoreCase(qName)) { currentElementType = ElementType.Tokenizations; } else if (TOKENIZATION_TAG.equalsIgnoreCase(qName)) { String tokenizerName = atts.getValue(TOKENIZER_ATTR); // TODO make switch between Charniak-Lease and split possible if (tokenizerName.equals("Charniak-Lease")) { // if (tokenizerName.equals("split")) { currentElementType = ElementType.Tokenization; } } else if (TOKEN_TAG.equalsIgnoreCase(qName)) { if (currentElementType.equals(ElementType.Tokenization)) { createTokenAnnotation(atts); } } else { currentElementType = ElementType.Ignore; } } private void createDocumentAnnotation(Attributes atts) { currentElementType = ElementType.Document; documentTextStringBuffer = new StringBuffer(); sentenceOffset = 0; nextSentenceOffset = 0; documentId = atts.getValue(ID_ATTR); } private void createSentenceAnnotation(Attributes atts) { currentElementType = ElementType.Sentence; String sentenceText = atts.getValue(TEXT_ATTR) + " "; documentTextStringBuffer.append(sentenceText); sentenceOffset = nextSentenceOffset; nextSentenceOffset += sentenceText.length(); Sentence sentence = new Sentence(jcas, sentenceOffset, nextSentenceOffset - 1); sentence.setID(atts.getValue(ID_ATTR)); sentence.addToIndexes(); currentSentence = sentence; currentPairs = new ArrayList<Pair>(); } private void createEntityAnnotation(Attributes atts) { String charOffset = atts.getValue(CHAR_OFFSET_ATTR); String[] boundaries = atts.getValue(CHAR_OFFSET_ATTR).split("-"); // e.g. charOffset="0-2,5-5" gets to 0-5 int begin = sentenceOffset + Integer.valueOf(boundaries[0]); int end = sentenceOffset + Integer.valueOf(boundaries[boundaries.length-1]); if (!firstEntitySeen) { end = end + 1; } Entity entity = new Entity(jcas, begin, end); entity.setID(atts.getValue(ID_ATTR)); entity.setEntityType(atts.getValue(ENTITY_TYPE_ATTR)); entity.setCharOffset(charOffset); entity.addToIndexes(); NamedEntity namedEntity = new NamedEntity(jcas, begin, end); namedEntity.setId(atts.getValue(ID_ATTR)); namedEntity.setEntityType(atts.getValue(ENTITY_TYPE_ATTR)); namedEntity.setSource(Constants.GOLDSTANDARD); namedEntity.addToIndexes(); } private void createPairAnnotation(Attributes atts) { Pair pair = new Pair(jcas); pair.setID(atts.getValue(ID_ATTR)); pair.setInteraction(Boolean.parseBoolean(atts.getValue(INTERACTION_ATTR))); String entity1ID = atts.getValue(ENTITY_1_ATTR); String entity2ID = atts.getValue(ENTITY_2_ATTR); // TODO: use JCasUtil.iterator instead? Iterator<Entity> entityIterator = JCasUtil.iterate(jcas, Entity.class, currentSentence).iterator(); Entity entity1 = null; Entity entity2 = null; while (entityIterator.hasNext()) { Entity currentEntity = entityIterator.next(); if (entity1ID.equals(currentEntity.getID())) { entity1 = currentEntity; } else if (entity2ID.equals(currentEntity.getID())) { entity2 = currentEntity; } } pair.setEntity1(entity1); pair.setEntity2(entity2); if (entity1.getBegin() < entity2.getBegin()) { pair.setBegin(entity1.getBegin()); pair.setEnd(entity2.getEnd()); } else { pair.setBegin(entity2.getBegin()); pair.setEnd(entity1.getEnd()); } pair.addToIndexes(); currentPairs.add(pair); } private void createTokenAnnotation(Attributes atts) { String[] boundaries = atts.getValue(CHAR_OFFSET_ATTR).split("-"); int begin = sentenceOffset + Integer.valueOf(boundaries[0]); int end = sentenceOffset + Integer.valueOf(boundaries[1]) + 1; Token entity = new Token(jcas, begin, end); entity.setID(atts.getValue(ID_ATTR)); entity.setPOS(atts.getValue(POS_ATTR)); entity.addToIndexes(); } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) */ public void endElement(String uri, String localName, String qName) throws SAXException { if (CORPUS_TAG.equalsIgnoreCase(qName)) { //TODO } else if (DOCUMENT_TAG.equalsIgnoreCase(qName)) { jcas.setDocumentText(documentTextStringBuffer.toString()); CorpusDocument corpusDocument = new CorpusDocument(jcas, 0, documentTextStringBuffer.length()); corpusDocument.setID(documentId); corpusDocument.addToIndexes(); PubmedDocument pubmedDocument = new PubmedDocument(jcas, 0, documentTextStringBuffer.length()); pubmedDocument.setPmid(documentId); pubmedDocument.addToIndexes(); } else if (SENTENCE_TAG.equalsIgnoreCase(qName)) { FSArray pairs = new FSArray(jcas, currentPairs.size()); for (int j = 0; j < pairs.size(); j++) { pairs.set(j, currentPairs.get(j)); } currentSentence.setPairs(pairs); } else if (SENTENCEANALYSES_TAG.equalsIgnoreCase(qName)) { //TODO } else if (TOKENIZATIONS_TAG.equalsIgnoreCase(qName)) { //TODO } else if (TOKENIZATION_TAG.equalsIgnoreCase(qName)) { currentElementType = ElementType.Sentenceanalyses; } } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator) */ public void setDocumentLocator(Locator locator) { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#startDocument() */ public void startDocument() throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#endDocument() */ public void endDocument() throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String, java.lang.String) */ public void startPrefixMapping(String prefix, String uri) throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String) */ public void endPrefixMapping(String prefix) throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#characters(char[], int, int) */ public void characters(char[] ch, int start, int length) throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int) */ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#processingInstruction(java.lang.String, java.lang.String) */ public void processingInstruction(String target, String data) throws SAXException { } /* (non-Javadoc) * @see org.xml.sax.ContentHandler#skippedEntity(java.lang.String) */ public void skippedEntity(String name) throws SAXException { } }