/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.ui.curation.component.model; import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.TypeUtil.getAdapter; import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getAddr; import static org.apache.uima.fit.util.CasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.uima.UIMAException; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.security.core.context.SecurityContextHolder; import de.tudarmstadt.ukp.clarin.webanno.api.AnnotationSchemaService; import de.tudarmstadt.ukp.clarin.webanno.api.CorrectionDocumentService; import de.tudarmstadt.ukp.clarin.webanno.api.DocumentService; import de.tudarmstadt.ukp.clarin.webanno.api.WebAnnoConst; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil; import de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2; import de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.Configuration; import de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.ConfigurationSet; import de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.DiffResult; import de.tudarmstadt.ukp.clarin.webanno.curation.casdiff.CasDiff2.LinkCompareBehavior; import de.tudarmstadt.ukp.clarin.webanno.curation.storage.CurationDocumentService; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocumentState; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer; import de.tudarmstadt.ukp.clarin.webanno.model.Mode; import de.tudarmstadt.ukp.clarin.webanno.model.Project; import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument; import de.tudarmstadt.ukp.clarin.webanno.security.UserDao; import de.tudarmstadt.ukp.clarin.webanno.security.model.User; import de.tudarmstadt.ukp.clarin.webanno.ui.curation.util.MergeCas; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * This class is responsible for two things. Firstly, it creates a pre-merged cas, which contains * all annotations, where all annotators agree on. This is done by copying a random cas and removing * all differing annotations. * * Secondly, the class creates an instance of {@link CurationContainer}, which is the wicket model * for the curation panel. The {@link CurationContainer} contains the text for all sentences, which * are displayed at a specific page. */ public class SuggestionBuilder { private final Logger log = LoggerFactory.getLogger(getClass()); private final AnnotationSchemaService annotationService; private final DocumentService documentService; private final CorrectionDocumentService correctionDocumentService; private final CurationDocumentService curationDocumentService; private final UserDao userRepository; int diffRangeBegin, diffRangeEnd; boolean firstload = true; public static Map<Integer, Set<Integer>> crossSentenceLists; // Map<Integer, Integer> segmentBeginEnd = new HashMap<Integer, Integer>(); public SuggestionBuilder(DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, UserDao aUserDao) { documentService = aDocumentService; correctionDocumentService = aCorrectionDocumentService; curationDocumentService = aCurationDocumentService; annotationService = aAnnotationService; userRepository = aUserDao; } public CurationContainer buildCurationContainer(AnnotatorState aBModel) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { CurationContainer curationContainer = new CurationContainer(); // initialize Variables SourceDocument sourceDocument = aBModel.getDocument(); Map<Integer, Integer> segmentBeginEnd = new HashMap<Integer, Integer>(); Map<Integer, Integer> segmentNumber = new HashMap<Integer, Integer>(); Map<String, Map<Integer, Integer>> segmentAdress = new HashMap<String, Map<Integer, Integer>>(); // get annotation documents List<AnnotationDocument> finishedAnnotationDocuments = new ArrayList<AnnotationDocument>(); for (AnnotationDocument annotationDocument : documentService.listAnnotationDocuments(aBModel .getDocument())) { if (annotationDocument.getState().equals(AnnotationDocumentState.FINISHED)) { finishedAnnotationDocuments.add(annotationDocument); } } Map<String, JCas> jCases = new HashMap<String, JCas>(); AnnotationDocument randomAnnotationDocument = null; JCas mergeJCas; // get the correction/automation JCas for the logged in user if (aBModel.getMode().equals(Mode.AUTOMATION) || aBModel.getMode().equals(Mode.CORRECTION)) { jCases = listJcasesforCorrection(randomAnnotationDocument, sourceDocument, aBModel.getMode()); mergeJCas = getMergeCas(aBModel, sourceDocument, jCases, randomAnnotationDocument); String username = jCases.keySet().iterator().next(); updateSegment(aBModel, segmentBeginEnd, segmentNumber, segmentAdress, jCases.get(username), username, aBModel.getWindowBeginOffset(), aBModel.getWindowEndOffset()); } else { jCases = listJcasesforCuration(finishedAnnotationDocuments, randomAnnotationDocument, aBModel.getMode()); mergeJCas = getMergeCas(aBModel, sourceDocument, jCases, randomAnnotationDocument); updateSegment(aBModel, segmentBeginEnd, segmentNumber, segmentAdress, mergeJCas, WebAnnoConst.CURATION_USER, WebAnnoCasUtil.getFirstSentence(mergeJCas).getBegin(), mergeJCas.getDocumentText().length()); } List<Type> entryTypes = null; segmentAdress.put(WebAnnoConst.CURATION_USER, new HashMap<Integer, Integer>()); for (Sentence sentence : selectCovered(mergeJCas, Sentence.class, diffRangeBegin, diffRangeEnd)) { segmentAdress.get(WebAnnoConst.CURATION_USER).put(sentence.getBegin(), getAddr(sentence)); } if (entryTypes == null) { entryTypes = getEntryTypes(mergeJCas, aBModel.getAnnotationLayers(), annotationService); } // for cross-sentences annotation, update the end of the segment if (firstload) { long start = System.currentTimeMillis(); log.debug("Updating cross sentence annotation list..."); updateCrossSentAnnoList(segmentBeginEnd, segmentNumber, jCases, entryTypes); firstload = false; log.debug("Cross sentence annotation list complete in {}ms", (System.currentTimeMillis() - start)); } long diffStart = System.currentTimeMillis(); log.debug("Calculating differences..."); int count = 0; for (Integer begin : segmentBeginEnd.keySet()) { Integer end = segmentBeginEnd.get(begin); count ++; if (count % 100 == 0) { log.debug("Processing differences: {} of {} sentences...", count, segmentBeginEnd.size()); } DiffResult diff = CasDiff2.doDiffSingle(annotationService, aBModel.getProject(), entryTypes, LinkCompareBehavior.LINK_ROLE_AS_LABEL, jCases, begin, end); SourceListView curationSegment = new SourceListView(); curationSegment.setBegin(begin); curationSegment.setEnd(end); curationSegment.setSentenceNumber(segmentNumber.get(begin)); if (diff.hasDifferences() || !diff.getIncompleteConfigurationSets().isEmpty()) { // Is this confSet a diff due to stacked annotations (with same configuration)? boolean stackedDiff = false; stackedDiffSet: for (ConfigurationSet d : diff.getDifferingConfigurationSets() .values()) { for (Configuration c : d.getConfigurations()) { if (c.getCasGroupIds().size() != d.getCasGroupIds().size()) { stackedDiff = true; break stackedDiffSet; } } } if (stackedDiff) { curationSegment.setSentenceState(SentenceState.DISAGREE); } else if (!diff.getIncompleteConfigurationSets().isEmpty()) { curationSegment.setSentenceState(SentenceState.DISAGREE); } else { curationSegment.setSentenceState(SentenceState.AGREE); } } else { curationSegment.setSentenceState(SentenceState.AGREE); } for (String username : segmentAdress.keySet()) { curationSegment.getSentenceAddress().put(username, segmentAdress.get(username).get(begin)); } curationContainer.getCurationViewByBegin().put(begin, curationSegment); } log.debug("Difference calculation completed in {}ms", (System.currentTimeMillis() - diffStart)); return curationContainer; } private void updateCrossSentAnnoList(Map<Integer, Integer> aSegmentBeginEnd, Map<Integer, Integer> aSegmentNumber, Map<String, JCas> aJCases, List<Type> aEntryTypes) { // FIXME Remove this side-effect and instead return this hashmap crossSentenceLists = new HashMap<>(); // Extract the sentences for all the CASes Map<JCas, List<Sentence>> idxSentences = new HashMap<>(); for (JCas c : aJCases.values()) { idxSentences.put(c, new ArrayList<>(select(c, Sentence.class))); } Set<Integer> sentenceBegins = aSegmentBeginEnd.keySet(); int count = 0; for (int sentBegin : sentenceBegins) { count ++; if (count % 100 == 0) { log.debug("Updating cross-sentence annoations: {} of {} sentences...", count, sentenceBegins.size()); } int sentEnd = aSegmentBeginEnd.get(sentBegin); int currentSentenceNumber = -1; Set<Integer> crossSents = new HashSet<>(); for (Type t : aEntryTypes) { for (JCas c : aJCases.values()) { // Determine sentence number for the current segment begin. This takes quite // a while, so we only do it for the first CAS in the batch. Will be the // same for all others anyway. if (currentSentenceNumber == -1) { currentSentenceNumber = aSegmentNumber.get(sentBegin); } // update cross-sentence annotation lists for (AnnotationFS fs : selectCovered(c.getCas(), t, diffRangeBegin, diffRangeEnd)) { // CASE 1. annotation begins here if (sentBegin <= fs.getBegin() && fs.getBegin() <= sentEnd) { if (fs.getEnd() < sentBegin || sentEnd < fs.getEnd()) { Sentence s = getSentenceByAnnoEnd(idxSentences.get(c), fs.getEnd()); int thatSent = idxSentences.get(c).indexOf(s) + 1; crossSents.add(thatSent); } } // CASE 2. Annotation ends here else if (sentBegin <= fs.getEnd() && fs.getEnd() <= sentEnd) { if (fs.getBegin() < sentBegin || sentEnd < fs.getBegin()) { int thatSent = WebAnnoCasUtil.getSentenceNumber(c, fs.getBegin()); crossSents.add(thatSent); } } } for (AnnotationFS fs : selectCovered(c.getCas(), t, sentBegin, diffRangeEnd)) { if (fs.getBegin() <= sentEnd && fs.getEnd() > sentEnd) { Sentence s = getSentenceByAnnoEnd(idxSentences.get(c), fs.getEnd()); aSegmentBeginEnd.put(sentBegin, s.getEnd()); } } } } crossSentenceLists.put(currentSentenceNumber, crossSents); } } /** * Get a sentence at the end of an annotation */ private static Sentence getSentenceByAnnoEnd(List<Sentence> aSentences, int aEnd) { int prevEnd = 0; Sentence sent = null; for (Sentence sentence : aSentences) { if (prevEnd >= aEnd) { return sent; } sent = sentence; prevEnd = sent.getEnd(); } return sent; } private Map<String, JCas> listJcasesforCorrection(AnnotationDocument randomAnnotationDocument, SourceDocument aDocument, Mode aMode) throws UIMAException, ClassNotFoundException, IOException { Map<String, JCas> jCases = new HashMap<String, JCas>(); User user = userRepository.get(SecurityContextHolder.getContext().getAuthentication() .getName()); randomAnnotationDocument = documentService.getAnnotationDocument(aDocument, user); // Upgrading should be an explicit action during the opening of a document at the end // of the open dialog - it must not happen during editing because the CAS addresses // are used as IDs in the UI // repository.upgradeCasAndSave(aDocument, aMode, user.getUsername()); JCas jCas = documentService.readAnnotationCas(randomAnnotationDocument); jCases.put(user.getUsername(), jCas); return jCases; } public Map<String, JCas> listJcasesforCuration(List<AnnotationDocument> annotationDocuments, AnnotationDocument randomAnnotationDocument, Mode aMode) throws UIMAException, ClassNotFoundException, IOException { Map<String, JCas> jCases = new HashMap<String, JCas>(); for (AnnotationDocument annotationDocument : annotationDocuments) { String username = annotationDocument.getUser(); if (!annotationDocument.getState().equals(AnnotationDocumentState.FINISHED)) { continue; } if (randomAnnotationDocument == null) { randomAnnotationDocument = annotationDocument; } // Upgrading should be an explicit action during the opening of a document at the end // of the open dialog - it must not happen during editing because the CAS addresses // are used as IDs in the UI // repository.upgradeCasAndSave(annotationDocument.getDocument(), aMode, username); JCas jCas = documentService.readAnnotationCas(annotationDocument); jCases.put(username, jCas); } return jCases; } /** * Fetches the CAS that the user will be able to edit. In AUTOMATION/CORRECTION mode, this is * the CAS for the CORRECTION_USER and in CURATION mode it is the CAS for the CURATION user. * * @param aBratAnnotatorModel * the model. * @param aDocument * the source document. * @param jCases * the JCases. * @param randomAnnotationDocument * an annotation document. * @return the JCas. * @throws UIMAException * hum? * @throws ClassNotFoundException * hum? * @throws IOException * if an I/O error occurs. * @throws AnnotationException * hum? */ public JCas getMergeCas(AnnotatorState aBratAnnotatorModel, SourceDocument aDocument, Map<String, JCas> jCases, AnnotationDocument randomAnnotationDocument) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { JCas mergeJCas = null; try { if (aBratAnnotatorModel.getMode().equals(Mode.AUTOMATION) || aBratAnnotatorModel.getMode().equals(Mode.CORRECTION)) { // Upgrading should be an explicit action during the opening of a document at the // end // of the open dialog - it must not happen during editing because the CAS addresses // are used as IDs in the UI // repository.upgradeCasAndSave(aDocument, aBratAnnotatorModel.getMode(), // aBratAnnotatorModel.getUser().getUsername()); mergeJCas = correctionDocumentService.readCorrectionCas(aDocument); } else { // Upgrading should be an explicit action during the opening of a document at the // end // of the open dialog - it must not happen during editing because the CAS addresses // are used as IDs in the UI // repository.upgradeCasAndSave(aDocument, aBratAnnotatorModel.getMode(), // aBratAnnotatorModel.getUser().getUsername()); mergeJCas = curationDocumentService.readCurationCas(aDocument); } } // Create jcas, if it could not be loaded from the file system catch (Exception e) { if (aBratAnnotatorModel.getMode().equals(Mode.AUTOMATION) || aBratAnnotatorModel.getMode().equals(Mode.CORRECTION)) { mergeJCas = createCorrectionCas(mergeJCas, aBratAnnotatorModel, randomAnnotationDocument); } else { mergeJCas = createCurationCas(aBratAnnotatorModel.getProject(), randomAnnotationDocument, jCases, aBratAnnotatorModel.getAnnotationLayers()); } } return mergeJCas; } /** * Puts JCases into a list and get a random annotation document that will be used as a base for * the diff. */ private void updateSegment(AnnotatorState aBratAnnotatorModel, Map<Integer, Integer> aIdxSentenceBeginEnd, Map<Integer, Integer> aIdxSentenceBeginNumber, Map<String, Map<Integer, Integer>> aSegmentAdress, JCas aJCas, String aUsername, int aWindowStart, int aWindowEnd) { diffRangeBegin = aWindowStart; diffRangeEnd = aWindowEnd; // Get the number of the first sentence - instead of fetching the number over and over // we can just increment this one. int sentenceNumber = WebAnnoCasUtil.getSentenceNumber(aJCas, diffRangeBegin); aSegmentAdress.put(aUsername, new HashMap<Integer, Integer>()); for (Sentence sentence : selectCovered(aJCas, Sentence.class, diffRangeBegin, diffRangeEnd)) { aIdxSentenceBeginEnd.put(sentence.getBegin(), sentence.getEnd()); aIdxSentenceBeginNumber.put(sentence.getBegin(), sentenceNumber); aSegmentAdress.get(aUsername).put(sentence.getBegin(), getAddr(sentence)); sentenceNumber += 1; } } public static List<Type> getEntryTypes(JCas mergeJCas, List<AnnotationLayer> aLayers, AnnotationSchemaService aAnnotationService) { List<Type> entryTypes = new LinkedList<Type>(); for (AnnotationLayer layer : aLayers) { if (layer.getName().equals(Token.class.getName())) { continue; } if (layer.getType().equals(WebAnnoConst.CHAIN_TYPE)) { continue; } entryTypes.add(getAdapter(aAnnotationService, layer).getAnnotationType( mergeJCas.getCas())); } return entryTypes; } /** * For the first time a curation page is opened, create a MergeCas that contains only agreeing * annotations Using the CAS of the curator user. * * @param aProject * the project * @param randomAnnotationDocument * an annotation document. * @param jCases * the JCases * @param aAnnotationLayers * the layers. * @return the JCas. * @throws IOException * if an I/O error occurs. */ public JCas createCurationCas(Project aProject, AnnotationDocument randomAnnotationDocument, Map<String, JCas> jCases, List<AnnotationLayer> aAnnotationLayers) throws IOException, UIMAException { User userLoggedIn = userRepository .get(SecurityContextHolder.getContext().getAuthentication().getName()); JCas mergeJCas = documentService.readAnnotationCas(randomAnnotationDocument); jCases.put(WebAnnoConst.CURATION_USER, mergeJCas); List<Type> entryTypes = getEntryTypes(mergeJCas, aAnnotationLayers, annotationService); DiffResult diff = CasDiff2.doDiffSingle(annotationService, aProject, entryTypes, LinkCompareBehavior.LINK_ROLE_AS_LABEL, jCases, 0, mergeJCas.getDocumentText().length()); mergeJCas = MergeCas.geMergeCas(diff, jCases); curationDocumentService.writeCurationCas(mergeJCas, randomAnnotationDocument.getDocument(), userLoggedIn, false); return mergeJCas; } private JCas createCorrectionCas(JCas mergeJCas, AnnotatorState aBratAnnotatorModel, AnnotationDocument randomAnnotationDocument) throws UIMAException, ClassNotFoundException, IOException { User userLoggedIn = userRepository.get(SecurityContextHolder.getContext() .getAuthentication().getName()); mergeJCas = documentService.readAnnotationCas(aBratAnnotatorModel.getDocument(), userLoggedIn); correctionDocumentService.writeCorrectionCas(mergeJCas, randomAnnotationDocument.getDocument(), userLoggedIn); return mergeJCas; } }