/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.ui.automation.util; import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.TypeUtil.getAdapter; import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getAddr; import static org.apache.uima.fit.util.CasUtil.getType; import static org.apache.uima.fit.util.CasUtil.select; import static org.apache.uima.fit.util.CasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import javax.persistence.NoResultException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.uima.UIMAException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.SofaFS; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.dao.DataRetrievalFailureException; import org.springframework.security.core.context.SecurityContextHolder; import de.tudarmstadt.ukp.clarin.webanno.api.AnnotationSchemaService; import de.tudarmstadt.ukp.clarin.webanno.api.CorrectionDocumentService; import de.tudarmstadt.ukp.clarin.webanno.api.DocumentService; import de.tudarmstadt.ukp.clarin.webanno.api.WebAnnoConst; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.ArcAdapter; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.SpanAdapter; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.TypeAdapter; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.TypeUtil; import de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil; import de.tudarmstadt.ukp.clarin.webanno.automation.model.AutomationStatus; import de.tudarmstadt.ukp.clarin.webanno.automation.model.MiraTemplate; import de.tudarmstadt.ukp.clarin.webanno.automation.service.AutomationService; import de.tudarmstadt.ukp.clarin.webanno.curation.storage.CurationDocumentService; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature; import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument; import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocumentState; import de.tudarmstadt.ukp.clarin.webanno.security.UserDao; import de.tudarmstadt.ukp.clarin.webanno.security.model.User; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import edu.lium.mira.Mira; /** * A utility class for the automation modules * * */ public class AutomationUtil { private static Logger LOG = LoggerFactory.getLogger(AutomationUtil.class); private static final String NILL = "__nill__"; public static void repeateSpanAnnotation(AnnotatorState aBModel, DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, int aStart, int aEnd, AnnotationFeature aFeature, String aValue) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { AnnotationDocument annoDoc = aDocumentService.getAnnotationDocument(aBModel.getDocument(), aBModel.getUser()); JCas annoCas = aDocumentService.readAnnotationCas(annoDoc); // get selected text, concatenations of tokens String selectedText = WebAnnoCasUtil.getSelectedText(annoCas, aStart, aEnd); SpanAdapter adapter = (SpanAdapter) getAdapter(aAnnotationService, aFeature.getLayer()); for (SourceDocument d : aDocumentService.listSourceDocuments(aBModel.getProject())) { if (d.isTrainingDocument()) { continue; } loadDocument(d, aDocumentService, aCorrectionDocumentService, aBModel.getUser()); JCas jCas = aCorrectionDocumentService.readCorrectionCas(d); for (Sentence sentence : select(jCas, Sentence.class)) { String sentenceText = sentence.getCoveredText().toLowerCase(); for (int i = -1; (i = sentenceText.indexOf(selectedText.toLowerCase(), i)) != -1; i = i + selectedText.length()) { if (selectCovered(jCas, Token.class, sentence.getBegin() + i, sentence.getBegin() + i + selectedText.length()).size() > 0) { adapter.add(jCas, sentence.getBegin() + i, sentence.getBegin() + i + selectedText.length() - 1, aFeature, aValue); } } } aCorrectionDocumentService.writeCorrectionCas(jCas, d, aBModel.getUser()); } } public static void repeateRelationAnnotation(AnnotatorState aBModel, DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AnnotationFS fs, AnnotationFeature aFeature, String aValue) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { for (SourceDocument d : aDocumentService.listSourceDocuments(aBModel.getProject())) { if (d.isTrainingDocument()) { continue; } loadDocument(d, aDocumentService, aCorrectionDocumentService, aBModel.getUser()); JCas jCas = aCorrectionDocumentService.readCorrectionCas(d); ArcAdapter adapter = (ArcAdapter) getAdapter(aAnnotationService, aFeature.getLayer()); String sourceFName = adapter.getSourceFeatureName(); String targetFName = adapter.getTargetFeatureName(); Type type = getType(jCas.getCas(), aFeature.getLayer().getName()); Type spanType = getType(jCas.getCas(), adapter.getAttachTypeName()); Feature arcSpanFeature = spanType.getFeatureByBaseName(adapter.getAttachFeatureName()); Feature dependentFeature = type.getFeatureByBaseName(targetFName); Feature governorFeature = type.getFeatureByBaseName(sourceFName); AnnotationFS dependentFs = null; AnnotationFS governorFs = null; if (adapter.getAttachFeatureName() != null) { dependentFs = (AnnotationFS) fs.getFeatureValue(dependentFeature) .getFeatureValue(arcSpanFeature); governorFs = (AnnotationFS) fs.getFeatureValue(governorFeature) .getFeatureValue(arcSpanFeature); } else { dependentFs = (AnnotationFS) fs.getFeatureValue(dependentFeature); governorFs = (AnnotationFS) fs.getFeatureValue(governorFeature); } if (adapter.isCrossMultipleSentence()) { List<AnnotationFS> mSpanAnnos = new ArrayList<>( getAllAnnoFss(jCas, governorFs.getType())); repeatRelation(0, jCas.getDocumentText().length()-1, aFeature, aValue, jCas, adapter, dependentFs, governorFs, mSpanAnnos); } else { for (Sentence sent : select(jCas, Sentence.class)) { List<AnnotationFS> spanAnnos = selectCovered(governorFs.getType(), sent); repeatRelation(sent.getBegin(), sent.getEnd(), aFeature, aValue, jCas, adapter, dependentFs, governorFs, spanAnnos); } } aCorrectionDocumentService.writeCorrectionCas(jCas, d, aBModel.getUser()); } } private static void repeatRelation(int aStart, int aEnd, AnnotationFeature aFeature, String aValue, JCas jCas, ArcAdapter adapter, AnnotationFS aDepFS, AnnotationFS aGovFS, List<AnnotationFS> aSpanAnnos) throws AnnotationException { String dCoveredText = aDepFS.getCoveredText(); String gCoveredText = aGovFS.getCoveredText(); AnnotationFS d = null, g = null; Type attachSpanType = aDepFS.getType(); for (AnnotationFS fs : aSpanAnnos) { if (dCoveredText.equals(fs.getCoveredText())) { if (g != null && isSamAnno(attachSpanType, fs, aDepFS)) { adapter.add(g, fs, jCas, aStart, aEnd, aFeature, aValue); g = null; d = null; continue;// so we don't go to the other if } else if (d == null && isSamAnno(attachSpanType, fs, aDepFS)) { d = fs; continue; // so we don't go to the other if } } // we don't use else, in case gov and dep are the same if (gCoveredText.equals(fs.getCoveredText()) ) { if (d != null && isSamAnno(attachSpanType, fs, aGovFS)) { adapter.add(fs, d, jCas, aStart, aEnd, aFeature, aValue); g = null; d = null; } else if (g == null && isSamAnno(attachSpanType, fs, aGovFS)) { g = fs; } } } } private static Collection<AnnotationFS> getAllAnnoFss(JCas aJcas, Type aType) { Collection<AnnotationFS> spanAnnos = select(aJcas.getCas(), aType); Collections.sort(new ArrayList<AnnotationFS>(spanAnnos), new Comparator<AnnotationFS>() { @Override public int compare(AnnotationFS arg0, AnnotationFS arg1) { return arg0.getBegin() - arg1.getBegin(); } }); return spanAnnos; } private static boolean isSamAnno(Type aType, AnnotationFS aMFs, AnnotationFS aFs) { for (Feature f : aType.getFeatures()) { // anywhere is ok if (f.getName().equals(CAS.FEATURE_FULL_NAME_BEGIN)) { continue; } // anywhere is ok if (f.getName().equals(CAS.FEATURE_FULL_NAME_END)) { continue; } if (!f.getRange().isPrimitive() && aMFs.getFeatureValue(f) instanceof SofaFS) { continue; } // do not attach relation on empty span annotations if (aMFs.getFeatureValueAsString(f) == null){ continue; } if (aFs.getFeatureValueAsString(f) == null){ continue; } if (!aMFs.getFeatureValueAsString(f).equals(aFs.getFeatureValueAsString(f))) { return false; } } return true; } /** * Repeat annotation will repeat annotations of same pattern to all documents on the project * load CAS from document in case no initial CORRECTION_CAS is not created before */ public static void loadDocument(SourceDocument aDocument, DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, User logedInUser) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { JCas jCas = null; if (!aCorrectionDocumentService.existsCorrectionCas(aDocument)) { try { AnnotationDocument logedInUserAnnotationDocument = aDocumentService .getAnnotationDocument(aDocument, logedInUser); jCas = aDocumentService.readAnnotationCas(logedInUserAnnotationDocument); aDocumentService.upgradeCas(jCas.getCas(), logedInUserAnnotationDocument); aCorrectionDocumentService.writeCorrectionCas(jCas, aDocument, logedInUser); } catch (IOException e) { throw e; } catch (DataRetrievalFailureException e) { jCas = aDocumentService.readAnnotationCas( aDocumentService.createOrGetAnnotationDocument(aDocument, logedInUser)); // upgrade this cas aDocumentService.upgradeCas(jCas.getCas(), aDocumentService.createOrGetAnnotationDocument(aDocument, logedInUser)); aCorrectionDocumentService.writeCorrectionCas(jCas, aDocument, logedInUser); } catch (NoResultException e) { jCas = aDocumentService.readAnnotationCas( aDocumentService.createOrGetAnnotationDocument(aDocument, logedInUser)); // upgrade this cas aDocumentService.upgradeCas(jCas.getCas(), aDocumentService.createOrGetAnnotationDocument(aDocument, logedInUser)); aCorrectionDocumentService.writeCorrectionCas(jCas, aDocument, logedInUser); } } else { jCas = aCorrectionDocumentService.readCorrectionCas(aDocument); // upgrade this automation cas aCorrectionDocumentService.upgradeCorrectionCas(jCas.getCas(), aDocument); } } public static void deleteSpanAnnotation(AnnotatorState aBModel, DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, int aStart, int aEnd, AnnotationFeature aFeature, String aValue) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { AnnotationDocument annoDoc = aDocumentService.getAnnotationDocument(aBModel.getDocument(), aBModel.getUser()); JCas annoCas = aDocumentService.readAnnotationCas(annoDoc); // get selected text, concatenations of tokens String selectedText = WebAnnoCasUtil.getSelectedText(annoCas, aStart, aEnd); for (SourceDocument d : aDocumentService.listSourceDocuments(aBModel.getProject())) { if (d.isTrainingDocument()) { continue; } loadDocument(d, aDocumentService, aCorrectionDocumentService, aBModel.getUser()); JCas jCas = aCorrectionDocumentService.readCorrectionCas(d); AutomationTypeAdapter adapter = (AutomationTypeAdapter) getAdapter(aAnnotationService, aFeature.getLayer()); for (Sentence sentence : select(jCas, Sentence.class)) { String sentenceText = sentence.getCoveredText().toLowerCase(); for (int i = -1; (i = sentenceText.indexOf(selectedText.toLowerCase(), i)) != -1; i = i + selectedText.length()) { if (selectCovered(jCas, Token.class, sentence.getBegin() + i, sentence.getBegin() + i + selectedText.length()).size() > 0) { adapter.delete(jCas, aFeature, sentence.getBegin() + i, sentence.getBegin() + i + selectedText.length() - 1, aValue); } } } aCorrectionDocumentService.writeCorrectionCas(jCas,d, aBModel.getUser()); } } public static void deleteRelationAnnotation(AnnotatorState aBModel, DocumentService aDocumentService, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AnnotationFS fs, AnnotationFeature aFeature, String aValue) throws UIMAException, ClassNotFoundException, IOException, AnnotationException { for (SourceDocument d : aDocumentService.listSourceDocuments(aBModel.getProject())) { if (d.isTrainingDocument()) { continue; } loadDocument(d, aDocumentService, aCorrectionDocumentService, aBModel.getUser()); JCas jCas = aCorrectionDocumentService.readCorrectionCas(d); ArcAdapter adapter = (ArcAdapter) getAdapter(aAnnotationService, aFeature.getLayer()); String sourceFName = adapter.getSourceFeatureName(); String targetFName = adapter.getTargetFeatureName(); Type type = getType(jCas.getCas(), aFeature.getLayer().getName()); Type spanType = getType(jCas.getCas(), adapter.getAttachTypeName()); Feature arcSpanFeature = spanType.getFeatureByBaseName(adapter.getAttachFeatureName()); Feature dependentFeature = type.getFeatureByBaseName(targetFName); Feature governorFeature = type.getFeatureByBaseName(sourceFName); AnnotationFS dependentFs = null; AnnotationFS governorFs = null; if (adapter.getAttachFeatureName() != null) { dependentFs = (AnnotationFS) fs.getFeatureValue(dependentFeature) .getFeatureValue(arcSpanFeature); governorFs = (AnnotationFS) fs.getFeatureValue(governorFeature) .getFeatureValue(arcSpanFeature); } else { dependentFs = (AnnotationFS) fs.getFeatureValue(dependentFeature); governorFs = (AnnotationFS) fs.getFeatureValue(governorFeature); } int beginOffset = 0; int endOffset = jCas.getDocumentText().length() - 1; String depCoveredText = dependentFs.getCoveredText(); String govCoveredText = governorFs.getCoveredText(); adapter.delete(jCas, aFeature, beginOffset, endOffset, depCoveredText, govCoveredText, aValue); aCorrectionDocumentService.writeCorrectionCas(jCas, d, aBModel.getUser()); } } // generates training document that will be used to predict the training document // to add extra features, for example add POS tag as a feature for NE classifier public static void addOtherFeatureTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException { File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); if (!miraDir.exists()) { FileUtils.forceMkdir(miraDir); } AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate); String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); for (AnnotationFeature feature : aTemplate.getOtherFeatures()) { File trainFile = new File(miraDir, feature.getId() + ".train"); boolean documentChanged = false; for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) { documentChanged = true; break; } } if (!documentChanged && trainFile.exists()) { continue; } BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile)); AutomationTypeAdapter adapter = (AutomationTypeAdapter) TypeUtil.getAdapter( aAnnotationService, feature.getLayer()); for (SourceDocument sourceDocument : aRepository.listSourceDocuments(feature .getProject())) { if ((sourceDocument.isTrainingDocument() && sourceDocument.getFeature() != null && sourceDocument .getFeature().equals(feature))) { JCas jCas = aRepository.readAnnotationCas(sourceDocument, user); for (Sentence sentence : select(jCas, Sentence.class)) { trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n"); } sourceDocument.setProcessed(false); status.setTrainDocs(status.getTrainDocs() - 1); } } trainOut.close(); } } /** * If the training file or the test file already contain the "Other laye" annotations, get the * UIMA annotation and add it as a feature - no need to train and predict for this "other layer" */ private static void addOtherFeatureFromAnnotation(AnnotationFeature aFeature, DocumentService aRepository, AnnotationSchemaService aAnnotationService, UserDao aUserDao, List<List<String>> aPredictions, SourceDocument aSourceDocument) throws UIMAException, ClassNotFoundException, IOException { String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); AutomationTypeAdapter adapter = (AutomationTypeAdapter) TypeUtil.getAdapter( aAnnotationService, aFeature.getLayer()); List<String> annotations = new ArrayList<String>(); if (aSourceDocument == null) {// this is training - all sources documents will be converted // to a single // training file for (SourceDocument sourceDocument : aRepository.listSourceDocuments(aFeature .getProject())) { if ((sourceDocument.isTrainingDocument())) { JCas jCas = aRepository.readAnnotationCas(sourceDocument, user); for (Sentence sentence : select(jCas, Sentence.class)) { if (aFeature.getLayer().isMultipleTokens()) { annotations.addAll((List<String>) ((SpanAdapter) adapter) .getMultipleAnnotation(sentence, aFeature).values()); } else { annotations.addAll(adapter.getAnnotation(sentence, aFeature)); } } } } aPredictions.add(annotations); } else { JCas jCas = aRepository.readAnnotationCas(aSourceDocument, user); for (Sentence sentence : select(jCas, Sentence.class)) { if (aFeature.getLayer().isMultipleTokens()) { annotations.addAll((List<String>) ((SpanAdapter) adapter) .getMultipleAnnotation(sentence, aFeature).values()); } else { annotations.addAll(adapter.getAnnotation(sentence, aFeature)); } } aPredictions.add(annotations); } } public static void addTabSepTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, AutomationService aAutomationService) throws IOException, UIMAException, ClassNotFoundException, AutomationException { File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); if (!miraDir.exists()) { FileUtils.forceMkdir(miraDir); } AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate); boolean documentChanged = false; for (SourceDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature() .getProject())) { if (!document.isProcessed()) { documentChanged = true; break; } } if (!documentChanged) { return; } for (SourceDocument sourceDocument : aAutomationService.listTabSepDocuments(aTemplate .getTrainFeature().getProject())) { if (sourceDocument.getFeature() != null) { // This is a target layer train document continue; } File trainFile = new File(miraDir, sourceDocument.getId() + sourceDocument.getProject().getId() + ".train"); BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile)); File tabSepFile = new File(aRepository.getDocumentFolder(sourceDocument), sourceDocument.getName()); LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile)); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { trainOut.append("\n"); } else { StringTokenizer st = new StringTokenizer(line, "\t"); if (st.countTokens() != 2) { trainOut.close(); throw new AutomationException("This is not a valid TAB-SEP document"); } trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken())); } } sourceDocument.setProcessed(false); status.setTrainDocs(status.getTrainDocs() - 1); trainOut.close(); } } public static void generateTrainDocument(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, boolean aBase) throws IOException, UIMAException, ClassNotFoundException, AutomationException { LOG.info("Starting to generate training document"); File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); if (!miraDir.exists()) { FileUtils.forceMkdir(miraDir); } String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); AnnotationFeature feature = aTemplate.getTrainFeature(); boolean documentChanged = false; // A. training document for other train layers were changed for (AnnotationFeature otherrFeature : aTemplate.getOtherFeatures()) { for (SourceDocument document : aRepository.listSourceDocuments(aTemplate .getTrainFeature().getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(otherrFeature)) { documentChanged = true; break; } } } // B. Training document for the main training layer were changed for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(feature))) { documentChanged = true; break; } } // C. New Curation document arrives for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && document.getState().equals(SourceDocumentState.CURATION_FINISHED)) { documentChanged = true; break; } } // D. tab-sep training documents for (SourceDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature() .getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) { documentChanged = true; break; } } if (!documentChanged) { return; } File trainFile; if (aBase) { trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.ft"); } else { trainFile = new File(miraDir, feature.getLayer().getId() + "-" + feature.getId() + ".train.base"); } AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate); BufferedWriter trainOut = new BufferedWriter(new FileWriter(trainFile)); AutomationTypeAdapter adapter = (AutomationTypeAdapter) TypeUtil.getAdapter( aAnnotationService, feature.getLayer()); // Training documents (Curated or webanno-compatible imported ones - read using UIMA) List<SourceDocument> sourceDocs = aRepository.listSourceDocuments(feature.getProject()); int sourceDocsCounter = 0; for (SourceDocument sourceDocument : sourceDocs) { if ((sourceDocument.isTrainingDocument() && sourceDocument.getFeature() != null && sourceDocument .getFeature().equals(feature))) { JCas jCas = aRepository.readAnnotationCas(sourceDocument, user); for (Sentence sentence : select(jCas, Sentence.class)) { if (aBase) {// base training document trainOut.append(getMiraLine(sentence, null, adapter).toString() + "\n"); } else {// training document with other features trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n"); } } sourceDocument.setProcessed(!aBase); if (!aBase) { status.setTrainDocs(status.getTrainDocs() - 1); } } else if (sourceDocument.getState().equals(SourceDocumentState.CURATION_FINISHED)) { JCas jCas = aCurationDocumentService.readCurationCas(sourceDocument); for (Sentence sentence : select(jCas, Sentence.class)) { if (aBase) {// base training document trainOut.append(getMiraLine(sentence, null, adapter).toString() + "\n"); } else {// training document with other features trainOut.append(getMiraLine(sentence, feature, adapter).toString() + "\n"); } } sourceDocument.setProcessed(!aBase); if (!aBase) { status.setTrainDocs(status.getTrainDocs() - 1); } } sourceDocsCounter++; LOG.info("Processed source document " + sourceDocsCounter + " of " + sourceDocs.size()); } // Tab-sep documents to be used as a target layer train document int goldStandardDocsCounter = 0; List<SourceDocument> goldStandardDocs = aAutomationService .listTabSepDocuments(feature.getProject()); for (SourceDocument document : goldStandardDocs) { if (document.getFormat().equals(WebAnnoConst.TAB_SEP) && document.getFeature() != null && document.getFeature().equals(feature)) { File tabSepFile = new File(aRepository.getDocumentFolder(document), document.getName()); LineIterator it = IOUtils.lineIterator(new FileReader(tabSepFile)); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { trainOut.append("\n"); } else { StringTokenizer st = new StringTokenizer(line, "\t"); if (st.countTokens() != 2) { trainOut.close(); throw new AutomationException("This is not a valid TAB-SEP document"); } if (aBase) { trainOut.append(getMiraLineForTabSep(st.nextToken(), "")); } else { trainOut.append(getMiraLineForTabSep(st.nextToken(), st.nextToken())); } } } } goldStandardDocsCounter++; LOG.info("Processed gold standard document " + goldStandardDocsCounter + " of " + goldStandardDocs.size()); } trainOut.close(); LOG.info("Completed generating training document"); } public static void generatePredictDocument(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws IOException, UIMAException, ClassNotFoundException { File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); if (!miraDir.exists()) { FileUtils.forceMkdir(miraDir); } String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); AnnotationFeature feature = aTemplate.getTrainFeature(); boolean documentChanged = false; for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && !document.isTrainingDocument()) { documentChanged = true; break; } } if (!documentChanged) { return; } AutomationTypeAdapter adapter = (AutomationTypeAdapter) TypeUtil.getAdapter( aAnnotationService, feature.getLayer()); for (SourceDocument document : aRepository.listSourceDocuments(feature.getProject())) { if (!document.isProcessed() && !document.isTrainingDocument()) { File predFile = new File(miraDir, document.getId() + ".pred.ft"); BufferedWriter predOut = new BufferedWriter(new FileWriter(predFile)); JCas jCas; try { jCas = aCorrectionDocumentService.readCorrectionCas(document); } catch (Exception e) { jCas = aRepository.readAnnotationCas(document, user); } for (Sentence sentence : select(jCas, Sentence.class)) { predOut.append(getMiraLine(sentence, null, adapter).toString() + "\n"); } predOut.close(); } } } private static StringBuffer getMiraLine(Sentence sentence, AnnotationFeature aLayerFeature, AutomationTypeAdapter aAdapter) throws CASException { StringBuffer sb = new StringBuffer(); String tag = ""; List<String> annotations = new ArrayList<String>(); Map<Integer, String> multAnno = null; if (aLayerFeature != null) { if (aLayerFeature.getLayer().isMultipleTokens()) { multAnno = ((SpanAdapter) aAdapter).getMultipleAnnotation(sentence, aLayerFeature); } else { annotations = aAdapter.getAnnotation(sentence, aLayerFeature); } } int i = 0; for (Token token : selectCovered(Token.class, sentence)) { String word = token.getCoveredText(); char[] words = word.toCharArray(); String prefix1 = "", prefix2 = "", prefix3 = "", prefix4 = "", suffix1 = "", suffix2 = "", suffix3 = "", suffix4 = ""; if (aLayerFeature == null || aLayerFeature.getLayer().isLockToTokenOffset()) { prefix1 = Character.toString(words[0]) + " "; prefix2 = (words.length > 1 ? prefix1.trim() + (Character.toString(words[1]).trim().equals("") ? "__nil__" : Character .toString(words[1])) : "__nil__") + " "; prefix3 = (words.length > 2 ? prefix2.trim() + (Character.toString(words[2]).trim().equals("") ? "__nil__" : Character .toString(words[2])) : "__nil__") + " "; prefix4 = (words.length > 3 ? prefix3.trim() + (Character.toString(words[3]).trim().equals("") ? "__nil__" : Character .toString(words[3])) : "__nil__") + " "; suffix1 = Character.toString(words[words.length - 1]) + " "; suffix2 = (words.length > 1 ? (Character.toString(words[words.length - 2]).trim() .equals("") ? "__nil__" : Character.toString(words[words.length - 2])) + suffix1.trim() : "__nil__") + " "; suffix3 = (words.length > 2 ? (Character.toString(words[words.length - 3]).trim() .equals("") ? "__nil__" : Character.toString(words[words.length - 3])) + suffix2.trim() : "__nil__") + " "; suffix4 = (words.length > 3 ? (Character.toString(words[words.length - 4]).trim() .equals("") ? "__nil__" : Character.toString(words[words.length - 4])) + suffix3.trim() : "__nil__") + " "; } String nl = "\n"; if (aLayerFeature != null) { if (aLayerFeature.getLayer().isMultipleTokens()) { tag = multAnno.get(getAddr(token)) == null ? "O" : multAnno.get(getAddr(token)); } else { tag = annotations.size() == 0 ? NILL : annotations.get(i); i++; } } sb.append(word + " " + prefix1 + prefix2 + prefix3 + prefix4 + suffix1 + suffix2 + suffix3 + suffix4 + tag + nl); } return sb; } private static StringBuffer getMiraLineForTabSep(String aToken, String aFeature) throws CASException { StringBuffer sb = new StringBuffer(); char[] words = aToken.toCharArray(); String prefix1 = Character.toString(words[0]) + " "; String prefix2 = (words.length > 1 ? prefix1.trim() + (Character.toString(words[1]).trim().equals("") ? "__nil__" : Character .toString(words[1])) : "__nil__") + " "; String prefix3 = (words.length > 2 ? prefix2.trim() + (Character.toString(words[2]).trim().equals("") ? "__nil__" : Character .toString(words[2])) : "__nil__") + " "; String prefix4 = (words.length > 3 ? prefix3.trim() + (Character.toString(words[3]).trim().equals("") ? "__nil__" : Character .toString(words[3])) : "__nil__") + " "; String suffix1 = Character.toString(words[words.length - 1]) + " "; String suffix2 = (words.length > 1 ? (Character.toString(words[words.length - 2]).trim() .equals("") ? "__nil__" : Character.toString(words[words.length - 2])) + suffix1.trim() : "__nil__") + " "; String suffix3 = (words.length > 2 ? (Character.toString(words[words.length - 3]).trim() .equals("") ? "__nil__" : Character.toString(words[words.length - 3])) + suffix2.trim() : "__nil__") + " "; String suffix4 = (words.length > 3 ? (Character.toString(words[words.length - 4]).trim() .equals("") ? "__nil__" : Character.toString(words[words.length - 4])) + suffix3.trim() : "__nil__") + " "; String nl = "\n"; sb.append(aToken + " " + prefix1 + prefix2 + prefix3 + prefix4 + suffix1 + suffix2 + suffix3 + suffix4 + aFeature + nl); return sb; } /** * When additional layers are used as training feature, the training document should be * auto-predicted with the other layers. Example, if the train layer is Named Entity and POS * layer is used as additional feature, the training document should be predicted using the POS * layer documents for POS annotation * * @param aTemplate * the template. * @param aRepository * the repository. * @throws IOException * hum? * @throws ClassNotFoundException * hum? */ public static void otherFeatureClassifiers(MiraTemplate aTemplate, DocumentService aRepository, AutomationService aAutomationService) throws IOException, ClassNotFoundException { Mira mira = new Mira(); int frequency = 2; double sigma = 1; int iterations = 10; int beamSize = 0; boolean maxPosteriors = false; String templateName = null; boolean documentChanged = false; for (AnnotationFeature feature : aTemplate.getOtherFeatures()) { for (SourceDocument document : aRepository.listSourceDocuments(aTemplate .getTrainFeature().getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) { documentChanged = true; break; } } } if (!documentChanged) { return; } for (AnnotationFeature feature : aTemplate.getOtherFeatures()) { templateName = createTemplate(feature, getMiraTemplateFile(feature, aAutomationService), 0); File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); File trainFile = new File(miraDir, feature.getId() + ".train"); String initalModelName = ""; String trainName = trainFile.getAbsolutePath(); String modelName = aAutomationService.getMiraModel(feature, true, null).getAbsolutePath(); boolean randomInit = false; if (!feature.getLayer().isLockToTokenOffset()) { mira.setIobScorer(); } mira.loadTemplates(templateName); mira.setClip(sigma); mira.maxPosteriors = maxPosteriors; mira.beamSize = beamSize; int numExamples = mira.count(trainName, frequency); mira.initModel(randomInit); if (!initalModelName.equals("")) { mira.loadModel(initalModelName); } for (int i = 0; i < iterations; i++) { mira.train(trainName, iterations, numExamples, i); mira.averageWeights(iterations * numExamples); } mira.saveModel(modelName); } } /** * Classifier for an external tab-sep file (token TAB feature) * * @param aTemplate * the template. * @throws IOException * hum? * @throws ClassNotFoundException * hum? */ public static void tabSepClassifiers(MiraTemplate aTemplate, AutomationService aAutomationService) throws IOException, ClassNotFoundException { Mira mira = new Mira(); int frequency = 2; double sigma = 1; int iterations = 10; int beamSize = 0; boolean maxPosteriors = false; String templateName = null; boolean documentChanged = false; for (SourceDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature() .getProject())) { if (!document.isProcessed()) { documentChanged = true; break; } } if (!documentChanged) { return; } for (SourceDocument sourceDocument : aAutomationService.listTabSepDocuments(aTemplate .getTrainFeature().getProject())) { if (sourceDocument.getFeature() != null) { // This is a target layer train document continue; } File miraDir = aAutomationService.getMiraDir(aTemplate.getTrainFeature()); File trainFile = new File(miraDir, sourceDocument.getId() + sourceDocument.getProject().getId() + ".train"); templateName = createTemplate(null, getMiraTemplateFile(aTemplate.getTrainFeature(), aAutomationService), 0); String initalModelName = ""; String trainName = trainFile.getAbsolutePath(); String modelName = aAutomationService.getMiraModel(aTemplate.getTrainFeature(), true, sourceDocument).getAbsolutePath(); boolean randomInit = false; mira.loadTemplates(templateName); mira.setClip(sigma); mira.maxPosteriors = maxPosteriors; mira.beamSize = beamSize; int numExamples = mira.count(trainName, frequency); mira.initModel(randomInit); if (!initalModelName.equals("")) { mira.loadModel(initalModelName); } for (int i = 0; i < iterations; i++) { mira.train(trainName, iterations, numExamples, i); mira.averageWeights(iterations * numExamples); } mira.saveModel(modelName); } } public static String createTemplate(AnnotationFeature aFeature, File templateFile, int aOther) throws IOException { StringBuffer sb = new StringBuffer(); if (aFeature == null || aFeature.getLayer().isLockToTokenOffset()) { setMorphoTemplate(sb, aOther); } else { setNgramForLable(sb, aOther); } sb.append("\n"); sb.append("B\n"); FileUtils.writeStringToFile(templateFile, sb.toString()); return templateFile.getAbsolutePath(); } private static void setNgramForLable(StringBuffer aSb, int aOther) { int i = 1; aSb.append("U" + String.format("%02d", i) + "%x[0,0]\n"); i++; /* * aSb.append("U" + String.format("%02d", i) + "%x[0,1]\n"); i++; aSb.append("U" + * String.format("%02d", i) + "%x[0,0]" + "%x[0,1]\n"); i++; */ aSb.append("U" + String.format("%02d", i) + "%x[-1,0]" + "%x[0,0]\n"); i++; /* * aSb.append("U" + String.format("%02d", i) + "%x[-1,1]" + "%x[0,1]\n"); i++; */ int temp = 1; int tempOther = aOther; if (aOther > 0) {// consider other layer annotations as features while (aOther > 0) { aOther--; aSb.append("U" + String.format("%02d", i) + "%x[0," + temp + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0,0] %x[0," + temp + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-1," + temp + "] %x[0," + temp + "]\n"); i++; temp++; } } aSb.append("\n"); i = 1; aSb.append("B" + String.format("%02d", i) + "%x[0,0]\n"); i++; /* * aSb.append("B" + String.format("%02d", i) + "%x[0,1]\n"); i++; aSb.append("B" + * String.format("%02d", i) + "%x[0,0]" + "%x[0,1]\n"); i++; */ aSb.append("B" + String.format("%02d", i) + "%x[-1,0]" + "%x[0,0]\n"); i++; /* * aSb.append("B" + String.format("%02d", i) + "%x[-1,1]" + "%x[0,1]\n"); i++; */ aSb.append("\n"); temp = 1; if (tempOther > 0) {// consider other layer annotations as features while (aOther > 0) { aOther--; aSb.append("B" + String.format("%02d", i) + "%x[0," + temp + "]\n"); i++; aSb.append("B" + String.format("%02d", i) + "%x[0,0] %x[0," + temp + "]\n"); i++; aSb.append("B" + String.format("%02d", i) + "%x[-1," + temp + "] %x[0," + temp + "]\n"); i++; temp++; } } } // only for token based automation, we need morphological features. private static void setMorphoTemplate(StringBuffer aSb, int aOther) { int i = 1; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0," + i + "]\n"); i++; aSb.append("\n"); aSb.append("U" + String.format("%02d", i) + "%x[0,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-1,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[1,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-2,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[2,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-2,0]" + "%x[-1,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-1,0]" + "%x[0,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0,0]" + "%x[1,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[1,0]" + "%x[2,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-2,0]" + "%x[-1,0]" + "%x[0,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-1,0]" + "%x[0,0]" + "%x[1,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0,0]" + "%x[1,0]" + "%x[2,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-2,0]" + "%x[-1,0]" + "%x[0,0]" + "%x[1,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-1,0]" + "%x[0,0]" + "%x[1,0]" + "%x[2,0]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-2,0]" + "%x[-1,0]" + "%x[0,0" + "%x[1,0]" + "%x[2,0]]\n"); aSb.append("\n"); int temp = 1; if (aOther > 0) {// consider other layer annotations as features while (aOther > 0) { aOther--; aSb.append("U" + String.format("%02d", i) + "%x[0," + temp + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[0,0] %x[0," + temp + "]\n"); i++; aSb.append("U" + String.format("%02d", i) + "%x[-1," + temp + "] %x[0," + temp + "]\n"); i++; temp++; } } aSb.append("\n"); } public static File getMiraTemplateFile(AnnotationFeature aFeature, AutomationService aAutomationService) { return new File(aAutomationService.getMiraDir(aFeature).getAbsolutePath(), aFeature.getId() + "-template"); } /** * Based on the other layer, predict features for the training document * * @param aTemplate * the template. * @param aRepository * the repository. * @return the prediction. * @throws UIMAException * hum? * @throws ClassNotFoundException * hum? * @throws IOException * hum? * @throws AnnotationException * hum? * * @throws AutomationException * if an error occurs. */ public static String generateFinalClassifier(MiraTemplate aTemplate, DocumentService aRepository, CurationDocumentService aCurationDocumentService, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException { int frequency = 2; double sigma = 1; int iterations = 10; int beamSize = 0; boolean maxPosteriors = false; AnnotationFeature layerFeature = aTemplate.getTrainFeature(); List<List<String>> predictions = new ArrayList<List<String>>(); File miraDir = aAutomationService.getMiraDir(layerFeature); Mira mira = new Mira(); File predFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.ft"); File predcitedFile = new File(predFile.getAbsolutePath() + "-pred"); boolean documentChanged = false; // A. training document for other train layers were changed for (AnnotationFeature feature : aTemplate.getOtherFeatures()) { for (SourceDocument document : aRepository.listSourceDocuments(aTemplate .getTrainFeature().getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(feature)) { documentChanged = true; break; } } } // B. Training document for the main training layer were changed for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) { if (!document.isProcessed() && (document.getFeature() != null && document.getFeature().equals(layerFeature))) { documentChanged = true; break; } } // C. New Curation document arrives for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) { if (!document.isProcessed() && document.getState().equals(SourceDocumentState.CURATION_FINISHED)) { documentChanged = true; break; } } // D. tab-sep training documents for (SourceDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature() .getProject())) { if (!document.isProcessed() && document.getFeature() != null && document.getFeature().equals(layerFeature)) { documentChanged = true; break; } } if (!documentChanged) { return aTemplate.getResult(); } // if no other layer is used, use this as main train document, // otherwise, add all the // predictions and modify template File baseTrainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train.base"); File trainFile = new File(miraDir, layerFeature.getLayer().getId() + "-" + layerFeature.getId() + ".train"); // generate final classifier, using all features generated String trainName = trainFile.getAbsolutePath(); String finalClassifierModelName = aAutomationService.getMiraModel(layerFeature, false, null) .getAbsolutePath(); getFeatureOtherLayer(aTemplate, aRepository, aAnnotationService, aAutomationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFile, predcitedFile, null); getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFile, predcitedFile); generateTrainDocument(aTemplate, aRepository, aCurationDocumentService, aAnnotationService, aAutomationService, aUserDao, false); String trainTemplate; if (predictions.size() == 0) { trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0); FileUtils.copyFile(baseTrainFile, trainFile); } else { trainTemplate = createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size()); buildTrainFile(baseTrainFile, trainFile, predictions); } boolean randomInit = false; if (!layerFeature.getLayer().isLockToTokenOffset()) { mira.setIobScorer(); } mira.loadTemplates(trainTemplate); mira.setClip(sigma); mira.maxPosteriors = maxPosteriors; mira.beamSize = beamSize; int numExamples = mira.count(trainName, frequency); mira.initModel(randomInit); String trainResult = ""; for (int i = 0; i < iterations; i++) { trainResult = mira.train(trainName, iterations, numExamples, i); mira.averageWeights(iterations * numExamples); } mira.saveModel(finalClassifierModelName); // all training documents are processed by now for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) { if (document.isTrainingDocument()) { document.setProcessed(true); } } for (SourceDocument document : aAutomationService.listTabSepDocuments(layerFeature .getProject())) { document.setProcessed(true); } return trainResult; } private static void getFeatureOtherLayer(MiraTemplate aTemplate, DocumentService aRepository, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao, int beamSize, boolean maxPosteriors, List<List<String>> predictions, Mira mira, File predFtFile, File predcitedFile, SourceDocument document) throws FileNotFoundException, IOException, ClassNotFoundException, UIMAException { // other layers as training document for (AnnotationFeature feature : aTemplate.getOtherFeatures()) { int shiftColumns = 0; int nbest = 1; String modelName = aAutomationService.getMiraModel(feature, true, null).getAbsolutePath(); if (!new File(modelName).exists()) { addOtherFeatureFromAnnotation(feature, aRepository, aAnnotationService, aUserDao, predictions, document); continue; } String testName = predFtFile.getAbsolutePath(); PrintStream stream = new PrintStream(predcitedFile); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); if (testName != null) { input = new BufferedReader(new FileReader(testName)); } mira.loadModel(modelName); mira.setShiftColumns(shiftColumns); mira.nbest = nbest; mira.beamSize = beamSize; mira.maxPosteriors = maxPosteriors; mira.test(input, stream); LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile)); List<String> annotations = new ArrayList<String>(); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { continue; } StringTokenizer st = new StringTokenizer(line, " "); String tag = ""; while (st.hasMoreTokens()) { tag = st.nextToken(); } annotations.add(tag); } predictions.add(annotations); } } private static void getFeaturesTabSep(MiraTemplate aTemplate, AutomationService aAutomationService, int beamSize, boolean maxPosteriors, AnnotationFeature layerFeature, List<List<String>> predictions, Mira mira, File predFile, File predcitedFile) throws FileNotFoundException, IOException, ClassNotFoundException, AutomationException { for (SourceDocument document : aAutomationService.listTabSepDocuments(aTemplate.getTrainFeature() .getProject())) { int shiftColumns = 0; int nbest = 1; String modelName = aAutomationService.getMiraModel(layerFeature, true, document) .getAbsolutePath(); if (!new File(modelName).exists()) { continue; } String testName = predFile.getAbsolutePath(); PrintStream stream = new PrintStream(predcitedFile); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); if (testName != null) { input = new BufferedReader(new FileReader(testName)); } mira.loadModel(modelName); mira.setShiftColumns(shiftColumns); mira.nbest = nbest; mira.beamSize = beamSize; mira.maxPosteriors = maxPosteriors; try { mira.test(input, stream); } catch (Exception e) { throw new AutomationException(document.getName() + " is Invalid TAB-SEP file!"); } LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile)); List<String> annotations = new ArrayList<String>(); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { continue; } StringTokenizer st = new StringTokenizer(line, " "); String tag = ""; while (st.hasMoreTokens()) { tag = st.nextToken(); } annotations.add(tag); } predictions.add(annotations); } } /** * Based on the other layer, add features for the prediction document * * @param aTemplate * the template. * @param aRepository * the repository. * @throws UIMAException * hum? * @throws ClassNotFoundException * hum? * @throws IOException * hum? * @throws AnnotationException * hum? * @throws AutomationException * hum? */ public static void addOtherFeatureToPredictDocument(MiraTemplate aTemplate, DocumentService aRepository, AnnotationSchemaService aAnnotationService, AutomationService aAutomationService, UserDao aUserDao) throws UIMAException, ClassNotFoundException, IOException, AnnotationException, AutomationException { AnnotationFeature layerFeature = aTemplate.getTrainFeature(); File miraDir = aAutomationService.getMiraDir(layerFeature); for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) { List<List<String>> predictions = new ArrayList<List<String>>(); if (!document.isProcessed() && !document.isTrainingDocument()) { File predFtFile = new File(miraDir, document.getId() + ".pred.ft"); Mira mira = new Mira(); int beamSize = 0; boolean maxPosteriors = false; File predcitedFile = new File(predFtFile.getAbsolutePath() + "-pred"); getFeatureOtherLayer(aTemplate, aRepository, aAnnotationService, aAutomationService, aUserDao, beamSize, maxPosteriors, predictions, mira, predFtFile, predcitedFile, document); getFeaturesTabSep(aTemplate, aAutomationService, beamSize, maxPosteriors, layerFeature, predictions, mira, predFtFile, predcitedFile); File basePredFile = new File(miraDir, document.getId() + ".pred"); if (predictions.size() == 0) { createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), 0); FileUtils.copyFile(predFtFile, basePredFile); } else { createTemplate(aTemplate.getTrainFeature(), getMiraTemplateFile(layerFeature, aAutomationService), predictions.size()); buildPredictFile(predFtFile, basePredFile, predictions, aTemplate.getTrainFeature()); } } } } // add all predicted features and its own label at the end, to train a classifier. private static void buildTrainFile(File aBaseFile, File aTrainFile, List<List<String>> aPredictions) throws IOException { LineIterator it = IOUtils.lineIterator(new FileReader(aBaseFile)); StringBuffer trainBuffer = new StringBuffer(); int i = 0; while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { trainBuffer.append("\n"); continue; } StringTokenizer st = new StringTokenizer(line, " "); String label = ""; String feature = ""; // Except the last token, which is the label, maintain the line while (st.hasMoreTokens()) { feature = st.nextToken(); if (label.equals("")) { // first time label = feature; continue; } trainBuffer.append(label + " "); label = feature; } for (List<String> prediction : aPredictions) { trainBuffer.append(prediction.get(i) + " "); } // add its own label trainBuffer.append(label + "\n"); i++; } IOUtils.write(trainBuffer.toString(), new FileOutputStream(aTrainFile)); } // add additional features predicted so that it will have the same number of features as the // classifier private static void buildPredictFile(File apredFt, File aPredFile, List<List<String>> aPredictions, AnnotationFeature aFeature) throws IOException { LineIterator it = IOUtils.lineIterator(new FileReader(apredFt)); StringBuffer predBuffer = new StringBuffer(); int i = 0; while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { predBuffer.append("\n"); continue; } StringTokenizer st = new StringTokenizer(line, " "); // if the target feature is on multiple token, we do not need the morphological features // in the prediction file if (aFeature.getLayer().isMultipleTokens()) { predBuffer.append(st.nextToken() + " "); } else { while (st.hasMoreTokens()) { predBuffer.append(st.nextToken() + " "); } } for (List<String> prediction : aPredictions) { predBuffer.append(prediction.get(i) + " "); } // add its predBuffer.append("\n"); i++; } IOUtils.write(predBuffer.toString(), new FileOutputStream(aPredFile)); } /** * Add new annotation to the CAS using the MIRA prediction. This is different from the add * methods in the {@link TypeAdapter}s in such a way that the begin and end offsets are always * exact so that no need to re-compute * * @param aJcas * the JCas. * @param aFeature * the feature. * @param aLabelValues * the values. * @throws AnnotationException * if the annotations could not be created/updated. * @throws IOException * if an I/O error occurs. */ public static void automate(JCas aJcas, AnnotationFeature aFeature, List<String> aLabelValues) throws AnnotationException, IOException { String typeName = aFeature.getLayer().getName(); String attachTypeName = aFeature.getLayer().getAttachType() == null ? null : aFeature .getLayer().getAttachType().getName(); Type type = CasUtil.getType(aJcas.getCas(), typeName); Feature feature = type.getFeatureByBaseName(aFeature.getName()); int i = 0; String prevNe = "O"; int begin = 0; int end = 0; // remove existing annotations of this type, after all it is an // automation, no care clearAnnotations(aJcas, type); if (!aFeature.getLayer().isLockToTokenOffset() || aFeature.getLayer().isMultipleTokens()) { for (Token token : select(aJcas, Token.class)) { String value = aLabelValues.get(i); AnnotationFS newAnnotation; if (value.equals("O") && prevNe.equals("O")) { i++; continue; } else if (value.equals("O") && !prevNe.equals("O")) { newAnnotation = aJcas.getCas().createAnnotation(type, begin, end); newAnnotation.setFeatureValueFromString(feature, prevNe.replace("B-", "")); prevNe = "O"; aJcas.getCas().addFsToIndexes(newAnnotation); } else if (!value.equals("O") && prevNe.equals("O")) { begin = token.getBegin(); end = token.getEnd(); prevNe = value; } else if (!value.equals("O") && !prevNe.equals("O")) { if (value.replace("B-", "").replace("I-", "") .equals(prevNe.replace("B-", "").replace("I-", "")) && value.startsWith("B-")) { newAnnotation = aJcas.getCas().createAnnotation(type, begin, end); newAnnotation.setFeatureValueFromString(feature, prevNe.replace("B-", "") .replace("I-", "")); prevNe = value; begin = token.getBegin(); end = token.getEnd(); aJcas.getCas().addFsToIndexes(newAnnotation); } else if (value.replace("B-", "").replace("I-", "") .equals(prevNe.replace("B-", "").replace("I-", ""))) { i++; end = token.getEnd(); continue; } else { newAnnotation = aJcas.getCas().createAnnotation(type, begin, end); newAnnotation.setFeatureValueFromString(feature, prevNe.replace("B-", "") .replace("I-", "")); prevNe = value; begin = token.getBegin(); end = token.getEnd(); aJcas.getCas().addFsToIndexes(newAnnotation); } } i++; } } else { // check if annotation is on an AttachType Feature attachFeature = null; Type attachType; if (attachTypeName != null) { attachType = CasUtil.getType(aJcas.getCas(), attachTypeName); attachFeature = attachType.getFeatureByBaseName(attachTypeName); } for (Token token : select(aJcas, Token.class)) { AnnotationFS newAnnotation = aJcas.getCas().createAnnotation(type, token.getBegin(), token.getEnd()); newAnnotation.setFeatureValueFromString(feature, aLabelValues.get(i)); i++; if (attachFeature != null) { token.setFeatureValue(attachFeature, newAnnotation); } aJcas.getCas().addFsToIndexes(newAnnotation); } } } public static void predict(MiraTemplate aTemplate, DocumentService aRepository, CorrectionDocumentService aCorrectionDocumentService, AutomationService aAutomationService, UserDao aUserDao) throws CASException, UIMAException, ClassNotFoundException, IOException, AnnotationException { AnnotationFeature layerFeature = aTemplate.getTrainFeature(); File miraDir = aAutomationService.getMiraDir(layerFeature); AutomationStatus status = aAutomationService.getAutomationStatus(aTemplate); for (SourceDocument document : aRepository.listSourceDocuments(layerFeature.getProject())) { if (!document.isProcessed() && !document.isTrainingDocument()) { File predFile = new File(miraDir, document.getId() + ".pred"); Mira mira = new Mira(); int shiftColumns = 0; int nbest = 1; int beamSize = 0; boolean maxPosteriors = false; String modelName = aAutomationService.getMiraModel(layerFeature, false, null) .getAbsolutePath(); String testName = predFile.getAbsolutePath(); File predcitedFile = new File(predFile.getAbsolutePath() + "-pred"); PrintStream stream = new PrintStream(predcitedFile); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); if (testName != null) { input = new BufferedReader(new FileReader(testName)); } mira.loadModel(modelName); mira.setShiftColumns(shiftColumns); mira.nbest = nbest; mira.beamSize = beamSize; mira.maxPosteriors = maxPosteriors; mira.test(input, stream); LOG.info("Prediction is wrtten to a MIRA File. To be done is writing back to the CAS"); LineIterator it = IOUtils.lineIterator(new FileReader(predcitedFile)); List<String> annotations = new ArrayList<String>(); while (it.hasNext()) { String line = it.next(); if (line.trim().equals("")) { continue; } StringTokenizer st = new StringTokenizer(line, " "); String tag = ""; while (st.hasMoreTokens()) { tag = st.nextToken(); } annotations.add(tag); } LOG.info(annotations.size() + " Predictions found to be written to the CAS"); JCas jCas = null; String username = SecurityContextHolder.getContext().getAuthentication().getName(); User user = aUserDao.get(username); try { AnnotationDocument annoDocument = aRepository.getAnnotationDocument(document, user); jCas = aRepository.readAnnotationCas(annoDocument); } catch (DataRetrievalFailureException e) { } automate(jCas, layerFeature, annotations); LOG.info("Predictions found are written to the CAS"); aCorrectionDocumentService.writeCorrectionCas(jCas, document, user); document.setProcessed(true); status.setAnnoDocs(status.getAnnoDocs() - 1); } } } public static void clearAnnotations(JCas aJCas, Type aType) throws IOException { List<AnnotationFS> annotationsToRemove = new ArrayList<AnnotationFS>(); for (AnnotationFS a : select(aJCas.getCas(), aType)) { annotationsToRemove.add(a); } for (AnnotationFS annotation : annotationsToRemove) { aJCas.removeFsFromIndexes(annotation); } } }