/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.api.dao; import static de.tudarmstadt.ukp.clarin.webanno.api.ProjectService.DOCUMENT; import static de.tudarmstadt.ukp.clarin.webanno.api.ProjectService.PROJECT; import static de.tudarmstadt.ukp.clarin.webanno.api.ProjectService.SOURCE; import static de.tudarmstadt.ukp.clarin.webanno.api.WebAnnoConst.INITIAL_CAS_PSEUDO_USER; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import javax.annotation.Resource; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.uima.UIMAException; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.apache.uima.util.CasCreationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.MDC; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import org.springframework.transaction.annotation.Transactional; import de.tudarmstadt.ukp.clarin.webanno.api.AnnotationSchemaService; import de.tudarmstadt.ukp.clarin.webanno.api.CasStorageService; import de.tudarmstadt.ukp.clarin.webanno.api.DocumentService; import de.tudarmstadt.ukp.clarin.webanno.api.ImportExportService; import de.tudarmstadt.ukp.clarin.webanno.api.WebAnnoConst; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer; import de.tudarmstadt.ukp.clarin.webanno.model.LinkMode; import de.tudarmstadt.ukp.clarin.webanno.model.Mode; import de.tudarmstadt.ukp.clarin.webanno.model.MultiValueMode; import de.tudarmstadt.ukp.clarin.webanno.model.Project; import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument; import de.tudarmstadt.ukp.clarin.webanno.model.TagSet; import de.tudarmstadt.ukp.clarin.webanno.support.ZipUtils; import de.tudarmstadt.ukp.clarin.webanno.support.logging.Logging; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter; @Component(ImportExportService.SERVICE_NAME) public class ImportExportServiceImpl implements ImportExportService { private final Logger log = LoggerFactory.getLogger(getClass()); @Value(value = "${repository.path}") private File dir; @Resource(name = "casStorageService") private CasStorageService casStorageService; @Resource(name = "annotationService") private AnnotationSchemaService annotationService; @Resource(name = "documentService") private DocumentService documentService; @Resource(name = "formats") private Properties readWriteFileFormats; public ImportExportServiceImpl() { // Nothing to do } /** * A new directory is created using UUID so that every exported file will reside in its own * directory. This is useful as the written file can have multiple extensions based on the * Writer class used. */ @SuppressWarnings("rawtypes") @Override @Transactional public File exportAnnotationDocument(SourceDocument aDocument, String aUser, Class aWriter, String aFileName, Mode aMode) throws UIMAException, IOException, ClassNotFoundException { return exportAnnotationDocument(aDocument, aUser, aWriter, aFileName, aMode, true); } @SuppressWarnings("rawtypes") @Override @Transactional public File exportAnnotationDocument(SourceDocument aDocument, String aUser, Class aWriter, String aFileName, Mode aMode, boolean aStripExtension) throws UIMAException, IOException, ClassNotFoundException { File annotationFolder = casStorageService.getAnnotationFolder(aDocument); String serializedCasFileName; // for Correction, it will export the corrected document (of the logged in user) // (CORRECTION_USER.ser is the automated result displayed for the user to correct it, not // the final result) for automation, it will export either the corrected document // (Annotated) or the automated document if (aMode.equals(Mode.ANNOTATION) || aMode.equals(Mode.AUTOMATION) || aMode.equals(Mode.CORRECTION)) { serializedCasFileName = aUser + ".ser"; } // The merge result will be exported else { serializedCasFileName = WebAnnoConst.CURATION_USER + ".ser"; } // Read file File serializedCasFile = new File(annotationFolder, serializedCasFileName); if (!serializedCasFile.exists()) { throw new FileNotFoundException("CAS file [" + serializedCasFileName + "] not found in [" + annotationFolder + "]"); } CAS cas = CasCreationUtils.createCas((TypeSystemDescription) null, null, null); CasPersistenceUtils.readSerializedCas(cas.getJCas(), serializedCasFile); // Update type system the CAS annotationService.upgradeCas(cas, aDocument, aUser); File exportFile = exportCasToFile(cas, aDocument, aFileName, aWriter, aStripExtension); Project project = aDocument.getProject(); try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(project.getId()))) { log.info("Exported annotation document [{}]({}) for user [{}] from project [{}]({})", aDocument.getName(), aDocument.getId(), aUser, project.getName(), project.getId()); } return exportFile; } @Override @Transactional public void uploadTrainingDocument(File aFile, SourceDocument aDocument) throws IOException { // Check if the file has a valid format / can be converted without error JCas cas = null; try { if (aDocument.getFormat().equals(WebAnnoConst.TAB_SEP)) { if (!isTabSepFileFormatCorrect(aFile)) { throw new IOException( "This TAB-SEP file is not in correct format. It should have two columns separated by TAB!"); } } else { cas = importCasFromFile(aFile, aDocument.getProject(), aDocument.getFormat()); casStorageService.analyzeAndRepair(aDocument, INITIAL_CAS_PSEUDO_USER, cas.getCas()); } } catch (IOException e) { documentService.removeSourceDocument(aDocument); throw e; } catch (Exception e) { documentService.removeSourceDocument(aDocument); throw new IOException(e.getMessage(), e); } // Copy the original file into the repository File targetFile = documentService.getSourceDocumentFile(aDocument); FileUtils.forceMkdir(targetFile.getParentFile()); FileUtils.copyFile(aFile, targetFile); // Copy the initial conversion of the file into the repository if (cas != null) { CasPersistenceUtils.writeSerializedCas(cas, documentService.getCasFile(aDocument, INITIAL_CAS_PSEUDO_USER)); } try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(aDocument.getProject().getId()))) { Project project = aDocument.getProject(); log.info("Imported training document [{}]({}) to project [{}]({})", aDocument.getName(), aDocument.getId(), project.getName(), project.getId()); } } @Override public List<String> getReadableFormatLabels() { List<String> readableFormats = new ArrayList<String>(); for (String key : readWriteFileFormats.stringPropertyNames()) { if (key.contains(".label") && !isBlank(readWriteFileFormats.getProperty(key))) { String readerLabel = key.substring(0, key.lastIndexOf(".label")); if (!isBlank(readWriteFileFormats.getProperty(readerLabel + ".reader"))) { readableFormats.add(readWriteFileFormats.getProperty(key)); } } } Collections.sort(readableFormats); return readableFormats; } @Override public String getReadableFormatId(String aLabel) { String readableFormat = ""; for (String key : readWriteFileFormats.stringPropertyNames()) { if (key.contains(".label") && !isBlank(readWriteFileFormats.getProperty(key))) { if (readWriteFileFormats.getProperty(key).equals(aLabel)) { readableFormat = key.substring(0, key.lastIndexOf(".label")); break; } } } return readableFormat; } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public Map<String, Class<CollectionReader>> getReadableFormats() throws ClassNotFoundException { Map<String, Class<CollectionReader>> readableFormats = new HashMap<>(); for (String key : readWriteFileFormats.stringPropertyNames()) { if (key.contains(".label") && !isBlank(readWriteFileFormats.getProperty(key))) { String readerLabel = key.substring(0, key.lastIndexOf(".label")); if (!isBlank(readWriteFileFormats.getProperty(readerLabel + ".reader"))) { readableFormats.put(readerLabel, (Class) Class.forName(readWriteFileFormats .getProperty(readerLabel + ".reader"))); } } } return readableFormats; } @Override public List<String> getWritableFormatLabels() { List<String> writableFormats = new ArrayList<String>(); for (String key : readWriteFileFormats.stringPropertyNames()) { if (key.contains(".label") && !isBlank(readWriteFileFormats.getProperty(key))) { String writerLabel = key.substring(0, key.lastIndexOf(".label")); if (!isBlank(readWriteFileFormats.getProperty(writerLabel + ".writer"))) { writableFormats.add(readWriteFileFormats.getProperty(key)); } } } Collections.sort(writableFormats); return writableFormats; } @Override public String getWritableFormatId(String aLabel) { String writableFormat = ""; for (String key : readWriteFileFormats.stringPropertyNames()) { if (key.contains(".label") && !isBlank(readWriteFileFormats.getProperty(key))) { if (readWriteFileFormats.getProperty(key).equals(aLabel)) { writableFormat = key.substring(0, key.lastIndexOf(".label")); break; } } } return writableFormat; } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public Map<String, Class<JCasAnnotator_ImplBase>> getWritableFormats() throws ClassNotFoundException { Map<String, Class<JCasAnnotator_ImplBase>> writableFormats = new HashMap<>(); Set<String> keys = (Set) readWriteFileFormats.keySet(); for (String keyvalue : keys) { if (keyvalue.contains(".label")) { String writerLabel = keyvalue.substring(0, keyvalue.lastIndexOf(".label")); if (readWriteFileFormats.getProperty(writerLabel + ".writer") != null) { writableFormats.put(writerLabel, (Class) Class.forName(readWriteFileFormats .getProperty(writerLabel + ".writer"))); } } } return writableFormats; } @Override @SuppressWarnings({ "rawtypes", "unchecked" }) public JCas importCasFromFile(File aFile, Project aProject, String aFormat) throws UIMAException, IOException, ClassNotFoundException { Class readerClass = getReadableFormats().get(aFormat); if (readerClass == null) { throw new IOException("No reader available for format [" + aFormat + "]"); } // Prepare a CAS with the project type system TypeSystemDescription builtInTypes = TypeSystemDescriptionFactory .createTypeSystemDescription(); List<TypeSystemDescription> projectTypes = annotationService.getProjectTypes(aProject); projectTypes.add(builtInTypes); TypeSystemDescription allTypes = CasCreationUtils.mergeTypeSystems(projectTypes); CAS cas = JCasFactory.createJCas(allTypes).getCas(); // Convert the source document to CAS CollectionReader reader = CollectionReaderFactory.createReader(readerClass, ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, aFile.getParentFile() .getAbsolutePath(), ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] { "[+]" + aFile.getName() }); if (!reader.hasNext()) { throw new FileNotFoundException("Annotation file [" + aFile.getName() + "] not found in [" + aFile.getPath() + "]"); } reader.getNext(cas); JCas jCas = cas.getJCas(); // Create sentence / token annotations if they are missing boolean hasTokens = JCasUtil.exists(jCas, Token.class); boolean hasSentences = JCasUtil.exists(jCas, Sentence.class); if (!hasTokens || !hasSentences) { AnalysisEngine pipeline = createEngine(createEngineDescription( BreakIteratorSegmenter.class, BreakIteratorSegmenter.PARAM_WRITE_TOKEN, !hasTokens, BreakIteratorSegmenter.PARAM_WRITE_SENTENCE, !hasSentences)); pipeline.process(jCas); } return jCas; } /** * A new directory is created using UUID so that every exported file will reside in its own * directory. This is useful as the written file can have multiple extensions based on the * Writer class used. */ @Override public File exportCasToFile(CAS cas, SourceDocument aDocument, String aFileName, @SuppressWarnings("rawtypes") Class aWriter, boolean aStripExtension) throws IOException, UIMAException { // Update the source file name in case it is changed for some reason. This is necessary // for the writers to create the files under the correct names. Project project = aDocument.getProject(); File currentDocumentUri = new File(dir.getAbsolutePath() + PROJECT + project.getId() + DOCUMENT + aDocument.getId() + SOURCE); DocumentMetaData documentMetadata = DocumentMetaData.get(cas.getJCas()); documentMetadata.setDocumentUri(new File(currentDocumentUri, aFileName).toURI().toURL() .toExternalForm()); documentMetadata.setDocumentBaseUri(currentDocumentUri.toURI().toURL().toExternalForm()); documentMetadata.setCollectionId(currentDocumentUri.toURI().toURL().toExternalForm()); documentMetadata.setDocumentUri(new File(dir.getAbsolutePath() + PROJECT + project.getId() + DOCUMENT + aDocument.getId() + SOURCE + "/" + aFileName).toURI().toURL() .toExternalForm()); // update with the correct tagset name List<AnnotationFeature> features = annotationService.listAnnotationFeature(project); for (AnnotationFeature feature : features) { TagSet tagSet = feature.getTagset(); if (tagSet == null) { continue; } else if (!feature.getLayer().getType().equals(WebAnnoConst.CHAIN_TYPE)) { updateCasWithTagSet(cas, feature.getLayer().getName(), tagSet.getName()); } } File exportTempDir = File.createTempFile("webanno", "export"); try { exportTempDir.delete(); exportTempDir.mkdirs(); AnalysisEngineDescription writer; if (aWriter.getName() .equals("de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv3Writer")) { List<AnnotationLayer> layers = annotationService.listAnnotationLayer(aDocument.getProject()); List<String> slotFeatures = new ArrayList<String>(); List<String> slotTargets = new ArrayList<String>(); List<String> linkTypes = new ArrayList<String>(); Set<String> spanLayers = new HashSet<String>(); Set<String> slotLayers = new HashSet<String>(); for (AnnotationLayer layer : layers) { if (layer.getType().contentEquals(WebAnnoConst.SPAN_TYPE)) { // TSV will not use this if(!annotationExists(cas, layer.getName())){ continue; } boolean isslotLayer = false; for (AnnotationFeature f : annotationService.listAnnotationFeature(layer)) { if (MultiValueMode.ARRAY.equals(f.getMultiValueMode()) && LinkMode.WITH_ROLE.equals(f.getLinkMode())) { isslotLayer = true; slotFeatures.add(layer.getName() + ":" + f.getName()); slotTargets.add(f.getType()); linkTypes.add(f.getLinkTypeName()); } } if (isslotLayer) { slotLayers.add(layer.getName()); } else { spanLayers.add(layer.getName()); } } } spanLayers.addAll(slotLayers); List<String> chainLayers = new ArrayList<String>(); for (AnnotationLayer layer : layers) { if (layer.getType().contentEquals(WebAnnoConst.CHAIN_TYPE)) { if(!chainAnnotationExists(cas, layer.getName()+"Chain")){ continue; } chainLayers.add(layer.getName()); } } List<String> relationLayers = new ArrayList<String>(); for (AnnotationLayer layer : layers) { if (layer.getType().contentEquals(WebAnnoConst.RELATION_TYPE)) { // TSV will not use this if(!annotationExists(cas, layer.getName())){ continue; } relationLayers.add(layer.getName()); } } writer = createEngineDescription(aWriter, JCasFileWriter_ImplBase.PARAM_TARGET_LOCATION, exportTempDir, JCasFileWriter_ImplBase.PARAM_STRIP_EXTENSION, aStripExtension, "spanLayers", spanLayers, "slotFeatures", slotFeatures, "slotTargets", slotTargets, "linkTypes", linkTypes, "chainLayers", chainLayers, "relationLayers", relationLayers); } else { writer = createEngineDescription(aWriter, JCasFileWriter_ImplBase.PARAM_TARGET_LOCATION, exportTempDir, JCasFileWriter_ImplBase.PARAM_STRIP_EXTENSION, aStripExtension); } runPipeline(cas, writer); // If the writer produced more than one file, we package it up as a ZIP file File exportFile; if (exportTempDir.listFiles().length > 1) { exportFile = new File(exportTempDir.getAbsolutePath() + ".zip"); try { ZipUtils.zipFolder(exportTempDir, exportFile); } catch (Exception e) { try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID, String.valueOf(project.getId()))) { log.info("Unable to create zip File"); } } } else { exportFile = new File(exportTempDir.getParent(), exportTempDir.listFiles()[0].getName()); FileUtils.copyFile(exportTempDir.listFiles()[0], exportFile); } return exportFile; } finally { if (exportTempDir != null) { FileUtils.forceDelete(exportTempDir); } } } private boolean annotationExists(CAS aCas, String aType) { Type type = aCas.getTypeSystem().getType(aType); if (CasUtil.select(aCas, type).size() == 0) { return false; } return true; } private boolean chainAnnotationExists(CAS aCas, String aType) { Type type = aCas.getTypeSystem().getType(aType); if (CasUtil.selectFS(aCas, type).size() == 0) { return false; } return true; } /** * Check if a TAB-Sep training file is in correct format before importing */ private boolean isTabSepFileFormatCorrect(File aFile) { try { LineIterator it = new LineIterator(new FileReader(aFile)); while (it.hasNext()) { String line = it.next(); if (line.trim().length() == 0) { continue; } if (line.split("\t").length != 2) { return false; } } } catch (Exception e) { return false; } return true; } /** * A Helper method to add {@link TagsetDescription} to {@link CAS} * * @param aCas * the CAA. * @param aLayer * the layer. * @param aTagSetName * the tagset. */ private static void updateCasWithTagSet(CAS aCas, String aLayer, String aTagSetName) { Type TagsetType = CasUtil.getType(aCas, TagsetDescription.class); Feature layerFeature = TagsetType.getFeatureByBaseName("layer"); Feature nameFeature = TagsetType.getFeatureByBaseName("name"); boolean tagSetModified = false; // modify existing tagset Name for (FeatureStructure fs : CasUtil.select(aCas, TagsetType)) { String layer = fs.getStringValue(layerFeature); String tagSetName = fs.getStringValue(nameFeature); if (layer.equals(aLayer)) { // only if the tagset name is changed if (!aTagSetName.equals(tagSetName)) { fs.setStringValue(nameFeature, aTagSetName); aCas.addFsToIndexes(fs); } tagSetModified = true; break; } } if (!tagSetModified) { FeatureStructure fs = aCas.createFS(TagsetType); fs.setStringValue(layerFeature, aLayer); fs.setStringValue(nameFeature, aTagSetName); aCas.addFsToIndexes(fs); } } }