/* * Copyright (C) 2013-2015, VistaTEC or third-party contributors as indicated * by the @author tags or express copyright attribution statements applied by * the authors. All third-party contributions are distributed under license by * VistaTEC. * * This file is part of Ocelot. * * Ocelot is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Ocelot is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, write to: * * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 * USA * * Also, see the full LGPL text here: <http://www.gnu.org/copyleft/lesser.html> */ package com.vistatec.ocelot.xliff.okapi; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import net.sf.okapi.common.Event; import net.sf.okapi.common.FileUtil; import net.sf.okapi.common.IResource; import net.sf.okapi.common.LocaleId; import net.sf.okapi.common.annotation.AltTranslation; import net.sf.okapi.common.annotation.AltTranslationsAnnotation; import net.sf.okapi.common.annotation.GenericAnnotation; import net.sf.okapi.common.annotation.GenericAnnotationType; import net.sf.okapi.common.annotation.GenericAnnotations; import net.sf.okapi.common.annotation.ITSLQIAnnotations; import net.sf.okapi.common.annotation.ITSProvenanceAnnotations; import net.sf.okapi.common.annotation.XLIFFPhase; import net.sf.okapi.common.annotation.XLIFFPhaseAnnotation; import net.sf.okapi.common.annotation.XLIFFTool; import net.sf.okapi.common.annotation.XLIFFToolAnnotation; import net.sf.okapi.common.resource.ITextUnit; import net.sf.okapi.common.resource.Property; import net.sf.okapi.common.resource.RawDocument; import net.sf.okapi.common.resource.StartSubDocument; import net.sf.okapi.common.resource.TextContainer; import net.sf.okapi.filters.xliff.Parameters; import net.sf.okapi.filters.xliff.XLIFFFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.vistatec.ocelot.its.model.LanguageQualityIssue; import com.vistatec.ocelot.its.model.OtherITSMetadata; import com.vistatec.ocelot.its.model.Provenance; import com.vistatec.ocelot.its.model.TextAnalysisMetaData; import com.vistatec.ocelot.its.model.okapi.OkapiProvenance; import com.vistatec.ocelot.rules.DataCategoryField; import com.vistatec.ocelot.rules.StateQualifier; import com.vistatec.ocelot.segment.model.BaseSegmentVariant; import com.vistatec.ocelot.segment.model.OcelotSegment; import com.vistatec.ocelot.segment.model.enrichment.Enrichment; import com.vistatec.ocelot.segment.model.okapi.Note; import com.vistatec.ocelot.segment.model.okapi.Notes; import com.vistatec.ocelot.segment.model.okapi.OkapiSegment; import com.vistatec.ocelot.segment.model.okapi.TextContainerVariant; import com.vistatec.ocelot.xliff.XLIFFParser; import com.vistatec.ocelot.xliff.freme.EnrichmentConverterXLIFF12; /** * Parse XLIFF file for use in the workbench. The Event list is used when * writing out files through Okapi; updates to the workbench segments must then * be reflected(synchronized) in the proper Event. */ public class OkapiXLIFF12Parser implements XLIFFParser { private static Logger LOG = LoggerFactory .getLogger(OkapiXLIFF12Parser.class); private LinkedList<Event> events; private XLIFFFilter filter; private int documentSegmentNum; private String sourceLang, targetLang; private EnrichmentConverterXLIFF12 enrichmentConverter; @Override public String getSourceLang() { return this.sourceLang; } public void setSourceLang(String sourceLang) { this.sourceLang = sourceLang; } @Override public String getTargetLang() { return this.targetLang; } public void setTargetLang(String targetLang) { this.targetLang = targetLang; } public Event getSegmentEvent(int segEventNumber) { return this.events.get(segEventNumber); } public List<Event> getSegmentEvents() { return this.events; } @Override public List<OcelotSegment> parse(File xliffFile) throws IOException { events = new LinkedList<Event>(); List<OcelotSegment> segments = new LinkedList<OcelotSegment>(); documentSegmentNum = 1; List<String> locales = FileUtil.guessLanguages(xliffFile .getAbsolutePath()); LocaleId sourceLocale = null, targetLocale = null; sourceLocale = (locales.size() >= 1) ? LocaleId.fromString(locales .get(0)) : LocaleId.EMPTY; targetLocale = (locales.size() >= 2) ? LocaleId.fromString(locales .get(1)) : LocaleId.EMPTY; FileInputStream is = new FileInputStream(xliffFile); RawDocument fileDoc = new RawDocument(is, "UTF-8", sourceLocale, targetLocale); this.filter = new XLIFFFilter(); Parameters filterParams = new Parameters(); filterParams.setAddAltTrans(true); this.filter.setParameters(filterParams); this.filter.open(fileDoc); int fileEventNum = 0; while (this.filter.hasNext()) { Event event = this.filter.next(); events.add(event); if (event.isStartSubDocument()) { StartSubDocument fileElement = (StartSubDocument) event .getResource(); XLIFFToolAnnotation toolAnn = fileElement .getAnnotation(XLIFFToolAnnotation.class); if (toolAnn == null) { toolAnn = new XLIFFToolAnnotation(); fileElement.setAnnotation(toolAnn); } if (toolAnn.get("Ocelot") == null) { toolAnn.add(new XLIFFTool("Ocelot", "Ocelot"), fileElement); } if (fileElement.getProperty("sourceLanguage") != null) { String fileSourceLang = fileElement.getProperty( "sourceLanguage").getValue(); if (getSourceLang() != null && !getSourceLang().equals(fileSourceLang)) { LOG.warn("Mismatch between source languages in file elements"); } setSourceLang(fileSourceLang); fileDoc.setSourceLocale(LocaleId.fromString(fileSourceLang)); } if (fileElement.getProperty("targetLanguage") != null) { String fileTargetLang = fileElement.getProperty( "targetLanguage").getValue(); if (getTargetLang() != null && !getTargetLang().equals(fileTargetLang)) { LOG.warn("Mismatch between target languages in file elements"); } setTargetLang(fileTargetLang); fileDoc.setTargetLocale(LocaleId.fromString(fileTargetLang)); } enrichmentConverter = new EnrichmentConverterXLIFF12(sourceLang, targetLang); } else if (event.isTextUnit()) { ITextUnit tu = (ITextUnit) event.getResource(); segments.add(convertTextUnitToSegment(tu, fileEventNum)); } fileEventNum++; } is.close(); return segments; } public OkapiSegment convertTextUnitToSegment(ITextUnit tu, int fileEventNum) { TextContainer srcTu = tu.getSource(); TextContainer tgtTu = new TextContainer(); Set<LocaleId> targetLocales = tu.getTargetLocales(); if (targetLocales.size() > 1) { LocaleId chosenTargetLocale = targetLocales.iterator().next(); LOG.warn("More than 1 target locale: " + targetLocales); LOG.warn("Using target locale '" + chosenTargetLocale + "'"); tgtTu = tu.getTarget(chosenTargetLocale); } else if (targetLocales.size() == 1) { for (LocaleId tgt : targetLocales) { tgtTu = tu.getTarget(tgt); } } else { tu.setTarget(LocaleId.fromString(getTargetLang()), tgtTu); } TextContainer oriTgtTu = retrieveOriginalTarget(tgtTu); List<Enrichment> sourceEnrichments = enrichmentConverter .retrieveEnrichments(srcTu, tu, LocaleId.fromString(getSourceLang()).getLanguage()); List<Enrichment> targetEnrichments = enrichmentConverter .retrieveEnrichments(tgtTu, tu, LocaleId.fromString(getTargetLang()).getLanguage()); List<Enrichment> originalTargetEnrichments = enrichmentConverter .retrieveEnrichments(oriTgtTu, tu, LocaleId.fromString(getTargetLang()).getLanguage()); OkapiSegment.Builder segBuilder = new OkapiSegment.Builder() .segmentNumber(documentSegmentNum++) .eventNumber(fileEventNum) .source(new TextContainerVariant(srcTu)) .target(new TextContainerVariant(tgtTu)) .originalTarget( oriTgtTu != null ? new TextContainerVariant(oriTgtTu) : null).tuId(tu.getId()); Property stateQualifier = tgtTu.getProperty("state-qualifier"); if (stateQualifier != null) { StateQualifier sq = StateQualifier.get(stateQualifier.getValue()); if (sq != null) { segBuilder.stateQualifier(sq); } else { LOG.info("Ignoring state-qualifier value '" + stateQualifier.getValue() + "'"); } } XLIFFPhaseAnnotation phaseAnn = tu .getAnnotation(XLIFFPhaseAnnotation.class); if (phaseAnn != null) { XLIFFPhase refPhase = phaseAnn.getReferencedPhase(); segBuilder.phaseName(refPhase.getPhaseName()); } OkapiSegment segment = segBuilder.build(); if (segment.getSource() instanceof BaseSegmentVariant && !sourceEnrichments.isEmpty()) { ((BaseSegmentVariant) segment.getSource()) .addEnrichmentList(sourceEnrichments); ((BaseSegmentVariant) segment.getSource()).setEnriched(true); } if (segment.getTarget() instanceof BaseSegmentVariant && !targetEnrichments.isEmpty()) { ((BaseSegmentVariant) segment.getTarget()) .addEnrichmentList(targetEnrichments); } if (segment.getOriginalTarget() instanceof BaseSegmentVariant && !originalTargetEnrichments.isEmpty()) { ((BaseSegmentVariant) segment.getOriginalTarget()) .addEnrichmentList(originalTargetEnrichments); } List<Enrichment> totEnrichments = new ArrayList<Enrichment>( sourceEnrichments); totEnrichments.addAll(targetEnrichments); readNotes(segment, tu); return attachITSDataToSegment(segment, tu, srcTu, tgtTu, totEnrichments); } private void readNotes(OkapiSegment seg, ITextUnit tu) { Property p = tu.getProperty(Property.NOTE); if (p != null) { // XLIFF 1.2 doesn't support note IDs, so we always display notes Notes notes = new Notes(); notes.add(new Note(Note.OCELOT_ID_PREFIX + "1", p.getValue())); seg.setNotes(notes); } } private OkapiSegment attachITSDataToSegment(OkapiSegment seg, ITextUnit tu, TextContainer srcTu, TextContainer tgtTu, List<Enrichment> enrichments) { ITSLQIAnnotations lqiAnns = retrieveITSLQIAnnotations(tu, srcTu, tgtTu); List<LanguageQualityIssue> lqiList = new ArrayList<>(); for (GenericAnnotation ga : lqiAnns .getAnnotations(GenericAnnotationType.LQI)) { lqiList.add(new LanguageQualityIssue(ga)); } seg.addAllLQI(lqiList); ITSProvenanceAnnotations provAnns = retrieveITSProvAnnotations(tu, srcTu, tgtTu); List<GenericAnnotation> provAnnList = provAnns .getAnnotations(GenericAnnotationType.PROV); if (provAnnList != null) { List<Provenance> provList = new ArrayList<>(); for (GenericAnnotation ga : provAnnList) { provList.add(new OkapiProvenance(ga)); } seg.addAllProvenance(provList); } List<OtherITSMetadata> otherList = new ArrayList<OtherITSMetadata>(); // otherList.addAll(EnrichmentConverter.convertEnrichments2ITSMetadata(enrichments, // seg)); enrichmentConverter.convertEnrichments2ITSMetadata(seg); // seg.addAllTextAnalysis(retrieveITSTAAnnotations(tu, srcTu, tgtTu)); if (tgtTu != null) { for (GenericAnnotation mtAnn : retrieveITSMTConfidenceAnnotations(tgtTu)) { otherList .add(new OtherITSMetadata( DataCategoryField.MT_CONFIDENCE, mtAnn.getDouble(GenericAnnotationType.MTCONFIDENCE_VALUE))); } } seg.addAllOtherITSMetadata(otherList); return seg; } private List<TextAnalysisMetaData> retrieveITSTAAnnotations(ITextUnit tu, TextContainer srcTu, TextContainer tgtTu) { List<TextAnalysisMetaData> taAnnotations = new ArrayList<TextAnalysisMetaData>(); // Iterable<IAnnotation> annotations = tu.getAnnotations(); // OK!!!! taAnnotations.addAll(createTaAnnotations(tu.getAnnotation(GenericAnnotations.class), TextAnalysisMetaData.SEGMENT)); taAnnotations.addAll(createTaAnnotations(srcTu.getAnnotation(GenericAnnotations.class), TextAnalysisMetaData.SOURCE)); if (tgtTu != null) { taAnnotations.addAll(createTaAnnotations(tgtTu.getAnnotation(GenericAnnotations.class), TextAnalysisMetaData.TARGET)); } // tu.getAnnotation(GenericAnnotation) return taAnnotations; } private List<TextAnalysisMetaData> createTaAnnotations( GenericAnnotations annotations, String entityType) { List<TextAnalysisMetaData> taAnnotations = new ArrayList<TextAnalysisMetaData>(); if (annotations != null) { Iterator<GenericAnnotation> annotsIt = annotations.iterator(); GenericAnnotation annot = null; TextAnalysisMetaData taAnnot = null; while (annotsIt.hasNext()) { annot = annotsIt.next(); if (annot.getType().equals(GenericAnnotationType.TA)) { taAnnot = new TextAnalysisMetaData(); taAnnot.setEntity(entityType); // taAnnot.setTaAnnotatorsRef(taAnnotatorsRef); taAnnot.setTaClassRef(annot .getString(GenericAnnotationType.TA_CLASS)); taAnnot.setTaConfidence(annot .getDouble(GenericAnnotationType.TA_CONFIDENCE)); taAnnot.setTaIdentRef(annot .getString(GenericAnnotationType.TA_IDENT)); taAnnotations.add(taAnnot); } else if (annot.getType().equals(GenericAnnotationType.ANNOT)) { String annotValue = annot.getString(GenericAnnotationType.ANNOT_VALUE); if(annotValue != null && annotValue.startsWith("text-analysis|")){ int index = annotValue.indexOf("text-analysis|") + "text-analysis|".length(); taAnnot = new TextAnalysisMetaData(); taAnnot.setTaAnnotatorsRef(annotValue.substring(index)); taAnnotations.add(taAnnot); } } } } return taAnnotations; } public ITSLQIAnnotations retrieveITSLQIAnnotations(ITextUnit tu, TextContainer srcTu, TextContainer tgtTu) { ITSLQIAnnotations lqiAnns = tu.getAnnotation(ITSLQIAnnotations.class); lqiAnns = lqiAnns == null ? new ITSLQIAnnotations() : lqiAnns; ITSLQIAnnotations srcLQIAnns = srcTu .getAnnotation(ITSLQIAnnotations.class); if (srcLQIAnns != null) { lqiAnns.addAll(srcLQIAnns); } if (tgtTu != null) { ITSLQIAnnotations tgtLQIAnns = tgtTu .getAnnotation(ITSLQIAnnotations.class); if (tgtLQIAnns != null) { lqiAnns.addAll(tgtLQIAnns); } } return lqiAnns; } public ITSProvenanceAnnotations retrieveITSProvAnnotations(ITextUnit tu, TextContainer srcTu, TextContainer tgtTu) { ITSProvenanceAnnotations provAnns = tu .getAnnotation(ITSProvenanceAnnotations.class); provAnns = provAnns == null ? new ITSProvenanceAnnotations() : provAnns; ITSProvenanceAnnotations srcProvAnns = srcTu .getAnnotation(ITSProvenanceAnnotations.class); if (srcProvAnns != null) { provAnns.addAll(srcProvAnns); } if (tgtTu != null) { ITSProvenanceAnnotations tgtProvAnns = tgtTu .getAnnotation(ITSProvenanceAnnotations.class); if (tgtProvAnns != null) { provAnns.addAll(tgtProvAnns); } } return provAnns; } public List<GenericAnnotation> retrieveITSMTConfidenceAnnotations( TextContainer tgtTu) { GenericAnnotations tgtAnns = tgtTu .getAnnotation(GenericAnnotations.class); List<GenericAnnotation> mtAnns = new LinkedList<GenericAnnotation>(); if (tgtAnns != null) { mtAnns = tgtAnns.getAnnotations(GenericAnnotationType.MTCONFIDENCE); } return mtAnns; } public TextContainer retrieveOriginalTarget(TextContainer target) { AltTranslationsAnnotation altTrans = target .getAnnotation(AltTranslationsAnnotation.class); if (altTrans != null) { Iterator<AltTranslation> iterAltTrans = altTrans.iterator(); while (iterAltTrans.hasNext()) { AltTranslation altTran = iterAltTrans.next(); // Check if alt-trans is Ocelot generated. XLIFFTool altTool = altTran.getTool(); if (altTool != null && altTool.getName().equals("Ocelot")) { // We should be able to replace this with |return // altTrans.getTarget;| // once an issue with the XLIFF reader is fixed (Okapi 412). ITextUnit tu = altTran.getEntry(); for (LocaleId trg : tu.getTargetLocales()) { return altTran.getTarget(); // If there is a target // return it } // No target: create one empty return tu.createTarget( LocaleId.fromString(getTargetLang()), true, IResource.CREATE_EMPTY); } } } return null; } protected XLIFFFilter getFilter() { return this.filter; } }