package com.vistatec.ocelot.xliff.freme; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import net.sf.okapi.lib.xliff2.core.ExtContent; import net.sf.okapi.lib.xliff2.core.ExtElement; import net.sf.okapi.lib.xliff2.core.ExtElements; import net.sf.okapi.lib.xliff2.core.Fragment; import net.sf.okapi.lib.xliff2.core.MTag; import net.sf.okapi.lib.xliff2.core.Unit; import net.sf.okapi.lib.xliff2.its.DataCategory; import net.sf.okapi.lib.xliff2.its.TermTag; import net.sf.okapi.lib.xliff2.its.TextAnalysis; import org.slf4j.LoggerFactory; import com.vistatec.ocelot.segment.model.enrichment.Enrichment; import com.vistatec.ocelot.segment.model.enrichment.EntityEnrichment; import com.vistatec.ocelot.segment.model.enrichment.TerminologyEnrichment; /** * This class provides methods for converting XLIFF 2.0 tags to enrichments. */ public class EnrichmentConverterXLIFF20 extends EnrichmentConverter { /** * Constructor. * * @param sourceLang * the XLIFF file source language. * @param targetLang * the XLIFF file target language. */ public EnrichmentConverterXLIFF20(String sourceLang, String targetLang) { super(sourceLang, targetLang, LoggerFactory .getLogger(EnrichmentConverterXLIFF20.class)); } /** * Retrieves the enrichments from a XLIFF 2.0 unit. * * @param unit * the unit * @param fragment * the current involved fragment * @return the list of enrichments for this unit. */ public List<Enrichment> retrieveEnrichments(Unit unit, Fragment fragment, String language) { List<Enrichment> enrichments = new ArrayList<Enrichment>(); if (fragment != null) { StringBuilder wholeText = new StringBuilder(); List<EnrichmentWrapper> currEnrichments = new ArrayList<EnrichmentWrapper>(); List<Integer> codePositionToRemove = new ArrayList<Integer>(); List<DataCategory> dataCategoryToDelete = new ArrayList<DataCategory>(); String termAnnotator = null; if (unit.getAnnotatorsRef() != null) { termAnnotator = unit.getAnnotatorsRef().get("terminology"); } String codedText = fragment.getCodedText(); for (int textIdx = 0; textIdx < codedText.length(); textIdx++) { switch (codedText.charAt(textIdx)) { case Fragment.MARKER_OPENING: MTag tag = fragment.getMTag(codedText, textIdx); if (tag != null && (tag.hasITSItem() || tag instanceof TermTag)) { manageMarkerOpeningXliff2_0(tag, currEnrichments, dataCategoryToDelete, wholeText.toString(), termAnnotator); if (!tag.hasITSItem()) { int tagKey = fragment.getTags().getKey(tag); fragment.getTags().remove(tagKey); codePositionToRemove.add(textIdx++); codePositionToRemove.add(textIdx); } } break; case Fragment.MARKER_CLOSING: tag = fragment.getMTag(codedText, textIdx); EnrichmentWrapper enrichmentWrapper = findEnrichmentByTagId( tag.getId(), currEnrichments); if (enrichmentWrapper != null ) { enrichmentWrapper.getEnrichment().setOffsetEndIdx( wholeText.length()); currEnrichments.remove(enrichmentWrapper); enrichments.add(enrichmentWrapper.getEnrichment()); if (tag.getITSItems() != null && !tag.getITSItems().isEmpty()) { for (DataCategory annot : dataCategoryToDelete) { if (annot.equals(tag.getITSItems().get( annot.getClass()))) { tag.getITSItems().remove(annot); } } } if (!tag.hasITSItem()) { fragment.remove(tag); codePositionToRemove.add(textIdx++); codePositionToRemove.add(textIdx); } } break; default: wholeText.append(codedText.charAt(textIdx)); break; } } StringBuilder newCodedText = new StringBuilder(); int lastIndex = 0; for (Integer index : codePositionToRemove) { newCodedText.append(codedText.substring(lastIndex, index)); lastIndex = index + 1; } newCodedText.append(codedText.substring(lastIndex)); fragment.setCodedText(newCodedText.toString()); enrichments.addAll(retrieveTriplesEnrichments( unit.getExtElements(), enrichments, language)); } return enrichments; } /** * Manages an opening marker for XLIFF 2.0: depending on the type of the * tag, the proper enrichment is created. * * @param tag * the tag * @param currEnrichments * the list of current enrichments * @param dataCategoryToDelete * the list of data category to be deleted * @param wholeText * the current text * @param termAnnotator * the term annotator. */ private void manageMarkerOpeningXliff2_0(MTag tag, List<EnrichmentWrapper> currEnrichments, List<DataCategory> dataCategoryToDelete, String wholeText, String termAnnotator) { if (tag.getITSItems() != null) { TextAnalysis taAnnot = (TextAnalysis) tag.getITSItems().get( TextAnalysis.class); if (taAnnot != null) { if (taAnnot.getTaIdentRef() != null) { EntityEnrichment entityEnr = new EntityEnrichment( taAnnot.getTaIdentRef()); entityEnr.setAnnotatorRef(taAnnot.getAnnotatorRef()); entityEnr.setOffsetStartIdx(wholeText.length()); currEnrichments.add(new EnrichmentWrapper(entityEnr, tag .getId())); } boolean canDeleteTextAnalysisAnnot = canDeleteTAAnnotation(taAnnot); if (canDeleteTextAnalysisAnnot) { tag.getITSItems().remove(taAnnot); dataCategoryToDelete.add(taAnnot); } } } else if (tag instanceof TermTag) { TerminologyEnrichment termEnric = new TerminologyEnrichment(); termEnric.setOffsetStartIdx(wholeText.length()); if (termAnnotator != null) { termEnric.setAnnotator(termAnnotator); } termEnric.setTermInfoRef(((TermTag) tag).getTermInfoRef()); currEnrichments.add(new EnrichmentWrapper(termEnric, tag.getId())); } } /** * Checks if the text-analysis annotation can be deleted. * * @param taAnnot * the text-analysis annotation * @return <code>true</code> if it can be deleted; <code>false</code> * otherwise */ private boolean canDeleteTAAnnotation(TextAnalysis taAnnot) { return (taAnnot.getTaClassRef() == null || taAnnot.getTaClassRef() .isEmpty()) && taAnnot.getTaConfidence() == null && (taAnnot.getTaIdent() == null || taAnnot.getTaIdent() .isEmpty()) && (taAnnot.getTaSource() == null || taAnnot.getTaSource() .isEmpty()); } /** * Finds the enrichment retrieved from a specific tag. * * @param tagId * the tag ID * @param enrichmentList * the list of enrichments * @return the enrichment if it exists; <code>null</code> otherwise. */ private EnrichmentWrapper findEnrichmentByTagId(String tagId, List<EnrichmentWrapper> enrichmentList) { EnrichmentWrapper enrichWrapper = null; if (enrichmentList != null) { for (EnrichmentWrapper currEnrichWrapp : enrichmentList) { if (currEnrichWrapp.getTagId().equals(tagId)) { enrichWrapper = currEnrichWrapp; break; } } } return enrichWrapper; } /** * Retrieves the triple enrichments for a XLIFF file 2.0. The triple * enrichments are those enrichments expressed in triples (link and * terminology). * * @param elements * the extra elements in the current XLIFF 2.0 unit * @param enrichments * the list of enrichments found so far. * @return the complete list of enrichments. */ private List<Enrichment> retrieveTriplesEnrichments( final ExtElements elements, final List<Enrichment> enrichments, String language) { List<Enrichment> triplesEnrichments = new ArrayList<Enrichment>(); if (elements != null) { Iterator<ExtElement> elemsIt = elements.iterator(); ExtElement elem = null; while (elemsIt.hasNext()) { elem = elemsIt.next(); if (elem.getQName().getPrefix().equals("ex") && elem.getQName().getLocalPart().equals("json-ld") && !elem.getChildren().isEmpty()) { if (elem.getChildren().get(0) instanceof ExtContent) { String jsonString = ((ExtContent) elem.getChildren() .get(0)).getText(); triplesEnrichments.addAll(retrieveTriplesEnrichments( jsonString, enrichments, language)); } } } } return triplesEnrichments; } } /** * Wrapper class for enrichments. It contains information about the enrichment * and the ID of the tag from which the enrichemnt has been retrieved. */ class EnrichmentWrapper { /** The enrichment. */ private Enrichment enrichment; /** The tag ID. */ private String tagId; /** * Constructor. * * @param enrichment * the enrichment * @param tagId * the tag ID */ public EnrichmentWrapper(Enrichment enrichment, String tagId) { super(); this.enrichment = enrichment; this.tagId = tagId; } /** * Gets the enrichment. * * @return the enrichment. */ public Enrichment getEnrichment() { return enrichment; } /** * Sets the enrichment * * @param enrichment * the enrichment */ public void setEnrichment(Enrichment enrichment) { this.enrichment = enrichment; } /** * Gets the tag ID. * * @return the tag ID. */ public String getTagId() { return tagId; } /** * Sets the tag ID * * @param tagId * the tag ID. */ public void setTagId(String tagId) { this.tagId = tagId; } }