package com.vistatec.ocelot.xliff.freme;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import net.sf.okapi.common.ISkeleton;
import net.sf.okapi.common.annotation.GenericAnnotation;
import net.sf.okapi.common.annotation.GenericAnnotationType;
import net.sf.okapi.common.resource.Code;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextPart;
import org.slf4j.LoggerFactory;
import com.vistatec.ocelot.segment.model.enrichment.Enrichment;
import com.vistatec.ocelot.segment.model.enrichment.EntityEnrichment;
import com.vistatec.ocelot.segment.model.enrichment.LinkEnrichment;
import com.vistatec.ocelot.segment.model.enrichment.TerminologyEnrichment;
/**
* This class provides methods for converting XLIFF 1.2 tags to enrichments.
*/
public class EnrichmentConverterXLIFF12 extends EnrichmentConverter {
/**
* Constructor.
*
* @param sourceLang
* the XLIFF file source language
* @param targetLang
* the XLIFF file target language
*/
public EnrichmentConverterXLIFF12(String sourceLang, String targetLang) {
super(sourceLang, targetLang, LoggerFactory
.getLogger(EnrichmentConverterXLIFF12.class));
}
private boolean hasAnnotations(Code code) {
return (code.getGenericAnnotations() != null && code.getGenericAnnotations().size() > 0);
}
/**
* Retrieves enrichments from a specific XLIFF 1.2 text unit. Codes embedded
* into the text unit and the skeleton are inspected. The codes representing
* enrichments are then deleted.
*
* @param textContainer
* the text container
* @param textUnit
* the text unit.
* @return the list of retrieved enrichments
*/
public List<Enrichment> retrieveEnrichments(TextContainer textContainer,
ITextUnit textUnit , String language) {
List<Enrichment> enrichments = new ArrayList<Enrichment>();
if (textContainer != null) {
StringBuilder wholeText = new StringBuilder();
List<Code> codesToRemove = new ArrayList<Code>();
List<Enrichment> currEnrichments = new ArrayList<Enrichment>();
int index = -1;
for (TextPart part : textContainer.getParts()) {
TextFragment text = part.getContent();
String codedText = text.getCodedText();
Code openingCode = null;
for (int i = 0; i < codedText.length(); i++) {
switch (text.charAt(i)) {
case TextFragment.MARKER_OPENING:
index = TextFragment.toIndex(codedText.charAt(++i));
openingCode = part.getContent().getCodes().get(index);
if (hasAnnotations(openingCode)) {
currEnrichments.addAll(convertAnnots2Enrichments(
openingCode, wholeText.length()));
// all the annotations from this code have been
// translated to enrichments.
// the code must be removed
if (!hasAnnotations(openingCode)) {
codesToRemove.add(openingCode);
}
}
break;
case TextFragment.MARKER_CLOSING:
index = TextFragment.toIndex(codedText.charAt(++i));
Code code = part.getContent().getCodes().get(index);
if (hasAnnotations(code)) {
// check annotations in the closing code.
manageClosingCodeAnnots(code, openingCode);
if (code.getGenericAnnotations() == null || code.getGenericAnnotations().size() == 0) {
codesToRemove.add(code);
}
}
// update end index for all current enrichments.
for (Enrichment enrich : currEnrichments) {
enrich.setOffsetEndIdx(wholeText.length());
}
enrichments.addAll(currEnrichments);
currEnrichments.clear();
break;
default:
wholeText.append(text.charAt(i));
break;
}
}
for (Code code : codesToRemove) {
text.removeCode(code);
}
}
}
// retrieve all triple enrichments.
enrichments.addAll(retrieveTriplesEnrichments(textUnit.getSkeleton(),
enrichments, language));
return enrichments;
}
/**
* Converts all the annotations included in a code to proper enrichments.
*
* @param code
* the code
* @param startOffsetIndex
* the offset start index for the enrichments to be creaed.
* @return the list of created enrichments.
*/
private List<Enrichment> convertAnnots2Enrichments(Code code,
int startOffsetIndex) {
List<Enrichment> enrichments = new ArrayList<Enrichment>();
if (code.getGenericAnnotations() != null) {
Iterator<GenericAnnotation> annotationsIt = code
.getGenericAnnotations().iterator();
List<GenericAnnotation> annotationsToDelete = new ArrayList<GenericAnnotation>();
String entityAnnotator = null;
String termAnnotator = null;
GenericAnnotation annotation = null;
while (annotationsIt.hasNext()) {
annotation = annotationsIt.next();
switch (annotation.getType()) {
// Entity Enrichment
case GenericAnnotationType.TA:
if (annotation.getString(GenericAnnotationType.TA_IDENT) != null) {
EntityEnrichment enrichment = convertAnnotation2EntityEnrichment(
annotation, entityAnnotator, startOffsetIndex);
enrichments.add(enrichment);
annotationsToDelete.add(annotation);
}
break;
// Terminology Enrichment
case GenericAnnotationType.TERM:
TerminologyEnrichment termEnrichment = convertAnnotation2TerminologyEnrichment(
annotation, termAnnotator, startOffsetIndex, code);
enrichments.add(termEnrichment);
annotationsToDelete.add(annotation);
break;
// Annotators Ref
case GenericAnnotationType.ANNOT:
if (annotation.getString(GenericAnnotationType.ANNOT_VALUE) != null) {
String annotValue = annotation
.getString(GenericAnnotationType.ANNOT_VALUE);
if (annotValue != null) {
if (annotValue
.contains(EnrichmentAnnotationsConstants.TERM_ANNOTATORS_REF_STRING)) {
termAnnotator = manageTermAnnotatorsRef(
annotValue, enrichments);
annotationsToDelete.add(annotation);
}
if (annotValue
.contains(EnrichmentAnnotationsConstants.TA_ANNOTATORS_REF_STRING)) {
entityAnnotator = manageTAAnnotatorsRef(
annotValue, enrichments);
annotationsToDelete.add(annotation);
}
}
}
break;
default:
break;
}
}
for (GenericAnnotation annot : annotationsToDelete) {
code.getGenericAnnotations().remove(annot);
}
}
return enrichments;
}
/**
* Converts the annotation to an Entity Enrichment.
*
* @param annotation
* the annotation
* @param entityAnnotator
* the entity annotator
* @param startOffsetIndex
* the enrichment offset start index
* @return the Entity Enrichment.
*/
private EntityEnrichment convertAnnotation2EntityEnrichment(
GenericAnnotation annotation, String entityAnnotator,
int startOffsetIndex) {
String value = annotation.getString(GenericAnnotationType.TA_IDENT);
if (value.startsWith(GenericAnnotationType.REF_PREFIX)) {
value = getRefString(value);
}
logger.debug("Found an Entity Enrichment with value \"{}\".", value);
EntityEnrichment enrichment = new EntityEnrichment(value);
enrichment.setOffsetStartIdx(startOffsetIndex);
if (entityAnnotator != null) {
enrichment.setAnnotatorRef(entityAnnotator);
}
return enrichment;
}
/**
* Converts the annotation to a Terminology Enrichemnt.
*
* @param annotation
* the annotation
* @param termAnnotator
* the term annotator
* @param startOffsetIndex
* the enrichment offset start index
* @param code
* the code
* @return the terminology enrichment.
*/
private TerminologyEnrichment convertAnnotation2TerminologyEnrichment(
GenericAnnotation annotation, String termAnnotator,
int startOffsetIndex, Code code) {
TerminologyEnrichment termEnrichment = new TerminologyEnrichment();
termEnrichment.setOffsetStartIdx(startOffsetIndex);
if (termAnnotator != null) {
termEnrichment.setAnnotator(termAnnotator);
}
if (code.getOuterData() != null) {
int index = code.getOuterData().indexOf("ref=\"")
+ "ref=\"".length();
if (index != -1) {
String infoRef = code.getOuterData().substring(index);
int endIndex = infoRef.indexOf("\"");
infoRef = infoRef.substring(0, endIndex);
termEnrichment.setTermInfoRef(infoRef);
}
}
return termEnrichment;
}
/**
* Gets the referenced value for a specific string of the type "REF:...".
*
* @param refValue
* the reference value string
* @return the referenced value.
*/
private String getRefString(String refValue) {
String retString = refValue;
int refIdx = refValue.indexOf(GenericAnnotationType.REF_PREFIX);
if (refIdx != -1) {
retString = refValue.substring(refIdx
+ GenericAnnotationType.REF_PREFIX.length());
}
return retString;
}
/**
* Manages the text-analysis annotators ref. This is the case the annotators
* ref value is "text-analysis|...".
*
* @param annotValue
* the annotation value
* @param enrichments
* the list of enrichments.
* @return the text-analysis annotator ref.
*/
private String manageTAAnnotatorsRef(String annotValue,
List<Enrichment> enrichments) {
int termIdx = annotValue
.indexOf(EnrichmentAnnotationsConstants.TA_ANNOTATORS_REF_STRING);
int endIndex = annotValue.indexOf(" ", termIdx);
if (endIndex == -1) {
endIndex = annotValue.length();
}
String entityAnnotator = annotValue
.substring(
termIdx
+ EnrichmentAnnotationsConstants.TA_ANNOTATORS_REF_STRING
.length(), endIndex);
List<Enrichment> entityEnrichments = findEnrichments(
Enrichment.ENTITY_TYPE, enrichments);
if (entityEnrichments != null) {
for (Enrichment entityEnrich : entityEnrichments) {
((EntityEnrichment) entityEnrich)
.setAnnotatorRef(entityAnnotator);
}
}
return entityAnnotator;
}
/**
* Manages the Term annotators ref. This is the case the annotators ref
* value is "terminology|...".
*
* @param annotValue
* the annotation value
* @param enrichments
* the list of enrichments
* @return the terminology annotators ref.
*/
private String manageTermAnnotatorsRef(String annotValue,
List<Enrichment> enrichments) {
int termIdx = annotValue
.indexOf(EnrichmentAnnotationsConstants.TERM_ANNOTATORS_REF_STRING);
int endIndex = annotValue.indexOf(" ", termIdx);
if (endIndex == -1) {
endIndex = annotValue.length();
}
String termAnnotator = annotValue
.substring(
termIdx
+ EnrichmentAnnotationsConstants.TERM_ANNOTATORS_REF_STRING
.length(), endIndex);
List<Enrichment> termEnrichments = findEnrichments(
Enrichment.TERMINOLOGY_TYPE, enrichments);
if (termEnrichments != null) {
for (Enrichment termEnrich : termEnrichments) {
((TerminologyEnrichment) termEnrich)
.setAnnotator(termAnnotator);
}
}
return termAnnotator;
}
/**
* Finds all the enrichments of a specific type among a list of enrichments.
*
* @param type
* the type
* @param enrichments
* the list of enrichments
* @return the list of enrichments of the specified type.
*/
private List<Enrichment> findEnrichments(String type,
List<Enrichment> enrichments) {
List<Enrichment> retEnrichments = new ArrayList<Enrichment>();
if (enrichments != null) {
for (Enrichment currEnrich : enrichments) {
if (currEnrich.getType().equals(type)) {
retEnrichments.add(currEnrich);
}
}
}
return retEnrichments;
}
/**
* Checks if the annotations in the closing code are still listed among the
* annotations of the related opening code. An annotation is deleted if it
* is not included in the opening code.
*
* @param closingCode
* the closing code.
* @param openingCode
* the opening code.
*/
private void manageClosingCodeAnnots(Code closingCode, Code openingCode) {
if (closingCode.getGenericAnnotations() != null) {
List<GenericAnnotation> annotToDelete = new ArrayList<GenericAnnotation>();
Iterator<GenericAnnotation> annotIt = closingCode
.getGenericAnnotations().iterator();
GenericAnnotation annot = null;
while (annotIt.hasNext()) {
annot = annotIt.next();
if (!existAnnotInCode(openingCode, annot)) {
annotToDelete.add(annot);
}
}
for (GenericAnnotation annotation : annotToDelete) {
closingCode.getGenericAnnotations().remove(annotation);
}
}
}
/**
* Checks if an annotation exists in a specific code.
*
* @param code
* the code
* @param annot
* the annotation
* @return <code>true</code> if the annotation exists; <code>false</code>
* otherwise.
*/
private boolean existAnnotInCode(Code code, GenericAnnotation annot) {
boolean exist = false;
if (code.getGenericAnnotations() != null) {
Iterator<GenericAnnotation> annotIt = code.getGenericAnnotations()
.iterator();
GenericAnnotation currAnnot = null;
while (annotIt.hasNext() && !exist) {
currAnnot = annotIt.next();
if (currAnnot.getType().equals(annot.getType())) {
exist = true;
}
}
}
return exist;
}
/**
* Retrieves triple enrichments from the skeleton. The skeleton in XLIFF 1.2
* file includes a <code><ex:json-ld></code> tag containing the
* JSON-LD triples. Triple enrichments are those enrichments having info
* stored in the JSON triples (Link and Terminology enrichments).
*
* @param skeleton
* the skeleton
* @param enrichments
* all the enrichments found so far.
* @return the complete list of enrichments.
*/
private List<Enrichment> retrieveTriplesEnrichments(ISkeleton skeleton,
List<Enrichment> enrichments, String language) {
List<Enrichment> triplesEnrichments = new ArrayList<Enrichment>();
if (skeleton != null) {
String skelString = skeleton.toString();
int jsonStartIdx = skelString
.indexOf(EnrichmentAnnotationsConstants.JSON_TAG_NAME);
if (jsonStartIdx != -1) {
jsonStartIdx = skelString.indexOf(">", jsonStartIdx) + 1;
String stringJson = skelString.substring(jsonStartIdx);
int endJsonIdx = stringJson.indexOf("</"
+ LinkEnrichment.MARKER_TAG);
stringJson = stringJson.substring(0, endJsonIdx);
triplesEnrichments.addAll(retrieveTriplesEnrichments(
stringJson, enrichments, language));
}
}
return triplesEnrichments;
}
}