package com.vistatec.ocelot.xliff.freme;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.ResIterator;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.vistatec.ocelot.its.model.EnrichmentMetaData;
import com.vistatec.ocelot.its.model.TerminologyMetaData;
import com.vistatec.ocelot.its.model.TextAnalysisMetaData;
import com.vistatec.ocelot.segment.model.BaseSegmentVariant;
import com.vistatec.ocelot.segment.model.OcelotSegment;
import com.vistatec.ocelot.segment.model.enrichment.ELinkEnrichmentsConstants;
import com.vistatec.ocelot.segment.model.enrichment.Enrichment;
import com.vistatec.ocelot.segment.model.enrichment.EntityEnrichment;
import com.vistatec.ocelot.segment.model.enrichment.LinkEnrichment;
import com.vistatec.ocelot.segment.model.enrichment.TerminologyEnrichment;
/**
* This class should be extended by all classes providing methods for converting
* XLIFF files tag into enrichments. This class also provides methods for
* converting enrichments to ITS meta data to be displayed in tables in Ocelot
* left panel.
*/
public abstract class EnrichmentConverter {
/** The logger. */
protected final Logger logger;
/** The source language. */
private String sourceLang;
/** The target language. */
private String targetLang;
/**
* Constructor.
*
* @param sourceLang
* the source language.
* @param targetLang
* the target language.
* @param logger
* the logger.
*/
public EnrichmentConverter(String sourceLang, String targetLang,
Logger logger) {
this.sourceLang = sourceLang;
this.targetLang = targetLang;
this.logger = logger;
}
/**
* Retrieves the triple enrichments (link and terminology) from a triples
* formatted as a JSON-LD string.
*
* @param jsonString
* the triples formatted as JSON-LD string
* @param enrichments
* the list of enrichments retrieved so far
* @return the complete list of enrichments.
*/
protected List<Enrichment> retrieveTriplesEnrichments(String jsonString,
List<Enrichment> enrichments, String language) {
List<Enrichment> triplesEnrichments = new ArrayList<Enrichment>();
Model tripleModel = ModelFactory.createDefaultModel();
tripleModel.read(new StringReader(jsonString), null,
EnrichmentAnnotationsConstants.JSON_LD_FORMAT);
ResIterator resourcesIt = tripleModel.listSubjects();
if (resourcesIt != null) {
Resource currRes = null;
while (resourcesIt.hasNext()) {
currRes = resourcesIt.next();
// check if an enrichment (entity of terminology exists for the
// current resource
Enrichment enrichment = findEnrichmentForURI(enrichments,
currRes.getURI());
if (enrichment != null) {
// The resource is related to an entity enrichment --> then
// the triples represent a Link enrichment
if (enrichment.getType().equals(Enrichment.ENTITY_TYPE)) {
LinkEnrichment link = new LinkEnrichment(
enrichment.getOffsetStartIdx(),
enrichment.getOffsetEndIdx(), language);
ELinkEnrichmentsConstants.fillLinkEnrichment(link,
tripleModel, currRes.getURI());
triplesEnrichments.add(link);
// The resource is related to a terminology enrichment
// --> then the triples represent sources, targets and
// senses for this terminology enrichment.
} else if (enrichment.getType().equals(
Enrichment.TERMINOLOGY_TYPE)) {
triplesEnrichments.addAll(buildTerminologyEnrichments(
tripleModel, currRes.getURI(),
enrichment.getOffsetStartIdx(),
enrichment.getOffsetEndIdx()));
enrichments.remove(enrichment);
}
}
}
}
return triplesEnrichments;
}
/**
* Builds all the terminology enrichments represented by the triples model.
*
* @param tripleModel
* the triples model.
* @param termResURI
* the terminology resource URI
* @param offsetStartIdx
* the terminology enrichment offset start index
* @param offsetEndIdx
* the terminology enrichment offset end index
* @return the list of terminology enrichments.
*/
private List<Enrichment> buildTerminologyEnrichments(Model tripleModel,
String termResURI, int offsetStartIdx, int offsetEndIdx) {
List<Enrichment> termEnrichments = new ArrayList<Enrichment>();
Resource mainRes = tripleModel.createResource(termResURI);
StmtIterator mainStmtIt = tripleModel.listStatements(mainRes, null,
(RDFNode) null);
List<Statement> tripleStmts = null;
TerminologyEnrichment termEnrich = null;
while (mainStmtIt.hasNext()) {
Statement mainStmt = mainStmtIt.next();
tripleStmts = new ArrayList<Statement>();
tripleStmts.add(mainStmt);
String sense = findSense(tripleModel, mainStmt, tripleStmts);
String[] sourceTarget = findSourceAndTarget(tripleModel, mainStmt,
tripleStmts);
if (sourceTarget[0] != null) {
termEnrich = new TerminologyEnrichment();
termEnrich.setSourceTerm(sourceTarget[0]);
termEnrich.setTargetTerm(sourceTarget[1]);
termEnrich.setSense(sense);
termEnrich.setTermTriples(tripleStmts);
termEnrich.setOffsetStartIdx(offsetStartIdx);
termEnrich.setOffsetEndIdx(offsetEndIdx);
termEnrich.setTermInfoRef(termResURI);
termEnrichments.add(termEnrich);
}
}
return termEnrichments;
}
/**
* Finds the enrichment related to the URI passed as parameter. It could be
* either an entity or a terminology enrichment.
*
* @param enrichments
* the list of enrichments.
* @param uri
* the URI
* @return the enrichment related to the URI
*/
private Enrichment findEnrichmentForURI(List<Enrichment> enrichments,
String uri) {
Enrichment enrichment = null;
if (enrichments != null) {
for (Enrichment currEnrich : enrichments) {
if (currEnrich.getType().equals(Enrichment.ENTITY_TYPE)) {
if (uri.equals(((EntityEnrichment) currEnrich)
.getEntityURL())) {
enrichment = currEnrich;
break;
}
} else if (currEnrich.getType().equals(
Enrichment.TERMINOLOGY_TYPE)) {
if (uri.equals(((TerminologyEnrichment) currEnrich)
.getTermInfoRef())) {
enrichment = currEnrich;
break;
}
}
}
}
return enrichment;
}
/**
* Finds the sense for the current terminology triple.
*
* @param tripleModel
* the triples model
* @param mainTermStmt
* the terminology triple
* @param tripleStmts
* the list of triples statements related to this terminology
* enrichment
* @return the sense if it exists; <code>null</code> otherwise
*/
private String findSense(Model tripleModel, Statement mainTermStmt,
List<Statement> tripleStmts) {
String sense = null;
StmtIterator senseStmtIt = tripleModel.listStatements(mainTermStmt
.getObject().asResource(), tripleModel.createProperty(
"http://www.w3.org/2000/01/rdf-schema#", "comment"),
(RDFNode) null);
Statement senseStmt = null;
if (senseStmtIt != null && senseStmtIt.hasNext()) {
senseStmt = senseStmtIt.next();
sense = senseStmt.getObject().asLiteral().getString();
tripleStmts.add(senseStmt);
}
return sense;
}
/**
* Finds source and target for the current terminology triple.
*
* @param tripleModel
* the triples model.
* @param mainTermStmt
* the terminology main statement.
* @param tripleStmts
* the list of triples realted to this terminology enrichment.
* @return an array of strings containing the source at the first index and
* the target at the second index.
*/
protected String[] findSourceAndTarget(Model tripleModel,
Statement mainTermStmt, List<Statement> tripleStmts) {
String[] sourceTarget = new String[2];
String sourceLanguage = sourceLang;
if (sourceLang.contains("-")) {
sourceLanguage = sourceLang.substring(0, sourceLang.indexOf("-"));
}
String targetLanguage = targetLang;
if (targetLang.contains("-")) {
targetLanguage = targetLang.substring(0, targetLang.indexOf("-"));
}
StmtIterator referenceStmtIt = tripleModel.listStatements(null,
tripleModel.createProperty(
"http://www.w3.org/ns/lemon/ontolex#", "reference"),
mainTermStmt.getObject());
if (referenceStmtIt != null) {
Statement referenceStmt = null;
while (referenceStmtIt.hasNext()) {
referenceStmt = referenceStmtIt.next();
tripleStmts.add(referenceStmt);
String sourceURI = referenceStmt.getSubject().getURI()
.replace("#Sense", "#CanonicalForm");
StmtIterator termIt = tripleModel.listStatements(tripleModel
.createResource(sourceURI), tripleModel.createProperty(
"http://www.w3.org/ns/lemon/ontolex#", "writtenRep"),
(RDFNode) null);
if (termIt != null && termIt.hasNext()) {
Statement sourcTgtStmt = termIt.next();
tripleStmts.add(sourcTgtStmt);
if (sourceLanguage.equals(sourcTgtStmt.getObject()
.asLiteral().getLanguage())) {
sourceTarget[0] = sourcTgtStmt.getObject().asLiteral()
.getString();
} else if (targetLanguage.equals(sourcTgtStmt.getObject()
.asLiteral().getLanguage())) {
sourceTarget[1] = sourcTgtStmt.getObject().asLiteral()
.getString();
}
}
}
}
return sourceTarget;
}
/**
* Converts the enrichments assigned to a segment to a list of ITS meta
* data.
*
* @param segment
* the Ocelot segment.
*/
public void convertEnrichments2ITSMetadata(OcelotSegment segment) {
if (segment.getSource() instanceof BaseSegmentVariant) {
convertEnrichment2ITSMetaData(segment,
(BaseSegmentVariant) segment.getSource(),
EnrichmentMetaData.SOURCE);
}
if (segment.getTarget() != null
&& segment.getTarget() instanceof BaseSegmentVariant) {
convertEnrichment2ITSMetaData(segment,
(BaseSegmentVariant) segment.getTarget(),
EnrichmentMetaData.TARGET);
}
}
/**
* Converts the enrichments assigned to a specific variant to a list of ITS
* meta data.
*
* @param segment
* the Ocelot segment
* @param variant
* the variant
* @param segmentPart
* a string stating the part of the segment involved.
*/
public static void convertEnrichment2ITSMetaData(OcelotSegment segment,
BaseSegmentVariant variant, String segmentPart) {
Set<Enrichment> variantEnrichments = variant.getEnirchments();
if (variantEnrichments != null) {
String sourceText = variant.getDisplayText();
TextAnalysisMetaData taAnnot = null;
TerminologyMetaData termAnnot = null;
for (Enrichment enrich : variantEnrichments) {
if (enrich.getType().equals(Enrichment.ENTITY_TYPE)) {
taAnnot = createTaMetaData((EntityEnrichment) enrich,
sourceText, segmentPart);
TextAnalysisMetaData existingMetaData = findTaMetaData(
taAnnot.getEntity(), segment.getTextAnalysis(), segmentPart);
if (existingMetaData == null) {
segment.addTextAnalysis(taAnnot);
} else {
existingMetaData.merge(taAnnot);
}
} else if (enrich.getType().equals(Enrichment.TERMINOLOGY_TYPE)) {
termAnnot = createTermMetaData(
(TerminologyEnrichment) enrich, sourceText,
segmentPart);
if(!segment.getTerms().contains(termAnnot)){
segment.addTerm(termAnnot);
}
}
}
}
}
/**
* Removes from the segment all the metadata related to enrichments.
* @param segment the segment
* @param variant the variant
*/
public static void removeEnrichmentMetaData(OcelotSegment segment,
BaseSegmentVariant variant, boolean target) {
if (variant.getEnirchments() != null) {
for (Enrichment enrich : variant.getEnirchments()) {
String sourceText = variant.getDisplayText();
TerminologyMetaData termAnnot = null;
TextAnalysisMetaData taAnnot = null;
String segPart = target?TextAnalysisMetaData.TARGET:TextAnalysisMetaData.SOURCE;
if (enrich.getType().equals(Enrichment.ENTITY_TYPE)) {
taAnnot = createTaMetaData((EntityEnrichment) enrich,
sourceText, null);
TextAnalysisMetaData existingMetaData = findTaMetaData(
taAnnot.getEntity(), segment.getTextAnalysis(), segPart);
if(existingMetaData != null){
if(taAnnot.getTaAnnotatorsRef() != null){
existingMetaData.setTaAnnotatorsRef(null);
}
if(taAnnot.getTaClassRef() != null){
existingMetaData.setTaClassRef(null);
}
if(taAnnot.getTaConfidence() != null){
existingMetaData.setTaConfidence(null);
}
if(taAnnot.getTaIdentRef() != null){
existingMetaData.setTaIdentRef(null);
}
if(existingMetaData.isEmpty()){
segment.removeTextAnalysis(existingMetaData);
}
}
} else if (enrich.getType().equals(Enrichment.TERMINOLOGY_TYPE)) {
termAnnot = createTermMetaData(
(TerminologyEnrichment) enrich, sourceText,
segPart);
segment.removeTerm(termAnnot);
}
}
}
}
/**
* Finds the Text-Analysis meta data related to a specific entity.
*
* @param entity
* the entity string
* @param metaDataList
* the list of meta data
* @return the Text-Analysis meta data related to the entity
*/
private static TextAnalysisMetaData findTaMetaData(String entity,
List<TextAnalysisMetaData> metaDataList, String segPart) {
TextAnalysisMetaData taMetaData = null;
if (metaDataList != null) {
for (TextAnalysisMetaData metaData : metaDataList) {
if (metaData.getEntity().equals(entity) && metaData.getSegPart().equals(segPart)) {
taMetaData = metaData;
break;
}
}
}
return taMetaData;
}
/**
* Creates a text-analysis meta data starting from an entity enrichment.
*
* @param enrichment
* the enrichment
* @param wholeText
* the whole text assigned to the involved segment part
* @param segmentPart
* a string stating which segment part is involved (source,
* target or segment)
* @return the text-analysis meta data
*/
private static TextAnalysisMetaData createTaMetaData(EntityEnrichment enrichment,
String wholeText, String segmentPart) {
TextAnalysisMetaData taAnnot = new TextAnalysisMetaData();
taAnnot.setTaIdentRef(enrichment.getEntityURL());
taAnnot.setEntity(wholeText.substring(enrichment.getOffsetStartIdx(),
enrichment.getOffsetEndIdx()));
taAnnot.setTaAnnotatorsRef(enrichment.getAnnotatorRef());
taAnnot.setSegPart(segmentPart);
return taAnnot;
}
/**
* Creates a terminlogy meta data starting from a terminology enrichment.
*
* @param enrichment
* the terminology enrichment
* @param wholeText
* the whole text assigned to the involved segment part
* @param segmentPart
* a string stating which segment part is involved (source,
* target or segment)
* @return the terminology meta data
*/
private static TerminologyMetaData createTermMetaData(
TerminologyEnrichment enrichment, String wholeText,
String segmentPart) {
TerminologyMetaData termAnnot = new TerminologyMetaData();
termAnnot.setAnnotatorsRef(enrichment.getAnnotator());
termAnnot.setSense(enrichment.getSense());
termAnnot.setTermSource(enrichment.getSourceTerm());
termAnnot.setTermTarget(enrichment.getTargetTerm());
termAnnot.setTerm(wholeText.substring(enrichment.getOffsetStartIdx(),
enrichment.getOffsetEndIdx()));
termAnnot.setSegPart(segmentPart);
return termAnnot;
}
}