/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.utils;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.project.ttc.engines.BilingualAligner.RequiresSize2Exception;
import eu.project.ttc.engines.morpho.CompoundUtils;
import eu.project.ttc.models.ContextVector;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.index.CustomTermIndex;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.models.index.TermMeasure;
import eu.project.ttc.resources.BilingualDictionary;
public class AlignerUtils {
private static final Logger LOGGER = LoggerFactory.getLogger(AlignerUtils.class);
public static final int TRANSLATION_STRATEGY_PRORATA = 1;
public static final int TRANSLATION_STRATEGY_MOST_FREQUENT = 2;
public static final int TRANSLATION_STRATEGY_MOST_SPECIFIC = 3;
private static final int TRANSLATION_STRATEGY_EQUI_REPARTITION = 4;
/**
*
* Translates all {@link ContextVector} components (i.e. its coTerms) into
* the target language of this aligner by the mean of one of the available
* strategy :
* - {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_FREQUENT}
* - {@link AlignerUtils#TRANSLATION_STRATEGY_PRORATA}
* - {@link AlignerUtils#TRANSLATION_STRATEGY_EQUI_REPARTITION}
* - {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_SPECIFIC}
*
* @see BilingualDictionary
* @param sourceVector
* The source context vector object to be translated into target language
* @param dictionary
* The dico used in the translation process
* @param translationStrategy
* The translation strategy of the <code>sourceVector</code>.
* Two possible values: {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_FREQUENT}
* {@link AlignerUtils#TRANSLATION_STRATEGY_PRORATA}
* {@link AlignerUtils#TRANSLATION_STRATEGY_EQUI_REPARTITION}
* {@link AlignerUtils#TRANSLATION_STRATEGY_MOST_SPECIFIC}
* @return
* The translated context vector
*/
public static ContextVector translateVector(ContextVector sourceVector,
BilingualDictionary dictionary, int translationStrategy, TermIndex targetTermino) {
ContextVector targetVector = new ContextVector();
CustomTermIndex swtLemmaIndex = targetTermino.getCustomIndex(TermIndexes.SINGLE_WORD_LEMMA);
for(ContextVector.Entry entry:sourceVector.getEntries()) {
Set<Term> translations = Sets.newHashSet();
for(String targetLemma:dictionary.getTranslations(entry.getCoTerm().getLemma())) {
Collection<Term> translatedTerms = swtLemmaIndex.getTerms(targetLemma);
if(!translatedTerms.isEmpty())
translations.add(translatedTerms.iterator().next());
}
switch (translationStrategy) {
case TRANSLATION_STRATEGY_PRORATA:
fillTargetVectorSProrata(targetVector, entry, translations);
break;
case TRANSLATION_STRATEGY_MOST_FREQUENT:
fillTargetVectorSMost(targetVector, entry, translations, targetTermino.getFrequencyMeasure());
break;
case TRANSLATION_STRATEGY_MOST_SPECIFIC:
fillTargetVectorSMost(targetVector, entry, translations, targetTermino.getWRMeasure());
break;
case TRANSLATION_STRATEGY_EQUI_REPARTITION:
fillTargetVectorSEquiRepartition(targetVector, entry, translations);
break;
default:
throw new IllegalArgumentException("Invalid translation strategy: " + translationStrategy);
}
}
return targetVector;
}
/**
* This method implements the strategy {@link #TRANSLATION_STRATEGY_PRORATA}
* for context vector translation.
*
* Explanation of strategy:
*
* Example of source term in french : chat <noir: 10, chien: 3>
*
* Example of candidate translations for "noir" from dico: black, dark
* Example of candidate translations for "chien" from dico: dog
*
* Suppose that frequencies in target term index are :
* - black : 35
* - dark : 15
* - dog : 7
*
* The translated vector would be : <black: 7, dark: 3, dog: 3>
*
* because :
* - total frequency in target term index for term "noir" is 35 + 15 = 50,
* and 7 = ( 35 / 50 ) * 10 for "black"
* and 3 = ( 15 / 50 ) * 10 for "dark"
* - total frequency in target term index for term "dog" is 7,
* and 3 = ( 7 / 7 ) * 3
*
*
* @param translatedVector
* the target vector to be fill
* @param sourceTermEntry
* the source vector's component to translated and add to target vector
* @param candidateTranslations
* the candidate translations of the <code>sourceTermEntry</code> given by the
* bilingual dictionary.
*/
private static void fillTargetVectorSProrata(ContextVector translatedVector,
ContextVector.Entry sourceTermEntry, Set<Term> candidateTranslations) {
/*
* Do the cross product of translation frequencies
*/
int totalFreqInTargetTermino = 0;
for(Term tt : candidateTranslations)
totalFreqInTargetTermino += tt.getFrequency();
for(Term targetTerm:candidateTranslations) {
int prorataCooccs = targetTerm.getFrequency() * sourceTermEntry.getNbCooccs() / totalFreqInTargetTermino;
translatedVector.addEntry(targetTerm, prorataCooccs, sourceTermEntry.getAssocRate());
}
}
/**
* This method implements the {@value #TRANSLATION_STRATEGY_MOST_FREQUENT}
* strategy for context vector translation.
*
*
* Explanation of strategy:
*
* Example of source term in french : chat <noir: 10, chien: 3>
*
* Example of candidate translations for "noir" from dico: black, dark
* Example of candidate translations for "chien" from dico: dog
*
* Suppose that frequencies in target term index are :
* - black : 35
* - dark : 15
* - dog : 7
*
* The translated vector would be : <black: 10, dog: 3>
*
* @param translatedVector
* the target vector to be fill
* @param sourceTermEntry
* the source vector's component to translated and add to target vector
* @param candidateTranslations
* the candidate translations of the <code>sourceTermEntry</code> given by the
* bilingual dictionary.
* @param termMeasure
*
*/
private static void fillTargetVectorSMost(ContextVector translatedVector,
ContextVector.Entry sourceTermEntry, Set<Term> candidateTranslations, TermMeasure termMeasure) {
fillTargetVectorWithMostProperty(translatedVector, sourceTermEntry,
candidateTranslations, termMeasure);
}
/**
*
* Explanation of strategy:
*
* Example of source term in french : chat <noir: 10, chien: 3>
*
* Example of candidate translations for "noir" from dico: black, dark
* Example of candidate translations for "chien" from dico: dog
*
*
* The translated vector would be : <black: 5, dark: 5, dog: 3>
*
* @param translatedVector
* @param sourceTermEntry
* @param candidateTranslations
*/
private static void fillTargetVectorSEquiRepartition(ContextVector translatedVector,
ContextVector.Entry sourceTermEntry, Set<Term> candidateTranslations) {
/*
* Do the cross product of translation frequencies
*/
for(Term targetTerm:candidateTranslations) {
int nbCooccs = sourceTermEntry.getNbCooccs()/candidateTranslations.size();
translatedVector.addEntry(
targetTerm,
nbCooccs,
sourceTermEntry.getAssocRate()/candidateTranslations.size());
}
}
private static void fillTargetVectorWithMostProperty(
ContextVector translatedVector,
ContextVector.Entry sourceTermEntry,
Set<Term> candidateTranslations, final TermMeasure measure) {
Term mostFrequent = null;
double maxValue = -1d;
for(Term t:candidateTranslations) {
if(measure.getValue(t)>maxValue) {
maxValue = t.getFrequency();
mostFrequent = t;
}
}
if(mostFrequent != null)
/*
* mostFrequent would be null if candidateTranslations is empty
*/
translatedVector.addEntry(mostFrequent, sourceTermEntry.getNbCooccs(), sourceTermEntry.getAssocRate());
}
/**
*
* Gives the list of all possible single lemma terms decompositino for a complex term.
*
*
* @param termIndex
* @param term
* @return
*/
public static List<List<Term>> getSingleLemmaTerms(TermIndex termIndex, Term term) {
List<Term> swtTerms = TermUtils.getSingleWordTerms(termIndex, term);
List<List<Term>> lemmaSets = Lists.newArrayList();
if(swtTerms.size() == 1) {
if(term.getWords().size() > 1) {
LOGGER.warn("Could not apply single lemma term decomposition for term {}. Expected at least two inner swt terms, but got {}", term, swtTerms);
return Lists.newArrayList();
}
// sourceTerm is swtTerms.get(0);
if(term.isCompound()) {
lemmaSets.add(Lists.newArrayList(term));
for(Pair<String> pair:CompoundUtils.asLemmaPairs(term.getWords().get(0).getWord())) {
for(Term swt1:termIndex.getCustomIndex(TermIndexes.LEMMA_LOWER_CASE).getTerms(pair.getElement1())) {
for(Term swt2:termIndex.getCustomIndex(TermIndexes.LEMMA_LOWER_CASE).getTerms(pair.getElement2())) {
lemmaSets.add(new Pair<Term>(swt1, swt2).toList());
}
}
}
} else {
lemmaSets.add(Lists.newArrayList(term));
}
} else {
if(swtTerms.size() == 2) {
lemmaSets.add(swtTerms);
} else
throw new RequiresSize2Exception(term, swtTerms);
}
return lemmaSets;
}
}