BilingualAligner.java example

Explorer
termsuite-core-master
- src
/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.engines;

import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.base.MoreObjects;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.MinMaxPriorityQueue;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;

import eu.project.ttc.metrics.ExplainedValue;
import eu.project.ttc.metrics.Explanation;
import eu.project.ttc.metrics.IExplanation;
import eu.project.ttc.metrics.SimilarityDistance;
import eu.project.ttc.metrics.TextExplanation;
import eu.project.ttc.models.ContextVector;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.index.CustomTermIndex;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.models.index.TermMeasure;
import eu.project.ttc.models.index.TermValueProviders;
import eu.project.ttc.resources.BilingualDictionary;
import eu.project.ttc.utils.AlignerUtils;
import eu.project.ttc.utils.IteratorUtils;
import eu.project.ttc.utils.TermSuiteConstants;
 
 
/** 
 * 
 * 
 * 
 * @author Damien Cram
 * 
 */
public class BilingualAligner {
	
	private static final Logger LOGGER = LoggerFactory.getLogger(BilingualAligner.class);
	private static final String MSG_TERM_NOT_NULL = "Source term must not be null";
	private static final String MSG_REQUIRES_SIZE_2_LEMMAS = "The term %s must have exactly two single-word terms (single-word terms: %s)";
	private static final String MSG_SEVERAL_VECTORS_NOT_COMPUTED = "Several terms have no context vectors in target terminology (nb terms with vector: {}, nb terms without vector: {})";
	private static final String ERR_VECTOR_NOT_SET = "Cannot align on term %s. Cause: context vector no set.";

	
	/**
	 * The bonus factor applied to dictionary candidates when they are
	 * merged with distributional candidates
	 */
	public static final double DICO_CANDIDATE_BONUS_FACTOR = 30;

	private BilingualDictionary dico;
	private TermIndex sourceTermino;
	private TermIndex targetTermino;

	private SimilarityDistance distance;
	
	public BilingualAligner(BilingualDictionary dico, TermIndex sourceTermino, TermIndex targetTermino, SimilarityDistance distance) {
		super();
		this.dico = dico;
		this.targetTermino = targetTermino;
		this.sourceTermino = sourceTermino;
		this.distance = distance;
	}
	
	/**
	 * Overrides the default distance measure.
	 * 
	 * @param distance
	 * 			an object implementing the similarity distance
	 */
	public void setDistance(SimilarityDistance distance) {
		this.distance = distance;
	}

	
	/**
	 * 
	 * Translates the source term with the help of the dictionary
	 * and computes the list of <code>contextSize</code> closest candidate
	 * terms in the target terminology.
	 * 
	 * <code>sourceTerm</code>'s context vector must be computed and normalized,
	 * as well as all terms' context vectors in the target term index.
	 * 
	 * @param sourceTerm
	 * 			the term to align with target term index
	 * @param nbCandidates
	 * 			the number of {@link TranslationCandidate} to return in the returned list
	 * @param minCandidateFrequency
	 * 			the minimum frequency of a target candidate
	 * @return
	 * 			A sorted list of {@link TranslationCandidate} sorted by distance desc. Each
	 * 			{@link TranslationCandidate} is a container for a target term index's term 
	 * 			and its translation score.
	 * 			
	 */
	public List<TranslationCandidate> alignDicoThenDistributional(Term sourceTerm, int nbCandidates, int minCandidateFrequency) {
		checkNotNull(sourceTerm);
		Preconditions.checkArgument(sourceTerm.isContextVectorComputed(), ERR_VECTOR_NOT_SET, sourceTerm.getGroupingKey());

		List<TranslationCandidate> dicoCandidates = Lists.newArrayList();
		/*
		 * 1- find direct translation of the term in the dictionary
		 */
		dicoCandidates.addAll(sortTruncateNormalize(targetTermino, nbCandidates, alignDico(sourceTerm, Integer.MAX_VALUE)));
		applySpecificityBonus(targetTermino, dicoCandidates);

		
		/*
		 * 2- align against all terms in the corpus
		 */
		List<TranslationCandidate> alignedCandidateQueue = alignDistributional(sourceTerm, nbCandidates,
				minCandidateFrequency);
		
		
		/*
		 * 3- Merge candidates
		 */
		List<TranslationCandidate> mergedCandidates = dicoCandidates;
		mergedCandidates.addAll(alignedCandidateQueue);
		Collections.sort(mergedCandidates);
		

		/*
		 * 4- Sort, truncate, and normalize
		 */
		List<TranslationCandidate> sortedTruncateedNormalized = sortTruncateNormalize(targetTermino, nbCandidates, mergedCandidates);
		return sortedTruncateedNormalized;
	}

	public List<TranslationCandidate> alignDistributional(Term sourceTerm, int nbCandidates,
			int minCandidateFrequency) {
		Queue<TranslationCandidate> alignedCandidateQueue = MinMaxPriorityQueue.maximumSize(nbCandidates).create();
		ContextVector sourceVector = sourceTerm.getContextVector();
		ContextVector translatedSourceVector = AlignerUtils.translateVector(
				sourceVector,
				dico,
				AlignerUtils.TRANSLATION_STRATEGY_MOST_SPECIFIC,
				targetTermino);
		ExplainedValue v;
		int nbVectorsNotComputed = 0;
		int nbVectorsComputed = 0;
		for(Term targetTerm:IteratorUtils.toIterable(targetTermino.singleWordTermIterator())) {
			if(targetTerm.getFrequency() < minCandidateFrequency)
				continue;
			if(targetTerm.isContextVectorComputed()) {
				nbVectorsComputed++;
				v = distance.getExplainedValue(translatedSourceVector, targetTerm.getContextVector());
				alignedCandidateQueue.add(new TranslationCandidate(
						targetTerm, 
						AlignmentMethod.DISTRIBUTIONAL,
						v.getValue(), 
						v.getExplanation()));
			}
		}
		if(nbVectorsNotComputed > 0) {
			LOGGER.warn(MSG_SEVERAL_VECTORS_NOT_COMPUTED, nbVectorsComputed, nbVectorsNotComputed);	
		}
		
		// sort alignedCandidates
		List<TranslationCandidate> alignedCandidates = Lists.newArrayListWithCapacity(alignedCandidateQueue.size());
		alignedCandidates.addAll(alignedCandidateQueue);
		normalizeCandidateScores(alignedCandidates);
		return Lists.newArrayList(alignedCandidateQueue);
	}
	
	
	private static final String ERR_MSG_BAD_SOURCE_LEMMA_SET_SIZE = "Unexpected size for a source lemma set: %s. Expected size: 2";
	/**
	 * 
	 * 
	 * @param sourceTerm
	 * @param nbCandidates
	 * @param minCandidateFrequency
	 * @return
	 */
	public List<TranslationCandidate> align(Term sourceTerm, int nbCandidates, int minCandidateFrequency) {
		if(sourceTerm.getGroupingKey().equals("npn: stockage de énergie"))
			System.out.println(sourceTerm);
		Preconditions.checkNotNull(sourceTerm);
		List<TranslationCandidate> mergedCandidates = Lists.newArrayList();
		List<List<Term>> sourceLemmaSets = AlignerUtils.getSingleLemmaTerms(sourceTermino, sourceTerm);
		for(List<Term> sourceLemmaSet:sourceLemmaSets) {
			Preconditions.checkState(sourceLemmaSet.size() == 1 || sourceLemmaSet.size() == 2, 
					ERR_MSG_BAD_SOURCE_LEMMA_SET_SIZE, sourceLemmaSet);
			if(sourceLemmaSet.size() == 1) {
				mergedCandidates.addAll(alignDicoThenDistributional(sourceLemmaSet.get(0), 3*nbCandidates, minCandidateFrequency));
			} else if(sourceLemmaSet.size() == 2) {
				List<TranslationCandidate> compositional = Lists.newArrayList();
				try {
					compositional.addAll(alignCompositionalSize2(sourceLemmaSet.get(0), sourceLemmaSet.get(1), nbCandidates, minCandidateFrequency));
				} catch(RequiresSize2Exception e) {
					// Do nothing
				}
				mergedCandidates.addAll(compositional);
				if(mergedCandidates.isEmpty()) {
					List<TranslationCandidate> semiDist = Lists.newArrayList();
					try {
						semiDist = alignSemiDistributionalSize2Syntagmatic(sourceLemmaSet.get(0), sourceLemmaSet.get(1), nbCandidates, minCandidateFrequency);
					} catch(RequiresSize2Exception e) {
						// Do nothing
					}
					mergedCandidates.addAll(semiDist);
				}
			} 
		}
		
		removeDuplicatesOnTerm(mergedCandidates);
		return sortTruncateNormalize(targetTermino, nbCandidates, mergedCandidates);
	}

	private List<TranslationCandidate> sortTruncateNormalize(TermIndex termIndex, int nbCandidates, Collection<TranslationCandidate> candidatesCandidates) {
		List<TranslationCandidate> list = Lists.newArrayList(candidatesCandidates);
		Collections.sort(list);
		// set rank
		for(int i = 0; i < list.size(); i++)
			list.get(i).setRank(i+1);
		List<TranslationCandidate> finalCandidates = list.subList(0, Ints.min(nbCandidates, candidatesCandidates.size()));
		normalizeCandidateScores(finalCandidates);
		return finalCandidates;
	}

	/*
	 * Filter candidates by specificity
	 */
	private void applySpecificityBonus(TermIndex termIndex, List<TranslationCandidate> list) {
		Iterator<TranslationCandidate> it = list.iterator();
		TranslationCandidate c;
		while (it.hasNext()) {
			c = (TranslationCandidate) it.next();
			double wr = termIndex.getWRMeasure().getValue(c.getTerm());
			c.setScore(c.getScore()*getSpecificityBonusFactor(wr));
		}
	}

	private double getSpecificityBonusFactor(double wr) {
		if(wr <= 1)
			return 0.5;
		else if(wr <= 2)
			return 1;
		else if(wr <= 10)
			return 1.5;
		else if(wr <= 100)
			return 2;
		else
			return 5;
	}

	public List<TranslationCandidate> alignDico(Term sourceTerm, int nbCandidates) {
		List<TranslationCandidate> dicoCandidates = Lists.newArrayList();
		Collection<String> translations = dico.getTranslations(sourceTerm.getLemma());
		
		ContextVector translatedSourceVector = AlignerUtils.translateVector(
				sourceTerm.getContextVector(),
				dico,
				AlignerUtils.TRANSLATION_STRATEGY_MOST_SPECIFIC,
				targetTermino);

		
		for(String candidateLemma:translations) {
			List<Term> terms = targetTermino.getCustomIndex(TermIndexes.LEMMA_LOWER_CASE).getTerms(candidateLemma);
			for (Term candidateTerm : terms) {
				if (candidateTerm.isContextVectorComputed())
					dicoCandidates.add(new TranslationCandidate(candidateTerm, AlignmentMethod.DICTIONARY,
							distance.getValue(translatedSourceVector, candidateTerm.getContextVector()),
							Explanation.emptyExplanation()
					));
			}
		}
		

		return dicoCandidates;
	}

	
	public boolean canAlignCompositional(Term sourceTerm) {
		return AlignerUtils.getSingleLemmaTerms(sourceTermino, sourceTerm)
					.stream()
					.anyMatch(slTerms -> slTerms.size() == 2);
	}

	public List<TranslationCandidate> alignCompositional(Term sourceTerm, int nbCandidates, int minCandidateFrequency) {
		Preconditions.checkArgument(canAlignCompositional(sourceTerm), "Cannot align <%s> with compositional method", sourceTerm);
		
		List<List<Term>> singleLemmaTermSets = AlignerUtils.getSingleLemmaTerms(sourceTermino, sourceTerm);
		
		List<TranslationCandidate> candidates = Lists.newArrayList();
		
		for(List<Term> singleLemmaTerms:singleLemmaTermSets) {
			if(singleLemmaTerms.size() == 2) {
				candidates.addAll(alignCompositionalSize2(
						singleLemmaTerms.get(0), 
						singleLemmaTerms.get(1), nbCandidates, minCandidateFrequency));
			}
		}
		
		return sortTruncateNormalize(targetTermino, nbCandidates, candidates);
	}

	public boolean canAlignSemiDistributional(Term sourceTerm) {
		return AlignerUtils.getSingleLemmaTerms(sourceTermino, sourceTerm)
				.stream()
				.anyMatch(slTerms -> slTerms.size() == 2);
	}
	
	public List<TranslationCandidate> alignSemiDistributional(Term sourceTerm, int nbCandidates, int minCandidateFrequency) {
		Preconditions.checkArgument(canAlignCompositional(sourceTerm), "Cannot align <%s> with compositional method", sourceTerm);
		
		List<List<Term>> singleLemmaTermSets = AlignerUtils.getSingleLemmaTerms(sourceTermino, sourceTerm);
		
		List<TranslationCandidate> candidates = Lists.newArrayList();
		
		for(List<Term> singleLemmaTerms:singleLemmaTermSets) {
			if(singleLemmaTerms.size() == 2) {
				candidates.addAll(alignSemiDistributionalSize2Syntagmatic(
						singleLemmaTerms.get(0), 
						singleLemmaTerms.get(1), nbCandidates, minCandidateFrequency));
			}
		}
		
		return sortTruncateNormalize(targetTermino, nbCandidates, candidates);

	}

	
	public List<TranslationCandidate> alignCompositionalSize2(Term lemmaTerm1, Term lemmaTerm2, int nbCandidates, int minCandidateFrequency) {
		List<TranslationCandidate> candidates = Lists.newArrayList();	
		List<TranslationCandidate> dicoCandidates1 = alignDico(lemmaTerm1, Integer.MAX_VALUE);
		List<TranslationCandidate> dicoCandidates2 = alignDico(lemmaTerm2, Integer.MAX_VALUE);
			
		candidates.addAll(combineCandidates(dicoCandidates1, dicoCandidates2, AlignmentMethod.COMPOSITIONAL));
		return sortTruncateNormalize(targetTermino, nbCandidates, candidates);
	}

	public static class RequiresSize2Exception extends RuntimeException {
		private static final long serialVersionUID = 1L;
		private Term term;
		private List<Term> swtTerms;
		
		public RequiresSize2Exception(Term term, List<Term> swtTerms) {
			super();
			this.term = term;
			this.swtTerms = swtTerms;
		}

		@Override
		public String getMessage() {
			return String.format(MSG_REQUIRES_SIZE_2_LEMMAS, 
				term, 
				Joiner.on(TermSuiteConstants.COMMA).join(swtTerms)
				);
		}
	}

	/**
	 * Join to lists of swt candidates and use the specificities (wrLog)
	 * of the combine terms as the candidate scores.
	 * 
	 * FIXME Bad way of scoring candidates. They should be scored by similarity of context vectors with the source context vector
	 * 
	 * @param candidates1
	 * @param candidates2
	 * @return
	 */
	private Collection<TranslationCandidate> combineCandidates(Collection<TranslationCandidate> candidates1,
			Collection<TranslationCandidate> candidates2, AlignmentMethod method) {
		Collection<TranslationCandidate> combination = Sets.newHashSet();
		TermMeasure wrLog = targetTermino.getWRLogMeasure();
		wrLog.compute();
		for(TranslationCandidate candidate1:candidates1) {
			for(TranslationCandidate candidate2:candidates2) {
				/*
				 * 1- create candidate combine terms
				 */
				CustomTermIndex index = targetTermino.getCustomIndex(TermIndexes.WORD_COUPLE_LEMMA_LEMMA);
				List<Term> candidateCombinedTerms = index.getTerms(candidate1.getTerm().getLemma() + "+" + candidate2.getTerm().getLemma());
				candidateCombinedTerms.addAll(index.getTerms(candidate2.getTerm().getLemma() + "+" + candidate1.getTerm().getLemma()));
				if(candidateCombinedTerms.isEmpty())
					continue;
				
				/*
				 * 2- Avoids retrieving too long terms by keeping the ones that have 
				 * the lowest number of lemma+lemma keys.
				 */
				final Map<Term, Collection<String>> termLemmaLemmaKeys = Maps.newHashMap();
				for(Term t:candidateCombinedTerms)
					termLemmaLemmaKeys.put(t, TermValueProviders.WORD_LEMMA_LEMMA_PROVIDER.getClasses(targetTermino, t));
				Collections.sort(candidateCombinedTerms, new Comparator<Term>() { 
					@Override
					public int compare(Term o1, Term o2) {
						return Integer.compare(termLemmaLemmaKeys.get(o1).size(), termLemmaLemmaKeys.get(o2).size());
					}
				});
				List<Term> filteredTerms = Lists.newArrayList();
				int minimumNbClasses = termLemmaLemmaKeys.get(candidateCombinedTerms.get(0)).size();
				for(Term t:candidateCombinedTerms) {
					if(termLemmaLemmaKeys.get(t).size() == minimumNbClasses)
						filteredTerms.add(t);
					else 
						break;
				}
				
				/*
				 * 3- Create candidates from filtered terms
				 */
				for(Term t:filteredTerms) {
					combination.add(new TranslationCandidate(
							t, 
							method,
							wrLog.getValue(t), 
							new TextExplanation(String.format("Spécificité: %.1f", wrLog.getValue(t)))));
				}
			}
		}
		return combination;
	}

	private void checkNotNull(Term sourceTerm) {
		Preconditions.checkNotNull(sourceTerm, MSG_TERM_NOT_NULL);
	}

	
		
	public List<TranslationCandidate> alignSemiDistributionalSize2Syntagmatic(Term lemmaTerm1, Term lemmaTerm2, int nbCandidates, int minCandidateFrequency) {
		List<TranslationCandidate> candidates = Lists.newArrayList();
			
		Collection<? extends TranslationCandidate> t1 = semiDistributional(lemmaTerm1, lemmaTerm2);
		candidates.addAll(t1);
		Collection<? extends TranslationCandidate> t2 = semiDistributional(lemmaTerm2, lemmaTerm1);
		candidates.addAll(t2);

		removeDuplicatesOnTerm(candidates);
		return sortTruncateNormalize(targetTermino, nbCandidates, candidates);
	}

	private void removeDuplicatesOnTerm(List<TranslationCandidate> candidates) {
		Set<Term> set = Sets.newHashSet();
		Iterator<TranslationCandidate> it = candidates.iterator();
		while(it.hasNext())
			if(!set.add(it.next().getTerm()))
				it.remove();
	}

	private Collection<? extends TranslationCandidate> semiDistributional(Term dicoTerm, Term vectorTerm) {
		List<TranslationCandidate> candidates = Lists.newArrayList();
		List<TranslationCandidate> dicoCandidates = alignDico(dicoTerm, Integer.MAX_VALUE);
		
		if(dicoCandidates.isEmpty())
			// Optimisation: no need to align since there is no possible combination
			return candidates;
		else {
			List<TranslationCandidate> vectorCandidates = alignDicoThenDistributional(vectorTerm, Integer.MAX_VALUE, 1);
			return combineCandidates(dicoCandidates, vectorCandidates, AlignmentMethod.SEMI_DISTRIBUTIONAL);
		}
	}

	private void normalizeCandidateScores(List<TranslationCandidate> candidates) {
		double sum = 0;
		for(TranslationCandidate cand:candidates)
			sum+= cand.getScore();
		
		if(sum > 0d) 
			for(TranslationCandidate cand:candidates)
				cand.setScore(cand.getScore()/sum);
			
	}



	public static enum AlignmentMethod {
		DICTIONARY("dico", "dictionary"),
		DISTRIBUTIONAL("dist", "distributional"),
		COMPOSITIONAL("comp", "compositional"),
		SEMI_DISTRIBUTIONAL("s-dist", "semi-distributional");
		
		private String shortName;
		private String longName;
		private AlignmentMethod(String shortName, String longName) {
			this.shortName = shortName;
			this.longName = longName;
		}
		
		public String getShortName() {
			return shortName;
		}
		public String getLongName() {
			return longName;
		}
	}

	
	public static class TranslationCandidate implements Comparable<TranslationCandidate> {
		private IExplanation explanation;
		private AlignmentMethod method;
		private Term term;
		private int rank=-1;
		private double score;
		
//		private TranslationCandidate(Term term, AlignmentMethod method, double score) {
//			this(term, method, score, Explanation.emptyExplanation());
//		}

			
		public void setScore(double score) {
			this.score = score;
		}
		
		public void setRank(int rank) {
			this.rank = rank;
		}

		public int getRank() {
			return rank;
		}

		private TranslationCandidate(Term term, AlignmentMethod method, double score, IExplanation explanation) {
			super();
			this.term = term;
			this.score = score;
			this.method = method;
			this.explanation = explanation;
		}

		@Override
		public int compareTo(TranslationCandidate o) {
			return ComparisonChain.start()
					.compare(o.score, score)
					.compare(term, o.term)
					.result();
		}
		
		public AlignmentMethod getMethod() {
			return method;
		}
		
		public double getScore() {
			return score;
		}
		
		public Term getTerm() {
			return term;
		}
		
		@Override
		public boolean equals(Object obj) {
			if( obj instanceof TranslationCandidate)
				return Objects.equal(((TranslationCandidate)obj).score, this.score) 
						&& Objects.equal(((TranslationCandidate)obj).term, this.term);
			else
				return false;
		}
		
		public IExplanation getExplanation() {
			return explanation;
		}
		
		@Override
		public int hashCode() {
			return Objects.hashCode(term, score);
		}
		
		@Override
		public String toString() {
			return MoreObjects.toStringHelper(this)
					.addValue(this.term.getGroupingKey())
					.addValue(this.method.toString())
					.add("s",String.format("%.2f", this.score))
					.toString();
		}
	}


	public BilingualDictionary getDico() {
		return this.dico;
	}
}