CompoundUtils.java example

Explorer
termsuite-core-master
- src

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/

package eu.project.ttc.engines.morpho;

import java.util.Iterator;
import java.util.List;
import java.util.Set;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import eu.project.ttc.models.Component;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.index.TermValueProviders;
import eu.project.ttc.utils.Pair;
import eu.project.ttc.utils.TermSuiteConstants;

/**
 * 
 * A set of helper methods for compound words and for iteration
 * over word components (see {@link TermValueProviders}).
 * 
 * @author Damien Cram
 *
 */
public class CompoundUtils {

	private static final String ERR_MSG_CANNOT_MERGE_AN_EMPTY_SET = "Cannot merge an empty set of component";
	private static final String ERR_MSG_COMPONENTS_OVERLAP = "Cannot merge two components if they overlap. Got [%s,%s] followed by [%s,%s].";
	private static final String ERR_MSG_COMPONENT_OFFSET_ARE_TOO_BIG = "Component %s does not belong to word %s (length=%s), because offsets [%s,%s] are too big.";
	private static final String ERR_WMSG_WORD_LEMMA_NULL = "Word lemma needs to not be null";
	
	
	/**
	 * Returns all possible components for a compound word 
	 * by combining its atomic components.
	 * 
	 * E.g. ab|cd|ef returns
	 * 		abcdef,
	 * 		ab, cdef,
	 * 		abcd, ef,
	 * 		cd
	 * 
	 * 
	 * @param word the compound word
	 * @return
	 * 			the list of all possible component lemmas
	 */
	public static List<Component> allSizeComponents(Word word) {
		Set<Component> components = Sets.newHashSet();
		for(int nbComponents=word.getComponents().size();
				nbComponents > 0 ;
				nbComponents--) {
			
			for(int startIndex = 0;
					startIndex <= word.getComponents().size() - nbComponents;
					startIndex++) {
				List<Component> toMerge = Lists.newArrayListWithExpectedSize(nbComponents);
				
				for(int i = 0; i<nbComponents; i++) 
					toMerge.add(word.getComponents().get(startIndex + i));
				
				components.add(merge(word, toMerge));
			}
		}
		return Lists.newArrayList(components);
	}

	/**
	 * 
	 * Merges <code>n</code> consecutive components of a compound
	 * word into a single {@link Component} object. 
	 * 
	 * The <code>lemma</code> of the returned {@link Component} is
	 * the concatenation of the 1st to n-1-th param components' substring 
	 * and the last param component's <code>lemma</code>.
	 * 
	 * 
	 * @param word
	 * 			The compound word
	 * @param components
	 * 			The list of consecutive components of the word to merge
	 * @return
	 * 			The merged component
	 * 
	 * @throws IllegalArgumentException
	 * 				when the <code>components</code> param is empty
	 * @throws IllegalArgumentException
	 * 				when the <code>components</code> are not consecutive
	 * @throws IllegalArgumentException
	 * 				when the components offsets do not match with the <code>word</code> size.
	 */
	public static Component merge(Word word, Iterable<? extends Component> components) {
		Preconditions.checkNotNull(word.getLemma(), ERR_WMSG_WORD_LEMMA_NULL);
		 
		
		Iterator<? extends Component> it = components.iterator();
		Preconditions.checkArgument(it.hasNext(), ERR_MSG_CANNOT_MERGE_AN_EMPTY_SET);
		
		Component lastComponent = it.next();
		int begin = lastComponent.getBegin();
		StringBuilder lemmaBuilder = new StringBuilder();
		while (it.hasNext()) {
			Component cur = it.next();
			Preconditions.checkArgument(
					cur.getBegin() >= lastComponent.getEnd(),
					ERR_MSG_COMPONENTS_OVERLAP,
					lastComponent.getBegin(), lastComponent.getEnd(),
					cur.getBegin(), cur.getEnd()
				);
			
			
			Preconditions.checkArgument(
					cur.getEnd() <= word.getLemma().length(),
					ERR_MSG_COMPONENT_OFFSET_ARE_TOO_BIG,
					cur, word, word.getLemma().length(),
					cur.getBegin(),cur.getEnd()
					);
			lemmaBuilder.append(word.getLemma().substring(lastComponent.getBegin(), lastComponent.getEnd()));
			
			if(lastComponent.getEnd() < cur.getBegin())
				/*
				 * Fills the gap with the lemma substring
				 */
				lemmaBuilder.append(word.getLemma().substring(lastComponent.getEnd(), cur.getBegin()));
			
			lastComponent = cur;
		}
		lemmaBuilder.append(lastComponent.getLemma());
		return new Component(lemmaBuilder.toString(), begin, lastComponent.getEnd());
	}

	
	/**
	 * 
	 * Produces the set of all pairs of non-overlapping components
	 * for a given word.
	 * 
	 * E.g. ab|cd|ef returns:
	 * 		ab+cd, ab+ef, cd+ef, ab+cdef, abcd+ef
	 * 			
	 * 
	 * @param word
	 * 			the compound word
	 * @return
	 * 			the exhaustive list of pairs.
	 */
	public static List<Pair<Component>> innerComponentPairs(Word word) {
		Set<Pair<Component>> pairs = Sets.newHashSet();
		List<Component> components = allSizeComponents(word);
		Component c1,c2;
		Pair<Component> pair;
		for(int i=0; i<components.size(); i++) {
			c1 = components.get(i);
			for(int j=i+1; j<components.size(); j++) {
				c2 = components.get(j);
				pair = new Pair<Component>(c1, c2);
				if(pair.getElement1().getEnd() <= pair.getElement2().getBegin())
					// no overlap
					pairs.add(pair);
			}
		}
		return Lists.newArrayList(pairs);
	}
	
	public static String toIndexString(Pair<Component> pair) {
		boolean ordered = pair.getElement1().getLemma().compareTo(pair.getElement2().getLemma()) <= 0;
		StringBuilder sb = new StringBuilder();
		sb.append(ordered ? pair.getElement1().getLemma() : pair.getElement2().getLemma());
		sb.append(TermSuiteConstants.PLUS);
		sb.append(ordered ? pair.getElement2().getLemma() : pair.getElement1().getLemma());
		return sb.toString();
		
	}

	/**
	 * 
	 * <b>WARNING: This method does not behave as {@link #innerComponentPairs(Word)}.</b> 
	 * This method enforces that returned pairs cover the input word completely and 
	 * without any overlap.
	 *
	 * Example 1: with a word that is not a compound, it returns an empty list.
	 * 
	 * Example 2: with a word that is a size-2 compound, it returns the only pair of lemmas possible:
	 * 	
	 * <code>
	 * 	w = "ab|cd"
	 *  returnedPairs are [["ab","cd"]]
	 * </code>
	 * 
	 * Example 3: with a word that is a size-3 compound, it returns two pairs of lemmas:
	 * 	
	 * <code>
	 * 	w = "ab|cd|ef"
	 *  returnedPairs are [["ab","cded"], ["abcd","ef"]]
	 * </code>
	 * 
	 * Example 4: with a word that is a size-n compound, it returns n-1 pairs of lemmas:
	 * 
	 * <code>
	 * 	w = "comp1|comp2|...|compn"
	 *  returnedPairs are [
	 *  	["comp1","comp2comp3...compn"],
	 *  	["comp1comp2","comp3comp4...compn"], 
	 *  	..., 
	 *  	["comp1comp2...compn-1","compn"]
	 *  ]
	 * </code>
	 * 
	 * 
	 * @param word
	 * 			The input compound word
	 */
	public static List<Pair<String>> asLemmaPairs(Word word) {
		List<Pair<String>> pairs = Lists.newArrayList();
		if(word.isCompound()) {
			String lemma1, lemma2;
			int n = word.getComponents().size();
			for(int i=0; i<n-1; i++) {
				lemma1 = merge(word, word.getComponents().subList(0, i+1)).getLemma();
				lemma2 = merge(word, word.getComponents().subList(i+1, n)).getLemma();
				pairs.add(new Pair<String>(lemma1, lemma2));
			}
		}
		return pairs;
	}

}