/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.engines.morpho;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.project.ttc.models.Component;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.index.TermValueProviders;
import eu.project.ttc.utils.Pair;
import eu.project.ttc.utils.TermSuiteConstants;
/**
*
* A set of helper methods for compound words and for iteration
* over word components (see {@link TermValueProviders}).
*
* @author Damien Cram
*
*/
public class CompoundUtils {
private static final String ERR_MSG_CANNOT_MERGE_AN_EMPTY_SET = "Cannot merge an empty set of component";
private static final String ERR_MSG_COMPONENTS_OVERLAP = "Cannot merge two components if they overlap. Got [%s,%s] followed by [%s,%s].";
private static final String ERR_MSG_COMPONENT_OFFSET_ARE_TOO_BIG = "Component %s does not belong to word %s (length=%s), because offsets [%s,%s] are too big.";
private static final String ERR_WMSG_WORD_LEMMA_NULL = "Word lemma needs to not be null";
/**
* Returns all possible components for a compound word
* by combining its atomic components.
*
* E.g. ab|cd|ef returns
* abcdef,
* ab, cdef,
* abcd, ef,
* cd
*
*
* @param word the compound word
* @return
* the list of all possible component lemmas
*/
public static List<Component> allSizeComponents(Word word) {
Set<Component> components = Sets.newHashSet();
for(int nbComponents=word.getComponents().size();
nbComponents > 0 ;
nbComponents--) {
for(int startIndex = 0;
startIndex <= word.getComponents().size() - nbComponents;
startIndex++) {
List<Component> toMerge = Lists.newArrayListWithExpectedSize(nbComponents);
for(int i = 0; i<nbComponents; i++)
toMerge.add(word.getComponents().get(startIndex + i));
components.add(merge(word, toMerge));
}
}
return Lists.newArrayList(components);
}
/**
*
* Merges <code>n</code> consecutive components of a compound
* word into a single {@link Component} object.
*
* The <code>lemma</code> of the returned {@link Component} is
* the concatenation of the 1st to n-1-th param components' substring
* and the last param component's <code>lemma</code>.
*
*
* @param word
* The compound word
* @param components
* The list of consecutive components of the word to merge
* @return
* The merged component
*
* @throws IllegalArgumentException
* when the <code>components</code> param is empty
* @throws IllegalArgumentException
* when the <code>components</code> are not consecutive
* @throws IllegalArgumentException
* when the components offsets do not match with the <code>word</code> size.
*/
public static Component merge(Word word, Iterable<? extends Component> components) {
Preconditions.checkNotNull(word.getLemma(), ERR_WMSG_WORD_LEMMA_NULL);
Iterator<? extends Component> it = components.iterator();
Preconditions.checkArgument(it.hasNext(), ERR_MSG_CANNOT_MERGE_AN_EMPTY_SET);
Component lastComponent = it.next();
int begin = lastComponent.getBegin();
StringBuilder lemmaBuilder = new StringBuilder();
while (it.hasNext()) {
Component cur = it.next();
Preconditions.checkArgument(
cur.getBegin() >= lastComponent.getEnd(),
ERR_MSG_COMPONENTS_OVERLAP,
lastComponent.getBegin(), lastComponent.getEnd(),
cur.getBegin(), cur.getEnd()
);
Preconditions.checkArgument(
cur.getEnd() <= word.getLemma().length(),
ERR_MSG_COMPONENT_OFFSET_ARE_TOO_BIG,
cur, word, word.getLemma().length(),
cur.getBegin(),cur.getEnd()
);
lemmaBuilder.append(word.getLemma().substring(lastComponent.getBegin(), lastComponent.getEnd()));
if(lastComponent.getEnd() < cur.getBegin())
/*
* Fills the gap with the lemma substring
*/
lemmaBuilder.append(word.getLemma().substring(lastComponent.getEnd(), cur.getBegin()));
lastComponent = cur;
}
lemmaBuilder.append(lastComponent.getLemma());
return new Component(lemmaBuilder.toString(), begin, lastComponent.getEnd());
}
/**
*
* Produces the set of all pairs of non-overlapping components
* for a given word.
*
* E.g. ab|cd|ef returns:
* ab+cd, ab+ef, cd+ef, ab+cdef, abcd+ef
*
*
* @param word
* the compound word
* @return
* the exhaustive list of pairs.
*/
public static List<Pair<Component>> innerComponentPairs(Word word) {
Set<Pair<Component>> pairs = Sets.newHashSet();
List<Component> components = allSizeComponents(word);
Component c1,c2;
Pair<Component> pair;
for(int i=0; i<components.size(); i++) {
c1 = components.get(i);
for(int j=i+1; j<components.size(); j++) {
c2 = components.get(j);
pair = new Pair<Component>(c1, c2);
if(pair.getElement1().getEnd() <= pair.getElement2().getBegin())
// no overlap
pairs.add(pair);
}
}
return Lists.newArrayList(pairs);
}
public static String toIndexString(Pair<Component> pair) {
boolean ordered = pair.getElement1().getLemma().compareTo(pair.getElement2().getLemma()) <= 0;
StringBuilder sb = new StringBuilder();
sb.append(ordered ? pair.getElement1().getLemma() : pair.getElement2().getLemma());
sb.append(TermSuiteConstants.PLUS);
sb.append(ordered ? pair.getElement2().getLemma() : pair.getElement1().getLemma());
return sb.toString();
}
/**
*
* <b>WARNING: This method does not behave as {@link #innerComponentPairs(Word)}.</b>
* This method enforces that returned pairs cover the input word completely and
* without any overlap.
*
* Example 1: with a word that is not a compound, it returns an empty list.
*
* Example 2: with a word that is a size-2 compound, it returns the only pair of lemmas possible:
*
* <code>
* w = "ab|cd"
* returnedPairs are [["ab","cd"]]
* </code>
*
* Example 3: with a word that is a size-3 compound, it returns two pairs of lemmas:
*
* <code>
* w = "ab|cd|ef"
* returnedPairs are [["ab","cded"], ["abcd","ef"]]
* </code>
*
* Example 4: with a word that is a size-n compound, it returns n-1 pairs of lemmas:
*
* <code>
* w = "comp1|comp2|...|compn"
* returnedPairs are [
* ["comp1","comp2comp3...compn"],
* ["comp1comp2","comp3comp4...compn"],
* ...,
* ["comp1comp2...compn-1","compn"]
* ]
* </code>
*
*
* @param word
* The input compound word
*/
public static List<Pair<String>> asLemmaPairs(Word word) {
List<Pair<String>> pairs = Lists.newArrayList();
if(word.isCompound()) {
String lemma1, lemma2;
int n = word.getComponents().size();
for(int i=0; i<n-1; i++) {
lemma1 = merge(word, word.getComponents().subList(0, i+1)).getLemma();
lemma2 = merge(word, word.getComponents().subList(i+1, n)).getLemma();
pairs.add(new Pair<String>(lemma1, lemma2));
}
}
return pairs;
}
}