/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.utils;
import java.io.PrintStream;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import com.google.common.base.Optional;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.engines.desc.TermSuiteResourceException;
import eu.project.ttc.engines.morpho.CompoundUtils;
import eu.project.ttc.models.Component;
import eu.project.ttc.models.ContextVector;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.TermWord;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.models.index.TermMeasure;
import eu.project.ttc.resources.GeneralLanguageResource;
import eu.project.ttc.tools.TermSuiteResource;
public class TermUtils {
private static final String MSG_NOT_AN_EXTENSION = "Term '%s' is no extension of term '%s'";
private static final String MSG_NOT_AN_AFFIX = "Term '%s' is contained into term '%s', but not an affix.";
/**
* Most frequent first
*/
public static Comparator<Term> frequencyComparator = new Comparator<Term>() {
@Override
public int compare(Term o1, Term o2) {
return ComparisonChain.start()
.compare(o2.getFrequency(), o1.getFrequency())
.result();
}
};
public static TermFormGetter formGetter(TermIndex termIndex, boolean downcaseForms) {
return new TermFormGetter(termIndex, downcaseForms);
}
public static void showIndex(TermIndex index, PrintStream stream) {
Optional<Pattern> watchExpression = Optional.absent();
showIndex(index, stream, watchExpression);
}
public static void showIndex(TermIndex index, PrintStream stream, Optional<Pattern> watchExpression) {
for(Term term:index.getTerms()) {
if(!watchExpression.isPresent()
|| (watchExpression.isPresent() && watchExpression.get().matcher(term.getGroupingKey()).find())
) {
stream.println(term);
// for(Term t:term.getGraphicalVariants())
// stream.format("\tgraphical: %s\n" , t.getGroupingKey());
for(TermVariation variation:term.getVariations())
stream.format("\tsyntactic: %s\n" , variation.getVariant().getGroupingKey());
}
}
}
public static void showTopNTermsBy(TermIndex index, TermMeasure measure, PrintStream out, int n) {
List<Term> terms = Lists.newArrayList(index.getTerms());
Collections.sort(terms, measure.getTermComparator(true));
int i = 0;
for(Term t:terms) {
out.println(t);
if(i++ > n)
break;
}
}
public static void showCompounds(TermIndex index, PrintStream out, int threshhold) {
List<Term> terms = Lists.newArrayList();
for(Term term:index.getTerms()) {
if(term.isCompound() && term.getFrequency() >= threshhold)
terms.add(term);
}
Collections.sort(terms, frequencyComparator);
for(Term term:terms)
out.println(term);
}
/**
*
* Finds in an input term all single-word terms it is made off.
* If the input term has compounds, this method will iterate
* over each compound and try to find a matching swt for each compound.
*
* This method creates an index on TermIndex based on key
* {@link TermIndexes#SINGLE_WORD_LEMMA}.
*
* @param termIndex
* The {@link TermIndex} in which single word terms must be found.
* @param term
* The input term.
* @return
* The list of single word terms.
*
* @see Term#asComponentIterator(boolean)
*/
public static List<Term> getSingleWordTerms(TermIndex termIndex, Term term) {
List<Term> terms = Lists.newArrayList();
for(TermWord tw:term.getWords()) {
Term swt = termIndex.getTermByGroupingKey(toGroupingKey(tw));
if(swt != null)
terms.add(swt);
}
return terms;
}
public static String collapseText(String coveredText) {
char[] charArray = coveredText.toCharArray();
if(charArray.length == 0)
return "";
char last = charArray[0];
StringBuilder builder = new StringBuilder();
builder.append(last);
for(int i=1;i<charArray.length; i++) {
char c = charArray[i];
if(Character.isWhitespace(c)) {
c = TermSuiteConstants.WHITESPACE;
if(Character.isWhitespace(last))
continue;
}
builder.append(c);
last = c;
}
return builder.toString().trim();
}
public static void showContextVector(ContextVector contextVector, int topN) {
Set<ContextVector.Entry> entries = Sets.newTreeSet(contextVector.getEntries());
int i = 0;
for(ContextVector.Entry e:entries) {
i++;
if(i>topN)
break;
System.out.format("\t%-12s: %d\n", e.getCoTerm().getLemma(), e.getNbCooccs());
}
}
/**
* Returns the strictness of t1 based on t2, i.e. the ratio of appearance
* in an occurrence that do not overlap with t2.
*
* @param t1
* the term to analyze
* @param t2
* the base term
* @return
* fstrict(t1) / f(t1)
*/
public static double getStrictness(Term t1, Term t2) {
Collection<TermOccurrence> occ1 = Lists.newArrayList(t1.getOccurrences());
TermOccurrenceUtils.removeOverlaps(t2.getOccurrences(), occ1);
double t1Strict = occ1.size();
double t1F = t1.getFrequency();
return t1Strict / t1F;
}
/**
*
* Finds in a {@link TermIndex} the biggest extension affix term of a term depending
* on a base term.
*
* For example, the term "offshore wind turbine" is an extension of
* "wind turbine". The extension affix is the term "offshore".
*
* @param termIndex
* The term index that both terms belong to.
* @param base
* The base term
* @param extension
* The extension term
* @return
* the extension affix found in <code>termIndex</code>, <code>null</code> if none
* has been found.
* @throws IllegalArgumentException if <code>extension</code> id not an
* extension of the term <code>base</code>.
*/
public static Term getExtensionAffix(TermIndex termIndex, Term base, Term extension) {
int index = TermUtils.getPosition(base, extension);
if(index == -1)
throw new IllegalStateException(String.format(MSG_NOT_AN_EXTENSION,
extension,
base)
);
/*
* true if prefix, false if suffix
*/
boolean isPrefix = false;
if(index == 0)
isPrefix = true;
else if(index + base.getWords().size() == extension.getWords().size())
isPrefix = false; // suffix
else {
throw new IllegalStateException(String.format(MSG_NOT_AN_AFFIX,
extension,
base)
);
}
if(isPrefix)
return findBiggestSuffix(
termIndex,
extension.getWords().subList(index + base.getWords().size(), extension.getWords().size())
);
else
return findBiggestPrefix(
termIndex,
extension.getWords().subList(0, index)
);
}
/**
* Finds in a {@link TermIndex} the biggest prefix of a sequence of
* {@link TermWord}s that exists as a term.
*
* @param termIndex
* the term index
* @param words
* the initial sequence of {@link TermWord}s
* @return
* A {@link Term} found in <code>termIndex</code> that makes the
* biggest possible prefix sequence for <code>words</code>.
*/
public static Term findBiggestPrefix(TermIndex termIndex, List<TermWord> words) {
Term t;
String gKey;
for(int i = words.size(); i > 0 ; i--) {
gKey = TermSuiteUtils.getGroupingKey(words.subList(0, i));
t = termIndex.getTermByGroupingKey(gKey);
if(t!=null)
return t;
}
return null;
}
/**
* Finds in a {@link TermIndex} the biggest suffix of a sequence of
* {@link TermWord}s that exists as a term.
*
* @param termIndex
* the term index
* @param words
* the initial sequence of {@link TermWord}s
* @return
* A {@link Term} found in <code>termIndex</code> that makes the
* biggest possible suffix sequence for <code>words</code>.
*/
public static Term findBiggestSuffix(TermIndex termIndex, List<TermWord> words) {
Term t;
String gKey;
for(int i = 0; i < words.size() ; i++) {
gKey = TermSuiteUtils.getGroupingKey(words.subList(i, words.size()));
t = termIndex.getTermByGroupingKey(gKey);
if(t!=null)
return t;
}
return null;
}
public static boolean isIncludedIn(Term term, Term inTerm) {
return getPosition(term, inTerm) != -1;
}
public static boolean isPrefixOf(Term term, Term ofTerm) {
return getPosition(term, ofTerm) == 0;
}
public static boolean isSuffixOf(Term term, Term ofTerm) {
return getPosition(term, ofTerm) + term.getWords().size() == ofTerm.getWords().size();
}
/**
* Finds the index of appearance of a term's sub-term.
*
*
* @param subTerm
* the inner term, must be included in <code>term</code>
* @param term
* the container term.
* @return
* the starting index of <code>subTerm</code> in <code>term</code>. -1 otherwise.
*/
public static int getPosition(Term subTerm, Term term) {
int startingIndex = -1;
int j = 0;
for(int i=0; i<term.getWords().size(); i++) {
if(term.getWords().get(i).equals(subTerm.getWords().get(j))) {
j++;
if(startingIndex == -1)
startingIndex = i;
} else {
startingIndex = -1;
j = 0;
}
if(j == subTerm.getWords().size())
return startingIndex;
}
return -1;
}
/**
*
* @param l
* @param t
* @return
*/
public static int getGeneralFrequency(Lang l, Term t) {
String resName = TermSuiteResource.GENERAL_LANGUAGE.getPath(l);
GeneralLanguageResource generalLanguage = new GeneralLanguageResource();
try {
generalLanguage.load(TermUtils.class.getClassLoader().getResourceAsStream(resName));
return generalLanguage.getFrequency(t.getLemma(), t.getPattern());
} catch (Exception e) {
throw new TermSuiteResourceException("Could not read resource " + resName, e);
}
}
public static double getExtensionGain(Term extension, Term extensionAffix) {
return ((double)extension.getFrequency())/extensionAffix.getFrequency();
}
private static final String GROUPING_KEY_FORMAT = "%s: %s";
public static String toGroupingKey(TermWord termWord) {
return String.format(GROUPING_KEY_FORMAT,
termWord.getSyntacticLabel().toLowerCase(),
termWord.getWord().getLemma());
}
/**
*
* Transforms a term into a list of component sets.
*
* This
*
*
*
* @param term
* @return
*/
public static List<Set<Component>> toComponentSets(Iterable<Word> words) {
List<Set<Component>> sets = Lists.newArrayList();
for(Word w:words) {
if(w.isCompound())
sets.add(Sets.newHashSet(CompoundUtils.allSizeComponents(w)));
else {
sets.add(Sets.newHashSet(new Component(w.getLemma(), 0, w.getLemma().length())));
}
}
return sets;
}
}