/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.utils; import java.io.PrintStream; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import com.google.common.base.Optional; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.project.ttc.engines.desc.Lang; import eu.project.ttc.engines.desc.TermSuiteResourceException; import eu.project.ttc.engines.morpho.CompoundUtils; import eu.project.ttc.models.Component; import eu.project.ttc.models.ContextVector; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.TermOccurrence; import eu.project.ttc.models.TermVariation; import eu.project.ttc.models.TermWord; import eu.project.ttc.models.Word; import eu.project.ttc.models.index.TermIndexes; import eu.project.ttc.models.index.TermMeasure; import eu.project.ttc.resources.GeneralLanguageResource; import eu.project.ttc.tools.TermSuiteResource; public class TermUtils { private static final String MSG_NOT_AN_EXTENSION = "Term '%s' is no extension of term '%s'"; private static final String MSG_NOT_AN_AFFIX = "Term '%s' is contained into term '%s', but not an affix."; /** * Most frequent first */ public static Comparator<Term> frequencyComparator = new Comparator<Term>() { @Override public int compare(Term o1, Term o2) { return ComparisonChain.start() .compare(o2.getFrequency(), o1.getFrequency()) .result(); } }; public static TermFormGetter formGetter(TermIndex termIndex, boolean downcaseForms) { return new TermFormGetter(termIndex, downcaseForms); } public static void showIndex(TermIndex index, PrintStream stream) { Optional<Pattern> watchExpression = Optional.absent(); showIndex(index, stream, watchExpression); } public static void showIndex(TermIndex index, PrintStream stream, Optional<Pattern> watchExpression) { for(Term term:index.getTerms()) { if(!watchExpression.isPresent() || (watchExpression.isPresent() && watchExpression.get().matcher(term.getGroupingKey()).find()) ) { stream.println(term); // for(Term t:term.getGraphicalVariants()) // stream.format("\tgraphical: %s\n" , t.getGroupingKey()); for(TermVariation variation:term.getVariations()) stream.format("\tsyntactic: %s\n" , variation.getVariant().getGroupingKey()); } } } public static void showTopNTermsBy(TermIndex index, TermMeasure measure, PrintStream out, int n) { List<Term> terms = Lists.newArrayList(index.getTerms()); Collections.sort(terms, measure.getTermComparator(true)); int i = 0; for(Term t:terms) { out.println(t); if(i++ > n) break; } } public static void showCompounds(TermIndex index, PrintStream out, int threshhold) { List<Term> terms = Lists.newArrayList(); for(Term term:index.getTerms()) { if(term.isCompound() && term.getFrequency() >= threshhold) terms.add(term); } Collections.sort(terms, frequencyComparator); for(Term term:terms) out.println(term); } /** * * Finds in an input term all single-word terms it is made off. * If the input term has compounds, this method will iterate * over each compound and try to find a matching swt for each compound. * * This method creates an index on TermIndex based on key * {@link TermIndexes#SINGLE_WORD_LEMMA}. * * @param termIndex * The {@link TermIndex} in which single word terms must be found. * @param term * The input term. * @return * The list of single word terms. * * @see Term#asComponentIterator(boolean) */ public static List<Term> getSingleWordTerms(TermIndex termIndex, Term term) { List<Term> terms = Lists.newArrayList(); for(TermWord tw:term.getWords()) { Term swt = termIndex.getTermByGroupingKey(toGroupingKey(tw)); if(swt != null) terms.add(swt); } return terms; } public static String collapseText(String coveredText) { char[] charArray = coveredText.toCharArray(); if(charArray.length == 0) return ""; char last = charArray[0]; StringBuilder builder = new StringBuilder(); builder.append(last); for(int i=1;i<charArray.length; i++) { char c = charArray[i]; if(Character.isWhitespace(c)) { c = TermSuiteConstants.WHITESPACE; if(Character.isWhitespace(last)) continue; } builder.append(c); last = c; } return builder.toString().trim(); } public static void showContextVector(ContextVector contextVector, int topN) { Set<ContextVector.Entry> entries = Sets.newTreeSet(contextVector.getEntries()); int i = 0; for(ContextVector.Entry e:entries) { i++; if(i>topN) break; System.out.format("\t%-12s: %d\n", e.getCoTerm().getLemma(), e.getNbCooccs()); } } /** * Returns the strictness of t1 based on t2, i.e. the ratio of appearance * in an occurrence that do not overlap with t2. * * @param t1 * the term to analyze * @param t2 * the base term * @return * fstrict(t1) / f(t1) */ public static double getStrictness(Term t1, Term t2) { Collection<TermOccurrence> occ1 = Lists.newArrayList(t1.getOccurrences()); TermOccurrenceUtils.removeOverlaps(t2.getOccurrences(), occ1); double t1Strict = occ1.size(); double t1F = t1.getFrequency(); return t1Strict / t1F; } /** * * Finds in a {@link TermIndex} the biggest extension affix term of a term depending * on a base term. * * For example, the term "offshore wind turbine" is an extension of * "wind turbine". The extension affix is the term "offshore". * * @param termIndex * The term index that both terms belong to. * @param base * The base term * @param extension * The extension term * @return * the extension affix found in <code>termIndex</code>, <code>null</code> if none * has been found. * @throws IllegalArgumentException if <code>extension</code> id not an * extension of the term <code>base</code>. */ public static Term getExtensionAffix(TermIndex termIndex, Term base, Term extension) { int index = TermUtils.getPosition(base, extension); if(index == -1) throw new IllegalStateException(String.format(MSG_NOT_AN_EXTENSION, extension, base) ); /* * true if prefix, false if suffix */ boolean isPrefix = false; if(index == 0) isPrefix = true; else if(index + base.getWords().size() == extension.getWords().size()) isPrefix = false; // suffix else { throw new IllegalStateException(String.format(MSG_NOT_AN_AFFIX, extension, base) ); } if(isPrefix) return findBiggestSuffix( termIndex, extension.getWords().subList(index + base.getWords().size(), extension.getWords().size()) ); else return findBiggestPrefix( termIndex, extension.getWords().subList(0, index) ); } /** * Finds in a {@link TermIndex} the biggest prefix of a sequence of * {@link TermWord}s that exists as a term. * * @param termIndex * the term index * @param words * the initial sequence of {@link TermWord}s * @return * A {@link Term} found in <code>termIndex</code> that makes the * biggest possible prefix sequence for <code>words</code>. */ public static Term findBiggestPrefix(TermIndex termIndex, List<TermWord> words) { Term t; String gKey; for(int i = words.size(); i > 0 ; i--) { gKey = TermSuiteUtils.getGroupingKey(words.subList(0, i)); t = termIndex.getTermByGroupingKey(gKey); if(t!=null) return t; } return null; } /** * Finds in a {@link TermIndex} the biggest suffix of a sequence of * {@link TermWord}s that exists as a term. * * @param termIndex * the term index * @param words * the initial sequence of {@link TermWord}s * @return * A {@link Term} found in <code>termIndex</code> that makes the * biggest possible suffix sequence for <code>words</code>. */ public static Term findBiggestSuffix(TermIndex termIndex, List<TermWord> words) { Term t; String gKey; for(int i = 0; i < words.size() ; i++) { gKey = TermSuiteUtils.getGroupingKey(words.subList(i, words.size())); t = termIndex.getTermByGroupingKey(gKey); if(t!=null) return t; } return null; } public static boolean isIncludedIn(Term term, Term inTerm) { return getPosition(term, inTerm) != -1; } public static boolean isPrefixOf(Term term, Term ofTerm) { return getPosition(term, ofTerm) == 0; } public static boolean isSuffixOf(Term term, Term ofTerm) { return getPosition(term, ofTerm) + term.getWords().size() == ofTerm.getWords().size(); } /** * Finds the index of appearance of a term's sub-term. * * * @param subTerm * the inner term, must be included in <code>term</code> * @param term * the container term. * @return * the starting index of <code>subTerm</code> in <code>term</code>. -1 otherwise. */ public static int getPosition(Term subTerm, Term term) { int startingIndex = -1; int j = 0; for(int i=0; i<term.getWords().size(); i++) { if(term.getWords().get(i).equals(subTerm.getWords().get(j))) { j++; if(startingIndex == -1) startingIndex = i; } else { startingIndex = -1; j = 0; } if(j == subTerm.getWords().size()) return startingIndex; } return -1; } /** * * @param l * @param t * @return */ public static int getGeneralFrequency(Lang l, Term t) { String resName = TermSuiteResource.GENERAL_LANGUAGE.getPath(l); GeneralLanguageResource generalLanguage = new GeneralLanguageResource(); try { generalLanguage.load(TermUtils.class.getClassLoader().getResourceAsStream(resName)); return generalLanguage.getFrequency(t.getLemma(), t.getPattern()); } catch (Exception e) { throw new TermSuiteResourceException("Could not read resource " + resName, e); } } public static double getExtensionGain(Term extension, Term extensionAffix) { return ((double)extension.getFrequency())/extensionAffix.getFrequency(); } private static final String GROUPING_KEY_FORMAT = "%s: %s"; public static String toGroupingKey(TermWord termWord) { return String.format(GROUPING_KEY_FORMAT, termWord.getSyntacticLabel().toLowerCase(), termWord.getWord().getLemma()); } /** * * Transforms a term into a list of component sets. * * This * * * * @param term * @return */ public static List<Set<Component>> toComponentSets(Iterable<Word> words) { List<Set<Component>> sets = Lists.newArrayList(); for(Word w:words) { if(w.isCompound()) sets.add(Sets.newHashSet(CompoundUtils.allSizeComponents(w))); else { sets.add(Sets.newHashSet(new Component(w.getLemma(), 0, w.getLemma().length()))); } } return sets; } }