/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.models; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Set; import java.util.SortedSet; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.collect.AbstractIterator; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.project.ttc.utils.IteratorUtils; import eu.project.ttc.utils.TermSuiteConstants; import eu.project.ttc.utils.TermSuiteUtils; import eu.project.ttc.utils.TermUtils; public class Term implements Comparable<Term> { private static final String NO_OCCURRENCE = "[No occurrence]"; // private List<TermOccurrence> occurrences = Lists.newArrayList(); private OccurrenceStore occurrenceStore; private Set<Document> documents = Sets.newHashSet(); private Set<TermVariation> variations = Sets.newTreeSet(); private Set<TermVariation> bases = Sets.newTreeSet(); private Set<Term> extensions = Sets.newHashSet(); private Set<Term> extensionBases = Sets.newHashSet(); /* * The identifier and display string of this term */ private String groupingKey; /* * The numerical id in the term index */ private int id; /* * The term rank */ private int rank; private double normalizedTermFrequency; private double normalizedGeneralTermFrequency; /* * The weirdness ratio of this term */ private double specificity; /* * The logarithm of weirdness ratio of this term */ // private double wrLog; /* * The z-score of wrLog */ // private double wrLogZScore; /* * The frequency of this term */ private int frequency = 0; /* * The syntactic pattern of this term */ private String pattern; /* * The spotting rule */ private String spottingRule; /* * A flag that is true if this term is a fixed expression */ private boolean fixedExpression = false; /* * The morphological components of this term */ private List<TermWord> termWords = Lists.newArrayList(); /* * The context vector */ private Optional<ContextVector> contextVector = Optional.absent(); /** * The term class * @see TermClass */ private TermClass termClass; Term(OccurrenceStore occurrenceStore, int id) { this.occurrenceStore = occurrenceStore; this.id = id; } Term(OccurrenceStore occurrenceStore, int id, String termId, List<TermWord> termWords, String spottingRule) { this(occurrenceStore, id); this.groupingKey = termId; this.spottingRule = spottingRule; this.termWords = termWords; } public void setFrequency(int frequency) { this.frequency = frequency; } // public double getWR() { // return getNormalizedTermFrequency() / getNormalizedGeneralTermFrequency(); // } public void setPattern(String pattern) { this.pattern = pattern; } public Collection<TermOccurrence> getOccurrences() { return Collections.unmodifiableCollection(occurrenceStore.getOccurrences(this)); } // public boolean isEmpty() { // return occurrences.isEmpty(); // } /** * Increments the frequency, updates the inner list of source documents of this term * and optionnaly updates the inner list of occurrences of this term if param * <code>keepOccurrence</code> is set to true. * * @param e * the occurrence object to add * @param keepOccurrence * set this param to true if you need the occurrence to be stored within the object * * @see #getOccurrences() */ public void addOccurrence(TermOccurrence e, boolean keepOccurrence) { this.frequency++; this.documents.add(e.getSourceDocument()); if(keepOccurrence) occurrenceStore.addOccurrence(this,e); } /** * Adds the parameter occurrence and stores it to the inner occurrence list. * * @see #addOccurrence(TermOccurrence, boolean) * @param e * the occurrence object */ public void addOccurrence(TermOccurrence e) { this.addOccurrence(e, true); } public void addAll(Collection<TermOccurrence> c) { occurrenceStore.addAllOccurrences(this, c); } @Override public int compareTo(Term o) { return ComparisonChain.start() .compare(o.groupingKey.length(), this.groupingKey.length()) .compare(o.groupingKey, this.groupingKey) .result(); } @Override public int hashCode() { return groupingKey.hashCode(); } @Override public boolean equals(Object obj) { if (obj instanceof Term) return this.groupingKey.equals(((Term) obj).groupingKey); else return false; } public String getGroupingKey() { return groupingKey; } @Override public String toString() { return this.groupingKey; } public boolean isSingleWord() { return termWords.size() == 1; } public boolean isMultiWord() { return termWords.size() > 1; } /** * Builds a {@link TermVariation} object and add it to {@link #variations} and * variant{@link #bases}. * * @param variant * @param type * @param info */ public TermVariation addTermVariation(Term variant, VariationType type, Object info) { TermVariation tv = new TermVariation(type, this, variant, info); addTermVariation(tv); return tv; } /** * * Adds the {@link TermVariation} object to {@link #variations} and variant{@link #bases}. * @param termVariation */ public void addTermVariation(TermVariation termVariation) { this.variations.add(termVariation); termVariation.getVariant().bases.add(termVariation); } /** * Removes the param variation from this{@link #variations} and * from variant's {@link #bases}. * @param variation */ public void removeTermVariation(TermVariation variation) { this.variations.remove(variation); variation.getVariant().bases.remove(variation); } /** * * Calls {@link #asComponentIterator(boolean)} with param <code>true</code>. * * @return */ // public Iterator<LemmaStemHolder> asComponentIterator() { // return asComponentIterator(true); // } /** * Turns the term into a list {@link LemmaStemHolder} where each word of the term * is given as itself if not compound, or as a list of its components if compound. * * @param compoundLevel * set to <code>true</code> if this method should iterate over * components when words are compound, set it to <code>false</code> * if this method should iterate over plain words even though * they are compounds. * * @return * The list of words and/or (depending on <code>compoundLevel</code>) compounds. */ // public Iterator<LemmaStemHolder> asComponentIterator(final boolean compoundLevel) { // return new AbstractIterator<LemmaStemHolder>() { // private final Iterator<TermWord> it = Term.this.termWords.iterator(); // private Iterator<Component> currentWordIt; // // @Override // protected LemmaStemHolder computeNext() { // if(currentWordIt != null && currentWordIt.hasNext()) { // return currentWordIt.next(); // } else if(it.hasNext()) { // TermWord w = it.next(); // if(compoundLevel && w.getWord().isCompound()) { // this.currentWordIt = w.getWord().getComponents().iterator(); // return this.currentWordIt.next(); // } else { // currentWordIt = null; // return w.getWord(); // } // } else // return endOfData(); // } // }; // } public String getPattern() { if(pattern == null) { List<String> labels = Lists.newArrayListWithCapacity(termWords.size()); for(TermWord w:termWords) labels.add(w.getSyntacticLabel()); pattern = Joiner.on(' ').join(labels); } return pattern; } public List<TermWord> getWords() { return this.termWords; } public int getFrequency() { return frequency; } // public void removeOccurrence(String file, int begin, int end) { // // TODO Operation requires a linked list and is still too long. HashMap ? // Iterator<TermOccurrence> it = this.occurrenceStore.occurrenceIterator(this); // while(it.hasNext()) { // TermOccurrence occ = it.next(); // if(occ.getBegin() == begin && occ.getEnd() == end && file.equals(occ.getSourceDocument().getUrl())) { // it.remove(); // break; // } // } // this.frequency --; // } public TermWord firstWord() { return this.termWords.get(0); } public boolean isCompound() { return isSingleWord() && firstWord().getWord().isCompound(); } public int getId() { return id; } public String getSpottingRule() { return spottingRule; } public boolean isVariant() { return !bases.isEmpty(); } /** * Use {@link TermUtils#formGetter(TermIndex, boolean)} instead. * @return */ @Deprecated public Set<String> getForms() { List<String> forms = Lists.newArrayList(); for(TermOccurrence o:getOccurrences()) forms.add(o.getForm()); LinkedHashMap<String, Integer> counters = TermSuiteUtils.getCounters(forms); return counters.keySet(); } /** * Use {@link TermUtils#formGetter(TermIndex, boolean)} instead. * * @return */ @Deprecated public String getPilot() { Iterator<String> it = getForms().iterator(); if(it.hasNext()) return it.next(); else return NO_OCCURRENCE; } /** * Returns the concatenation of inner words' lemmas. */ public String getLemma() { StringBuilder builder = new StringBuilder(); int i = 0; for(TermWord tw:this.getWords()) { if(i>0) builder.append(TermSuiteConstants.WHITESPACE); builder.append(tw.getWord().getLemma()); i++; } return builder.toString(); } public List<VariationPath> getVariationPaths(int depth) { ArrayList<VariationPath> accu = Lists.newArrayList(); accumulateVariations( this, new ArrayList<TermVariation>(), depth, accu ); return accu; } private void accumulateVariations(Term baseTerm, List<TermVariation> currentPath, int depth, List<VariationPath> accu) { if(depth == 0 || (!currentPath.isEmpty() && this.equals(baseTerm)) // cycle prevention ) return; for(TermVariation tv:this.variations) { currentPath.add(tv); accu.add(new VariationPath(currentPath)); tv.getVariant().accumulateVariations(baseTerm, currentPath, depth-1, accu); } } /** * * Returns the context vector of this term. * * The context vector must have been explicitly invoked by user using * the <code>#computeContextVector(contextSize)</code> method. * * @see #isContextVectorComputed() * @return * the {@link ContextVector} of this term * @throws IllegalStateException * if the inner context vector does not exist (invoke <code>#computeContextVector(contextSize)</code>) * to generate this vector if missing. */ public ContextVector getContextVector() { if(this.contextVector.isPresent()) return this.contextVector.get(); else throw new IllegalStateException("Context vector not set on term " + this); } /** * * True if the context vector of this term has been computed. * float * @see #getContextVector() * @return */ public boolean isContextVectorComputed() { return this.contextVector.isPresent(); } /** * * Regenerate the single-word contextVector of this term and returns it. * * @param coTermsType * @param contextSize * @param cooccFrequencyThreshhold * @param useTermClasses * @return * The computed {@link ContextVector} object */ public ContextVector computeContextVector(OccurrenceType coTermsType, int contextSize, int cooccFrequencyThreshhold, boolean useTermClasses) { // 1- compute context vector ContextVector vector = new ContextVector(this, useTermClasses); vector.addAllCooccurrences(Iterators.concat(contextIterator(coTermsType, contextSize))); vector.removeCoTerm(this); this.contextVector = Optional.of(vector); // 2- filter entries that under the co-occurrence threshold if(cooccFrequencyThreshhold > 1) { for(ContextVector.Entry e:this.contextVector.get().getEntries()) { if(e.getNbCooccs()<cooccFrequencyThreshhold) this.contextVector.get().removeCoTerm(e.getCoTerm()); } } return this.contextVector.get(); } public Iterator<Iterator<TermOccurrence>> contextIterator(final OccurrenceType coTermsType, final int contextSize) { return new AbstractIterator<Iterator<TermOccurrence>>() { private Iterator<TermOccurrence> it = Term.this.occurrenceStore.occurrenceIterator(Term.this); @Override protected Iterator<TermOccurrence> computeNext() { if(this.it.hasNext()) return it.next().contextIterator(coTermsType, contextSize); else return endOfData(); } }; } public int getDocumentFrequency() { return this.documents.size(); } public void normalize(CrossTable crossTable) { } public Number getValue() { return 0; } public void setContextVector(ContextVector vector) { this.contextVector = Optional.of(vector); } public void clearContext() { this.contextVector = Optional.absent(); } public void setTermClass(TermClass termClass) { this.termClass = termClass; } public TermClass getTermClass() { return termClass; } // public double getWRLog() { // return Math.log10(1 + getWR()); // } public Set<TermVariation> getVariations() { return Collections.unmodifiableSet(variations); } public Set<TermVariation> getBases() { return Collections.unmodifiableSet(bases); } private Iterator<TermVariation> getTermVariationsIterator(final Iterable<TermVariation> iterable, final VariationType... variantTypes) { return new AbstractIterator<TermVariation>() { private Iterator<TermVariation> it = iterable.iterator(); private TermVariation current = null; private Set<VariationType> types = Sets.newHashSet(variantTypes); @Override protected TermVariation computeNext() { while(it.hasNext()) { this.current = it.next(); if(this.types.contains(this.current.getVariationType())) return this.current; } return endOfData(); } }; } /** * Get all variations of given {@link VariationType}s * * @param variantTypes * @return */ public Iterable<TermVariation> getVariations(final VariationType... variantTypes) { return IteratorUtils.toIterable(getTermVariationsIterator(this.variations, variantTypes)); } /** * Get all bases of given {@link VariationType}s * * @param variantTypes * @return */ public Iterable<TermVariation> getBases(final VariationType... variantTypes) { return IteratorUtils.toIterable(getTermVariationsIterator(this.bases, variantTypes)); } public void setFrequencyNorm(double normalizedTermFrequency) { this.normalizedTermFrequency = normalizedTermFrequency; } public void setGeneralFrequencyNorm(double normalizedGeneralTermFrequency) { this.normalizedGeneralTermFrequency = normalizedGeneralTermFrequency; } /** * The average number of occurrences of this term in the * general language corpus for each slice of 1000 words. * * @return */ public double getGeneralFrequencyNorm() { return normalizedGeneralTermFrequency; } /** * The average number of occurrences of this term in the * corpus for each slice of 1000 words. * * @return */ public double getFrequencyNorm() { return normalizedTermFrequency; } public void addExtension(Term t) { this.extensions.add(t); t.extensionBases.add(this); } public Set<Term> getExtensions() { return Collections.unmodifiableSet(this.extensions); } public Set<Term> getExtensionBases() { return Collections.unmodifiableSet(this.extensionBases); } /** * Do not use this, this will disappear on version 3.0. * @param variations */ @Deprecated public void setVariations(SortedSet<TermVariation> variations) { this.variations = Sets.newTreeSet(variations); } /** * Do not use this, this will disappear on version 3.0. * @param variations */ @Deprecated public void setBases(SortedSet<TermVariation> bases) { this.bases = Sets.newTreeSet(bases); } public int getRank() { return rank; } public void setRank(int rank) { this.rank = rank; } public double getSpecificity() { return specificity; } public void setSpecificity(double specificity) { this.specificity = specificity; } public void setFixedExpression(boolean fixedExpression) { this.fixedExpression = fixedExpression; } public boolean isFixedExpression() { return this.fixedExpression; } }