/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.models.index;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.models.Document;
import eu.project.ttc.models.OccurrenceStore;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermBuilder;
import eu.project.ttc.models.TermClass;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.TermWord;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.index.selectors.TermSelector;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.TermOccAnnotation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;
import eu.project.ttc.utils.TermSuiteUtils;
/**
* The in-memory implementation of a {@link TermIndex}.
*
* @author Damien Cram
*
*/
public class MemoryTermIndex implements TermIndex {
private static final Logger LOGGER = LoggerFactory.getLogger(MemoryTermIndex.class);
private static final String MSG_NO_SUCH_PROVIDER = "No such value provider: %s";
public static final String MSG_NO_SUCH_TERM = "No such term in term index: %s";
private static final String MEASURE_WR = "wr";
private static final String MEASURE_WRLOG = "wrLog";
private static final String MEASURE_FREQUENCY = "frequency";
/**
* The occurrence store
*/
private OccurrenceStore occurrenceStore;
/*
* The root index of terms. Variants must not be referenced at
* this level of index. They me be indexed from their base-term
* instead.
*/
private Map<Integer, Term> termsById = Maps.newHashMap();
private List<Term> rankedTerms = Lists.newArrayList();
private Map<String, Term> termsByGroupingKey = Maps.newHashMap();
private Map<String, CustomTermIndex> customIndexes = Maps.newHashMap();
private Map<String, TermMeasure> termMeasures = Maps.newHashMap();
private Map<String, Word> wordIndex = Maps.newHashMap();
private Map<String, Document> documents = Maps.newHashMap();
private Set<TermClass> termClasses = Sets.newHashSet();
private String name;
private Lang lang;
private String corpusId;
private int currentDocumentId = 0;
private int currentId = 0;
private int nbWordAnnotations = 0;
private int nbSpottedTerms = 0;
public MemoryTermIndex(String name, Lang lang, OccurrenceStore occurrenceStore) {
this.lang = lang;
this.name = name;
this.occurrenceStore = occurrenceStore;
this.termMeasures.put(MEASURE_WR, new WRMeasure(this));
this.termMeasures.put(MEASURE_WRLOG, new WRLogMeasure(this));
this.termMeasures.put(MEASURE_FREQUENCY, new FrequencyMeasure(this));
}
@Override
public void addTerm(Term term) {
Preconditions.checkArgument(
!this.termsByGroupingKey.containsKey(term.getGroupingKey()));
Preconditions.checkNotNull(term.getId());
Preconditions.checkArgument(!this.termsById.containsKey(term.getId()));
this.termsByGroupingKey.put(term.getGroupingKey(), term);
this.termsById.put(term.getId(), term);
for(CustomTermIndex termIndex:this.customIndexes.values())
termIndex.indexTerm(this, term);
for(TermWord tw:term.getWords())
privateAddWord(tw.getWord(), false);
}
@Override
public void addWord(Word word) {
privateAddWord(word, true);
}
private void privateAddWord(Word word, boolean failIfAlredyPresent) {
if(failIfAlredyPresent)
Preconditions.checkArgument(
!this.wordIndex.containsKey(word.getLemma()));
this.wordIndex.put(word.getLemma(), word);
}
@Override
public Collection<Term> getTerms() {
return Collections.unmodifiableCollection(this.termsByGroupingKey.values());
}
@Override
public Word getWord(String wordId) {
return this.wordIndex.get(wordId);
}
public Word addWord(WordAnnotation anno) {
String swKey = anno.getLemma();
Word word = this.wordIndex.get(swKey);
if(word == null) {
word = new Word(anno.getLemma(), anno.getStem());
this.wordIndex.put(swKey, word);
}
return word;
}
@Override
public Term addTermOccurrence(TermOccAnnotation annotation, String fileUrl, boolean keepOccurrence) {
this.nbSpottedTerms++;
String termGroupingKey = TermSuiteUtils.getGroupingKey(annotation);
Term term = this.termsByGroupingKey.get(termGroupingKey);
if(term == null) {
TermBuilder builder = TermBuilder.start(this);
for (int i = 0; i < annotation.getWords().size(); i++) {
WordAnnotation wa = annotation.getWords(i);
Word w = this.addWord(wa);
builder.addWord(
w,
annotation.getPattern(i)
);
}
builder.setSpottingRule(annotation.getSpottingRuleName());
term = builder.createAndAddToIndex();
}
term.addOccurrence(
new TermOccurrence(
term,
annotation.getCoveredText(),
this.getDocument(fileUrl),
annotation.getBegin(),
annotation.getEnd()),
keepOccurrence
);
return term;
}
@Override
public Iterator<Term> singleWordTermIterator() {
return new SingleMultiWordIterator(true);
}
@Override
public Iterator<Term> multiWordTermIterator() {
return new SingleMultiWordIterator(false);
}
@Override
public Iterator<Term> compoundWordTermIterator() {
return new CompoundIterator();
};
private abstract class TermIterator extends AbstractIterator<Term> {
protected Term t;
protected Iterator<Term> it;
private TermIterator() {
super();
this.it = MemoryTermIndex.this.termsByGroupingKey.values().iterator();
}
}
private class SingleMultiWordIterator extends TermIterator {
/*
* Single word it if false, multiword if true
*/
private boolean singleMultiWordToggle;
private SingleMultiWordIterator(boolean singleMultiWordToogle) {
super();
this.singleMultiWordToggle = singleMultiWordToogle;
}
@Override
protected Term computeNext() {
while(it.hasNext()) {
if((t = it.next()).isSingleWord() == this.singleMultiWordToggle)
return t;
}
return endOfData();
}
}
private class CompoundIterator extends TermIterator {
@Override
protected Term computeNext() {
while(it.hasNext()) {
if((t = it.next()).isSingleWord() && t.isCompound())
return t;
}
return endOfData();
}
}
@Override
public int newId() {
while(this.termsById.containsKey(currentId))
this.currentId++;
return this.currentId;
}
// @Override
// public TermBuilder newTerm(String termId) {
// return new TermBuilder(this).setGroupingKey(termId);
// }
@Override
public CustomTermIndex getCustomIndex(String indexName) {
if(this.customIndexes.get(indexName) == null) {
TermValueProvider valueProvider = TermValueProviders.get(indexName, this);
createCustomIndex(indexName, valueProvider);
}
return this.customIndexes.get(indexName);
}
@Override
public CustomTermIndex createCustomIndex(String indexName,
TermValueProvider valueProvider) {
// Preconditions.checkArgument(
// !this.customIndexes.containsKey(indexName),
// String.format("Custom term index %s already exists.", indexName));
Preconditions.checkArgument(valueProvider != null,
MSG_NO_SUCH_PROVIDER,
indexName);
CustomTermIndexImpl customIndex = new CustomTermIndexImpl(valueProvider);
this.customIndexes.put(indexName, customIndex);
LOGGER.debug("Indexing {} terms to index {}", this.getTerms().size(), indexName);
for(Term t:this.getTerms())
customIndex.indexTerm(this, t);
return customIndex;
}
@Override
public void dropCustomIndex(String indexName) {
this.customIndexes.remove(indexName);
}
@Override
public Collection<Word> getWords() {
return Collections.unmodifiableCollection(this.wordIndex.values());
}
@Override
public Term getTermByGroupingKey(String groupingKey) {
// Preconditions.checkArgument(this.termsByGroupingKey.containsKey(groupingKey),MSG_NO_SUCH_TERM, groupingKey);
return this.termsByGroupingKey.get(groupingKey);
}
@Override
public Term getTermById(int id) {
return this.termsById.get(id);
}
@Override
public void cleanOrphanWords() {
Set<String> usedWordLemmas = Sets.newHashSet();
for(Term t:getTerms()) {
for(TermWord tw:t.getWords())
usedWordLemmas.add(tw.getWord().getLemma());
}
Iterator<Entry<String, Word>> it = wordIndex.entrySet().iterator();
Entry<String, Word> entry;
while (it.hasNext()) {
entry = it.next();
if(!usedWordLemmas.contains(entry.getValue().getLemma()))
it.remove();
}
}
@Override
public void removeTerm(Term t) {
removeTermOnly(t);
occurrenceStore.removeTerm(t);
}
private void removeTermOnly(Term t) {
termsByGroupingKey.remove(t.getGroupingKey());
termsById.remove(t.getId());
// remove from custom indexes
for(CustomTermIndex customIndex:customIndexes.values())
customIndex.removeTerm(this, t);
// remove from variants
Set<TermVariation> toRem = Sets.newHashSet();
for(TermVariation v:t.getVariations())
toRem.add(v);
for(TermVariation v:toRem)
t.removeTermVariation(v);
// remove from variants
toRem = Sets.newHashSet();
for(TermVariation v:t.getBases())
toRem.add(v);
for(TermVariation v:toRem)
v.getBase().removeTermVariation(v);
/*
* Removes from context vectors.
*
* We assumes that if this term has a context vector
* then all others terms may have this term as co-term,
* thus they must be checked from removal.
*
*/
if(t.isContextVectorComputed()) {
for(Term o:termsById.values()) {
if(o.isContextVectorComputed())
o.getContextVector().removeCoTerm(t);
}
}
}
@Override
public String getName() {
return this.name;
}
@Override
public int hashCode() {
return this.name.hashCode();
}
@Override
public Document getDocument(String url) {
Document document = documents.get(url);
if(document == null) {
document = new Document(currentDocumentId++, url);
documents.put(url, document);
}
return document;
}
@Override
public Collection<Document> getDocuments() {
return this.documents.values();
}
@Override
public void createOccurrenceIndex() {
for(Term t:this.getTerms()) {
for(TermOccurrence o:t.getOccurrences()) {
/*
* Explicitely index all occurrences within each source document. The context
* generation would not work without that step.
*
* FIXME Move these occurrence indexes inside the present AE (because the
* indexes are never used anywhere else).
*/
o.getSourceDocument().indexTermOccurrence(o);
}
}
}
@Override
public void clearOccurrenceIndex() {
for(Document d:this.getDocuments())
d.clearOccurrenceIndex();
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this).addValue(name)
.add("terms", this.termsById.size())
.toString();
}
@Override
public Lang getLang() {
return this.lang;
}
@Override
public String getCorpusId() {
return corpusId;
}
@Override
public void setCorpusId(String corpusId) {
this.corpusId = corpusId;
}
@Override
public Collection<TermClass> getTermClasses() {
return Collections.unmodifiableSet(termClasses);
}
@Override
public void classifyTerms(Term classHead, Iterable<Term> classTerms) {
Preconditions.checkArgument(Iterables.contains(classTerms, classHead), "head must be contained in class terms");
TermClass termClass = new TermClass(classHead, classTerms);
this.termClasses.add(termClass);
for(Term t2:termClass)
t2.setTermClass(termClass);
}
@Override
public void setWordAnnotationsNum(int nbWordAnnotations) {
this.nbWordAnnotations = nbWordAnnotations;
}
@Override
public int getWordAnnotationsNum() {
return this.nbWordAnnotations;
}
@Override
public TermMeasure getWRMeasure() {
return this.termMeasures.get(MEASURE_WR);
}
@Override
public TermMeasure getWRLogMeasure() {
return this.termMeasures.get(MEASURE_WRLOG);
}
@Override
public TermMeasure getFrequencyMeasure() {
return this.termMeasures.get(MEASURE_FREQUENCY);
}
@Override
public Iterable<TermMeasure> getMeasures() {
return this.termMeasures.values();
}
@Override
public int getSpottedTermsNum() {
return nbSpottedTerms;
}
@Override
public void setSpottedTermsNum(int spottedTermsNum) {
this.nbSpottedTerms = spottedTermsNum;
}
@Override
public OccurrenceStore getOccurrenceStore() {
return this.occurrenceStore;
}
@Override
public void deleteMany(TermSelector selector) {
List<Term> rem = Lists.newArrayList();
for(Term t:termsById.values()) {
if(selector.select(t))
rem.add(t);
}
for(Term t:rem)
removeTermOnly(t);
occurrenceStore.deleteMany(selector);
}
@Override
public void importCas(JCas cas, boolean keepOccurrence) {
SourceDocumentInformation sdi = JCasUtils.getSourceDocumentAnnotation(cas).get();
FSIterator<Annotation> iterator = cas.getAnnotationIndex().iterator();
while(iterator.hasNext()) {
Annotation anno = iterator.next();
if(anno instanceof WordAnnotation) {
this.nbWordAnnotations++;
}
else if(anno instanceof TermOccAnnotation) {
addTermOccurrence((TermOccAnnotation)anno, sdi.getUri(), keepOccurrence);
}
}
}
}