MemoryTermIndex.java example

Explorer
termsuite-core-master
- src
/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.models.index;

import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.models.Document;
import eu.project.ttc.models.OccurrenceStore;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermBuilder;
import eu.project.ttc.models.TermClass;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.TermWord;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.index.selectors.TermSelector;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.TermOccAnnotation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;
import eu.project.ttc.utils.TermSuiteUtils;

/**
 * The in-memory implementation of a {@link TermIndex}.
 * 
 * @author Damien Cram
 *
 */
public class MemoryTermIndex implements TermIndex {
	private static final Logger LOGGER = LoggerFactory.getLogger(MemoryTermIndex.class);
	private static final String MSG_NO_SUCH_PROVIDER = "No such value provider: %s";
	public static final String MSG_NO_SUCH_TERM = "No such term in term index: %s";

	private static final String MEASURE_WR = "wr";
	private static final String MEASURE_WRLOG = "wrLog";
	private static final String MEASURE_FREQUENCY = "frequency";

	/**
	 * The occurrence store 
	 */
	private OccurrenceStore occurrenceStore;

	/*
	 * The root index of terms. Variants must not be referenced at 
	 * this level of index. They me be indexed from their base-term
	 * instead. 
	 */
	private Map<Integer, Term> termsById = Maps.newHashMap();
	private List<Term> rankedTerms = Lists.newArrayList();
	private Map<String, Term> termsByGroupingKey = Maps.newHashMap();
	private Map<String, CustomTermIndex> customIndexes = Maps.newHashMap();
	private Map<String, TermMeasure> termMeasures = Maps.newHashMap();
	private Map<String, Word> wordIndex = Maps.newHashMap();
	private Map<String, Document> documents = Maps.newHashMap();
	private Set<TermClass> termClasses = Sets.newHashSet();

	private String name;
	private Lang lang;
	private String corpusId;
	
	private int currentDocumentId = 0;
	private int currentId = 0;
	private int nbWordAnnotations = 0;
	private int nbSpottedTerms = 0;
	
	public MemoryTermIndex(String name, Lang lang, OccurrenceStore occurrenceStore) {
		this.lang = lang;
		this.name = name;
		this.occurrenceStore = occurrenceStore;
		this.termMeasures.put(MEASURE_WR, new WRMeasure(this));
		this.termMeasures.put(MEASURE_WRLOG, new WRLogMeasure(this));
		this.termMeasures.put(MEASURE_FREQUENCY, new FrequencyMeasure(this));
	}
	
	@Override
	public void addTerm(Term term) {
		Preconditions.checkArgument(
				!this.termsByGroupingKey.containsKey(term.getGroupingKey()));
		Preconditions.checkNotNull(term.getId());
		Preconditions.checkArgument(!this.termsById.containsKey(term.getId()));

		this.termsByGroupingKey.put(term.getGroupingKey(), term);
		this.termsById.put(term.getId(), term);
		for(CustomTermIndex termIndex:this.customIndexes.values())
			termIndex.indexTerm(this, term);
		for(TermWord tw:term.getWords())
			privateAddWord(tw.getWord(), false);
	}


	@Override
	public void addWord(Word word) {
		privateAddWord(word, true);
	}

	private void privateAddWord(Word word, boolean failIfAlredyPresent) {
		if(failIfAlredyPresent)
			Preconditions.checkArgument(
					!this.wordIndex.containsKey(word.getLemma()));
		this.wordIndex.put(word.getLemma(), word);
	}

	@Override
	public Collection<Term> getTerms() {
		return Collections.unmodifiableCollection(this.termsByGroupingKey.values());
	}

	@Override
	public Word getWord(String wordId) {
		return this.wordIndex.get(wordId);
	}

	public Word addWord(WordAnnotation anno) {
		String swKey = anno.getLemma();
		Word word = this.wordIndex.get(swKey);
		if(word == null) {
			word = new Word(anno.getLemma(), anno.getStem());
			this.wordIndex.put(swKey, word);
		}
		return word;
	}


	@Override
	public Term addTermOccurrence(TermOccAnnotation annotation, String fileUrl, boolean keepOccurrence) {
		this.nbSpottedTerms++;
		String termGroupingKey = TermSuiteUtils.getGroupingKey(annotation);
		Term term = this.termsByGroupingKey.get(termGroupingKey);
		if(term == null) {
			TermBuilder builder = TermBuilder.start(this);
			for (int i = 0; i < annotation.getWords().size(); i++) {
				WordAnnotation wa = annotation.getWords(i);
				Word w = this.addWord(wa);
				builder.addWord(
						w, 
						annotation.getPattern(i) 
					);
			}
			builder.setSpottingRule(annotation.getSpottingRuleName());
			term = builder.createAndAddToIndex();
		}
		term.addOccurrence(
			new TermOccurrence(
				term, 
				annotation.getCoveredText(), 
				this.getDocument(fileUrl), 
				annotation.getBegin(), 
				annotation.getEnd()),
			keepOccurrence
		);
		return term;
	}

	@Override
	public Iterator<Term> singleWordTermIterator() {
		return new SingleMultiWordIterator(true); 
	}

	@Override
	public Iterator<Term> multiWordTermIterator() {
		return new SingleMultiWordIterator(false); 
	}
	
	@Override
	public Iterator<Term> compoundWordTermIterator() {
		return new CompoundIterator(); 
		};
	
	private abstract class TermIterator extends AbstractIterator<Term> {
		protected Term t;
		protected Iterator<Term> it;
		
		private TermIterator() {
			super();
			this.it = MemoryTermIndex.this.termsByGroupingKey.values().iterator();
		}
	}
	
	private class SingleMultiWordIterator extends TermIterator {
		/*
		 * Single word it if false, multiword if true
		 */
		private boolean singleMultiWordToggle;
		
		private SingleMultiWordIterator(boolean singleMultiWordToogle) {
			super();
			this.singleMultiWordToggle = singleMultiWordToogle;
		}


		@Override
		protected Term computeNext() {
			while(it.hasNext()) {
				if((t = it.next()).isSingleWord() == this.singleMultiWordToggle)
					return t;
			}
			return endOfData();
		}
	}
	
	private class CompoundIterator extends TermIterator {

		@Override
		protected Term computeNext() {
			while(it.hasNext()) {
				if((t = it.next()).isSingleWord() && t.isCompound())
					return t;
			}
			return endOfData();
		}
		
	}

	@Override
	public int newId() {
		while(this.termsById.containsKey(currentId))
			this.currentId++;
		return this.currentId;
	}
	
//	@Override
//	public TermBuilder newTerm(String termId) {
//		return new TermBuilder(this).setGroupingKey(termId);
//	}

	@Override
	public CustomTermIndex getCustomIndex(String indexName) {
		if(this.customIndexes.get(indexName) == null) {
			TermValueProvider valueProvider = TermValueProviders.get(indexName, this);
			createCustomIndex(indexName, valueProvider);
		}
		return this.customIndexes.get(indexName);
	}

	@Override
	public CustomTermIndex createCustomIndex(String indexName,
			TermValueProvider valueProvider) {
//		Preconditions.checkArgument(
//				!this.customIndexes.containsKey(indexName),
//				String.format("Custom term index %s already exists.", indexName));
		Preconditions.checkArgument(valueProvider != null, 
				MSG_NO_SUCH_PROVIDER,
				indexName);
		CustomTermIndexImpl customIndex = new CustomTermIndexImpl(valueProvider);
		this.customIndexes.put(indexName, customIndex);

		LOGGER.debug("Indexing {} terms to index {}", this.getTerms().size(), indexName);
		for(Term t:this.getTerms()) 
			customIndex.indexTerm(this, t);
		return customIndex;
	}

	@Override
	public void dropCustomIndex(String indexName) {
		this.customIndexes.remove(indexName);
	}

	@Override
	public Collection<Word> getWords() {
		return Collections.unmodifiableCollection(this.wordIndex.values());
	}

	@Override
	public Term getTermByGroupingKey(String groupingKey) {
//		Preconditions.checkArgument(this.termsByGroupingKey.containsKey(groupingKey),MSG_NO_SUCH_TERM, groupingKey);
		return this.termsByGroupingKey.get(groupingKey);
	}
	
	@Override
	public Term getTermById(int id) {
		return this.termsById.get(id);
	}

	@Override
	public void cleanOrphanWords() {
		Set<String> usedWordLemmas = Sets.newHashSet();
		for(Term t:getTerms()) {
			for(TermWord tw:t.getWords())
				usedWordLemmas.add(tw.getWord().getLemma());
		}
		Iterator<Entry<String, Word>> it = wordIndex.entrySet().iterator();
		Entry<String, Word> entry;
		while (it.hasNext()) {
			entry = it.next();
			if(!usedWordLemmas.contains(entry.getValue().getLemma()))
				it.remove();
		}
	}

	@Override
	public void removeTerm(Term t) {
		
		removeTermOnly(t);
		occurrenceStore.removeTerm(t);
	}

	private void removeTermOnly(Term t) {
		termsByGroupingKey.remove(t.getGroupingKey());
		termsById.remove(t.getId());
		
		
		// remove from custom indexes
		for(CustomTermIndex customIndex:customIndexes.values())
			customIndex.removeTerm(this, t);
		
		// remove from variants
		Set<TermVariation> toRem = Sets.newHashSet();
		for(TermVariation v:t.getVariations()) 
			toRem.add(v);
		for(TermVariation v:toRem)
			t.removeTermVariation(v);
		
		// remove from variants
		toRem = Sets.newHashSet();
		for(TermVariation v:t.getBases()) 
			toRem.add(v);
		for(TermVariation v:toRem)
			v.getBase().removeTermVariation(v);
		
		/*
		 * Removes from context vectors.
		 * 
		 * We assumes that if this term has a context vector 
		 * then all others terms may have this term as co-term,
		 * thus they must be checked from removal.
		 * 
		 */
		if(t.isContextVectorComputed()) {
			for(Term o:termsById.values()) {
				if(o.isContextVectorComputed())
					o.getContextVector().removeCoTerm(t);
			}
		}
	}

	@Override
	public String getName() {
		return this.name;
	}
	
	@Override
	public int hashCode() {
		return this.name.hashCode();
	}

	
	@Override
	public Document getDocument(String url) {
		Document document = documents.get(url);
		if(document == null) {
			document = new Document(currentDocumentId++, url);
			documents.put(url, document);
		}
		return document;
	}
	
	@Override
	public Collection<Document> getDocuments() {
		return this.documents.values();
	}

	@Override
	public void createOccurrenceIndex() {
		for(Term t:this.getTerms()) {
			for(TermOccurrence o:t.getOccurrences()) {
				/*
				 * Explicitely index all occurrences within each source document. The context 
				 * generation would not work without that step.
				 * 
				 * FIXME Move these occurrence indexes inside the present AE (because the 
				 * indexes are never used anywhere else).
				 */
				o.getSourceDocument().indexTermOccurrence(o);
			}
		}
	}
	
	@Override
	public void clearOccurrenceIndex() {
		for(Document d:this.getDocuments())
			d.clearOccurrenceIndex();
	}

	@Override
	public String toString() {
		return MoreObjects.toStringHelper(this).addValue(name)
				.add("terms", this.termsById.size())
				.toString();
	}
	
	@Override
	public Lang getLang() {
		return this.lang;
	}
	
	@Override
	public String getCorpusId() {
		return corpusId;
	}
	
	@Override
	public void setCorpusId(String corpusId) {
		this.corpusId = corpusId;
	}

	@Override
	public Collection<TermClass> getTermClasses() {
		return Collections.unmodifiableSet(termClasses);
	}

	@Override
	public void classifyTerms(Term classHead, Iterable<Term> classTerms) {
		Preconditions.checkArgument(Iterables.contains(classTerms, classHead), "head must be contained in class terms");
		TermClass termClass = new TermClass(classHead, classTerms);
		this.termClasses.add(termClass);
		for(Term t2:termClass)
			t2.setTermClass(termClass);
	}

	@Override
	public void setWordAnnotationsNum(int nbWordAnnotations) {
		this.nbWordAnnotations = nbWordAnnotations;
	}

	@Override
	public int getWordAnnotationsNum() {
		return this.nbWordAnnotations;
	}

	@Override
	public TermMeasure getWRMeasure() {
		return this.termMeasures.get(MEASURE_WR);
	}

	@Override
	public TermMeasure getWRLogMeasure() {
		return this.termMeasures.get(MEASURE_WRLOG);
	}

	@Override
	public TermMeasure getFrequencyMeasure() {
		return this.termMeasures.get(MEASURE_FREQUENCY);
	}

	@Override
	public Iterable<TermMeasure> getMeasures() {
		return this.termMeasures.values();
	}
	
	@Override
	public int getSpottedTermsNum() {
		return nbSpottedTerms;
	}
	
	@Override
	public void setSpottedTermsNum(int spottedTermsNum) {
		this.nbSpottedTerms = spottedTermsNum;
	}
	
	@Override
	public OccurrenceStore getOccurrenceStore() {
		return this.occurrenceStore;
	}

	@Override
	public void deleteMany(TermSelector selector) {
		List<Term> rem = Lists.newArrayList();
		for(Term t:termsById.values()) {
			if(selector.select(t))
				rem.add(t);
		}
		for(Term t:rem)
			removeTermOnly(t);
		occurrenceStore.deleteMany(selector);
		
	}

	@Override
	public void importCas(JCas cas, boolean keepOccurrence) {
		SourceDocumentInformation sdi = JCasUtils.getSourceDocumentAnnotation(cas).get();
		FSIterator<Annotation> iterator = cas.getAnnotationIndex().iterator();
		while(iterator.hasNext()) {
			Annotation anno = iterator.next();
			if(anno instanceof WordAnnotation) {
				this.nbWordAnnotations++;
			}
			else if(anno instanceof TermOccAnnotation) {
				addTermOccurrence((TermOccAnnotation)anno, sdi.getUri(), keepOccurrence);
			} 
		}
	}
}