/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright 2, 2015nership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package eu.project.ttc.engines;
import java.math.BigInteger;
import java.util.List;
import java.util.ListIterator;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Lists;
import eu.project.ttc.engines.variant.VariantRule;
import eu.project.ttc.engines.variant.VariantRuleIndex;
import eu.project.ttc.history.TermHistoryResource;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermVariation;
import eu.project.ttc.models.VariationType;
import eu.project.ttc.models.index.CustomIndexStats;
import eu.project.ttc.models.index.CustomTermIndex;
import eu.project.ttc.models.index.TermIndexes;
import eu.project.ttc.resources.ObserverResource;
import eu.project.ttc.resources.ObserverResource.SubTaskObserver;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.resources.YamlVariantRules;
public class SyntacticTermGatherer extends JCasAnnotator_ImplBase {
private static final Logger LOGGER = LoggerFactory.getLogger(SyntacticTermGatherer.class);
public static final String TASK_NAME = "Syntactic variant gathering";
private static final int OBSERVING_STEP = 1000;
private static final int WARNING_CRITICAL_SIZE = 2500;
private static final String M_PREFIX = "M";
@ExternalResource(key=ObserverResource.OBSERVER, mandatory=false)
protected ObserverResource observerResource;
@ExternalResource(key=TermIndexResource.TERM_INDEX, mandatory=true)
private TermIndexResource termIndexResource;
public static final String YAML_VARIANT_RULES = "YamlVariantRules";
@ExternalResource(key = YAML_VARIANT_RULES, mandatory = true)
private YamlVariantRules yamlVariantRules;
@ExternalResource(key =TermHistoryResource.TERM_HISTORY, mandatory = true)
private TermHistoryResource historyResource;
private BigInteger totalComparisons = BigInteger.valueOf(0);
private int nbComparisons = 0;
private Optional<SubTaskObserver> taskObserver = Optional.absent();
static class RunConfig {
String indexName;
VariantRuleIndex variantRuleIndex;
RunConfig(String indexName, VariantRuleIndex variantRuleIndex) {
super();
this.indexName = indexName;
this.variantRuleIndex = variantRuleIndex;
}
}
/*
* Do not deactivate gathering on key_lemma_lemma, otherwise we loose
* morphological gathering based on single-word (with [compound] tag in yaml).
* TODO : understanding why
*/
private static final RunConfig[] RUN_CONFIGS = new RunConfig[] {
new RunConfig(TermIndexes.WORD_COUPLE_LEMMA_LEMMA, VariantRuleIndex.DEFAULT),
new RunConfig(TermIndexes.WORD_COUPLE_LEMMA_STEM, VariantRuleIndex.DEFAULT),
new RunConfig(TermIndexes.TERM_HAS_PREFIX_LEMMA, VariantRuleIndex.PREFIX),
new RunConfig(TermIndexes.TERM_HAS_DERIVATES_LEMMA, VariantRuleIndex.DERIVATION)
};
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
this.yamlVariantRules.initialize(this.termIndexResource.getTermIndex());
if(observerResource != null)
taskObserver = Optional.of(observerResource.getTaskObserver(TASK_NAME));
}
@Override
public void collectionProcessComplete()
throws AnalysisEngineProcessException {
LOGGER.info("Starting syntactic term gathering for TermIndex {}", this.termIndexResource.getTermIndex().getName());
TermIndex termIndex = this.termIndexResource.getTermIndex();
if(termIndexResource.getTermIndex().getTerms().isEmpty())
return;
/*
* Prepare observer and indexes
*/
for(RunConfig runConfig:RUN_CONFIGS) {
CustomTermIndex customIndex = termIndex.getCustomIndex(runConfig.indexName);
customIndex.cleanSingletonKeys();
// clean biggest classes
customIndex.cleanEntriesByMaxSize(WARNING_CRITICAL_SIZE);
CustomIndexStats stats = new CustomIndexStats(customIndex);
// Display class sizes
Stopwatch sw1 = Stopwatch.createStarted();
int k = 0;
LOGGER.debug("Biggest class is {}, size: {}", stats.getBiggestClass(), stats.getBiggestSize());
int size;
for(Integer i:stats.getSizeCounters().keySet()) {
k ++;
size = stats.getSizeCounters().get(i).size();
totalComparisons = totalComparisons.add(BigInteger.valueOf(size * i*(i-1)));
}
LOGGER.debug("Number of term pairs to test: " + totalComparisons);
sw1.stop();
LOGGER.debug("Time to get the comparisons number: " + sw1.elapsed(TimeUnit.MILLISECONDS));
LOGGER.debug("Number of classes: " + k);
if(taskObserver.isPresent())
taskObserver.get().setTotalTaskWork(totalComparisons.longValue());
}
LOGGER.debug("Gathering with default variant rule indexing (source and target patterns)");
for(RunConfig runConfig:RUN_CONFIGS) {
gather(runConfig.indexName, runConfig.variantRuleIndex);
termIndex.dropCustomIndex(runConfig.indexName);
}
}
private void gather(final String gatheringKey, VariantRuleIndex variantRuleIndex) {
LOGGER.debug("Rule-based gathering over the pregathering key {}", gatheringKey);
// create the index
CustomTermIndex customIndex = this.termIndexResource.getTermIndex().getCustomIndex(gatheringKey);
LOGGER.debug("Rule-based gathering over {} classes", customIndex.size());
// Log the progress every 5 seconds
Timer progressLoggerTimer = new Timer("Syn. Variant Gathering Timer");
progressLoggerTimer.schedule(new TimerTask() {
@Override
public void run() {
SyntacticTermGatherer.LOGGER.info("progress for key {}: ({}%)",
gatheringKey,
String.format("%.2f", ((float)nbComparisons*100)/totalComparisons.longValue())
);
}
}, 5000l, 5000l);
// Do the gathering in each class
nbComparisons = 0;
for (String cls : customIndex.keySet()) {
List<Term> list = customIndex.getTerms(cls);
List<String> examples = Lists.newLinkedList();
int cnt =0;
for(Term t:list) {
examples.add(t.getGroupingKey());
cnt++;
if(cnt > 5)
break;
}
if(list.size() > 1 && LOGGER.isTraceEnabled())
LOGGER.trace("Rule-based gathering over the '" + cls + "' term class of size " + list.size() + ": " + Joiner.on(" ").join(examples));
Term source;
Term target;
for(ListIterator<Term> sourceIt=list.listIterator(); sourceIt.hasNext();) {
source=sourceIt.next();
for(ListIterator<Term> targetIt=list.listIterator(sourceIt.nextIndex()); targetIt.hasNext();) {
nbComparisons+=2;
target=targetIt.next();
applyGatheringRules(variantRuleIndex, source, target);
applyGatheringRules(variantRuleIndex, target, source);
if(nbComparisons % OBSERVING_STEP == 0)
if(taskObserver.isPresent())
taskObserver.get().work(OBSERVING_STEP);
}
}
}
//finalize
progressLoggerTimer.cancel();
}
private void applyGatheringRules(VariantRuleIndex variantRuleIndex, Term source, Term target) {
VariantRule matchingRule = yamlVariantRules.getMatchingRule(variantRuleIndex, source, target);
if (matchingRule != null) {
applyMatchingRule(matchingRule, source, target);
}
}
private void applyMatchingRule(VariantRule matchingRule, Term source, Term target) {
// Finds the most frequent of both terms
checkFrequency(source);
checkFrequency(target);
// /*
// * Reverse the variation when target frequency is
// * bigger than source frequency
// */
// if(baseTargetComparator.compare(source, target) > 0) {
// // swaps terms, sets the most frequent and shortest as the source
// Term aux = source;
// source = target;
// target = aux;
// }
TermVariation tv = source.addTermVariation(
target,
matchingRule.getName().startsWith(M_PREFIX) ? VariationType.MORPHOLOGICAL : VariationType.SYNTACTICAL,
matchingRule.getName());
watch(source, target, tv);
}
private void watch(Term source, Term target, TermVariation tv) {
if(historyResource.getHistory().isWatched(source.getGroupingKey()))
historyResource.getHistory().saveEvent(
source.getGroupingKey(),
this.getClass(),
"Term has a new variation: " + tv);
if(historyResource.getHistory().isWatched(target.getGroupingKey()))
historyResource.getHistory().saveEvent(
target.getGroupingKey(),
this.getClass(),
"Term has a new variation base: " + tv);
}
private void checkFrequency(Term term) {
if(term.getFrequency() == 0)
LOGGER.warn("Frequency of term {} must be greater than 0 before running SyntactticTermGatherer AE", term.getGroupingKey());
}
@Override
public void process(JCas cas) throws AnalysisEngineProcessException {
// nothing to do at cas level
}
// private static Comparator<Term> baseTargetComparator = new Comparator<Term>() {
// public int compare(Term a, Term b) {
// return ComparisonChain.start()
// .compare(b.getFrequency(), a.getFrequency())
// .compare(a.getWords().size(), b.getWords().size())
// .result();
// }
// };
}