CasStatCounter.java example

Explorer
termsuite-core-master
- src
/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.engines;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;

import org.apache.commons.lang.mutable.MutableInt;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;

import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.VariationType;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;

/**
 * Compiles and logs CAS stats.
 * 
 * @author Damien Cram
 *
 */
public class CasStatCounter extends JCasAnnotator_ImplBase {
	private static final Logger LOGGER = LoggerFactory.getLogger(CasStatCounter.class);
	
	private Map<String, MutableInt> counters = Maps.newHashMap();
	
	public static final String STAT_NAME = "StatName";
	@ConfigurationParameter(name=STAT_NAME, mandatory=false)
	private String statName;

	public static final String DOCUMENT_PERIOD = "DocumentPeriod";
	@ConfigurationParameter(name=DOCUMENT_PERIOD, mandatory=false, defaultValue = "-1")
	private int docPeriod;
	private boolean periodicStatEnabled = false;
	private int docIt;
	private long cumulatedFileSize;
	
	public static final String TO_TRACE_FILE = "ToTraceFile";
	@ConfigurationParameter(name=TO_TRACE_FILE, mandatory=false)
	private String traceFileName;
	private Writer fileWriter;

	private static final String TSV_LINE_FORMAT="%d\t%d\t%d\t%d\t%d\n";
	
	@ExternalResource(key=TermIndexResource.TERM_INDEX, mandatory=true)
	private TermIndexResource termIndexResource;

	private Stopwatch sw;
	@Override
	public void initialize(UimaContext context) throws ResourceInitializationException {
		super.initialize(context);
		this.sw = Stopwatch.createStarted();
		if(traceFileName != null) {
			File file = new File(traceFileName);
			try {
				this.fileWriter = new FileWriter(file);
			} catch (IOException e) {
				LOGGER.error("Could not create a writer to file {}", traceFileName);
				throw new ResourceInitializationException(e);
			}
			this.periodicStatEnabled = docPeriod > 0;
			LOGGER.info("Tracing time performance to file {}", file.getAbsolutePath());
		}
	}
	
	@Override
	public void process(JCas aJCas) throws AnalysisEngineProcessException {
		this.docIt++;
		Optional<SourceDocumentInformation> sourceDocumentAnnotation = JCasUtils.getSourceDocumentAnnotation(aJCas);
		if(sourceDocumentAnnotation.isPresent())
			this.cumulatedFileSize += sourceDocumentAnnotation.get().getDocumentSize();
		FSIterator<Annotation> it =  aJCas.getAnnotationIndex().iterator();
		Annotation a;
		MutableInt i;
		while(it.hasNext()) {
			a = it.next();
			i = counters.get(a.getType().getShortName());
			if(i == null) 
				counters.put(a.getType().getShortName(), new MutableInt(1));
			else
				i.increment();
		}
		if(periodicStatEnabled && this.docIt % this.docPeriod == 0)
			try {
				traceToFile();
			} catch (IOException e) {
				throw new AnalysisEngineProcessException(e);
			}
	}
	
	private void traceToFile() throws IOException {
		String line = String.format(TSV_LINE_FORMAT,
			this.sw.elapsed(TimeUnit.MILLISECONDS),
			this.docIt,
			this.cumulatedFileSize,
			this.termIndexResource.getTermIndex().getTerms().size(),
			this.counters.get(WordAnnotation.class.getSimpleName()).intValue()
		);
		LOGGER.debug(line);
		this.fileWriter.write(line);
		this.fileWriter.flush();
	}

	@Override
	protected void finalize() throws Throwable {
		this.fileWriter.close();
		super.finalize();
	}
	@Override
	public void collectionProcessComplete()
			throws AnalysisEngineProcessException {
		if(statName != null)
			logStats();
	}

	private void logStats() {
		Ordering<String> a = Ordering.natural().reverse().onResultOf(Functions.forMap(counters)).compound(Ordering.natural());
		Map<String, MutableInt> map = ImmutableSortedMap.copyOf(counters, a);
		
		Iterator<Entry<String, MutableInt>> it = map.entrySet().iterator();
		if(it.hasNext()) {// it will be empty if pipeline is run on empty collection
			Entry<String, MutableInt> mostFrequentAnno = it.next();
			LOGGER.info("[{}] {}: {} ", statName, mostFrequentAnno.getKey(), mostFrequentAnno.getValue().intValue());
		}
		int nbSyntacticVariants = 0;
		int nbMorphologicalVariants = 0;
		int nbGraphicalVariants = 0;
		int nbOccurrences = 0;
		int nbPrimaryOccOccurrences = 0;
		TermIndex tIndex = termIndexResource.getTermIndex();
		for(Term t:tIndex.getTerms()) {
			nbMorphologicalVariants+=Iterables.size(t.getVariations(VariationType.MORPHOLOGICAL));
			nbSyntacticVariants+=Iterables.size(t.getVariations(VariationType.SYNTACTICAL));
			nbGraphicalVariants+=Iterables.size(t.getVariations(VariationType.GRAPHICAL));
			nbOccurrences+=t.getOccurrences().size();
			for(TermOccurrence o:t.getOccurrences()) {
				if(o.isPrimaryOccurrence())
					nbPrimaryOccOccurrences++;
			}
		}
		// graphical variants are bidirectional
		nbGraphicalVariants/=2;
		
		LOGGER.info("[{}] Nb terms:    {} [sw: {}, mw: {}]", statName, 
				tIndex.getTerms().size(), 
				Iterators.size(tIndex.singleWordTermIterator()),
				Iterators.size(tIndex.multiWordTermIterator()));
		LOGGER.info("[{}] Nb words:    {} [compounds: {}]", statName, 
				tIndex.getWords().size(), 
				Iterators.size(tIndex.compoundWordTermIterator()));
		LOGGER.info("[{}] Nb occurrences: {} [primary: {}]", statName, 
				nbOccurrences, 
				nbPrimaryOccOccurrences);
		LOGGER.info("[{}] Nb variants: {} [morph: {}, syn: {}, graph: {}]", statName, 
				nbMorphologicalVariants + nbSyntacticVariants + nbGraphicalVariants, 
				nbMorphologicalVariants, 
				nbSyntacticVariants, 
				nbGraphicalVariants);
	}
}