package eu.project.ttc.termino.export; import java.io.IOException; import java.io.Writer; import java.util.Collections; import java.util.List; import com.google.common.collect.Lists; import eu.project.ttc.api.TermSuiteException; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.TermOccurrence; import eu.project.ttc.models.TermVariation; import eu.project.ttc.models.VariationType; public class VariantEvalExporter { private TermIndex termIndex; private Writer writer; private int nbVariantsPerTerm; private int contextSize; private int nbExampleOccurrences; private int topN; private VariantEvalExporter(TermIndex termIndex, Writer writer, int nbVariantsPerTerm, int contextSize, int nbExampleOccurrences, int topN) { super(); this.termIndex = termIndex; this.writer = writer; this.nbVariantsPerTerm = nbVariantsPerTerm; this.contextSize = contextSize; this.nbExampleOccurrences = nbExampleOccurrences; this.topN = topN; } public static void export(TermIndex termIndex, Writer writer, int nbVariantsPerTerm, int contextSize, int nbExampleOccurrences, int topN) { new VariantEvalExporter(termIndex, writer, nbVariantsPerTerm, contextSize, nbExampleOccurrences, topN).doExport(); } private void doExport() { try { int rank = 0; int variantCnt = 0; for(Term t:termIndex.getTerms()) { if(t.isVariant()) continue; printBase(++rank, t); int variantRank = 0; for(TermVariation variation:t.getVariations(VariationType.MORPHOLOGICAL, VariationType.SYNTACTICAL)) { if(variantRank >= nbVariantsPerTerm) break; variantCnt++; variantRank++; printVariation(rank, variantRank, variation); printTermOccurrences(variation.getVariant()); } if(variantCnt>this.topN) break; } } catch (IOException e) { throw new TermSuiteException(e); } } private void printVariation(int termRank, int variantRank, TermVariation variation) throws IOException { Term variant = variation.getVariant(); String pilot = variant.getForms().iterator().next(); writer.write(Integer.toString(termRank)); writer.write("\t"); writer.write("V_" + Integer.toString(variantRank)); writer.write("\t"); writer.write(String.format("<%s>", variation.getInfo().toString())); writer.write("\t"); writer.write(String.format("%s (%d)", pilot, variant.getFrequency())); writer.write("\t"); writer.write(String.format("[%s]", variant.getGroupingKey())); writer.write("\t"); writer.write("{is_variant: _0_or_1_, variant_type: _syn_termino_other_}"); writer.write("\n"); } private void printBase(int rank, Term t) throws IOException { writer.write(Integer.toString(rank)); writer.write("\t"); writer.write("T"); writer.write("\t"); writer.write(t.getForms().iterator().next()); writer.write("\t"); writer.write(String.format("[%s]", t.getGroupingKey())); writer.write("\n"); } private void printTermOccurrences(Term term) throws IOException { List<TermOccurrence> occurrences = Lists.newArrayList(term.getOccurrences()); Collections.shuffle(occurrences); int occCnt = 0; for(TermOccurrence occurrence:occurrences) { if(occCnt > this.nbExampleOccurrences) break; printOccurrence(occurrence); occCnt++; } } private void printOccurrence(TermOccurrence occurrence) throws IOException { writer.write("#\t\t ..."); String textualContext = occurrence.getTextualContext(contextSize); writer.write(textualContext); writer.write("\n"); } }