/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.test.unit.io;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.offset;
import static org.assertj.core.api.Assertions.tuple;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.junit.Before;
import org.junit.Test;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import eu.project.ttc.api.JsonOptions;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.models.CompoundType;
import eu.project.ttc.models.ContextVector;
import eu.project.ttc.models.Document;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermBuilder;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.VariationType;
import eu.project.ttc.models.Word;
import eu.project.ttc.models.WordBuilder;
import eu.project.ttc.models.index.JsonTermIndexIO;
import eu.project.ttc.models.index.MemoryTermIndex;
import eu.project.ttc.models.occstore.MemoryOccurrenceStore;
import eu.project.ttc.test.unit.TestUtil;
public class JsonTermIndexIOSpec {
public static final String jsonFile1 = "org/project/ttc/test/json/termIndex1.json";
private TermIndex termIndex;
private Term term1;
private Term term2;
private Word word1;
private Word word2;
private Word word3;
private Document doc1;
private Document doc2;
private Document doc3;
private String json1;
@Before
public void initTermIndex() {
termIndex = new MemoryTermIndex("Titi va voir Toto", Lang.FR, new MemoryOccurrenceStore());
termIndex.setCorpusId("ccid");
termIndex.setWordAnnotationsNum(222);
termIndex.setSpottedTermsNum(111);
doc1 = termIndex.getDocument("source1");
doc2 = termIndex.getDocument("source2");
doc3 = termIndex.getDocument("source3");
word1 = new Word("word1", "stem1");
word2 = new Word("word2", "stem2");
word3 = WordBuilder.start()
.setLemma("word3")
.setStem("stem3")
.addComponent(0, 2,"wop")
.addComponent(2, 5,"rd3")
.setCompoundType(CompoundType.NATIVE)
.create();
term1 = TermBuilder.start(termIndex)
.setRank(1)
.addWord(word1, "L1")
.addWord(word2, "L2")
.addOccurrence(10, 12, doc2, "coveredText 3")
.addOccurrence(20, 30, doc3, "coveredText 4")
.setSpottingRule("spotRule1")
.setSpecificity(1.1)
.createAndAddToIndex();
term2 = TermBuilder.start(termIndex)
.setRank(2)
.addWord(word1, "L1")
.addWord(word2, "L2")
.addWord(word3, "L3")
.setSpottingRule("spotRule1")
.addOccurrence(0, 2, doc2, "coveredText 1")
.addOccurrence(10, 12, doc1, "coveredText 2")
.addOccurrence(14, 20, doc2, "coveredText 2")
.setSpecificity(2.2)
.createAndAddToIndex();
term1.addTermVariation(term2, VariationType.SYNTACTICAL, "variationRule1");
term1.addTermVariation(term2, VariationType.GRAPHICAL, 0.956d);
// generate context vectors
ContextVector v = new ContextVector(term1);
v.addEntry(term2, 21, 2.0);
term1.setContextVector(v);
}
@Before
public void initJsonTermIndex() {
json1 = TestUtil.readFile(jsonFile1);
}
@Test
public void testSaveLoadReturnWithNoVariant() throws IOException {
term1.removeTermVariation(term1.getVariations(VariationType.SYNTACTICAL).iterator().next());
StringWriter writer = new StringWriter();
JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(true));
String string = writer.toString();
JsonTermIndexIO.load(new StringReader(string), new JsonOptions().withOccurrences(true));
}
@Test
public void testSaveLoadReturn() throws IOException {
StringWriter writer = new StringWriter();
JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(true));
String string = writer.toString();
TermIndex termIndex2 = JsonTermIndexIO.load(new StringReader(string), new JsonOptions().withOccurrences(true));
assertEquals(111, termIndex2.getSpottedTermsNum());
assertEquals(222, termIndex2.getWordAnnotationsNum());
assertThat(termIndex2.getTerms()).hasSameElementsAs(termIndex.getTerms());
assertThat(termIndex2.getWords()).hasSameElementsAs(termIndex.getWords());
for(Term t:termIndex.getTerms()) {
Term t2 = termIndex2.getTermByGroupingKey(t.getGroupingKey());
assertThat(t2.getOccurrences()).hasSameElementsAs(t.getOccurrences());
assertThat(t2.getVariations()).hasSameElementsAs(t.getVariations());
assertThat(t2.getBases()).hasSameElementsAs(t.getBases());
assertThat(t2.getForms()).hasSameElementsAs(t.getForms());
assertThat(t2.getFrequency()).isEqualTo(t.getFrequency());
assertThat(t2.getSpecificity()).isEqualTo(t.getSpecificity());
assertThat(t2.getFrequencyNorm()).isEqualTo(t.getFrequencyNorm());
assertThat(t2.getGeneralFrequencyNorm()).isEqualTo(t.getGeneralFrequencyNorm());
assertThat(t2.getSpottingRule()).isEqualTo(t.getSpottingRule());
assertThat(t2.getPattern()).isEqualTo(t.getPattern());
assertThat(t2.getWords()).isEqualTo(t.getWords());
assertThat(t2.getRank()).isEqualTo(t.getRank());
if(t2.getId() == term1.getId()) {
assertTrue(t.isContextVectorComputed());
assertTrue(t2.isContextVectorComputed());
assertThat(t2.getContextVector()).isEqualTo(t.getContextVector());
} else if(t2.getId() == term2.getId()) {
assertFalse(t.isContextVectorComputed());
assertFalse(t2.isContextVectorComputed());
} else {
fail("should never happen");
}
}
for(Word w:termIndex.getWords()) {
Word w2 = termIndex2.getWord(w.getLemma());
assertThat(w2.getStem()).isEqualTo(w.getStem());
assertThat(w2.isCompound()).isEqualTo(w.isCompound());
assertThat(w2.getCompoundType()).isEqualTo(w.getCompoundType());
assertThat(w2.getComponents()).hasSameElementsAs(w.getComponents());
}
}
@Test
public void testExportTermIndexToJsonWithoutOccurrences() throws IOException {
StringWriter writer = new StringWriter();
JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(false));
ObjectMapper mapper = new ObjectMapper();
Map<String,Object> map = mapper.readValue(writer.toString(),
new TypeReference<HashMap<String,Object>>(){});
@SuppressWarnings("unchecked")
Map<String,Object> t1 = (Map<String,Object>)((List<?>)map.get("terms")).iterator().next();
assertThat(t1.keySet()).contains("id", "key").doesNotContain("occurrences");
}
@Test
public void testLoadJsonTermIndex() throws IOException {
TermIndex termIndex = JsonTermIndexIO.load(new StringReader(json1), new JsonOptions().withOccurrences(true));
assertEquals("Toto va à la plage", termIndex.getName());
assertEquals("Toto va à la montagne", termIndex.getCorpusId());
assertEquals(Lang.EN, termIndex.getLang());
assertEquals(123, termIndex.getWordAnnotationsNum());
assertEquals(456, termIndex.getSpottedTermsNum());
// test term rank
assertThat(termIndex.getTerms()).hasSize(3)
.extracting("rank")
.containsOnly(1, 2, 3)
;
// test terms
assertThat(termIndex.getTerms()).hasSize(3)
.extracting("groupingKey")
.containsOnly("na: word1 word2", "n: word1", "a: word2")
;
// test terms
Term t1 = termIndex.getTermByGroupingKey("na: word1 word2");
Term t2 = termIndex.getTermByGroupingKey("n: word1");
Term t3 = termIndex.getTermByGroupingKey("a: word2");
assertThat(t1.getId()).isEqualTo(1);
assertThat(t1.getSpecificity()).isCloseTo(0.321d, offset(0.000001d));
assertThat(t1.getFrequencyNorm()).isCloseTo(0.123d, offset(0.000001d));
assertThat(t1.getGeneralFrequencyNorm()).isCloseTo(0.025d, offset(0.000001d));
assertThat(t1.getFrequency()).isEqualTo(6);
assertThat(t1.getVariations(VariationType.GRAPHICAL)).extracting("variant").containsOnly(t2);
assertThat(t1.getVariations(VariationType.SYNTACTICAL)).hasSize(0);
assertThat(t1.getBases())
.hasSize(2)
.extracting("base")
.containsOnly(t2, t3);
// test words
assertThat(termIndex.getWords()).hasSize(2)
.extracting("lemma", "stem")
.containsOnly(
tuple("word1", "stem1"),
tuple("word2", "stem2")
);
// test word composition
Iterator<Word> iterator = termIndex.getWords().iterator();
Word w1 = iterator.next();
assertFalse(w1.isCompound());
assertThat(w1.getComponents()).hasSize(0);
Word w2 = iterator.next();
assertTrue(w2.isCompound());
assertThat(w2.getComponents())
.extracting("lemma", "begin", "end")
.containsOnly(
tuple("wor", 0, 3),
tuple("d3", 3, 5)
);
assertThat(t1.getContextVector().getEntries())
.hasSize(2)
.extracting("coTerm.id", "nbCooccs", "assocRate")
.contains(
tuple(2, 18, 1.2000000476837158d),
tuple(3, 12, 6.5d)
);
}
@Test
public void testExportTermIndexToJsonWithOccurrencesAndContext() throws IOException {
StringWriter writer = new StringWriter();
JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(true));
ObjectMapper mapper = new ObjectMapper();
// System.out.println(writer.toString());
Map<String,Object> map = mapper.readValue(writer.toString(),
new TypeReference<HashMap<String,Object>>(){});
assertThat(map.keySet()).hasSize(5).containsOnly("metadata", "words", "terms", "variations", "input_sources");
// test metadata
Map<String,String> metadata = (LinkedHashMap<String,String>)map.get("metadata");
assertThat(metadata).containsOnlyKeys("name", "corpus-id", "wordsNum", "spottedTermsNum", "lang", "occurrence_storage");
// test input sources1
@SuppressWarnings("unchecked")
Map<String,String> inputSources = (LinkedHashMap<String,String>)map.get("input_sources");
assertThat(inputSources).containsOnlyKeys("1", "2", "3");
assertThat(inputSources.values()).containsOnly("source1", "source2", "source3");
// test words
List<?> wordList = (List<?>)map.get("words");
assertThat(wordList).hasSize(3).extracting("lemma").containsOnly("word1", "word2", "word3");
LinkedHashMap<?,?> w3 = null;
for(Object wl:wordList) {
if(((LinkedHashMap<?, ?>)wl).get("lemma").equals("word3"))
w3 = (LinkedHashMap<?,?>)wl;
}
assertEquals("word3", w3.get("lemma"));
assertEquals("stem3", w3.get("stem"));
assertEquals("NATIVE", w3.get("compound_type"));
List<?> components = (List<?>)w3.get("components");
assertThat(components).hasSize(2).extracting("lemma", "begin", "end").contains(tuple("wop", 0, 2), tuple("rd3", 2, 5));
// test terms
BiMap<String, String> sources = HashBiMap.create(inputSources);
List<?> termList = (List<?>)map.get("terms");
assertThat(termList).hasSize(2).extracting("id").containsOnly(term1.getId(), term2.getId());
LinkedHashMap<?,?> t1 = (LinkedHashMap<?,?>)termList.get(0);
assertThat(t1.get("rank")).isEqualTo(1);
assertThat(t1.get("spec")).isEqualTo(1.1);
assertThat((List<?>)t1.get("words")).extracting("lemma", "syn").containsOnly(tuple("word1", "L1"), tuple("word2", "L2"));
assertThat((List<?>)t1.get("occurrences")).hasSize(2).extracting("begin", "end", "file", "text").containsOnly(
tuple(10, 12, Integer.parseInt(sources.inverse().get("source2")), "coveredText 3"),
tuple(20, 30, Integer.parseInt(sources.inverse().get("source3")), "coveredText 4")
);
final Map<?,?> t1Ctxt = (Map<?,?>)t1.get("context");
assertEquals(21, t1Ctxt.get("total_cooccs"));
assertThat((List<?>)t1Ctxt.get("cooccs"))
.hasSize(1)
.extracting("co_term", "cnt", "assoc_rate")
.contains(tuple("l1l2l3: word1 word2 word3", 21, 2.0d));
LinkedHashMap<?,?> t2 = (LinkedHashMap<?,?>)termList.get(1);
assertThat((List<?>)t2.get("occurrences")).hasSize(3).extracting("begin", "end", "file", "text").containsOnly(
tuple(0, 2, Integer.parseInt(sources.inverse().get("source2")), "coveredText 1"),
tuple(10, 12, Integer.parseInt(sources.inverse().get("source1")), "coveredText 2"),
tuple(14, 20, Integer.parseInt(sources.inverse().get("source2")), "coveredText 2")
);
assertThat((List<?>)t2.get("words")).extracting("lemma", "syn").containsOnly(tuple("word1", "L1"), tuple("word2", "L2"), tuple("word3", "L3"));
// test syntactic variants
List<?> variantList = (List<?>)map.get("variations");
assertThat(variantList).hasSize(2)
.extracting("base", "variant", "info", "type")
.contains(
tuple(term1.getGroupingKey(), term2.getGroupingKey(), "variationRule1", "syn"),
tuple(term1.getGroupingKey(), term2.getGroupingKey(), "0.956", "graph")
);
}
}