/*
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.fangorn;
import static java.util.Arrays.asList;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.junit.Test;
import au.edu.unimelb.csse.queryParser.QueryBuilder;
import au.edu.unimelb.csse.search.SimpleHitCollector;
import au.edu.unimelb.csse.search.TreebankQuery;
import au.edu.unimelb.csse.search.complete.AllResults;
import au.edu.unimelb.csse.search.complete.Result;
import au.edu.unimelb.csse.search.join.TermJoinType;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpParser;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter;
public class FangornWriterTest
{
@Test
public void test()
throws Exception
{
File outputFile = new File("target/test-output");
JCas jcas = JCasFactory.createJCas();
jcas.setDocumentLanguage("en");
jcas.setDocumentText("This is a test. I may work. Or it may not work.");
DocumentMetaData meta = DocumentMetaData.create(jcas);
meta.setCollectionId("dummyCollection");
meta.setDocumentId("dummyId");
AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class);
AnalysisEngineDescription parser = createEngineDescription(OpenNlpParser.class,
OpenNlpParser.PARAM_WRITE_PENN_TREE, true);
AnalysisEngineDescription writer = createEngineDescription(FangornWriter.class,
FangornWriter.PARAM_TARGET_LOCATION, outputFile);
SimplePipeline.runPipeline(jcas, segmenter, parser, writer);
IndexSearcher searcher = new IndexSearcher(FSDirectory.getDirectory(outputFile));
QueryBuilder builder = new QueryBuilder("//NP");
TreebankQuery tq = builder.parse(TermJoinType.SIMPLE_WITH_FC, false);
SimpleHitCollector hitCollector = new SimpleHitCollector(100);
searcher.search(tq, hitCollector);
AllResults allResults = new AllResults(hitCollector.hits, hitCollector.totalHits, tq);
Result[] resultMeta = allResults.collect(searcher);
String[] results = new String[hitCollector.totalHits];
for (int i = 0; i < hitCollector.totalHits; i++) {
results[i] = searcher.doc(hitCollector.hits[i]).get("sent").trim();
}
List<String> actual = new ArrayList<String>();
for (int i = 0; i < hitCollector.totalHits; i++) {
Document doc = searcher.doc(hitCollector.hits[i]);
actual.add(String.format("%s %s %s %s %s",
doc.get(FangornWriter.FIELD_COLLECTION_ID),
doc.get(FangornWriter.FIELD_DOCUMENT_ID),
doc.get(FangornWriter.FIELD_BEGIN),
doc.get(FangornWriter.FIELD_END),
resultMeta[i].asJSONString().replace('"', '\'')));
}
List<String> expected = asList(
"dummyCollection dummyId 0 15 {'num':'2','ms':[{'m':[{'s':'','e':'1_0_2_8','o':'0','t':'0'}]},{'m':[{'s':'','e':'4_2_3_6','o':'0','t':'0'}]}]}",
"dummyCollection dummyId 16 27 {'num':'1','ms':[{'m':[{'s':'','e':'1_0_2_7','o':'0','t':'0'}]}]}",
"dummyCollection dummyId 28 47 {'num':'1','ms':[{'m':[{'s':'','e':'2_1_2_9','o':'0','t':'0'}]}]}");
assertEquals(StringUtils.join(expected, "\n"), StringUtils.join(actual, "\n"));
}
}