/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.msnbc;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;
public class MSNBCDatasetTest {
/*
* The test file cotains two special cases. One marking with the URI
* "*null*" and two markings that are overlapping.
*/
private static final String TEST_ANNOTATION_DIR = "src/test/resources/datasets/msnbc/annot";
private static final String TEST_TEXT_DIR = "src/test/resources/datasets/msnbc/texts";
private static final String DATASET_NAME = "testDataset";
private static final String EXPECTED_DOCUMENT_URI = "http://testDataset/test.txt";
private static final String EXPECTED_TEXT = "Home Depot CEO Nardelli quits \nHome-improvement retailer's chief executive had been criticized over pay \n \nATLANTA - Bob Nardelli abruptly resigned Wednesday as chairman and chief executive of The Home Depot Inc. after a six-year tenure that saw the world’s largest home improvement store chain post big profits but left investors disheartened by poor stock performance.";
private static final Marking EXPECTED_MARKINGS[] = new Marking[] {
(Marking) new NamedEntity(0, 10,
new HashSet<String>(Arrays.asList("http://en.wikipedia.org/wiki/Home_Depot",
"http://dbpedia.org/resource/Home_Depot"))),
(Marking) new NamedEntity(11, 3, new HashSet<String>(Arrays.asList("*null*"))),
(Marking) new NamedEntity(15, 8,
new HashSet<String>(Arrays.asList("http://en.wikipedia.org/wiki/Robert_Nardelli",
"http://dbpedia.org/resource/Robert_Nardelli"))),
(Marking) new NamedEntity(107, 7,
new HashSet<String>(Arrays.asList("http://en.wikipedia.org/wiki/Atlanta,_Georgia",
"http://dbpedia.org/resource/Atlanta,_Georgia"))),
(Marking) new NamedEntity(117, 12,
new HashSet<String>(Arrays.asList("http://en.wikipedia.org/wiki/Robert_Nardelli",
"http://dbpedia.org/resource/Robert_Nardelli"))),
(Marking) new NamedEntity(193, 19, new HashSet<String>(Arrays
.asList("http://en.wikipedia.org/wiki/Home_Depot", "http://dbpedia.org/resource/Home_Depot"))) };
@Test
public void test() throws GerbilException {
MSNBCDataset dataset = new MSNBCDataset(TEST_TEXT_DIR, TEST_ANNOTATION_DIR);
dataset.setName(DATASET_NAME);
dataset.init();
Assert.assertEquals(1, dataset.getInstances().size());
Document document = dataset.getInstances().get(0);
Assert.assertEquals(EXPECTED_DOCUMENT_URI, document.getDocumentURI());
Assert.assertEquals(EXPECTED_TEXT, document.getText());
Set<Marking> expectedNEs = new HashSet<Marking>(Arrays.asList(EXPECTED_MARKINGS));
for (Marking marking : document.getMarkings()) {
Assert.assertTrue("Couldn't find " + marking.toString() + " inside " + expectedNEs.toString(),
expectedNEs.contains(marking));
}
Assert.assertEquals(expectedNEs.size(), document.getMarkings().size());
IOUtils.closeQuietly(dataset);
}
}