/*
* Copyright 2015
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.reuters;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.*;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class ExtractReutersTest
{
private static final String REUTERS_DIR = "src/test/resources/reuters-sgml";
@Test
public void testExtract()
throws Exception
{
int expectedDocs = 1000;
String expectedTitleFirst = "BAHIA COCOA REVIEW";
Date expectedDateFirst = new GregorianCalendar(1987, 1, 26, 15, 1, 1).getTime();
String expectedBodyFirst = "Showers";
String expectedTopicFirst = "cocoa";
ReutersDocument.LEWISSPLIT expectedlLewissplitFirst = ReutersDocument.LEWISSPLIT.TRAIN;
ReutersDocument.CGISPLIT expectedCgisplitFirst = ReutersDocument.CGISPLIT.TRAINING_SET;
int oldIdFirst = 5544;
int newIdFirst = 1;
String expectedTitle4 = "NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE";
Date expectedDate4 = new GregorianCalendar(1987, 1, 26, 15, 10, 44).getTime();
String expectedBody4 = "The U.S. Agriculture Department";
Set<String> expectedTopic4 = new HashSet<>(
Arrays.asList(
new String[] { "grain", "wheat", "corn", "barley", "oat", "sorghum" }));
ReutersDocument.LEWISSPLIT expectedlLewissplit4 = ReutersDocument.LEWISSPLIT.TRAIN;
ReutersDocument.CGISPLIT expectedCgisplit4 = ReutersDocument.CGISPLIT.TRAINING_SET;
int oldId4 = 5548;
int newId4 = 5;
String expectedTitleLast = "NATIONAL AMUSEMENTS AGAIN UPS VIACOM <VIA> BID";
Date expectedDateLast = new GregorianCalendar(1987, 2, 3, 9, 17, 32).getTime();
String expectedBodyLast = "Viacom International Inc said ";
String expectedTopicLast = "acq";
ReutersDocument.LEWISSPLIT expectedlLewissplitLast = ReutersDocument.LEWISSPLIT.TRAIN;
ReutersDocument.CGISPLIT expectedCgisplitLast = ReutersDocument.CGISPLIT.TRAINING_SET;
int oldIdLast = 16320;
int newIdLast = 1000;
List<ReutersDocument> docs = ExtractReuters.extract(new File(REUTERS_DIR).toPath());
assertEquals(expectedDocs, docs.size());
/* assert first doc */
ReutersDocument doc0 = docs.get(0);
assertEquals(expectedTitleFirst, doc0.getTitle());
assertEquals(expectedDateFirst.toString(), doc0.getDate().toString());
assertTrue(doc0.getTopics().contains(expectedTopicFirst));
assertTrue(doc0.getBody().startsWith(expectedBodyFirst));
Assert.assertEquals(expectedlLewissplitFirst, doc0.getLewissplit());
Assert.assertEquals(expectedCgisplitFirst, doc0.getCgisplit());
Assert.assertEquals(oldIdFirst, doc0.getOldid());
Assert.assertEquals(newIdFirst, doc0.getNewid());
ReutersDocument doc4 = docs.get(4);
assertEquals(expectedTitle4, doc4.getTitle());
assertEquals(expectedDate4.toString(), doc4.getDate().toString());
assertEquals(expectedTopic4, doc4.getTopics());
assertTrue(doc0.getBody().startsWith(expectedBodyFirst));
Assert.assertEquals(expectedlLewissplit4, doc4.getLewissplit());
Assert.assertEquals(expectedCgisplit4, doc4.getCgisplit());
Assert.assertEquals(oldId4, doc4.getOldid());
Assert.assertEquals(newId4, doc4.getNewid());
/* assert last doc */
ReutersDocument doc999 = docs.get(999);
assertEquals(expectedTitleLast, doc999.getTitle());
assertEquals(expectedDateLast.toString(), doc999.getDate().toString());
assertTrue(doc999.getTopics().contains(expectedTopicLast));
assertTrue(doc999.getBody().startsWith(expectedBodyLast));
Assert.assertEquals(expectedlLewissplitLast, doc999.getLewissplit());
Assert.assertEquals(expectedCgisplitLast, doc999.getCgisplit());
Assert.assertEquals(oldIdLast, doc999.getOldid());
Assert.assertEquals(newIdLast, doc999.getNewid());
}
}