/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.reuters; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.net.URI; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Locale; import java.util.Set; /** * A class that holds text and metadata for a Reuters-21578 document. */ public class ReutersDocument { private Log LOG = LogFactory.getLog(ReutersDocument.class); private static final SimpleDateFormat dateFormat = new SimpleDateFormat( "dd-MMM-yyyy hh:mm:ss.SS", Locale.US); private LEWISSPLIT lewissplit; private CGISPLIT cgisplit; private int oldid; private int newid; private Date date; private Set<String> topics; private Set<String> places; private Set<String> people; private Set<String> orgs; private Set<String> exchanges; private Set<String> companies; private String unknown; private String title; private String dateline; private String body; private URI path; public ReutersDocument() { body = ""; newid = -1; topics = new HashSet<>(); places = new HashSet<>(); people = new HashSet<>(); orgs = new HashSet<>(); exchanges = new HashSet<>(); companies = new HashSet<>(); } public URI getPath() { return path; } public void setPath(URI path) { this.path = path; } public void set(String key, Set<String> value) { switch (key) { case "TOPICS": setTopics(value); break; case "PLACES": setPlaces(value); break; case "PEOPLE": setPeople(value); break; case "ORGS": setOrgs(value); break; case "EXCHANGES": setExchanges(value); break; case "COMPANIES": setCompanies(value); break; default: LOG.warn("Invalid key: " + key); } } public void set(String key, String value) throws ParseException { switch (key) { case "LEWISSPLIT": setLewissplit(value); break; case "CGISPLIT": setCgisplit(value); break; case "OLDID": setOldid(value); break; case "NEWID": setNewid(value); break; case "DATE": setDate(value); break; case "TOPICS": addTopic(value); break; case "PLACES": addPlace(value); break; case "PEOPLE": addPeople(value); break; case "ORGS": addOrg(value); break; case "EXCHANGES": addExchange(value); break; case "COMPANIES": addCompany(value); break; case "UNKNOWN": setUnknown(value); break; case "TITLE": setTitle(value); break; case "DATELINE": setDateline(value); break; case "BODY": setBody(value); break; default: throw new IllegalArgumentException( "Unrecognized key/value pair: '" + key + "'/'" + value + "'."); } } private void addCompany(String value) { companies.add(value); } private void addExchange(String value) { exchanges.add(value); } private void addOrg(String value) { orgs.add(value); } private void addPeople(String value) { people.add(value); } private void addPlace(String value) { places.add(value); } private void addTopic(String value) { topics.add(value); } private void setNewid(String value) { setNewid(Integer.parseInt(value)); } private void setOldid(String value) { setOldid(Integer.parseInt(value)); } public int getOldid() { return oldid; } public void setOldid(int oldid) { this.oldid = oldid; } public LEWISSPLIT getLewissplit() { return lewissplit; } public void setLewissplit(String lewissplit) { setLewissplit(LEWISSPLIT.valueOf(lewissplit.toUpperCase().replaceAll("-", "_"))); } public void setLewissplit(LEWISSPLIT lewissplit) { this.lewissplit = lewissplit; } public CGISPLIT getCgisplit() { return cgisplit; } public void setCgisplit(CGISPLIT cgisplit) { this.cgisplit = cgisplit; } public void setCgisplit(String cgisplit) { setCgisplit(CGISPLIT.valueOf(cgisplit.toUpperCase().replaceAll("-", "_"))); } public int getNewid() { return newid; } public void setNewid(int newid) { this.newid = newid; } public Date getDate() { return date; } public void setDate(String date) throws ParseException { setDate(dateFormat.parse(date)); } public void setDate(Date date) { this.date = date; } public Set<String> getTopics() { return topics; } public void setTopics(Set<String> topics) { this.topics = topics; } public Set<String> getPlaces() { return places; } public void setPlaces(Set<String> places) { this.places = places; } public Set<String> getPeople() { return people; } public void setPeople(Set<String> people) { this.people = people; } public Set<String> getOrgs() { return orgs; } public void setOrgs(Set<String> orgs) { this.orgs = orgs; } public Set<String> getExchanges() { return exchanges; } public void setExchanges(Set<String> exchanges) { this.exchanges = exchanges; } public Set<String> getCompanies() { return companies; } public void setCompanies(Set<String> companies) { this.companies = companies; } public String getUnknown() { return unknown; } public void setUnknown(String unknown) { this.unknown = unknown; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDateline() { return dateline; } public void setDateline(String dateline) { this.dateline = dateline; } public String getBody() { return body; } public void setBody(String body) { this.body = body; } public enum LEWISSPLIT { TRAIN, TEST, NOT_USED } public enum CGISPLIT { TRAINING_SET, PUBLISHED_TESTSET } }