/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.reuters; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import org.apache.commons.io.FilenameUtils; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.Map; /** * Read a Reuters-21578 corpus that has been transformed into text format using {@code ExtractReuters} in * the {@code lucene-benchmarks} project. * <p> * The {@link #PARAM_SOURCE_LOCATION} parameter should typically point to the file name pattern * {@code reut2-*.txt}, preceded by the corpus root directory. * * @see <a href="http://www.daviddlewis.com/resources/testcollections/reuters21578/">Reuters-21587 Corpus</a> * @see <a href="http://lucene.apache.org/core/5_3_1/benchmark/org/apache/lucene/benchmark/utils/ExtractReuters.html">ExtractReuters</a> * @see <a href="https://github.com/apache/mahout/blob/master/examples/bin/cluster-reuters.sh">cluster-reuters.sh</a> */ @MimeTypeCapability({MimeTypes.TEXT_X_REUTERS21578}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) public class Reuters21578TxtReader extends JCasResourceCollectionReader_ImplBase { @Override public void getNext(JCas jCas) throws IOException, CollectionException { Resource resource = getResourceIterator().next(); File file = new File(resource.getResolvedUri()); try { initCas(jCas.getCas(), file); } catch (CASException e) { throw new CollectionException(e); } } private void initCas(CAS aCas, File aFile) throws IOException, CASException { Map<String, String> doc = readFile(aFile); DocumentMetaData docMetaData = DocumentMetaData.create(aCas); docMetaData.setDocumentTitle(doc.get("title")); docMetaData.setDocumentUri(aFile.toURI().toString()); docMetaData.setDocumentId(aFile.getParentFile().getName() + "_" + FilenameUtils.getBaseName(aFile.getName())); docMetaData.setDocumentBaseUri(aFile.getParent()); docMetaData.setCollectionId(getSourceLocation()); aCas.setDocumentLanguage(getLanguage()); aCas.setDocumentText(doc.get("text")); } /** * Read a Reuters text file into a Map * * @param reutersFile a Reuters text file * @return a Map with keys {@code dateline}, {@code title}, and {@code text} * @throws IOException if the file cannot be read */ private static Map<String, String> readFile(File reutersFile) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(reutersFile)); String dateline = reader.readLine(); reader.readLine(); // skip empty line String title = reader.readLine(); reader.readLine(); // skip empty line String text = reader.readLine(); reader.close(); Map<String, String> doc = new HashMap<>(); doc.put("title", title); doc.put("dateline", dateline); doc.put("text", text); return doc; } }