/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.reuters; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import java.io.IOException; import java.text.ParseException; import java.util.LinkedList; import java.util.Queue; /** * Read a Reuters-21578 corpus in SGML format. * <p> * Set the directory that contains the SGML files with {@link #PARAM_SOURCE_LOCATION}. */ @MimeTypeCapability({MimeTypes.APPLICATION_X_REUTERS21578_SGML}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) public class Reuters21578SgmlReader extends JCasResourceCollectionReader_ImplBase { private Queue<ReutersDocument> documentQueue; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); documentQueue = new LinkedList<>(); } @Override public void getNext(JCas jCas) throws IOException, CollectionException { if (documentQueue.isEmpty()) { /* read next SGML file */ assert getResourceIterator().hasNext(); Resource resource = getResourceIterator().next(); try { documentQueue.addAll(ExtractReuters .extractFile(resource.getInputStream(), resource.getResolvedUri())); } catch (ParseException e) { throw new CollectionException(e); } } /* process 1st element of document queue */ try { ReutersDocument doc = documentQueue.poll(); initCas(jCas.getCas(), doc); MetaDataStringField date = new MetaDataStringField(jCas); date.setKey("DATE"); date.setValue(doc.getDate().toString()); date.addToIndexes(); } catch (CASException e) { throw new CollectionException(e); } } @Override public boolean hasNext() throws IOException, CollectionException { return !documentQueue.isEmpty() || getResourceIterator().hasNext(); } @Override public Progress[] getProgress() { return new Progress[0]; } private void initCas(CAS aCas, ReutersDocument doc) throws IOException, CASException { DocumentMetaData docMetaData = DocumentMetaData.create(aCas); docMetaData.setDocumentTitle(doc.getTitle()); docMetaData.setDocumentUri(doc.getPath().toString()); docMetaData.setDocumentId(Integer.toString(doc.getNewid())); docMetaData.setDocumentBaseUri(getSourceLocation()); docMetaData.setCollectionId(getSourceLocation()); aCas.setDocumentLanguage(getLanguage()); aCas.setDocumentText(doc.getBody()); } }