/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.execute; import java.util.ArrayList; import java.util.List; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.Span; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.aksw.gerbil.transfer.nif.data.SpanImpl; import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; import org.aksw.gerbil.utils.filter.MarkingFilter; import org.aksw.gerbil.utils.filter.TypeBasedMarkingFilter; import com.hp.hpl.jena.vocabulary.OWL; import com.hp.hpl.jena.vocabulary.RDFS; /** * This class reduces the information contained inside a given document. It is * needed to create tasks using the documents loaded from a dataset, e.i., * removing the result from the documents before sending them to the annotator. * * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ public class DocumentInformationReducer { public static Document reduceToPlainText(Document document) { return new DocumentImpl(document.getText(), document.getDocumentURI()); } public static Document reduceToTextAndSpans(Document document) { List<Span> spans = document.getMarkings(Span.class); List<Marking> markings = new ArrayList<Marking>(spans.size()); for (Span s : spans) { markings.add(new SpanImpl(s)); } return new DocumentImpl(document.getText(), document.getDocumentURI(), markings); } public static Document reduceToTextAndEntities(Document document) { MarkingFilter<TypedNamedEntity> filter = new TypeBasedMarkingFilter<TypedNamedEntity>(false, RDFS.Class.getURI(), OWL.Class.getURI()); List<TypedNamedEntity> namedEntities = document.getMarkings(TypedNamedEntity.class); List<Marking> markings = new ArrayList<Marking>(namedEntities.size()); for (TypedNamedEntity tne : namedEntities) { if (filter.isMarkingGood(tne)) { markings.add(new NamedEntity(tne.getStartPosition(), tne.getLength(), tne.getUris())); } } return new DocumentImpl(document.getText(), document.getDocumentURI(), markings); } }