/** * Copyright 2014 Marco Cornolti * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.acubelab.smaph.main; import it.unipi.di.acube.batframework.data.*; import it.unipi.di.acube.batframework.problems.A2WDataset; import it.unipi.di.acube.batframework.utils.WikipediaApiInterface; import it.acubelab.smaph.SmaphAnnotatorDebugger; import it.cnr.isti.hpc.erd.WikipediaToFreebase; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Vector; public class ERDDatasetFilter implements A2WDataset { private List<HashSet<Tag>> ERDTopics; private A2WDataset ds; private List<HashSet<Mention>> ERDMentions; private List<HashSet<Annotation>> ERDAnnotations; public ERDDatasetFilter(A2WDataset ds, WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase) throws IOException { this.ds = ds; FilterERDTopics(ds.getC2WGoldStandardList(), wikiApi, wikiToFreebase); FilterERDAnnotations(ds.getA2WGoldStandardList(), wikiApi, wikiToFreebase); } public static boolean EntityIsNE(WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase, int wid) throws IOException { String title = wikiApi.getTitlebyId(wid); return EntityIsNE(wikiApi, wikiToFreebase, title); } public static boolean EntityIsNE(WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase, String title) throws IOException { return title != null && wikiToFreebase.hasEntity(title); } private void FilterERDAnnotations( List<HashSet<Annotation>> a2wGoldStandardList, WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase) throws IOException { ERDMentions = new Vector<HashSet<Mention>>(); ERDAnnotations = new Vector<HashSet<Annotation>>(); for (HashSet<Annotation> anns : a2wGoldStandardList) { HashSet<Annotation> filteredAnns = new HashSet<>(); ERDAnnotations.add(filteredAnns); HashSet<Mention> filteredMentions = new HashSet<>(); ERDMentions.add(filteredMentions); for (Annotation ann : anns) { String title = wikiApi.getTitlebyId(ann.getConcept()); if (!EntityIsNE(wikiApi, wikiToFreebase, ann.getConcept())) { SmaphAnnotatorDebugger.out.printf("Discarding title=%s%n", title); continue; } SmaphAnnotatorDebugger.out.printf("Including title=%s%n", title); filteredAnns.add(ann); filteredMentions.add(new Mention(ann.getPosition(), ann .getLength())); } } } private void FilterERDTopics(List<HashSet<Tag>> c2wGoldStandardList, WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase) throws IOException { ERDTopics = new Vector<>(); for (HashSet<Tag> tags : c2wGoldStandardList) { HashSet<Tag> erdTags = new HashSet<>(); ERDTopics.add(erdTags); for (Tag t : tags) { String title = wikiApi.getTitlebyId(t.getConcept()); if (!EntityIsNE(wikiApi, wikiToFreebase, t.getConcept())) { SmaphAnnotatorDebugger.out.printf("Discarding title=%s%n", title); continue; } SmaphAnnotatorDebugger.out.printf("Including title=%s%n", title); erdTags.add(new Tag(t.getConcept())); } } } @Override public int getSize() { return ds.getSize(); } @Override public String getName() { return ds.getName() + " (ERD)"; } @Override public List<String> getTextInstanceList() { return ds.getTextInstanceList(); } @Override public int getTagsCount() { int count = 0; for (HashSet<Annotation> s : ERDAnnotations) count += s.size(); return count; } @Override public List<HashSet<Tag>> getC2WGoldStandardList() { return ERDTopics; } @Override public List<HashSet<Mention>> getMentionsInstanceList() { return ERDMentions; } @Override public List<HashSet<Annotation>> getD2WGoldStandardList() { return getA2WGoldStandardList(); } @Override public List<HashSet<Annotation>> getA2WGoldStandardList() { return ERDAnnotations; } }