/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.bat.annotator; import it.unipi.di.acube.batframework.data.Annotation; import it.unipi.di.acube.batframework.data.Mention; import it.unipi.di.acube.batframework.data.ScoredAnnotation; import it.unipi.di.acube.batframework.data.ScoredTag; import it.unipi.di.acube.batframework.data.Tag; import it.unipi.di.acube.batframework.problems.Sa2WSystem; import it.unipi.di.acube.batframework.utils.AnnotationException; import it.unipi.di.acube.batframework.utils.ProblemReduction; import it.unipi.di.acube.batframework.utils.WikipediaApiInterface; import it.uniroma1.lcl.babelfy.commons.BabelfyConstraints; import it.uniroma1.lcl.babelfy.commons.BabelfyParameters; import it.uniroma1.lcl.babelfy.commons.IBabelfy; import it.uniroma1.lcl.babelfy.commons.annotation.CharOffsetFragment; import it.uniroma1.lcl.babelfy.commons.annotation.DisambiguationConstraint; import it.uniroma1.lcl.babelfy.commons.annotation.MCS; import it.uniroma1.lcl.babelfy.commons.annotation.PoStaggingOptions; import it.uniroma1.lcl.babelfy.commons.annotation.SemanticAnnotation; import it.uniroma1.lcl.babelfy.commons.annotation.SemanticAnnotationResource; import it.uniroma1.lcl.babelfy.commons.annotation.SemanticAnnotationType; import it.uniroma1.lcl.babelfy.core.Babelfy; import it.uniroma1.lcl.jlt.util.Language; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import org.aksw.gerbil.bat.converter.DBpediaToWikiId; import org.springframework.beans.factory.annotation.Autowired; import com.google.common.collect.Sets; /** * The Babelfy Annotator. * * <p> * <i>Andrea Moro: "I recommend to use the maximum amount of available characters (3500) at each request (i.e., try to * put a document all together or split it in chunks of 3500 characters) both for scalability and performance * reasons."</i><br> * This means, that we have to split up documents longer than 3500 characters. Unfortunately, BabelFy seems to measure * the length on the escaped text which means that every text could be three times longer than the unescaped text. Thus, * we have to set {@link #BABELFY_MAX_TEXT_LENGTH}={@value #BABELFY_MAX_TEXT_LENGTH}. * </p> */ @Deprecated public class BabelfyAnnotator implements Sa2WSystem { // private static final Logger LOGGER = LoggerFactory.getLogger(BabelfyAnnotator.class); private static final int BABELFY_MAX_TEXT_LENGTH = 3500; public static final String NAME = "Babelfy"; // private long calib = -1; // private long lastTime = -1; @Autowired private WikipediaApiInterface wikiApi; public BabelfyAnnotator(WikipediaApiInterface wikiApi) { this.wikiApi = wikiApi; } @Override public String getName() { return NAME; } public long getLastAnnotationTime() { return -1; } @Override public HashSet<Tag> solveC2W(String text) throws AnnotationException { return ProblemReduction.Sc2WToC2W(ProblemReduction.Sa2WToSc2W(solveSa2W(text))); } @Override public HashSet<Annotation> solveA2W(String text) throws AnnotationException { return ProblemReduction.Sa2WToA2W(solveSa2W(text)); } @Override public HashSet<ScoredTag> solveSc2W(String text) throws AnnotationException { return ProblemReduction.Sa2WToSc2W(solveSa2W(text)); } @Override public HashSet<Annotation> solveD2W(String text, HashSet<Mention> mentions) throws AnnotationException { return ProblemReduction.Sa2WToD2W(solveGeneralSa2W(text, mentions), mentions, -1.0f); } @Override public HashSet<ScoredAnnotation> solveSa2W(String text) throws AnnotationException { return solveGeneralSa2W(text, null); } protected HashSet<ScoredAnnotation> solveGeneralSa2W(String text, HashSet<Mention> mentions) throws AnnotationException { HashSet<ScoredAnnotation> annotations = Sets.newHashSet(); List<String> chunks = splitText(text); BabelfyParameters bfyParameters = new BabelfyParameters(); bfyParameters.setAnnotationResource(SemanticAnnotationResource.WIKI); bfyParameters.setMCS(MCS.OFF); bfyParameters.setExtendCandidatesWithAIDAmeans(true); if (mentions != null) { bfyParameters.setThreshold(0.0); bfyParameters.setPoStaggingOptions(PoStaggingOptions.INPUT_FRAGMENTS_AS_NOUNS); bfyParameters.setDisambiguationConstraint( DisambiguationConstraint.DISAMBIGUATE_ALL_RETURN_INPUT_FRAGMENTS); } else { bfyParameters.setThreshold(0.3); bfyParameters.setAnnotationType(SemanticAnnotationType.NAMED_ENTITIES); } IBabelfy bfy = new Babelfy(bfyParameters); int prevChars = 0; for (String chunk : chunks) { BabelfyConstraints constraints = new BabelfyConstraints(); if (mentions != null) { for (Mention m : mentions) { if (m.getPosition() >= prevChars && m.getPosition()+m.getLength() < prevChars+chunk.length()) { constraints.addFragmentToDisambiguate( new CharOffsetFragment(m.getPosition()-prevChars, m.getPosition()+m.getLength()-1-prevChars)); } } } List<SemanticAnnotation> bfyAnnotations = sendRequest(bfy, chunk, constraints); for (SemanticAnnotation bfyAnn : bfyAnnotations) { int wikiID = -1; wikiID = DBpediaToWikiId.getId(wikiApi, bfyAnn.getDBpediaURL()); if (wikiID >= 0) { ScoredAnnotation gerbilAnn = new ScoredAnnotation(prevChars+bfyAnn.getCharOffsetFragment().getStart(), bfyAnn.getCharOffsetFragment().getEnd()- bfyAnn.getCharOffsetFragment().getStart()+1, wikiID, (float)bfyAnn.getScore()); annotations.add(gerbilAnn); } } prevChars += chunk.length(); } return annotations; } protected synchronized List<SemanticAnnotation> sendRequest(IBabelfy bfy, String chunk, BabelfyConstraints constraints) { return bfy.babelfy(chunk, Language.EN, constraints); } protected List<String> splitText(String text) { List<String> chunks = new ArrayList<String>(); int start = 0, end = 0, nextEnd = 0; // As long as we have to create chunks while ((nextEnd >= 0) && ((text.length() - nextEnd) > BABELFY_MAX_TEXT_LENGTH)) { // We have to use the next space, even it would be too far away end = nextEnd = text.indexOf(' ', start + 1); // Search for the next possible end this chunk while ((nextEnd >= 0) && ((nextEnd - start) < BABELFY_MAX_TEXT_LENGTH)) { end = nextEnd; nextEnd = text.indexOf(' ', end + 1); } // Add the chunk chunks.add(text.substring(start, end)); start = end; } // Add the last chunk chunks.add(text.substring(start)); return chunks; } }