/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.semantic.sameas.impl.wiki; import org.apache.commons.lang3.StringEscapeUtils; public class WikipediaXMLParser { private static final String XML_REDIRECTS_START_TAG = "<redirects>"; private static final String XML_REDIRECTS_END_TAG = "</redirects>"; private static final String XML_REDIRECT_TAG_START = "<r"; private static final char XML_REDIRECT_TAG_END = '>'; private static final String XML_REDIRECT_TO_ATTRIBUTE_START = "to=\""; private static final char XML_REDIRECT_TO_ATTRIBUTE_END = '"'; /** * <p> * Extracts the value of the <code>to</code> attribute of the first redirect * that it can find inside the given XML string. * </p> * * <p> * It is assumed that the given String looks like this: <code> * ... * <redirects> * ... * <r from="title" to="redirected title"/> * ... * </redirects> * ... * </code> * </p> * * @param xmlString * XML string from which the redirect should be parsed. * @return The title to which the given title is redirected or null, if such * a title couldn't be found. */ public String extractRedirect(String xmlString) { if (xmlString == null) { return null; } int startPos = xmlString.indexOf(XML_REDIRECTS_START_TAG); if (startPos < 0) { // couldn't find redirects tag return null; } startPos += XML_REDIRECTS_START_TAG.length(); int redirectsEnd = xmlString.indexOf(XML_REDIRECTS_END_TAG, startPos); if (redirectsEnd < 0) { // couldn't find redirects end tag (no valid XML) return null; } startPos = xmlString.indexOf(XML_REDIRECT_TAG_START, startPos); if ((startPos < 0) || (startPos >= redirectsEnd)) { // couldn't find redirect tag return null; } startPos += XML_REDIRECT_TAG_START.length(); int tagEndPos = xmlString.indexOf(XML_REDIRECT_TAG_END, startPos); if ((tagEndPos < 0) || (tagEndPos >= redirectsEnd)) { // couldn't find the end of the redirect tag (no valid XML) return null; } startPos = xmlString.indexOf(XML_REDIRECT_TO_ATTRIBUTE_START, startPos); if ((startPos < 0) || (startPos >= tagEndPos)) { // couldn't find the 'to' attribute return null; } startPos += XML_REDIRECT_TO_ATTRIBUTE_START.length(); int endPos = xmlString.indexOf(XML_REDIRECT_TO_ATTRIBUTE_END, startPos); if ((endPos < 0) || (endPos >= tagEndPos)) { // couldn't find the end of the 'to' attribute value (no valid XML) return null; } return StringEscapeUtils.unescapeXml(xmlString.substring(startPos, endPos)); } }