/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.semantic.sameas.impl; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.HashSet; import java.util.Set; import org.aksw.gerbil.semantic.sameas.SingleUriSameAsRetriever; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class UriEncodingHandlingSameAsRetriever implements SingleUriSameAsRetriever { private static final Logger LOGGER = LoggerFactory.getLogger(UriEncodingHandlingSameAsRetriever.class); private static final String CHARSET_NAME = "UTF-8"; @Override public Set<String> retrieveSameURIs(String uri) { if (uri == null) { return null; } int startPos = findLastPathSegment(uri); if (startPos >= 0) { Set<String> uris = new HashSet<String>(); uris.add(uri); if (containsEncodedParts(uri, startPos)) { try { uris.add(uri.substring(0, startPos) + URLDecoder.decode(uri.substring(startPos), CHARSET_NAME)); } catch (Exception e) { LOGGER.error("Exception while trying to decode URI. Returning null.", e); return null; } } else { try { uris.add(uri.substring(0, startPos) + URLEncoder.encode(uri.substring(startPos), CHARSET_NAME)); } catch (Exception e) { LOGGER.error("Exception while trying to encode URI. Returning null.", e); return null; } } if (uris.size() > 1) { return uris; } } return null; } @Override public Set<String> retrieveSameURIs(String domain, String uri) { return retrieveSameURIs(uri); } /** * Searches for encoded parts in the given URI starting from the given * index. An encoded part starts with a '%' followed by two hex characters. * * @return true if such a part has been found, else false */ protected static boolean containsEncodedParts(String uri, int startPos) { int state = 0; char chars[] = uri.toCharArray(); for (int i = startPos; i < chars.length; ++i) { switch (chars[i]) { case '%': state = 1; break; case '0': // falls through case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': { if (state > 0) { ++state; if (state == 3) { return true; } } break; } default: if (state > 0) { state = 0; } break; } } return false; } /** * Searches the start position of the last part of a URIs path. * * @param uri * @return */ protected static int findLastPathSegment(String uri) { int lastSlash = uri.lastIndexOf('/'); int lastHash = uri.lastIndexOf('#'); if ((lastSlash < 0) && (lastHash < 0)) { return -1; } if ((lastSlash < lastHash)) { return lastHash + 1; } else { return lastSlash + 1; } } }