/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.entityhub.indexing.freebase.processor; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils; import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixProvider; import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService; import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory; import org.apache.stanbol.entityhub.indexing.core.EntityProcessor; import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig; import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum; import org.apache.stanbol.entityhub.servicesapi.model.Reference; import org.apache.stanbol.entityhub.servicesapi.model.Representation; import org.apache.stanbol.entityhub.servicesapi.model.Text; import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory; import org.apache.stanbol.entityhub.servicesapi.yard.Yard; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class FreebaseKeyProcessor implements EntityProcessor { private static final Logger log = LoggerFactory.getLogger(FreebaseKeyProcessor.class); /** * Allows to enable/disable <code>owl:sameAs</code> links to dbpedia */ public static final String PARAM_DBPEDIA_STATE = "dbpedia"; public static final boolean DEFAULT_DBPEDIA_STATE = true; /** * Allows to enable/disable <code>owl:sameAs</code> links to musicbrainz */ public static final String PARAM_MUSICBRAINZ_STATE = "musicbrainz"; public static final boolean DEFAULT_MUSICBRAINZ_STATE = true; // public static final String PARAM__STATE = ""; // public static final boolean DEFAULT__STATE = true; // // public static final String PARAM__STATE = ""; // public static final boolean DEFAULT__STATE = true; private static final String KEY_NS = "http://rdf.freebase.com/key/"; private static final String FB_NS = "http://rdf.freebase.com/ns/"; private static final int FB_NS_LEN = FB_NS.length(); private static final String WP_PREFIX = "wikipedia."; private static final int WP_PREFIX_LEN = WP_PREFIX.length(); private static final String WP_EN = KEY_NS + WP_PREFIX + "en"; private static final String MB_KEY = KEY_NS + "authority.musicbrainz"; private static final String MB_NS = "http://musicbrainz.org/"; private static final CharSequence MUSIC_PROP_PREFIX = "music."; private static final int MUSIC_PROP_PREFIX_LEN = MUSIC_PROP_PREFIX.length(); private static final Set<String> MB_TYPES = new HashSet<String>(); static{ MB_TYPES.add("recording"); MB_TYPES.add("artist"); MB_TYPES.add("release"); } private static final String SAME_AS = NamespaceEnum.owl + "sameAs"; private static final String RDF_TYPE = NamespaceEnum.rdf + "type"; private static final String RDFS_LABEL = NamespaceEnum.rdfs + "label"; public static final String PARAM_LINK_PROPERTY = "link-property"; public static final String DEFAULT_LINK_PROPERTY = SAME_AS; private String linkProperty; private boolean dbpediaState; private boolean musicbrainzState; @Override public void setConfiguration(Map<String,Object> config) { IndexingConfig indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG); NamespacePrefixService nsPrefixService = indexingConfig.getNamespacePrefixService(); Object value = config.get(PARAM_LINK_PROPERTY); if(value != null){ linkProperty = nsPrefixService.getFullName(value.toString()); if(linkProperty == null){ throw new IllegalArgumentException("Unknown Namespace Prefix use in " + PARAM_LINK_PROPERTY+'='+value+"!"); } } else { linkProperty = DEFAULT_LINK_PROPERTY; } value = config.get(PARAM_DBPEDIA_STATE); if(value != null){ dbpediaState = Boolean.parseBoolean(value.toString()); } else { dbpediaState = DEFAULT_DBPEDIA_STATE; } value = config.get(PARAM_MUSICBRAINZ_STATE); if(value != null){ musicbrainzState = Boolean.parseBoolean(value.toString()); } else { musicbrainzState = DEFAULT_MUSICBRAINZ_STATE; } } @Override public boolean needsInitialisation() { return false; } @Override public void initialise() { } @Override public void close() { } @Override public Representation process(Representation rep) { //wikipedia if(dbpediaState){ //we try to link only a single page. So get the English label and //search for the according dbpedia key Text enLabel = rep.getFirst(RDFS_LABEL, "en"); String mainKey = enLabel != null ? decodeKey(enLabel.getText()).replace(' ', '_') : null; Iterator<Text> wpEnKeys = rep.getText(WP_EN); Collection<String> keys = new ArrayList<String>(); boolean foundMain = false; if(wpEnKeys.hasNext()){ //link to the English dbpedia while(!foundMain & wpEnKeys.hasNext()){ String key = decodeKey(wpEnKeys.next().getText()); if(key.equals(mainKey)){ foundMain = true; rep.addReference(linkProperty, linkeDbPedia(null, key)); } else { keys.add(key); } } if(!foundMain){ //add all links for(String key : keys){ rep.addReference(linkProperty, linkeDbPedia(null, key)); } } } else { //search for other wikipedia keys Map<String,String> wikipediaFields = new HashMap<String,String>(); //(1) collect the fields for(Iterator<String> fields = rep.getFieldNames();fields.hasNext();){ String field = fields.next(); int nsIndex = field.lastIndexOf('/')+1; if(field.indexOf(WP_PREFIX, nsIndex) == nsIndex && //no '_' in the property name field.indexOf('_',nsIndex+WP_PREFIX_LEN+2) < 1){ String language = field.substring(nsIndex+WP_PREFIX.length(), field.length()); wikipediaFields.put(field, language); } // else no key:wikipedia.* field } //(2) add the values to avoid concurrent modification exceptions for(Entry<String,String> entry : wikipediaFields.entrySet()){ for(Iterator<Text> langWpKeys = rep.getText(entry.getKey()); langWpKeys.hasNext();){ rep.addReference(linkProperty, linkeDbPedia(entry.getValue(),langWpKeys.next().getText())); } } } } if(musicbrainzState){ Iterator<Text> mbKeys = rep.getText(MB_KEY); if(mbKeys.hasNext()){ String key = mbKeys.next().getText(); //we need the type Iterator<Reference> types = rep.getReferences(RDF_TYPE); String type = null; while(types.hasNext() && !MB_TYPES.contains(type)){ String fbType = types.next().getReference(); if(MUSIC_PROP_PREFIX.equals(fbType.subSequence(FB_NS_LEN, FB_NS_LEN+MUSIC_PROP_PREFIX_LEN))){ type = fbType.substring(FB_NS_LEN+MUSIC_PROP_PREFIX_LEN); } } if(type != null){ StringBuilder uri = new StringBuilder(MB_NS); uri.append(type).append('/').append(key).append("#_"); rep.addReference(linkProperty, uri.toString()); } } } return rep; } private String linkeDbPedia(String language, String key) { final StringBuilder uri; if(language == null){ uri = new StringBuilder("http://dbpedia.org/resource/"); } else { uri = new StringBuilder("http://").append(language).append(".dbpedia.org/resource/"); } return uri.append(key).toString(); } /** * Decodes Freebase.com keys using the '<code>$0000</code>' encoding for chars. * This encoding uses a 4 digit hex number to represent chars See the * Freebase documentation for details. * @param encodedKey * @return */ public static String decodeKey(String encodedKey){ StringBuilder key = null; //lazy initialisation for performance int index = 0; final int length = encodedKey.length(); while(index < length){ int next = encodedKey.indexOf('$', index); if(next < 0){ if(key == null){ return encodedKey; //no decoding needed } next = length; } if(key == null){ //init the StringBuilder with the maximum possible size key = new StringBuilder(encodedKey.length()); } if(next > index){ //add chars that do not need decoding key.append(encodedKey, index, next); } if(next < length){ //decode char try { if(next+4 < length){ key.appendCodePoint(Integer.parseInt( encodedKey.substring(next+1, next+5), 16)); } else { String section = encodedKey.substring(next, length); log.warn("Unable to decode Secton ["+next+"-"+(length)+"|'" + section+"'] from key '"+ encodedKey+"'! -> add plain " + "section instead!"); key.append(section); } } catch (NumberFormatException e) { String section = encodedKey.substring(next, next+5); log.warn("Unable to decode Secton ["+next+"-"+(next+5)+"|'" + section+"'] from key '"+ encodedKey+"'! -> add plain " + "section instead!"); key.append(section); } } index = next+5; //add the $0000 } return key.toString(); } }