/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.entityhub.indexing.source.jenatdb; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.osgi.service.cm.ConfigurationException; import com.hp.hpl.jena.graph.Node; /** * Allows to filter Tiples based on the language of the value. Triples with * values other than <code>{@link Node#isLiteral()} == true</code> are accepted. * This is also true for all Literals that do not have a language assigned. * @author Rupert Westenthaler * */ public class LiteralLanguageFilter implements RdfImportFilter { /** * Allows to configure the literal languages included/excluded during the * import of RDF data<p> * <b>Syntax: </b><code>{lang1},!{lang2},*</code> * <ul> * <li>'{lang}' includes an language * <li>'!{lang}'excludes an language * <li>',' is the separator, additional spaces are trimmed * <li>'*' will include all properties not explicitly excluded * </ul> */ public static final String PARAM_LITERAL_LANGUAGES = "if-literal-language"; private Set<String> configuredLanguages; private Set<String> excludedLanguages; private boolean includeAll; public LiteralLanguageFilter(){} /** * For unit tests * @param config the test config */ protected LiteralLanguageFilter(String config){ parseLanguages(config); } @Override public void setConfiguration(Map<String,Object> config) { Object value = config.get(PARAM_LITERAL_LANGUAGES); if(value == null){ includeAll = true; excludedLanguages = Collections.emptySet(); configuredLanguages = Collections.emptySet(); } else { parseLanguages(value.toString()); } } private void parseLanguages(String config){ configuredLanguages = new HashSet<String>(); excludedLanguages = new HashSet<String>(); String[] languages = config.split(","); for(int i = 0;i < languages.length;i++){ languages[i] = languages[i].trim().toLowerCase(Locale.ROOT); if(includeAll == false && languages[i].equals("*")){ includeAll = true; } } for(String lang : languages) { if(lang.isEmpty() || lang.equals("*")){ continue; //ignore null values and * is already processed } //lang = lang.toLowerCase(); //country codes are upper case if(lang.charAt(0) == '!'){ //exclude lang = lang.substring(1); if(lang.isEmpty()){ continue; //only a '!' without an lanugage } if(configuredLanguages.contains(lang)){ throw new IllegalArgumentException( "Langauge '"+lang+"' is both included and excluded (config: " + config+")"); } excludedLanguages.add(lang); } else{ if(excludedLanguages.contains(lang)){ throw new IllegalArgumentException( "Langauge '"+lang+"' is both included and excluded (config: " + config+")"); } configuredLanguages.add(lang); } } } @Override public boolean needsInitialisation() { return false; } @Override public void initialise() { } @Override public void close() { } @Override public boolean accept(Node s, Node p, Node o) { if(o.isLiteral()){ if(includeAll && excludedLanguages.isEmpty()){ return true; //deactivated } String lang = o.getLiteralLanguage(); if(lang != null && !lang.isEmpty()){ if(includeAll){ return !excludedLanguages.contains(lang); } else { return configuredLanguages.contains(lang); } } else { //no plain literal (null) or default language (empty) return true; //accept it } } else { return true; //accept all none literals } } }