/* Copyright 2008, 2009, 2010 by the Oxford University Computing Laboratory This file is part of HermiT. HermiT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. HermiT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with HermiT. If not, see <http://www.gnu.org/licenses/>. */ package org.semanticweb.HermiT.datatypes.rdfplainliteral; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.semanticweb.HermiT.datatypes.ValueSpaceSubset; import dk.brics.automaton.Automaton; import dk.brics.automaton.BasicAutomata; import dk.brics.automaton.BasicOperations; import dk.brics.automaton.Datatypes; import dk.brics.automaton.RegExp; public class RDFPlainLiteralPatternValueSpaceSubset implements ValueSpaceSubset { public static final char SEPARATOR='\u0001'; protected static final Automaton s_separator; protected static final Automaton s_languagePatternEnd; protected static final Automaton s_languageTag; protected static final Automaton s_languageTagOrEmpty; protected static final Automaton s_emptyLangTag; protected static final Automaton s_nonemptyLangTag; protected static final Automaton s_anyLangTag; protected static final Automaton s_xsdString; protected static final Map<String,Automaton> s_anyDatatype; protected static final Automaton s_anyString; protected static final Automaton s_anyChar; protected static final Automaton s_anyStringWithNonemptyLangTag; static { s_separator=BasicAutomata.makeChar(SEPARATOR); s_languagePatternEnd=BasicOperations.optional(BasicAutomata.makeChar('-').concatenate(BasicAutomata.makeAnyString())); s_languageTag=languageTagAutomaton(); s_languageTagOrEmpty=s_languageTag.union(BasicAutomata.makeEmptyString()); s_emptyLangTag=s_separator; s_nonemptyLangTag=s_separator.concatenate(s_languageTag); s_anyLangTag=s_separator.concatenate(s_languageTagOrEmpty); s_xsdString=Datatypes.get("string"); s_anyDatatype=new HashMap<String,Automaton>(); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"string",s_xsdString.concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"normalizedString",normalizedStringAutomaton().concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"token",tokenAutomaton().concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"Name",Datatypes.get("Name2").concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"NCName",Datatypes.get("NCName").concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"NMTOKEN",Datatypes.get("Nmtoken2").concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.XSD_NS+"language",Datatypes.get("language").concatenate(s_emptyLangTag)); s_anyDatatype.put(RDFPlainLiteralDatatypeHandler.RDF_NS+"PlainLiteral",s_xsdString.concatenate(s_anyLangTag)); s_anyChar=xmlChar(); s_anyString=s_anyChar.repeat(); s_anyStringWithNonemptyLangTag=s_anyString.concatenate(s_nonemptyLangTag); } protected static Automaton languageTagAutomaton() { return new RegExp( "("+ "([a-zA-Z]{2,3}"+ "("+ "(-[a-zA-Z]{3}){0,3}"+ // extlang ")?"+ ")|"+ "[a-zA-Z]{4}|"+ // 4ALPHA "[a-zA-Z]{5,8}"+ // 5*8ALPHA ")"+ // language "(-[a-zA-Z]{4})?"+ // script "(-([a-zA-Z]{2}|[0-9]{3}))?"+ // region "(-([a-zA-Z0-9]{5,8}|([0-9][a-z0-9]{3})))*"+ // variant "(-([a-wy-zA-WY-Z0-9](-[a-zA-Z0-9]{2,8})+))*"+ // extension "(-x(-[a-zA-Z0-9]{1,8})+)?" // privateuse ).toAutomaton(); } protected static Automaton xmlChar() { return new RegExp("[\u0009\n\u0020-\u007F\u00A0-\uD7FF\uE000-\uFFFD]").toAutomaton(); } protected static Automaton normalizedStringAutomaton() { return new RegExp("([\u0020-\u007F\u00A0-\uD7FF\uE000-\uFFFD])*").toAutomaton(); } protected static Automaton tokenAutomaton() { return new RegExp("([\u0021-\uD7FF\uE000-\uFFFD]+(\u0020[\u0021-\uD7FF\uE000-\uFFFD]+)*)?").toAutomaton(); } protected final Automaton m_automaton; public RDFPlainLiteralPatternValueSpaceSubset(Automaton automaton) { m_automaton=automaton; } public boolean hasCardinalityAtLeast(int number) { Set<String> elements=m_automaton.getFiniteStrings(number); if (elements==null) return true; else return elements.size()>=number; } public boolean containsDataValue(Object dataValue) { if (dataValue instanceof String) { String string=(String)dataValue; return m_automaton.run(string+SEPARATOR); } else if (dataValue instanceof RDFPlainLiteralDataValue) { RDFPlainLiteralDataValue value=(RDFPlainLiteralDataValue)dataValue; String string=value.getString(); String languageTag=value.getLanguageTag().toLowerCase(); return m_automaton.run(string+SEPARATOR+languageTag); } else return false; } public void enumerateDataValues(Collection<Object> dataValues) { Set<String> elements=m_automaton.getFiniteStrings(); if (elements==null) throw new IllegalStateException("The value space range is infinite."); else { for (String element : elements) { int separatorIndex=element.lastIndexOf(SEPARATOR); String string=element.substring(0,separatorIndex); String languageTag=element.substring(separatorIndex+1,element.length()); if (languageTag.length()==0) dataValues.add(string); else dataValues.add(new RDFPlainLiteralDataValue(string,languageTag)); } } } public String toString() { StringBuffer buffer=new StringBuffer(); buffer.append("rdf:PlainLiteral{"); buffer.append(m_automaton.toString()); buffer.append('}'); return buffer.toString(); } public static Automaton toAutomaton(RDFPlainLiteralLengthValueSpaceSubset valueSpaceSubset) { List<RDFPlainLiteralLengthInterval> intervals=valueSpaceSubset.m_intervals; Automaton result=null; for (int intervalIndex=intervals.size()-1;intervalIndex>=0;--intervalIndex) { RDFPlainLiteralLengthInterval interval=intervals.get(intervalIndex); Automaton stringPart; if (interval.m_maxLength==Integer.MAX_VALUE) { if (interval.m_minLength==0) stringPart=s_anyString; else stringPart=s_anyString.intersection(BasicOperations.repeat(s_anyChar,interval.m_minLength)); } else stringPart=s_anyString.intersection(BasicOperations.repeat(s_anyChar,interval.m_minLength,interval.m_maxLength)); Automaton intervalAutomaton; if (interval.m_languageTagMode==RDFPlainLiteralLengthInterval.LanguageTagMode.ABSENT) intervalAutomaton=stringPart.concatenate(s_emptyLangTag); else intervalAutomaton=stringPart.concatenate(s_nonemptyLangTag); if (result==null) result=intervalAutomaton; else result=result.intersection(intervalAutomaton); } return result; } public static Automaton toAutomaton(int minLength,int maxLength) { assert minLength<=maxLength; Automaton stringPart; if (maxLength==Integer.MAX_VALUE) { if (minLength==0) stringPart=s_anyString; else stringPart=s_anyString.intersection(BasicOperations.repeat(s_anyChar,minLength)); } else stringPart=s_anyString.intersection(BasicOperations.repeat(s_anyChar,minLength,maxLength)); return stringPart.concatenate(s_anyLangTag); } public static boolean isValidPattern(String pattern) { try { new RegExp(pattern); return true; } catch (IllegalArgumentException e) { return false; } } public static Automaton getPatternAutomaton(String pattern) { Automaton stringPart=new RegExp(pattern).toAutomaton(); return stringPart.concatenate(s_anyLangTag); } public static Automaton getLanguageRangeAutomaton(String languageRange) { if ("*".equals(languageRange)) return s_anyStringWithNonemptyLangTag; else { Automaton languageTagPart=BasicAutomata.makeString(languageRange.toLowerCase()).concatenate(s_languagePatternEnd); return s_anyString.concatenate(s_separator).concatenate(languageTagPart); } } public static Automaton getDatatypeAutomaton(String datatypeURI) { return s_anyDatatype.get(datatypeURI); } }