/* * Copyright (c) 2006 Henri Sivonen * Copyright (c) 2007-2010 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package org.whattf.datatype; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import org.mozilla.javascript.Context; import org.mozilla.javascript.ContextFactory; import org.mozilla.javascript.RhinoException; import org.relaxng.datatype.DatatypeException; import org.whattf.io.DataUri; import org.whattf.io.DataUriException; import org.whattf.io.Utf8PercentDecodingReader; import com.hp.hpl.jena.iri.IRI; import com.hp.hpl.jena.iri.IRIException; import com.hp.hpl.jena.iri.IRIFactory; import com.hp.hpl.jena.iri.Violation; public class IriRef extends AbstractDatatype { /** * The singleton instance. */ public static final IriRef THE_INSTANCE = new IriRef(); protected IriRef() { super(); } private final static boolean WARN = System.getProperty("org.whattf.datatype.warn","").equals("true") ? true : false; /** * The "known" Jena IRI violation codes we catch and handle specifically. * Note that this enum intentionally does not hold a complete list of all * the violation codes that Jena can return, and "ZZZ_DUMMY_DEFAULT" is not * an actual Jena IRI violation code (it's instead added here for our own * internal purposes). */ private enum KnownViolationCode { COMPATIBILITY_CHARACTER, CONTROL_CHARACTER, DNS_LABEL_DASH_START_OR_END, DOUBLE_WHITESPACE, EMPTY_SCHEME, HAS_PASSWORD, ILLEGAL_CHARACTER, ILLEGAL_PERCENT_ENCODING, IP_V4_HAS_FOUR_COMPONENTS, IP_V4_OCTET_RANGE, IP_V6_OR_FUTURE_ADDRESS_SYNTAX, NON_INITIAL_DOT_SEGMENT, NOT_DNS_NAME, PORT_SHOULD_NOT_BE_WELL_KNOWN, REQUIRED_COMPONENT_MISSING, SCHEME_MUST_START_WITH_LETTER, UNDEFINED_UNICODE_CHARACTER, UNICODE_WHITESPACE, UNREGISTERED_NONIETF_SCHEME_TREE, WHITESPACE, ZZZ_DUMMY_DEFAULT } private final CharSequencePair splitScheme(CharSequence iri) { StringBuilder sb = new StringBuilder(); Boolean atSchemeBeginning = true; for (int i = 0; i < iri.length(); i++) { char c = toAsciiLowerCase(iri.charAt(i)); if (atSchemeBeginning) { // Skip past any leading characters that the HTML5 spec defines // as space characters: space, tab, LF, FF, CR if (' ' == c || '\t' == c || '\n' == c || '\f' == c || '\r' == c) { continue; } if ('a' <= c && 'z' >= c) { atSchemeBeginning = false; sb.append(c); } else { return null; } } else { if (('a' <= c && 'z' >= c) || ('0' <= c && '9' >= c) || c == '+' || c == '.') { sb.append(c); continue; } else if (c == ':') { return new CharSequencePair(sb, iri.subSequence(i + 1, iri.length())); } else { return null; } } } return null; } public void checkValid(CharSequence literal) throws DatatypeException { // TODO Find out if it is safe to put this in a field IRIFactory fac = new IRIFactory(); fac.shouldViolation(true, false); fac.securityViolation(true, false); fac.dnsViolation(true, false); fac.mintingViolation(false, false); fac.useSpecificationIRI(true); fac.useSchemeSpecificRules("http", true); fac.useSchemeSpecificRules("https", true); fac.useSchemeSpecificRules("ftp", true); fac.useSchemeSpecificRules("mailto", true); // XXX broken fac.useSchemeSpecificRules("file", true); fac.useSchemeSpecificRules("data", true); // XXX broken // XXX javascript? // fac.setQueryCharacterRestrictions(false); IRI iri; boolean data = false; try { CharSequencePair pair = splitScheme(literal); if (pair == null) { // no scheme or scheme is private iri = fac.construct(trimHtmlSpaces(literal.toString())); } else { CharSequence scheme = pair.getHead(); CharSequence tail = pair.getTail(); if (isWellKnown(scheme)) { iri = fac.construct(trimHtmlSpaces(literal.toString())); } else if ("javascript".contentEquals(scheme)) { // StringBuilder sb = new StringBuilder(2 + // literal.length()); // sb.append("x-").append(literal); // iri = fac.construct(sb.toString()); iri = null; // Don't bother user with generic IRI syntax Reader reader = new BufferedReader( new Utf8PercentDecodingReader(new StringReader( "function(event){" + tail.toString() + "}"))); // XXX CharSequenceReader reader.mark(1); int c = reader.read(); if (c != 0xFEFF) { reader.reset(); } try { Context context = ContextFactory.getGlobal().enterContext(); context.setOptimizationLevel(0); context.setLanguageVersion(Context.VERSION_1_6); // -1 for lineno arg prevents Rhino from appending // "(unnamed script#1)" to all error messages context.compileReader(reader, null, -1, null); } finally { Context.exit(); } } else if ("data".contentEquals(scheme)) { data = true; iri = fac.construct(trimHtmlSpaces(literal.toString())); } else if (isHttpAlias(scheme)) { StringBuilder sb = new StringBuilder(5 + tail.length()); sb.append("http:").append(tail); iri = fac.construct(trimHtmlTrailingSpaces(sb.toString())); } else { StringBuilder sb = new StringBuilder(2 + literal.length()); sb.append("x-").append(literal); iri = fac.construct(trimHtmlTrailingSpaces(sb.toString())); } } } catch (IRIException e) { Violation v = e.getViolation(); /* * Violation codes that are not "known" codes get assigned the * dummy value so that handling of them will fall through to * the default case. */ KnownViolationCode vc = KnownViolationCode.valueOf("ZZZ_DUMMY_DEFAULT"); try { /* * If this violation code is one of the "known" Jena IRI * violation codes we want to handle specifically, then use it * as-is. */ vc = KnownViolationCode.valueOf(v.codeName()); } catch (Exception ex) { } switch (vc) { case HAS_PASSWORD: if (WARN) { throw newDatatypeException( underbarStringToSentence(v.component()) + " component contains a password.", WARN); } else { return; } case NON_INITIAL_DOT_SEGMENT: if (WARN) { throw newDatatypeException( "Path component contains a segment \u201C/../\u201D not at the beginning of a relative reference, or it contains a \u201C/./\u201D. These should be removed.", WARN); } else { return; } case PORT_SHOULD_NOT_BE_WELL_KNOWN: if (WARN) { throw newDatatypeException( "Ports under 1024 should be accessed using the appropriate scheme name.", WARN); } else { return; } case COMPATIBILITY_CHARACTER: if (WARN) { throw newDatatypeException( underbarStringToSentence(v.codeName()) + " in " + toAsciiLowerCase(v.component()) + " component.", WARN); } else { return; } case DNS_LABEL_DASH_START_OR_END: throw newDatatypeException("Host component contains a DNS name with a \u201C-\u201D (dash) character at the beginning or end."); case DOUBLE_WHITESPACE: case WHITESPACE: throw newDatatypeException("Whitespace in " + toAsciiLowerCase(v.component()) + " component. " + "Use \u201C%20\u201D in place of spaces."); case EMPTY_SCHEME: throw newDatatypeException("Scheme component is empty."); case ILLEGAL_PERCENT_ENCODING: throw newDatatypeException(underbarStringToSentence(v.component()) + " component contains a percent sign that is not followed by two hexadecimal digits."); case IP_V4_HAS_FOUR_COMPONENTS: throw newDatatypeException("Host component is entirely numeric but does not have four components like an IPv4 address."); case IP_V4_OCTET_RANGE: throw newDatatypeException("Host component contains a number not in the range 0-255, or a number with a leading zero."); case IP_V6_OR_FUTURE_ADDRESS_SYNTAX: throw newDatatypeException("Host component contains an IPv6 (or IPvFuture) syntax violation."); case NOT_DNS_NAME: throw newDatatypeException("Host component did not meet the restrictions on DNS names."); case REQUIRED_COMPONENT_MISSING: throw newDatatypeException("A component that is required by the scheme is missing."); case SCHEME_MUST_START_WITH_LETTER: throw newDatatypeException("Scheme component must start with a letter."); case UNREGISTERED_NONIETF_SCHEME_TREE: throw newDatatypeException("Scheme component has a \u201C-\u201D (dash) character, but does not start with \u201Cx-\u201D, and the prefix is not known as the prefix of an alternative tree for URI schemes."); case CONTROL_CHARACTER: case ILLEGAL_CHARACTER: case UNDEFINED_UNICODE_CHARACTER: case UNICODE_WHITESPACE: throw newDatatypeException(underbarStringToSentence(v.codeName()) + " in " + toAsciiLowerCase(v.component()) + " component."); default: throw newDatatypeException(v.codeName() + " in " + toAsciiLowerCase(v.component()) + " component."); } } catch (IOException e) { throw newDatatypeException(e.getMessage()); } catch (RhinoException e) { throw newDatatypeException(e.getMessage()); } if (isAbsolute()) { if (iri != null && !iri.isAbsolute()) { throw newDatatypeException("Not an absolute IRI."); } } if (iri != null) { if ("".equals(iri.toString())) { throw newDatatypeException("Must be non-empty."); } if (data) { try { DataUri dataUri = new DataUri(iri); InputStream is = dataUri.getInputStream(); while (is.read() >= 0) { // spin } } catch (DataUriException e) { throw newDatatypeException(e.getIndex(), e.getHead(), e.getLiteral(), e.getTail()); } catch (IOException e) { throw newDatatypeException(e.getMessage()); } } } } private final boolean isHttpAlias(CharSequence scheme) { return "feed".contentEquals(scheme) || "webcal".contentEquals(scheme); } private final boolean isWellKnown(CharSequence scheme) { return "http".contentEquals(scheme) || "https".contentEquals(scheme) || "ftp".contentEquals(scheme) || "mailto".contentEquals(scheme) || "file".contentEquals(scheme); } protected boolean isAbsolute() { return false; } /** * Turn "FOO_BAR_BAZ" into "Foo bar baz". */ protected static final String underbarStringToSentence(String str) { if (str == null) { return null; } char[] buf = new char[str.length()]; // preserve case of first character buf[0] = str.charAt(0); for (int i = 1; i < str.length(); i++) { char c = str.charAt(i); if (c >= 'A' && c <= 'Z') { c += 0x20; } else if (c == 0x5f) { // convert underbar to space c = 0x20; } buf[i] = c; } return new String(buf); } /** * Trim any leading and trailing space characters, as defined in HTML5. */ protected static final String trimHtmlSpaces(String str) { return trimHtmlLeadingSpaces(trimHtmlTrailingSpaces(str)); } /** * Trim any leading space characters, as defined in HTML5. * HTML space characters: space, tab, LF, FF, CR */ protected static final String trimHtmlLeadingSpaces(String str) { if (str == null) { return null; } for (int i = str.length(); i > 0; --i) { char c = str.charAt(str.length() - i); if (!(' ' == c || '\t' == c || '\n' == c || '\f' == c || '\r' == c)) { return str.substring(str.length() - i, str.length()); } } return ""; } /** * Trim any trailing space characters, as defined in HTML5. * HTML space characters: space, tab, LF, FF, CR */ protected static final String trimHtmlTrailingSpaces(String str) { if (str == null) { return null; } for (int i = str.length() - 1; i >= 0; --i) { char c = str.charAt(i); if (!(' ' == c || '\t' == c || '\n' == c || '\f' == c || '\r' == c)) { return str.substring(0, i + 1); } } return ""; } @Override public String getName() { return "IRI reference"; } private class CharSequencePair { private final CharSequence head; private final CharSequence tail; /** * @param head * @param tail */ public CharSequencePair(final CharSequence head, final CharSequence tail) { this.head = head; this.tail = tail; } /** * Returns the head. * * @return the head */ public CharSequence getHead() { return head; } /** * Returns the tail. * * @return the tail */ public CharSequence getTail() { return tail; } } }