/* -*- Mode: java; c-basic-indent: 4; tab-width: 4 -*- */
package freenet.client.filter;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.MalformedInputException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
import java.util.StringTokenizer;
import freenet.clients.http.ToadletContextImpl;
import freenet.l10n.NodeL10n;
import freenet.support.HTMLDecoder;
import freenet.support.HTMLEncoder;
import freenet.support.Logger;
import freenet.support.Logger.LogLevel;
import freenet.support.URLDecoder;
import freenet.support.URLEncodedFormatException;
import freenet.support.io.NullWriter;
public class HTMLFilter implements ContentDataFilter, CharsetExtractor {
private static boolean logMINOR;
private static boolean logDEBUG;
private static final boolean deleteWierdStuff = true;
private static final boolean deleteErrors = true;
/** If true, allow documents that don't have an <html> tag or have other tags before it.
* In all cases we disallow text before the first valid tag. This is because if we don't,
* charset detection can be ambiguous, potentially resulting in attacks. */
private static final boolean allowNoHTMLTag = true;
// FIXME make these configurable on a per-document level.
// Maybe by merging with TagReplacerCallback???
// For now they're just global.
/** -1 means don't allow it */
public static int metaRefreshSamePageMinInterval = 1;
/** -1 means don't allow it */
public static int metaRefreshRedirectMinInterval = 30;
@Override
public void readFilter(InputStream input, OutputStream output, String charset, HashMap<String, String> otherParams,
FilterCallback cb) throws DataFilterException, IOException {
if(cb == null) cb = new NullFilterCallback();
logMINOR = Logger.shouldLog(LogLevel.MINOR, this);
logDEBUG = Logger.shouldLog(LogLevel.DEBUG, this);
if(logMINOR) Logger.minor(this, "readFilter(): charset="+charset);
Reader r = null;
Writer w = null;
InputStreamReader isr = null;
OutputStreamWriter osw = null;
try {
isr = new InputStreamReader(input, charset);
osw = new OutputStreamWriter(output, charset);
r = new BufferedReader(isr, 4096);
w = new BufferedWriter(osw, 4096);
} catch(UnsupportedEncodingException e) {
throw UnknownCharsetException.create(e, charset);
}
HTMLParseContext pc = new HTMLParseContext(r, w, charset, cb, false);
pc.run();
w.flush();
}
@Override
public void writeFilter(InputStream input, OutputStream output, String charset, HashMap<String, String> otherParams,
FilterCallback cb) throws DataFilterException, IOException {
throw new UnsupportedOperationException();
}
@Override
public String getCharset(byte[] input, int length, String parseCharset) throws DataFilterException, IOException {
logMINOR = Logger.shouldLog(LogLevel.MINOR, this);
if(logMINOR) Logger.minor(this, "getCharset(): default="+parseCharset);
if(length > getCharsetBufferSize() && Logger.shouldLog(LogLevel.MINOR, this)) {
Logger.minor(this, "More data than was strictly needed was passed to the charset extractor for extraction");
}
ByteArrayInputStream strm = new ByteArrayInputStream(input, 0, length);
Writer w = new NullWriter();
Reader r;
try {
r = new BufferedReader(new InputStreamReader(strm, parseCharset), 4096);
} catch (UnsupportedEncodingException e) {
strm.close();
throw e;
}
HTMLParseContext pc = new HTMLParseContext(r, w, null, new NullFilterCallback(), true);
try {
pc.run();
} catch (MalformedInputException e) {
// Not this charset
return null;
} catch (IOException e) {
throw e;
} catch (Throwable t) {
// Ignore ALL errors
if(logMINOR) Logger.minor(this, "Caught "+t+" trying to detect MIME type with "+parseCharset);
}
try {
r.close();
} catch (IOException e) {
throw e;
} catch (Throwable t) {
if(logMINOR) Logger.minor(this, "Caught "+t+" closing stream after trying to detect MIME type with "+parseCharset);
}
if(logMINOR) Logger.minor(this, "Returning charset "+pc.detectedCharset);
return pc.detectedCharset;
}
class HTMLParseContext {
Reader r;
Writer w;
String charset;
String detectedCharset;
final FilterCallback cb;
final boolean onlyDetectingCharset;
boolean isXHTML=false;
Stack<String> openElements;
boolean failedDetectCharset;
/** If <head> is found, then it is true. It is needed that if <title> or <meta> is found outside <head> or if a <body> is found first, then insert a <head> too*/
boolean wasHeadElementFound=false;
/** We can only have <head> once, and <meta>/<title> can't be outside it. This helps with robustness against charset attacks and allows us to stop looking for <meta> as soon as we see </head> when detecting charset. */
boolean headEnded=false;
HTMLParseContext(Reader r, Writer w, String charset, FilterCallback cb, boolean onlyDetectingCharset) {
this.r = r;
this.w = w;
this.charset = charset;
this.cb = cb;
this.onlyDetectingCharset = onlyDetectingCharset;
openElements=new Stack<String>();
}
public void setisXHTML(boolean value) {
isXHTML=value;
}
public boolean getisXHTML() {
return isXHTML;
}
public void pushElementInStack(String element) {
openElements.push(element);
}
public String popElementFromStack() {
if(openElements.size()>0)
return openElements.pop();
else
return null;
}
public String peekTopElement() {
if(openElements.isEmpty()) return null;
return openElements.peek();
}
void run() throws IOException, DataFilterException {
/**
* TOKENIZE Modes:
* <p>0) in text transitions: '<' ->(1) 1) in tag, not in
* quotes/comment/whitespace transitions: whitespace -> (4) (save
* current element) '"' -> (2) '--' at beginning of tag -> (3) '>' ->
* process whole tag 2) in tag, in quotes transitions: '"' -> (1)
* '>' -> grumble about markup in quotes in tag might confuse older
* user-agents (stay in current state) 3) in tag, in comment
* transitions: '-->' -> save/ignore comment, go to (0) '<' or '>' ->
* grumble about markup in comments 4) in tag, in whitespace
* transitions: '"' -> (2) '>' -> save tag, (0) anything else not
* whitespace -> (1)
* </p>
*/
StringBuilder b = new StringBuilder(100);
StringBuilder balt = new StringBuilder(4000);
List<String> splitTag = new ArrayList<String>();
String currentTag = null;
char pprevC = 0;
char prevC = 0;
char c = 0;
mode = INTEXT;
// No text before <html>
boolean textAllowed = false;
boolean firstChar = true;
while (true) {
// If detecting charset, stop after </head> even if haven't found <meta> charset tag.
if(onlyDetectingCharset && failedDetectCharset)
return;
// If detecting charset, and found it, stop afterwards.
if(onlyDetectingCharset && detectedCharset != null)
return;
int x;
try {
x = r.read();
}
/**
* libgcj up to at least 4.2.2 has a bug: InputStreamReader.refill() throws this exception when BufferedInputReader.refill() returns false for EOF. See:
* line 299 at InputStreamReader.java (in refill()): http://www.koders.com/java/fidD8F7E2EB1E4C22DA90EBE0130306AE30F876AB00.aspx?s=refill#L279
* line 355 at BufferedInputStream.java (in refill()): http://www.koders.com/java/fid1949641524FAC0083432D79793F554CD85F46759.aspx?s=refill#L355
* TODO: remove this when the gcj bug is fixed and the affected gcj versions are outdated.
*/
catch(java.io.CharConversionException cce) {
if(freenet.node.Node.checkForGCJCharConversionBug()) /* only ignore the exception on affected libgcj */
x = -1;
else
throw cce;
}
if (x == -1) {
switch (mode) {
case INTEXT :
if(textAllowed) {
saveText(b, currentTag, w, this);
} else {
if(!b.toString().trim().equals(""))
throwFilterException(l10n("textBeforeHTML"));
}
break;
case INTAG:
w.write("<!-- truncated page: last tag not unfinished -->");
break;
case INTAGQUOTES:
w.write("<!-- truncated page: deleted unfinished tag: still in quotes -->");
break;
case INTAGSQUOTES:
w.write("<!-- truncated page: deleted unfinished tag: still in single quotes -->");
break;
case INTAGWHITESPACE:
w.write("<!-- truncated page: deleted unfinished tag: still in whitespace -->");
break;
case INTAGCOMMENT:
w.write("<!-- truncated page: deleted unfinished comment -->");
break;
case INTAGCOMMENTCLOSING:
w.write("<!-- truncated page: deleted unfinished comment, might be closing -->");
break;
default:
// Dump unfinished tag
break;
}
break;
} else {
pprevC = prevC;
prevC = c;
c = (char) x;
if(c == 0xFEFF) {
if(firstChar) {
// BOM
if(w != null)
w.write(c);
} else {
// Null character (zero width non breaking space). Get rid.
}
continue;
}
if(c == 0) {
// Delete nulls. They can cause all sorts of problems and also can result from messing around with charsets.
continue;
}
firstChar = false;
switch (mode) {
case INTEXT :
if (c == '<') {
if(textAllowed) {
saveText(b, currentTag, w, this);
} else {
if(!b.toString().trim().equals(""))
throwFilterException(l10n("textBeforeHTML"));
}
b.setLength(0);
balt.setLength(0);
mode = INTAG;
} else {
b.append(c);
}
break;
case INTAG :
balt.append(c);
if (HTMLDecoder.isWhitespace(c)) {
splitTag.add(b.toString());
mode = INTAGWHITESPACE;
b.setLength(0);
} else if ((c == '<') && Character.isWhitespace(balt.charAt(0))) {
// Previous was an un-escaped < in a script.
if(textAllowed) {
saveText(b, currentTag, w, this);
} else {
if(!b.toString().trim().equals(""))
throwFilterException(l10n("textBeforeHTML"));
}
balt.setLength(0);
b.setLength(0);
splitTag.clear();
} else if (c == '>') {
splitTag.add(b.toString());
b.setLength(0);
String s = processTag(splitTag, w, this);
currentTag = s;
splitTag.clear();
balt.setLength(0);
mode = INTEXT;
if(s != null && (allowNoHTMLTag || (s.equals("html") || (!isXHTML) && s.equalsIgnoreCase("html"))))
textAllowed = true;
} else if (
(b.length() == 2)
&& (c == '-')
&& (prevC == '-')
&& (pprevC == '!')) {
mode = INTAGCOMMENT;
b.append(c);
} else if (c == '"') {
mode = INTAGQUOTES;
b.append(c);
} else if (c == '\'') {
mode = INTAGSQUOTES;
b.append(c);
} else if (c == '/') { /* Probable end tag */
currentTag = null; /* We didn't remember what was the last tag, so ... */
b.append(c);
} else {
b.append(c);
}
break;
case INTAGQUOTES :
// Inside double-quotes, single quotes are just another character, perfectly legal in a URL.
if (c == '"') {
mode = INTAG;
b.append(c); // Part of the element
} else if (c == '>') {
b.append(">");
} else if (c == '<') {
b.append("<");
// } else if (c=='&') {
// b.append("&");
} else if (c== '\u00A0') {
b.append(" ");
}
else {
b.append(c);
}
break;
case INTAGSQUOTES :
if (c == '\'') {
mode = INTAG;
b.append(c); // Part of the element
} else if (c == '<') {
b.append("<");
} else if (c == '>') {
b.append(">");
// }else if (c=='&') {
// b.append("&");
} else if (c== '\u00A0') {
b.append(" ");
}
else {
b.append(c);
}
break;
/*
* Comments are often used to temporarily disable
* markup; I shall allow it. (avian) White space is
* not permitted between the markup declaration
* open delimiter ("
* <!") and the comment open delimiter ("--"), but
* is permitted between the comment close delimiter
* ("--") and the markup declaration close
* delimiter (">"). A common error is to include a
* string of hyphens ("---") within a comment.
* Authors should avoid putting two or more
* adjacent hyphens inside comments. However, the
* only browser that actually gets it right is IE
* (others either don't allow it or allow other
* chars as well). The only safe course of action
* is to allow any and all chars, but eat them.
* (avian)
*/
case INTAGCOMMENT :
if ((b.length() >= 4) && (c == '-') && (prevC == '-')) {
b.append(c);
mode = INTAGCOMMENTCLOSING;
} else
b.append(c);
break;
case INTAGCOMMENTCLOSING :
if (c == '>') {
saveComment(b, w, this);
b.setLength(0);
mode = INTEXT;
} else {
b.append(c);
if(c != '-')
mode = INTAGCOMMENT;
}
break;
case INTAGWHITESPACE :
if (c == '"') {
mode = INTAGQUOTES;
b.append(c);
} else if (c == '\'') {
// e.g. <div align = 'center'> (avian)
// This will be converted automatically to double quotes \"
// Note that SINGLE QUOTES ARE LEGAL IN URLS ...
// If we have single quotes inside single quotes, we could get into a major mess here... but that's really malformed code, and it will still be safe, it will just be unreadable.
mode = INTAGSQUOTES;
b.append(c);
} else if (c == '>') {
if (!killTag)
currentTag = processTag(splitTag, w, this);
else
currentTag = null;
killTag = false;
splitTag.clear();
b.setLength(0);
balt.setLength(0);
mode = INTEXT;
if(currentTag != null && (allowNoHTMLTag || (currentTag.equals("html") || (!isXHTML) && currentTag.equalsIgnoreCase("html"))))
textAllowed = true;
} else if ((c == '<') && Character.isWhitespace(balt.charAt(0))) {
// Previous was an un-escaped < in a script.
if(textAllowed) {
saveText(b, currentTag, w, this);
} else {
if(!b.toString().trim().equals(""))
throwFilterException(l10n("textBeforeHTML"));
}
balt.setLength(0);
b.setLength(0);
splitTag.clear();
mode = INTAG;
} else if (HTMLDecoder.isWhitespace(c)) {
// More whitespace, what fun
} else {
mode = INTAG;
b.append(c);
}
}
}
}
/**While detecting the charset, if head is not closed inside
* the interval which we are examining, something is wrong, and it's
* possible that the file has been given a freakishly large head, so
* that we'll miss a charset declaration.*/
if(onlyDetectingCharset && openElements.contains("head")) {
throw new MalformedInputException(1024*64);
}
//Writing the remaining tags for XHTML if any
if(getisXHTML())
{
while(openElements.size()>0)
w.write("</"+openElements.pop()+">");
}
w.flush();
return;
}
int mode;
static final int INTEXT = 0;
static final int INTAG = 1;
static final int INTAGQUOTES = 2;
static final int INTAGSQUOTES = 3;
static final int INTAGCOMMENT = 4;
static final int INTAGCOMMENTCLOSING = 5;
static final int INTAGWHITESPACE = 6;
boolean killTag = false; // just this one
boolean writeStyleScriptWithTag = false; // just this one
boolean expectingBadComment = false;
// has to be set on or off explicitly by tags
boolean inStyle = false; // has to be set on or off explicitly by tags
boolean inScript = false; // has to be set on or off explicitly by tags
boolean killText = false; // has to be set on or off explicitly by tags
boolean killStyle = false;
int styleScriptRecurseCount = 0;
String currentStyleScriptChunk = "";
StringBuilder writeAfterTag = new StringBuilder(1024);
public void closeXHTMLTag(String element, Writer w) throws IOException {
// Assume that missing closes are way more common than extra closes.
if(openElements.isEmpty()) return;
if(element.equals(openElements.peek())) {
w.write("</"+openElements.pop()+">");
}
else {
if(openElements.contains(element)) {
while(true) {
String top = openElements.pop();
w.write("</"+top+">");
if(top.equals(element)) return;
}
} // Else it has already been closed.
}
}
}
void saveText(StringBuilder s, String tagName, Writer w, HTMLParseContext pc)
throws IOException {
if(pc.onlyDetectingCharset) return;
if(logDEBUG) Logger.debug(this, "Saving text: "+s.toString());
if (pc.killText) {
return;
}
StringBuilder out = new StringBuilder(s.length()*2);
for(int i=0;i<s.length();i++) {
char c = s.charAt(i);
if(c == '<' && !(pc.inStyle || pc.inScript)) {
//Scripts and styles parsed elsewhere
out.append("<");
}
else if((c < 32) && (c != '\t') && (c != '\n') && (c != '\r')) {
// Not a real character
// STRONGLY suggests somebody is using a bogus charset.
// This could be in order to break the filter.
if(logDEBUG) Logger.debug(this, "Removing '"+c+"' from the output stream");
continue;
}
else {
out.append(c);
}
}
String sout = out.toString();
if (pc.inStyle || pc.inScript) {
pc.currentStyleScriptChunk += sout;
return; // is parsed and written elsewhere
}
if(pc.cb != null)
pc.cb.onText(HTMLDecoder.decode(sout), tagName); /* Tag name is given as type for the text */
w.write(sout);
}
String processTag(List<String> splitTag, Writer w, HTMLParseContext pc)
throws IOException, DataFilterException {
// First, check that it is a recognized tag
if(logDEBUG) {
for(int i=0;i<splitTag.size();i++)
Logger.debug(this, "Tag["+i+"]="+splitTag.get(i));
}
ParsedTag t = new ParsedTag(splitTag);
if (!pc.killTag) {
t = t.sanitize(pc);
if (t != null) {
// We have to check whether <head> exists etc even if we are just checking the charset.
// This enables us to quit when we see </head>.
//We need to make sure that <head> is present in the document. If it is not, then GWT javascript won't get loaded.
//To achieve this, we keep track whether we processed the <head>
if(t.element.compareTo("head")==0 && !t.startSlash){
pc.wasHeadElementFound=true;
} else if(t.element.compareTo("head")==0 && t.startSlash) {
pc.headEnded = true;
if(pc.onlyDetectingCharset) pc.failedDetectCharset = true;
//If we found a <title> or a <meta> without a <head>, then we need to add them to a <head>
}else if((t.element.compareTo("meta")==0 || t.element.compareTo("title")==0) && pc.wasHeadElementFound==false){
pc.openElements.push("head");
pc.wasHeadElementFound=true;
String headContent=pc.cb.processTag(new ParsedTag("head", new HashMap<String, String>()));
if(headContent!=null && !pc.onlyDetectingCharset){
w.write(headContent);
}
}else if((t.element.compareTo("meta")==0 || t.element.compareTo("title")==0) && pc.headEnded){
throwFilterException(l10n("metaOutsideHead"));
//If we found a <body> and haven't closed <head> already, then we do
}else if(t.element.compareTo("body") == 0 && pc.openElements.contains("head")){
if(!pc.onlyDetectingCharset) w.write("</head>");
pc.headEnded = true;
if(pc.onlyDetectingCharset) pc.failedDetectCharset = true;
pc.openElements.pop();
//If we found a <body> and no <head> before it, then we insert it
}else if(t.element.compareTo("body")==0 && pc.wasHeadElementFound==false){
pc.wasHeadElementFound=true;
String headContent=pc.cb.processTag(new ParsedTag("head", new HashMap<String, String>()));
if(headContent!=null){
if(!pc.onlyDetectingCharset) w.write(headContent+"</head>");
pc.headEnded = true;
if(pc.onlyDetectingCharset) pc.failedDetectCharset = true;
}
}
if(!pc.onlyDetectingCharset) {
//If the tag needs replacement, then replace it
String newContent=pc.cb.processTag(t);
if(newContent!=null){
w.write(newContent);
if(t.endSlash==false){
pc.openElements.push(t.element);
}
}else{
if (pc.writeStyleScriptWithTag) {
pc.writeStyleScriptWithTag = false;
String style = pc.currentStyleScriptChunk;
if ((style == null) || (style.length() == 0))
pc.writeAfterTag.append("<!-- "+l10n("deletedUnknownStyle")+" -->");
else
w.write(style);
pc.currentStyleScriptChunk = "";
}
t.write(w,pc);
if (pc.writeAfterTag.length() > 0) {
w.write(pc.writeAfterTag.toString());
pc.writeAfterTag = new StringBuilder(1024);
}
}
} else
pc.writeStyleScriptWithTag = false;
}
if(t == null || t.startSlash || t.endSlash) {
if(!pc.openElements.isEmpty())
return pc.openElements.peek();
if (pc.writeAfterTag.length() > 0) {
w.write(pc.writeAfterTag.toString());
pc.writeAfterTag = new StringBuilder(1024);
}
return null;
} else return t.element;
} else {
pc.killTag = false;
pc.writeStyleScriptWithTag = false;
return null;
}
}
void saveComment(StringBuilder s, Writer w, HTMLParseContext pc)
throws IOException {
if(pc.onlyDetectingCharset) return;
if((s.length() > 3) && (s.charAt(0) == '!') && (s.charAt(1) == '-') && (s.charAt(2) == '-')) {
s.delete(0, 3);
if(s.charAt(s.length()-1) == '-')
s.setLength(s.length()-1);
if(s.charAt(s.length()-1) == '-')
s.setLength(s.length()-1);
}
if(logDEBUG) Logger.debug(this, "Saving comment: "+s.toString());
if (pc.expectingBadComment)
return; // ignore it
if (pc.inStyle || pc.inScript) {
pc.currentStyleScriptChunk += s;
return; // </style> handler should write
}
if (pc.killTag) {
pc.killTag = false;
return;
}
StringBuilder sb = new StringBuilder();
for(int i=0;i<s.length();i++) {
char c = s.charAt(i);
if(c == '<') {
sb.append("<");
} else if(c == '>') {
sb.append(">");
} else {
sb.append(c);
}
}
s = sb;
w.write("<!-- ");
w.write(s.toString());
w.write(" -->");
}
static void throwFilterException(String msg) throws DataFilterException {
// FIXME
String longer = l10n("failedToParseLabel");
throw new DataFilterException(longer, longer, msg);
}
public static class ParsedTag {
public final String element;
public final String[] unparsedAttrs;
final boolean startSlash;
final boolean endSlash;
/*
* public ParsedTag(ParsedTag t) { this.element = t.element;
* this.unparsedAttrs = (String[]) t.unparsedAttrs.clone();
* this.startSlash = t.startSlash; this.endSlash = t.endSlash; }
*/
public ParsedTag(String elementName,Map<String,String> attributes){
this.element=elementName;
startSlash=false;
endSlash=true;
String[] attrs=new String[attributes.size()];
int pos=0;
for(Entry<String,String> entry:attributes.entrySet()){
attrs[pos++]=entry.getKey()+"=\""+entry.getValue()+"\"";
}
this.unparsedAttrs = attrs;
}
public ParsedTag(ParsedTag t, String[] outAttrs) {
this.element = t.element;
this.unparsedAttrs = outAttrs;
this.startSlash = t.startSlash;
this.endSlash = t.endSlash;
}
public ParsedTag(ParsedTag t, Map<String,String> attributes){
String[] attrs=new String[attributes.size()];
int pos=0;
for(Entry<String,String> entry:attributes.entrySet()){
attrs[pos++]=entry.getKey()+"=\""+entry.getValue()+"\"";
}
this.element = t.element;
this.unparsedAttrs = attrs;
this.startSlash = t.startSlash;
this.endSlash = t.endSlash;
}
public ParsedTag(List<String> v) {
int len = v.size();
if (len == 0) {
element = null;
unparsedAttrs = new String[0];
startSlash = endSlash = false;
return;
}
String s = v.get(len - 1);
if (((len - 1 != 0) || (s.length() > 1)) && s.endsWith("/")) {
s = s.substring(0, s.length() - 1);
v.set(len - 1, s);
if (s.length() == 0)
len--;
endSlash = true;
// Don't need to set it back because everything is an I-value
} else endSlash = false;
s = v.get(0);
if ((s.length() > 1) && s.startsWith("/")) {
s = s.substring(1);
v.set(0, s);
startSlash = true;
} else startSlash = false;
element = v.get(0);
if (len > 1) {
unparsedAttrs = new String[len - 1];
for (int x = 1; x < len; x++)
unparsedAttrs[x - 1] = v.get(x);
} else
unparsedAttrs = new String[0];
if(logDEBUG) Logger.debug(this, "Element = "+element);
}
public ParsedTag sanitize(HTMLParseContext pc) throws DataFilterException {
TagVerifier tv =
allowedTagsVerifiers.get(element.toLowerCase());
if(logDEBUG) Logger.debug(this, "Got verifier: "+tv+" for "+element);
if (tv == null) {
if (deleteWierdStuff) {
return null;
} else {
String err = "<!-- "+HTMLEncoder.encode(l10n("unknownTag", "tag", element))+ " -->";
if (!deleteErrors)
throwFilterException(l10n("unknownTagLabel") + ' ' + err);
return null;
}
}
return tv.sanitize(this, pc);
}
@Override
public String toString() {
if (element == null)
return "";
StringBuilder sb = new StringBuilder("<");
if (startSlash)
sb.append('/');
sb.append(element);
if (unparsedAttrs != null) {
int n = unparsedAttrs.length;
for (int i = 0; i < n; i++) {
sb.append(' ').append(unparsedAttrs[i]);
}
}
if (endSlash)
sb.append(" /");
sb.append('>');
return sb.toString();
}
public Map<String,String> getAttributesAsMap(){
Map<String,String> map=new HashMap<String, String>();
for(String attr: unparsedAttrs) {
String name=attr.substring(0,attr.indexOf('='));
String value=attr.substring(attr.indexOf('=')+2,attr.length()-1);
map.put(name, value);
}
return map;
}
public void htmlwrite(Writer w,HTMLParseContext pc) throws IOException {
String s = toString();
if(pc.getisXHTML())
{
if(ElementInfo.isVoidElement(element) && s.charAt(s.length()-2)!='/')
{
s=s.substring(0,s.length()-1)+" />";
}
}
if (s != null) {
w.write(s);
}
}
public void write(Writer w,HTMLParseContext pc) throws IOException {
if(!startSlash)
{
if(ElementInfo.tryAutoClose(element) && element.equals(pc.peekTopElement()))
pc.closeXHTMLTag(element, w);
if(pc.getisXHTML() && !ElementInfo.isVoidElement(element))
pc.pushElementInStack(element);
htmlwrite(w,pc);
}
else
{
if(pc.getisXHTML())
{
pc.closeXHTMLTag(element, w);
}
else
{
htmlwrite(w,pc);
}
}
}
}
public static Set<String> getAllowedHTMLTags() {
return Collections.unmodifiableSet(allowedHTMLTags);
}
private static final Set<String> allowedHTMLTags = new HashSet<String>();
static final Map<String, TagVerifier> allowedTagsVerifiers =
Collections.unmodifiableMap(getAllowedTagVerifiers());
private static final String[] emptyStringArray = new String[0];
private static Map<String, TagVerifier> getAllowedTagVerifiers()
{
Map<String, TagVerifier> allowedTagsVerifiers = new HashMap<String, TagVerifier>();
allowedTagsVerifiers.put("?xml", new XmlTagVerifier());
allowedTagsVerifiers.put(
"!doctype",
new DocTypeTagVerifier("!doctype"));
allowedTagsVerifiers.put("html", new HtmlTagVerifier());
allowedTagsVerifiers.put(
"head",
new TagVerifier(
"head",
new String[] { "id" },
// Don't support profiles.
// We don't know what format they might be in, whether they will be parsed even though they have bogus MIME types (which seems likely), etc.
new String[] { /*"profile"*/ },
null));
allowedTagsVerifiers.put(
"title",
new TagVerifier("title", new String[] { "id" }));
allowedTagsVerifiers.put("meta", new MetaTagVerifier());
allowedTagsVerifiers.put(
"body",
new CoreTagVerifier(
"body",
new String[] { "bgcolor", "text", "link", "vlink", "alink" },
null,
new String[] { "background" },
new String[] { "onload", "onunload" }));
String[] group =
{ "div", "h1", "h2", "h3", "h4", "h5", "h6", "p", "caption" };
for (String x: group)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
new String[] { "align" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
String[] group2 =
{
"span",
"address",
"em",
"strong",
"dfn",
"code",
"samp",
"kbd",
"var",
"cite",
"abbr",
"acronym",
"sub",
"sup",
"dt",
"dd",
"tt",
"i",
"b",
"big",
"small",
"strike",
"s",
"u",
"noframes",
"fieldset",
// Delete <noscript> / </noscript>. So we can at least see the non-scripting code.
// "noscript",
"xmp",
"listing",
"plaintext",
"center",
"bdo",
"aside",
"header",
"nav",
"footer",
"article",
"section",
"hgroup"};
for (String x: group2)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
emptyStringArray,
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"blockquote",
new CoreTagVerifier(
"blockquote",
emptyStringArray,
new String[] { "cite" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"q",
new CoreTagVerifier(
"q",
emptyStringArray,
new String[] { "cite" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"br",
new BaseCoreTagVerifier(
"br",
new String[] { "clear" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"pre",
new CoreTagVerifier(
"pre",
new String[] { "width", "xml:space" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"ins",
new CoreTagVerifier(
"ins",
new String[] { "datetime" },
new String[] { "cite" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"del",
new CoreTagVerifier(
"del",
new String[] { "datetime" },
new String[] { "cite" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"ul",
new CoreTagVerifier(
"ul",
new String[] { "type", "compact" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"ol",
new CoreTagVerifier(
"ol",
new String[] { "type", "compact", "start" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"li",
new CoreTagVerifier(
"li",
new String[] { "type", "value" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"dl",
new CoreTagVerifier(
"dl",
new String[] { "compact" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"dir",
new CoreTagVerifier(
"dir",
new String[] { "compact" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"menu",
new CoreTagVerifier(
"menu",
new String[] { "compact" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"table",
new CoreTagVerifier(
"table",
new String[] {
"summary",
"width",
"border",
"frame",
"rules",
"cellspacing",
"cellpadding",
"align",
"bgcolor" },
emptyStringArray,
new String[] { "background" },
emptyStringArray));
allowedTagsVerifiers.put(
"thead",
new CoreTagVerifier(
"thead",
new String[] { "align", "char", "charoff", "valign" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"tfoot",
new CoreTagVerifier(
"tfoot",
new String[] { "align", "char", "charoff", "valign" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"tbody",
new CoreTagVerifier(
"tbody",
new String[] { "align", "char", "charoff", "valign" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"colgroup",
new CoreTagVerifier(
"colgroup",
new String[] {
"span",
"width",
"align",
"char",
"charoff",
"valign" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"col",
new CoreTagVerifier(
"col",
new String[] {
"span",
"width",
"align",
"char",
"charoff",
"valign" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"tr",
new CoreTagVerifier(
"tr",
new String[] {
"align",
"char",
"charoff",
"valign",
"bgcolor" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"th",
new CoreTagVerifier(
"th",
new String[] {
"abbr",
"axis",
"headers",
"scope",
"rowspan",
"colspan",
"align",
"char",
"charoff",
"valign",
"nowrap",
"bgcolor",
"width",
"height" },
emptyStringArray,
new String[] { "background" },
emptyStringArray));
allowedTagsVerifiers.put(
"td",
new CoreTagVerifier(
"td",
new String[] {
"abbr",
"axis",
"headers",
"scope",
"rowspan",
"colspan",
"align",
"char",
"charoff",
"valign",
"nowrap",
"bgcolor",
"width",
"height" },
emptyStringArray,
new String[] { "background" },
emptyStringArray));
allowedTagsVerifiers.put(
"a",
new LinkTagVerifier(
"a",
new String[] {
"accesskey",
"tabindex",
"name",
"shape",
"coords",
"target" },
emptyStringArray,
emptyStringArray,
new String[] { "onfocus", "onblur" }));
allowedTagsVerifiers.put(
"link",
new LinkTagVerifier(
"link",
new String[] { "media", "target" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"base",
new BaseHrefTagVerifier(
"base",
new String[] { "id", "target" },
new String[] { /* explicitly sanitized by class */ }));
allowedTagsVerifiers.put(
"img",
new CoreTagVerifier(
"img",
new String[] {
"alt",
"name",
"height",
"width",
"ismap",
"align",
"border",
"hspace",
"vspace" },
new String[] { "longdesc", "usemap" },
new String[] { "src" },
emptyStringArray));
// FIXME: object tag -
// http://www.w3.org/TR/html4/struct/objects.html#h-13.3
// FIXME: param tag -
// http://www.w3.org/TR/html4/struct/objects.html#h-13.3.2
// applet tag PROHIBITED - we do not support applets (FIXME?)
allowedTagsVerifiers.put(
"map",
new CoreTagVerifier(
"map",
new String[] { "name" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"area",
new CoreTagVerifier(
"area",
new String[] {
"accesskey",
"tabindex",
"shape",
"coords",
"nohref",
"alt",
"target" },
new String[] { "href" },
emptyStringArray,
new String[] { "onfocus", "onblur" }));
allowedTagsVerifiers.put(
"audio", // currently just minimal support
new MediaTagVerifier(
"audio",
new String[] { // allowed tags
"preload",
"controls"},
emptyStringArray, // uris
new String[] { "src" }, // inline uris
emptyStringArray));
allowedTagsVerifiers.put("style", new StyleTagVerifier());
allowedTagsVerifiers.put(
"font",
new BaseCoreTagVerifier(
"font",
new String[] { "size", "color", "face" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"basefont",
new BaseCoreTagVerifier(
"basefont",
new String[] { "size", "color", "face" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"hr",
new CoreTagVerifier(
"hr",
new String[] { "align", "noshade", "size", "width" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"frameset",
new CoreTagVerifier(
"frameset",
new String[] { "rows", "cols" },
emptyStringArray,
emptyStringArray,
new String[] { "onload", "onunload" },
false));
allowedTagsVerifiers.put(
"frame",
new BaseCoreTagVerifier(
"frame",
new String[] {
"name",
"frameborder",
"marginwidth",
"marginheight",
"noresize",
"scrolling" },
new String[] { "longdesc" },
new String[] { "src" }));
allowedTagsVerifiers.put(
"iframe",
new BaseCoreTagVerifier(
"iframe",
new String[] {
"name",
"frameborder",
"marginwidth",
"marginheight",
"scrolling",
"align",
"height",
"width" },
new String[] { "longdesc"},
new String[] { "src" }));
allowedTagsVerifiers.put(
"form",
new FormTagVerifier(
"form",
new String[] {
"name" }, // FIXME add a whitelist filter for accept
// All other attributes are handled by FormTagVerifier.
new String[] { },
new String[] { "onsubmit", "onreset" }));
allowedTagsVerifiers.put(
"input",
new InputTagVerifier(
"input",
new String[] {
"accesskey",
"tabindex",
"type",
"name",
"value",
"checked",
"disabled",
"readonly",
"size",
"maxlength",
"alt",
"ismap",
"accept",
"align" },
new String[] { "usemap" },
new String[] { "src" },
new String[] { "onfocus", "onblur", "onselect", "onchange" }));
allowedTagsVerifiers.put(
"button",
new CoreTagVerifier(
"button",
new String[] {
"accesskey",
"tabindex",
"name",
"value",
"type",
"disabled" },
emptyStringArray,
emptyStringArray,
new String[] { "onfocus", "onblur" }));
allowedTagsVerifiers.put(
"select",
new CoreTagVerifier(
"select",
new String[] {
"name",
"size",
"multiple",
"disabled",
"tabindex" },
emptyStringArray,
emptyStringArray,
new String[] { "onfocus", "onblur", "onchange" }));
allowedTagsVerifiers.put(
"optgroup",
new CoreTagVerifier(
"optgroup",
new String[] { "disabled", "label" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"option",
new CoreTagVerifier(
"option",
new String[] { "selected", "disabled", "label", "value" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"textarea",
new CoreTagVerifier(
"textarea",
new String[] {
"accesskey",
"tabindex",
"name",
"rows",
"cols",
"disabled",
"readonly" },
emptyStringArray,
emptyStringArray,
new String[] { "onfocus", "onblur", "onselect", "onchange" }));
allowedTagsVerifiers.put(
"isindex",
new BaseCoreTagVerifier(
"isindex",
new String[] { "prompt" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"label",
new CoreTagVerifier(
"label",
new String[] { "for", "accesskey" },
emptyStringArray,
emptyStringArray,
new String[] { "onfocus", "onblur" }));
allowedTagsVerifiers.put(
"legend",
new CoreTagVerifier(
"legend",
new String[] { "accesskey", "align" },
emptyStringArray,
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put("script", new ScriptTagVerifier());
/* MathML 3.0 support for presentation markup, deprecated attributes
* not included so don't try using them. xref not supported as it is
* mainly used to link presentation and content in parallel markup.
*
* Content markup not supported as it is larger and presumably not
* used that much, and **HAS SECURITY ISSUES**: Content markup uses
* Content Dictionaries, which by default are loaded from a default
* URL on the web.
* See attributes: cdgroup, definitionURL, cd.
* Elements: csymbol, annotation, annotation-xml. */
allowedTagsVerifiers.put(
"math",
new CoreTagVerifier(
"math",
new String[] {
"accent",
"accentunder",
"align",
"alignmentscope",
"altimg-height",
"altimg-valign",
"altimg-width",
"alttext",
"bevelled",
"charalign",
"charspacing",
"close",
"columnalign",
"columnlines",
"columnspacing",
"columnspan",
"columnwidth",
"crossout",
"decimalpoint",
"depth",
"denomalign",
"dir",
"display",
"displaystyle",
"edge",
"equalcolumns",
"equalrows",
"fence",
"form",
"frame",
"framespacing",
"groupalign",
"height",
"indentalign",
"indentalignfirst",
"indentalignlast",
"indentshift",
"indentshiftfirst",
"indentshiftlast",
"indenttarget",
"infixlinebreakstyle",
"largeop",
"leftoverhang",
"length",
"linebreak",
"linebreakmultchar",
"linebreakstyle",
"lineleading",
"location",
"lquote",
"lspace",
"linethickness",
"longdivstyle",
"mathbackground",
"mathcolor",
"mathsize",
"mathvariant",
"maxsize",
"maxwidth",
"minlabelspacing",
"minsize",
"movablelimits",
"mslinethickness",
"notation",
"numalign",
"open",
"overflow",
"position",
"rightoverhang",
"rowalign",
"rowlines",
"rowspacing",
"rowspan",
"rquote",
"rspace",
"scriptlevel",
"scriptminsize",
"scriptsizemultiplier",
"separator",
"separators",
"shift",
"side",
"stackalign",
"stretchy",
"subscriptshift",
"superscriptshift",
"symmetric",
"voffset",
"width" },
new String[] { "href" },
new String[] { "altimg" },
emptyStringArray));
//MathML Presentation tags follow
String[] mathmlempty =
{
"mprescripts",
"none"};
for (String x: mathmlempty)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
emptyStringArray,
emptyStringArray,
emptyStringArray,
emptyStringArray));
String[] mathmlpresent =
{
"merror",
"mphantom",
"mroot",
"msqrt"};
for (String x: mathmlpresent)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
new String[] { "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"msub",
new CoreTagVerifier(
"msub",
new String[] { "mathbackground", "mathcolor", "subscriptshift" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"msup",
new CoreTagVerifier(
"msup",
new String[] { "mathbackground", "mathcolor", "superscriptshift" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
String[] mathmlscripts =
{
"msubsup",
"mmultiscripts"};
for (String x: mathmlscripts)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
new String[] { "mathbackground", "mathcolor", "subscriptshift", "superscriptshift" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"msrow",
new CoreTagVerifier(
"msrow",
new String[] { "mathbackground", "mathcolor", "position" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"msgroup",
new CoreTagVerifier(
"msgroup",
new String[] { "mathbackground", "mathcolor", "position", "shift" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"menclose",
new CoreTagVerifier(
"menclose",
new String[] { "mathbackground", "mathcolor", "notation" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"msline",
new CoreTagVerifier(
"msline",
new String[] { "leftoverhang", "length", "mathbackground", "mathcolor", "mslinethickness", "position", "rightoverhang" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"maligngroup",
new CoreTagVerifier(
"maligngroup",
new String[] { "groupalign", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"malignmark",
new CoreTagVerifier(
"malignmark",
new String[] { "edge", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mrow",
new CoreTagVerifier(
"mrow",
new String[] { "dir", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
String[] mathmlitem =
{
"mi",
"mn",
"mtext"};
for (String x: mathmlitem)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
new String[] { "dir", "mathbackground", "mathcolor", "mathsize", "mathvariant" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"ms",
new CoreTagVerifier(
"ms",
new String[] { "dir", "lquote", "mathbackground", "mathcolor", "mathsize", "mathvariant", "rquote" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mpadded",
new CoreTagVerifier(
"mpadded",
new String[] { "depth", "height", "lspace", "mathbackground", "mathcolor", "voffset", "width" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mspace",
new CoreTagVerifier(
"mspace",
new String[] {
"depth",
"dir",
"height",
"indentalign",
"indentalignfirst",
"indentalignlast",
"indentshift",
"indentshiftfirst",
"indentshiftlast",
"indenttarget",
"linebreak",
"mathbackground",
"mathcolor",
"mathsize",
"mathvariant",
"width" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mscarry",
new CoreTagVerifier(
"mscarry",
new String[] { "crossout", "location", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mscarries",
new CoreTagVerifier(
"mscarries",
new String[] { "crossout", "location", "mathbackground", "mathcolor", "position", "scriptsizemultiplier" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
String[] mathmltr =
{
"mtr",
"mlabeledtr"};
for (String x: mathmltr)
allowedTagsVerifiers.put(
x,
new CoreTagVerifier(
x,
new String[] { "columnalign", "groupalign", "mathbackground", "mathcolor", "rowalign" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mtd",
new CoreTagVerifier(
"mtd",
new String[] { "columnalign", "columnspan", "groupalign", "mathbackground", "mathcolor", "rowalign", "rowspan" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mfenced",
new CoreTagVerifier(
"mfenced",
new String[] { "close", "mathbackground", "mathcolor", "open", "separators" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mfrac",
new CoreTagVerifier(
"mfrac",
new String[] { "bevelled", "denomalign", "linethickness", "mathbackground", "mathcolor", "numalign" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mglyph",
new CoreTagVerifier(
"mglyph",
new String[] { "alt", "height", "mathbackground", "mathcolor", "valign", "width" },
new String[] { "href" },
new String[] { "src" },
emptyStringArray));
allowedTagsVerifiers.put(
"mstack",
new CoreTagVerifier(
"mstack",
new String[] { "align", "charalign", "charspacing", "mathbackground", "mathcolor", "stackalign" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mlongdiv",
new CoreTagVerifier(
"mlongdiv",
new String[] { "align", "charalign", "charspacing", "longdivstyle", "mathbackground", "mathcolor", "stackalign" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mtable",
new CoreTagVerifier(
"mtable",
new String[] {
"align",
"alignmentscope",
"columnalign",
"columnlines",
"columnspacing",
"columnwidth",
"displaystyle",
"equalcolumns",
"equalrows",
"frame",
"framespacing",
"groupalign",
"mathbackground",
"mathcolor",
"minlabelspacing",
"rowalign",
"rowlines",
"rowspacing",
"side",
"width" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"munder",
new CoreTagVerifier(
"munder",
new String[] { "accentunder", "align", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mo",
new CoreTagVerifier(
"mo",
new String[] {
"accent",
"dir",
"fence",
"form",
"indentalign",
"indentalignfirst",
"indentalignlast",
"indentshift",
"indentshiftfirst",
"indentshiftlast",
"indenttarget",
"largeop",
"linebreak",
"linebreakmultchar",
"linebreakstyle",
"lineleading",
"lspace",
"mathbackground",
"mathcolor",
"mathsize",
"mathvariant",
"maxsize",
"minsize",
"movablelimits",
"rspace",
"separator",
"stretchy",
"symmetric" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mover",
new CoreTagVerifier(
"mover",
new String[] { "accent", "align", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"munderover",
new CoreTagVerifier(
"munderover",
new String[] { "accent", "accentunder", "align", "mathbackground", "mathcolor" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
allowedTagsVerifiers.put(
"mstyle",
new CoreTagVerifier(
"mstyle",
new String[] {
"accent",
"accentunder",
"align",
"alignmentscope",
"bevelled",
"charalign",
"charspacing",
"close",
"columnalign",
"columnlines",
"columnspacing",
"columnspan",
"columnwidth",
"crossout",
"decimalpoint",
"depth",
"denomalign",
"dir",
"displaystyle",
"edge",
"equalcolumns",
"equalrows",
"fence",
"form",
"frame",
"framespacing",
"groupalign",
"height",
"indentalign",
"indentalignfirst",
"indentalignlast",
"indentshift",
"indentshiftfirst",
"indentshiftlast",
"indenttarget",
"infixlinebreakstyle",
"largeop",
"leftoverhang",
"length",
"linebreak",
"linebreakmultchar",
"linebreakstyle",
"lineleading",
"location",
"lquote",
"lspace",
"linethickness",
"longdivstyle",
"mathbackground",
"mathcolor",
"mathsize",
"mathvariant",
"maxsize",
"minlabelspacing",
"minsize",
"movablelimits",
"mslinethickness",
"notation",
"numalign",
"open",
"position",
"rightoverhang",
"rowalign",
"rowlines",
"rowspacing",
"rowspan",
"rquote",
"rspace",
"scriptlevel",
"scriptminsize",
"scriptsizemultiplier",
"separator",
"separators",
"shift",
"side",
"stackalign",
"stretchy",
"subscriptshift",
"superscriptshift",
"symmetric",
"voffset",
"width" },
new String[] { "href" },
emptyStringArray,
emptyStringArray));
// <maction> would go here though it seems a bit pointless and may require extra filtering
// MathML content tags would go here if anyone used them
return allowedTagsVerifiers;
}
static class TagVerifier {
private final String tag;
//Attributes which need no sanitation
private final HashSet<String> allowedAttrs;
//Attributes which will be sanitized by child classes
protected final HashSet<String> parsedAttrs;
private final HashSet<String> uriAttrs;
private final HashSet<String> inlineURIAttrs;
TagVerifier(String tag, String[] allowedAttrs) {
this(tag, allowedAttrs, null, null);
}
TagVerifier(String tag, String[] allowedAttrs, String[] uriAttrs, String[] inlineURIAttrs) {
this.tag = tag;
this.allowedAttrs = new HashSet<String>();
this.parsedAttrs = new HashSet<String>();
if (allowedAttrs != null) {
for (String allowedAttr: allowedAttrs)
this.allowedAttrs.add(allowedAttr);
}
this.uriAttrs = new HashSet<String>();
if (uriAttrs != null) {
for (String uriAttr: uriAttrs)
this.uriAttrs.add(uriAttr);
}
this.inlineURIAttrs = new HashSet<String>();
if (inlineURIAttrs != null) {
for (String inlineURIAttr: inlineURIAttrs)
this.inlineURIAttrs.add(inlineURIAttr);
}
}
ParsedTag sanitize(ParsedTag t, HTMLParseContext pc) throws DataFilterException {
/** Map contains the attributes, in order. The key is always the name
* of the attribute, but the value can be a raw Object if it has no value.
* "src" is different to "src=". Arguably we should probably use null in
* the first case and "" in the second case ... FIXME */
Map<String, Object> h = new LinkedHashMap<String, Object>();
boolean equals = false;
String prevX = "";
if (t.unparsedAttrs != null)
for (String s: t.unparsedAttrs) {
if (equals) {
equals = false;
s = stripQuotes(s);
h.remove(prevX);
h.put(prevX, s);
prevX = "";
} else {
int idx = s.indexOf('=');
if (idx == s.length() - 1) {
equals = true;
if (idx == 0) {
// prevX already set
} else {
prevX = s.substring(0, s.length() - 1);
prevX = prevX.toLowerCase();
}
} else if (idx > -1) {
String x = s.substring(0, idx);
if (x.length() == 0)
x = prevX;
x = x.toLowerCase();
String y;
if (idx == s.length() - 1)
y = "";
else
y = s.substring(idx + 1, s.length());
y = stripQuotes(y);
h.remove(x);
h.put(x, y);
prevX = x;
} else {
h.remove(s);
h.put(s, new Object());
prevX = s;
}
}
}
h = sanitizeHash(h, t, pc);
if (h == null) return null;
//Remove any blank entries
for(Iterator<Entry<String, Object>> it = h.entrySet().iterator(); it.hasNext();){
Map.Entry<String, Object> entry = it.next();
if(entry.getValue() == null || entry.getValue().equals("") && pc.isXHTML){
it.remove();
}
}
//If the tag has no attributes, and this is not allowable, remove it
if(h.isEmpty() && expungeTagIfNoAttributes()) return null;
if (t.startSlash)
return new ParsedTag(t, (String[])null);
String[] outAttrs = new String[h.size()];
int i = 0;
for (Map.Entry<String, Object> entry : h.entrySet()) {
String x = entry.getKey();
Object o = entry.getValue();
String y;
if (o instanceof String)
y = (String) o;
else
y = null;
StringBuilder out = new StringBuilder(x);
if (y != null)
out.append( "=\"" ).append( y ).append( '"' );
outAttrs[i++] = out.toString();
}
return new ParsedTag(t, outAttrs);
}
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = new LinkedHashMap<String, Object>();
for (Map.Entry<String, Object> entry : h.entrySet()) {
if(logDEBUG) Logger.debug(this, "HTML Filter is sanitizing: "+entry.getKey()+" = "+entry.getValue());
String x = entry.getKey();
Object o = entry.getValue();
boolean inline = inlineURIAttrs.contains(x);
//URI attributes require additional processing
if (inline || uriAttrs.contains(x)) {
if(!inline) {
if(logMINOR) Logger.minor(this, "Non-inline URI attribute: "+x);
} else {
if(logMINOR) Logger.minor(this, "Inline URI attribute: "+x);
}
// URI
if (o instanceof String) {
// Java's URL handling doesn't seem suitable
String uri = (String) o;
uri = HTMLDecoder.decode(uri);
uri = htmlSanitizeURI(uri, null, null, null, pc.cb, pc, inline);
if (uri == null) {
continue;
}
uri = HTMLEncoder.encode(uri);
o = uri;
}
// FIXME: rewrite absolute URLs, handle ?date= etc
if(logDEBUG) Logger.debug(this, "HTML Filter is putting "+(inline?"inline":"")+" uri attribute: "+x+" = "+o);
hn.put(x, o);
continue;
}
/*We create a placeholder for each parsed attribute in the
* sanitized output. This ensures the order of the attributes.
* Subclasses will take care of parsing and replacing these values.
* If they don't, we'll remove the placeholder later.*/
if(parsedAttrs.contains(x)) {
hn.put(x, null);
continue;
}
/*If the attribute is to be passed through without sanitation*/
if(allowedAttrs.contains(x)) {
hn.put(x, o);
continue;
}
// lang, xml:lang and dir can go on anything
// lang or xml:lang = language [ "-" country [ "-" variant ] ]
// The variant can be just about anything; no way to test (avian)
if (x.equals("xml:lang") ||x.equals("lang") || (x.equals("dir") && (o instanceof String) && (((String)o).equalsIgnoreCase("ltr") || ((String)o).equalsIgnoreCase("rtl")))) {
if(logDEBUG) Logger.debug(this, "HTML Filter is putting attribute: "+x+" = "+o);
hn.put(x, o);
}
}
return hn;
}
/*If this function returns true, this tag will be removed from
* the sanitized output if it has no attributes*/
protected boolean expungeTagIfNoAttributes() {
return false;
}
}
static String stripQuotes(String s) {
final String quotes = "\"'";
if (s.length() >= 2) {
int n = quotes.length();
for (int x = 0; x < n; x++) {
char cc = quotes.charAt(x);
if ((s.charAt(0) == cc) && (s.charAt(s.length() - 1) == cc)) {
if (s.length() > 2)
s = s.substring(1, s.length() - 1);
else
s = "";
break;
}
}
}
return s;
}
// static String[] titleString = new String[] {"title"};
static abstract class ScriptStyleTagVerifier extends TagVerifier {
ScriptStyleTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs) {
super(tag, allowedAttrs, uriAttrs, null);
}
abstract void setStyle(boolean b, HTMLParseContext pc);
abstract boolean getStyle(HTMLParseContext pc);
abstract void processStyle(HTMLParseContext pc);
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
if (p.startSlash) {
return finish(h, hn, pc);
} else {
return start(h, hn, pc);
}
}
Map<String, Object> finish(Map<String, Object> h, Map<String, Object> hn,
HTMLParseContext pc) throws DataFilterException {
if(logDEBUG) Logger.debug(this, "Finishing script/style");
// Finishing
setStyle(false, pc);
pc.styleScriptRecurseCount--;
if (pc.styleScriptRecurseCount < 0) {
if (deleteErrors)
pc.writeAfterTag.append(
"<!-- " + l10n("tooManyNestedStyleOrScriptTags") + " -->");
else
throwFilterException(l10n("tooManyNestedStyleOrScriptTagsLong"));
return null;
}
if(!pc.killStyle) {
processStyle(pc);
pc.writeStyleScriptWithTag = true;
} else {
pc.killStyle = false;
pc.currentStyleScriptChunk = "";
}
pc.expectingBadComment = false;
// Pass it on, no params for </style>
return hn;
}
Map<String, Object> start(Map<String, Object> h, Map<String, Object> hn, HTMLParseContext pc)
throws DataFilterException {
if(logDEBUG) Logger.debug(this, "Starting script/style");
pc.styleScriptRecurseCount++;
if (pc.styleScriptRecurseCount > 1) {
if (deleteErrors)
pc.writeAfterTag.append("<!-- " + l10n("tooManyNestedStyleOrScriptTags") + " -->");
else
throwFilterException(l10n("tooManyNestedStyleOrScriptTagsLong"));
return null;
}
setStyle(true, pc);
String type = getHashString(h, "type");
if (type != null) {
if (!type.equalsIgnoreCase("text/css") /* FIXME */
) {
pc.killStyle = true;
pc.expectingBadComment = true;
return null; // kill the tag
}
hn.put("type", "text/css");
}
return hn;
}
}
static class StyleTagVerifier extends ScriptStyleTagVerifier {
StyleTagVerifier() {
super(
"style",
new String[] { "id", "media", "title", "xml:space" },
emptyStringArray);
}
@Override
void setStyle(boolean b, HTMLParseContext pc) {
pc.inStyle = b;
}
@Override
boolean getStyle(HTMLParseContext pc) {
return pc.inStyle;
}
@Override
void processStyle(HTMLParseContext pc) {
try {
pc.currentStyleScriptChunk =
sanitizeStyle(pc.currentStyleScriptChunk, pc.cb, pc, false);
} catch (DataFilterException e) {
Logger.error(this, "Error parsing style: "+e, e);
pc.currentStyleScriptChunk = "";
}
}
}
static class ScriptTagVerifier extends ScriptStyleTagVerifier {
ScriptTagVerifier() {
super(
"script",
new String[] {
"id",
"charset",
"type",
"language",
"defer",
"xml:space" },
new String[] { "src" });
/*
* FIXME: src not supported type ignored (we will need to check
* this when if/when we support scripts charset ignored
*/
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> hn, ParsedTag p, HTMLParseContext pc)
throws DataFilterException {
// Call parent so we swallow the scripting
super.sanitizeHash(hn, p, pc);
return null; // Lose the tags
}
@Override
void setStyle(boolean b, HTMLParseContext pc) {
pc.inScript = b;
}
@Override
boolean getStyle(HTMLParseContext pc) {
return pc.inScript;
}
@Override
void processStyle(HTMLParseContext pc) {
pc.currentStyleScriptChunk =
sanitizeScripting(pc.currentStyleScriptChunk);
}
}
static class BaseCoreTagVerifier extends TagVerifier {
private static final String[] locallyVerifiedAttrs = new String[] {
"id",
"class",
"style"
};
BaseCoreTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] inlineURIAttrs) {
super(tag, allowedAttrs, uriAttrs, inlineURIAttrs);
allowedHTMLTags.add(tag);
for(String attr : locallyVerifiedAttrs) {
this.parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
// %i18n dealt with by TagVerifier
// %coreattrs
String id = getHashString(h, "id");
if (id != null) {
hn.put("id", id);
// hopefully nobody will be stupid enough to encode URLs into
// the unique ID... :)
}
String classNames = getHashString(h, "class");
if (classNames != null) {
hn.put("class", classNames);
// ditto
}
String style = getHashString(h, "style");
if (style != null) {
style = sanitizeStyle(style, pc.cb, pc, true);
if (style != null)
style = escapeQuotes(style);
if (style != null)
hn.put("style", style);
}
String title = getHashString(h, "title");
if (title != null) {
// PARANOIA: title is PLAIN TEXT, right? In all user agents? :)
hn.put("title", title);
}
return hn;
}
}
static class CoreTagVerifier extends BaseCoreTagVerifier {
private final HashSet<String> eventAttrs;
private static final String[] stdEvents =
new String[] {
"onclick",
"ondblclick",
"onmousedown",
"onmouseup",
"onmouseover",
"onmousemove",
"onmouseout",
"onkeypress",
"onkeydown",
"onkeyup",
"onload",
"onfocus",
"onblur",
"oncontextmenu",
"onresize",
"onscroll",
"onunload",
"onmouseenter",
"onchange",
"onreset",
"onselect",
"onsubmit",
"onerror",
};
CoreTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] inlineURIAttrs,
String[] eventAttrs) {
this(tag, allowedAttrs, uriAttrs, inlineURIAttrs, eventAttrs, true);
}
CoreTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] inlineURIAttrs,
String[] eventAttrs,
boolean addStdEvents) {
super(tag, allowedAttrs, uriAttrs, inlineURIAttrs);
this.eventAttrs = new HashSet<String>();
if (eventAttrs != null) {
for (String eventAttr: eventAttrs) {
this.eventAttrs.add(eventAttr);
this.parsedAttrs.add(eventAttr);
}
}
if (addStdEvents) {
for (String stdEvent: stdEvents) {
this.eventAttrs.add(stdEvent);
this.parsedAttrs.add(stdEvent);
}
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
// events (default and added)
for (String name: eventAttrs) {
String arg = getHashString(h, name);
if (arg != null) {
arg = sanitizeScripting(arg);
if (arg != null)
hn.put(name, arg);
}
}
return hn;
}
}
static class LinkTagVerifier extends CoreTagVerifier {
private static final String[] locallyVerifiedAttrs = new String[] {
"type",
"charset",
"rel",
"rev",
"media",
"hreflang",
"href"
};
LinkTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] inlineURIAttrs,
String[] eventAttrs) {
super(tag, allowedAttrs, uriAttrs, inlineURIAttrs, eventAttrs);
for(String attr : locallyVerifiedAttrs) {
this.parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
String hreflang = getHashString(h, "hreflang");
String charset = null;
String maybecharset = null;
String type = getHashString(h, "type");
if (type != null) {
String[] typesplit = splitType(type);
type = typesplit[0];
if ((typesplit[1] != null) && (typesplit[1].length() > 0))
charset = typesplit[1];
if(logDEBUG)
Logger.debug(
this,
"Processing link tag, type="
+ type
+ ", charset="
+ charset);
}
String c = getHashString(h, "charset");
if (c != null)
charset = c;
if(charset != null) {
try {
charset = URLDecoder.decode(charset, false);
} catch (URLEncodedFormatException e) {
charset = null;
}
}
if(charset != null && charset.indexOf('&') != -1)
charset = null;
if(charset != null && !Charset.isSupported(charset))
charset = null;
// Is it a style sheet?
// Also, sanitise rel type
// If neither rel nor rev, return null
String rel = getHashString(h, "rel");
String parsedRel = "", parsedRev = "";
boolean isStylesheet = false;
boolean isIcon = false;
if(rel != null) {
rel = rel.toLowerCase();
StringTokenizer tok = new StringTokenizer(rel, " ");
int i=0;
String prevToken = null;
StringBuffer sb = new StringBuffer(rel.length());
while (tok.hasMoreTokens()) {
String token = tok.nextToken();
if(token.equalsIgnoreCase("stylesheet")) {
if(token.equalsIgnoreCase("stylesheet")) {
isStylesheet = true;
if(!((i == 0 || i == 1 && prevToken != null && prevToken.equalsIgnoreCase("alternate"))))
return null;
if(tok.hasMoreTokens())
return null; // Disallow extra tokens after "stylesheet"
}
} else if (token.equalsIgnoreCase("icon")) {
isIcon = true;
} else if(!isStandardLinkType(token)) continue;
i++;
if(sb.length() == 0)
sb.append(token);
else {
sb.append(' ');
sb.append(token);
}
prevToken = token;
}
parsedRel = sb.toString();
}
String rev = getHashString(h, "rev");
if(rev != null) {
StringBuffer sb = new StringBuffer(rev.length());
rev = rev.toLowerCase();
StringTokenizer tok = new StringTokenizer(rev, " ");
sb = new StringBuffer(rev.length());
while (tok.hasMoreTokens()) {
String token = tok.nextToken();
if(!isStandardLinkType(token)) continue;
if(sb.length() == 0)
sb.append(token);
else {
sb.append(' ');
sb.append(token);
}
}
parsedRev = sb.toString();
}
// Allow no rel or rev, even on <link>, as per HTML spec.
if(parsedRel.length() != 0)
hn.put("rel", parsedRel);
if(parsedRev.length() != 0)
hn.put("rev", parsedRev);
if(rel != null) {
if(rel.equals("stylesheet") || rel.equals("alternate stylesheet"))
isStylesheet = true;
} else {
// Not a stylesheet.
if(type != null && type.startsWith("text/css"))
return null; // Not a stylesheet, so can't take a stylesheet type.
}
if(isStylesheet) {
if(charset == null) {
// Browser will use the referring document's charset if there
// is no BOM and we don't specify one in HTTP.
// So we need to pass this information to the filter.
// We cannot force the mime type with the charset, because if
// we do that, we might be wrong - if there is a BOM or @charset
// we want to use that. E.g. chinese pages might have the
// page in GB18030 and the borrowed CSS in ISO-8859-1 or UTF-8.
maybecharset = pc.charset;
}
String media = getHashString(h, "media");
if(media != null)
media = CSSReadFilter.filterMediaList(media);
if(media != null)
hn.put("media", media);
if(type != null && !type.startsWith("text/css"))
return null; // Different style language e.g. XSL, not supported.
type = "text/css";
}
String href = getHashString(h, "href");
if (href != null) {
href = HTMLDecoder.decode(href);
if (isIcon) {
href = htmlSanitizeURI(href, type, null, null, pc.cb, pc, false);
} else {
href = htmlSanitizeURI(href, type, charset, maybecharset, pc.cb, pc, false);
}
if (href != null) {
href = HTMLEncoder.encode(href);
hn.put("href", href);
if (type != null)
hn.put("type", type);
if (charset != null)
hn.put("charset", charset);
if ((charset != null) && (hreflang != null))
hn.put("hreflang", hreflang);
}
}
// FIXME: allow these if the charset and encoding are encoded into
// the URL
return hn;
}
// Does not include stylesheet
private static final HashSet<String> standardRelTypes = new HashSet<String>();
static {
for(String s : new String[] {
"alternate",
"start",
"next",
"prev",
"contents",
"index",
"glossary",
"copyright",
"chapter",
"section",
"subsection",
"appendix",
"help",
"bookmark"
}) standardRelTypes.add(s);
}
private boolean isStandardLinkType(String token) {
return standardRelTypes.contains(token.toLowerCase());
}
}
/** Verify media tags (audio and video). This needs its own
* verifier, because different from images, browsers use content
* sniffing to find out whether to display it as media
* content. Using text/plain as content type would allow
* exploiting this to run unfiltered files as media files. We fix
* this by encoding the mime type into the uri.*/
static class MediaTagVerifier extends CoreTagVerifier {
private static final String[] locallyVerifiedAttrs = new String[] {
"src"
};
MediaTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] inlineURIAttrs,
String[] eventAttrs) {
super(tag, allowedAttrs, uriAttrs, inlineURIAttrs, eventAttrs);
for(String attr : locallyVerifiedAttrs) {
this.parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
String hreflang = getHashString(h, "hreflang");
String charset = null;
String maybecharset = null;
/* TODO: get the type from the filename. Currently we only
* have a filter for mp3, so this is the simplest possible
* solution.*/
String type = "audio/mpeg";
String src = getHashString(h, "src");
if (src != null) {
src = HTMLDecoder.decode(src);
src = htmlSanitizeURI(src, type, null, null, pc.cb, pc, false);
if (src != null) {
src = HTMLEncoder.encode(src);
hn.put("src", src);
}
}
return hn;
}
}
// We do not allow forms to act anywhere else than on /
static class FormTagVerifier extends CoreTagVerifier{
private static final String[] locallyVerifiedAttrs = new String[] {
"method",
"action",
"enctype",
"accept-charset"
};
FormTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] eventAttrs) {
super(tag, allowedAttrs, uriAttrs, null, eventAttrs);
for(String attr : locallyVerifiedAttrs) {
this.parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
if(p.startSlash) {
// Allow, but only with standard elements
return hn;
}
String method = getHashString(h, "method");
String action = getHashString(h, "action");
String finalAction;
try {
finalAction = pc.cb.processForm(method, action);
} catch (CommentException e) {
pc.writeAfterTag.append("<!-- ").append(HTMLEncoder.encode(e.toString())).append(" -->");
return null;
}
if(finalAction == null) return null;
hn.put("method", method);
hn.put("action", finalAction);
// Force enctype and accept-charset to acceptable values.
hn.put("enctype", "multipart/form-data");
hn.put("accept-charset", "UTF-8");
return hn;
}
}
static class InputTagVerifier extends CoreTagVerifier{
private final HashSet<String> allowedTypes;
private String[] types = new String[]{
"text",
"password",
"checkbox",
"radio",
"submit",
"reset",
// no ! file
"hidden",
"image",
"button"
};
InputTagVerifier(
String tag,
String[] allowedAttrs,
String[] uriAttrs,
String[] inlineURIAttrs,
String[] eventAttrs) {
super(tag, allowedAttrs, uriAttrs, inlineURIAttrs, eventAttrs);
this.allowedTypes = new HashSet<String>();
if (types != null) {
for (String type: types) {
this.allowedTypes.add(type);
}
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
// We drop the whole <input> if type isn't allowed
if(!allowedTypes.contains(hn.get("type"))){
return null;
}
return hn;
}
}
static class MetaTagVerifier extends TagVerifier {
private static final String[] allowedContentTypes = ContentFilter.HTML_MIME_TYPES;
private static final String[] locallyVerifiedAttrs = {
"http-equiv",
"name",
"content"
};
MetaTagVerifier() {
super("meta", new String[] { "id" });
for(String attr : locallyVerifiedAttrs) {
this.parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
/*
* Several possibilities: a) meta http-equiv=X content=Y b) meta
* name=X content=Y
*/
String http_equiv = getHashString(h, "http-equiv");
String name = getHashString(h, "name");
String content = getHashString(h, "content");
String scheme = getHashString(h, "scheme");
if(logMINOR) Logger.minor(this, "meta: name="+name+", content="+content+", http-equiv="+http_equiv+", scheme="+scheme);
if (content != null) {
if ((name != null) && (http_equiv == null)) {
if (name.equalsIgnoreCase("Author")) {
hn.put("name", name);
hn.put("content", content);
} else if (name.equalsIgnoreCase("Keywords")) {
hn.put("name", name);
hn.put("content", content);
} else if (name.equalsIgnoreCase("Description")) {
hn.put("name", name);
hn.put("content", content);
}
} else if ((http_equiv != null) && (name == null)) {
if (http_equiv.equalsIgnoreCase("Expires")) {
try {
ToadletContextImpl.parseHTTPDate(content);
hn.put("http-equiv", http_equiv);
hn.put("content", content);
} catch (ParseException e) {
// Delete it.
return null;
}
} else if (
http_equiv.equalsIgnoreCase("Content-Script-Type")) {
// We don't support script at this time.
} else if (
http_equiv.equalsIgnoreCase("Content-Style-Type")) {
// FIXME: charsets
if (content.equalsIgnoreCase("text/css")) {
// FIXME: selectable style languages - only matters
// when we have implemented more than one
// FIXME: if we ever do allow it... the spec
// http://www.w3.org/TR/html4/present/styles.html#h-14.2.1
// says only the last definition counts...
// but it only counts if it's in the HEAD section,
// so we DONT need to parse the whole doc
hn.put("http-equiv", http_equiv);
hn.put("content", content);
}
// FIXME: add some more headers - Dublin Core?
} else if (http_equiv.equalsIgnoreCase("Content-Type")) {
if(logMINOR) Logger.minor(this, "Found http-equiv content-type="+content);
String[] typesplit = splitType(content);
if(logDEBUG) {
for(int i=0;i<typesplit.length;i++)
Logger.debug(this, "["+i+"] = "+typesplit[i]);
}
boolean detected = false;
for (String allowedContentType: allowedContentTypes) {
if (typesplit[0].equalsIgnoreCase(allowedContentType)) {
if((typesplit[1] == null) || (pc.charset != null && typesplit[1]
.equalsIgnoreCase(pc.charset))) {
hn.put("http-equiv", http_equiv);
hn.put("content", typesplit[0]
+ (typesplit[1] != null ? "; charset="
+ typesplit[1] : ""));
} else if(typesplit[1] != null && pc.charset != null && !typesplit[1].equalsIgnoreCase(pc.charset)) {
throwFilterException(l10n("wrongCharsetInMeta"));
} else if(typesplit[1] != null) {
if(pc.detectedCharset != null)
throwFilterException(l10n("multipleCharsetsInMeta"));
pc.detectedCharset = typesplit[1].trim();
}
detected = true;
break;
}
}
if(!detected)
throwFilterException(l10n("invalidMetaType"));
} else if (
http_equiv.equalsIgnoreCase("Content-Language")) {
if(content.matches("((?>[a-zA-Z0-9]*)(?>-[A-Za-z0-9]*)*(?>,\\s*)?)*") && (!content.trim().equals(""))) {
hn.put("http-equiv", "Content-Language");
hn.put("content", content);
}
} else if (http_equiv.equalsIgnoreCase("refresh")) {
int idx = content.indexOf(';');
if(idx == -1 && metaRefreshSamePageMinInterval >= 0) {
try {
int seconds = Integer.parseInt(content);
if(seconds < 0) return null;
if(seconds < metaRefreshSamePageMinInterval)
seconds = metaRefreshSamePageMinInterval;
hn.put("http-equiv", "refresh");
hn.put("content", Integer.toString(seconds));
} catch (NumberFormatException e) {
// Delete.
pc.writeAfterTag.append("<!-- doesn't parse as number in meta refresh -->");
return null;
}
} else if(metaRefreshRedirectMinInterval >= 0) {
int seconds;
String before = content.substring(0, idx);
String after = content.substring(idx+1).trim();
try {
seconds = Integer.parseInt(before);
if(seconds < 0) return null;
if(seconds < metaRefreshRedirectMinInterval) seconds = metaRefreshRedirectMinInterval;
if(!after.toLowerCase().startsWith("url=")) {
pc.writeAfterTag.append("<!-- no url but doesn't parse as number in meta refresh -->");
return null;
}
after = after.substring("url=".length()).trim();
try {
String url = sanitizeURI(after, null, null, null, pc.cb, false);
hn.put("http-equiv", "refresh");
hn.put("content", ""+seconds+"; url="+HTMLEncoder.encode(url));
} catch (CommentException e) {
pc.writeAfterTag.append("<!-- "+e.getMessage()+"-->");
// Delete
return null;
}
} catch (NumberFormatException e) {
pc.writeAfterTag.append("<!-- doesn't parse as number in meta refresh possibly with url -->");
// Delete.
return null;
}
}
}
}
}
/* try HTML5 meta charset declaration. */
String charset = getHashString(h, "charset");
if (charset != null) {
if ((pc.detectedCharset != null) && !charset.equals(pc.detectedCharset)) {
throwFilterException(l10n("multipleCharsetsInMeta"));
}
pc.detectedCharset = charset;
}
return hn;
}
@Override
protected boolean expungeTagIfNoAttributes() {
return true;
}
}
static class DocTypeTagVerifier extends TagVerifier {
DocTypeTagVerifier(String tag) {
super(tag, null);
}
private static final Map<String, Object> DTDs = new HashMap<String, Object>();
static {
DTDs.put(
"-//W3C//DTD XHTML 1.0 Strict//EN",
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
DTDs.put(
"-//W3C//DTD XHTML 1.0 Transitional//EN",
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd");
DTDs.put(
"-//W3C//DTD XHTML 1.0 Frameset//EN",
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd");
DTDs.put(
"-//W3C//DTD XHTML 1.1//EN",
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
DTDs.put(
"-//W3C//DTD HTML 4.01//EN",
"http://www.w3.org/TR/html4/strict.dtd");
DTDs.put(
"-//W3C//DTD HTML 4.01 Transitional//EN",
"http://www.w3.org/TR/html4/loose.dtd");
DTDs.put(
"-//W3C//DTD HTML 4.01 Frameset//EN",
"http://www.w3.org/TR/html4/frameset.dtd");
DTDs.put("-//W3C//DTD HTML 3.2 Final//EN", new Object());
}
@Override
ParsedTag sanitize(ParsedTag t, HTMLParseContext pc) {
// HTML5 is just <!doctype html>
if(t.unparsedAttrs.length == 1) {
if (!t.unparsedAttrs[0].equalsIgnoreCase("html"))
return null;
return t;
}
if (!((t.unparsedAttrs.length == 3) || (t.unparsedAttrs.length == 4)))
return null;
if (!t.unparsedAttrs[0].equalsIgnoreCase("html"))
return null;
if(t.unparsedAttrs[1].equalsIgnoreCase("system") && t.unparsedAttrs.length == 3) {
// HTML5 allows <!DOCTYPE html SYSTEM "about:legacy-compat"> (either kind of quotes)
String s = stripQuotes(t.unparsedAttrs[2]);
if(s.equals("about:legacy-compat") && t.unparsedAttrs.length == 3) {
return t;
} else return null;
}
if (!t.unparsedAttrs[1].equalsIgnoreCase("public"))
return null;
String s = stripQuotes(t.unparsedAttrs[2]);
if (!DTDs.containsKey(s))
return null;
if (t.unparsedAttrs.length == 4) {
String ss = stripQuotes(t.unparsedAttrs[3]);
String spec = getHashString(DTDs, s);
if ((spec != null) && !spec.equals(ss))
return null;
}
return t;
}
}
static class XmlTagVerifier extends TagVerifier {
XmlTagVerifier() {
super("?xml", null);
}
@Override
ParsedTag sanitize(ParsedTag t, HTMLParseContext pc) throws DataFilterException {
if (t.unparsedAttrs.length != 2 && t.unparsedAttrs.length != 3) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid length");
return null;
}
if (t.unparsedAttrs.length == 3 && !t.unparsedAttrs[2].equals("?")) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid ending (length 2)");
return null;
}
if (t.unparsedAttrs.length == 2 && !t.unparsedAttrs[1].endsWith("?")) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid ending (length 3)");
return null;
}
if (!(t.unparsedAttrs[0].equals("version=\"1.0\"") || t.unparsedAttrs[0].equals("version='1.0'"))) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid version");
return null;
}
String encodingAttr = t.unparsedAttrs[1];
if(encodingAttr.startsWith("encoding=\"")) {
if(!encodingAttr.endsWith("\"")) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid encoding");
return null;
}
} else if(encodingAttr.startsWith("encoding='")) {
if(!encodingAttr.endsWith("'")) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid encoding");
return null;
}
} else {
if (logMINOR) Logger.minor(this, "Deleting xml declaration, invalid encoding");
return null;
}
String charset = encodingAttr.substring("encoding='".length(), encodingAttr.length()-1);
if (!charset.equalsIgnoreCase(pc.charset)) {
if(pc.charset != null && !charset.equalsIgnoreCase(pc.charset)) {
if (logMINOR) Logger.minor(this, "Deleting xml declaration (invalid charset "
+ charset + " should be "+pc.charset + ")");
return null;
} else if(pc.detectedCharset != null) {
throwFilterException(l10n("multipleCharsetsInMeta"));
} else {
pc.detectedCharset = charset;
}
}
return t;
}
}
static class HtmlTagVerifier extends TagVerifier {
private static final String[] locallyVerifiedAttrs = new String[] { "xmlns" };
HtmlTagVerifier() {
super("html", new String[] { "id", "version" });
for(String attr : locallyVerifiedAttrs) {
parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
String xmlns = getHashString(h, "xmlns");
if ((xmlns != null) && xmlns.equals("http://www.w3.org/1999/xhtml")) {
hn.put("xmlns", xmlns);
pc.setisXHTML(true);
}
return hn;
}
}
static class BaseHrefTagVerifier extends TagVerifier {
private static final String[] locallyVerifiedAttrs = new String[] {
"href"};
BaseHrefTagVerifier(String tag, String[] allowedAttrs, String[] uriAttrs) {
super(tag, allowedAttrs, uriAttrs, null);
for(String attr : locallyVerifiedAttrs) {
this.parsedAttrs.add(attr);
}
}
@Override
Map<String, Object> sanitizeHash(Map<String, Object> h,
ParsedTag p,
HTMLParseContext pc) throws DataFilterException {
Map<String, Object> hn = super.sanitizeHash(h, p, pc);
String baseHref = getHashString(h, "href");
if(baseHref != null) {
// Decode and encode for the same reason we do in sanitizeHash().
baseHref = HTMLDecoder.decode(baseHref);
String ref = pc.cb.onBaseHref(baseHref);
if(ref != null) {
hn.put("href", HTMLEncoder.encode(ref));
return hn;
}
}
pc.writeAfterTag.append("<!-- deleted invalid base href -->");
return null;
}
}
static String sanitizeStyle(String style, FilterCallback cb, HTMLParseContext hpc, boolean isInline) throws DataFilterException {
if(style == null) return null;
if(hpc.onlyDetectingCharset) return null;
Reader r = new StringReader(style);
Writer w = new StringWriter();
style = style.trim();
if(logMINOR) Logger.minor(HTMLFilter.class, "Sanitizing style: " + style);
CSSParser pc = new CSSParser(r, w, false, cb, hpc.charset, false, isInline);
try {
pc.parse();
} catch (IOException e) {
Logger.error(
HTMLFilter.class,
"IOException parsing inline CSS!");
} catch (Error e) {
if (e.getMessage().equals("Error: could not match input")) {
// this sucks, it should be a proper exception
Logger.normal(
HTMLFilter.class,
"CSS Parse Error!",
e);
return "/* "+l10n("couldNotParseStyle")+" */";
} else
throw e;
}
String s = w.toString();
if ((s == null) || (s.length() == 0))
return null;
// Core.logger.log(SaferFilter.class, "Style now: " + s, LogLevel.DEBUG);
if(logMINOR) Logger.minor(HTMLFilter.class, "Style finally: " + s);
return s;
}
static String escapeQuotes(String s) {
StringBuilder buf = new StringBuilder(s.length());
for (int x = 0; x < s.length(); x++) {
char c = s.charAt(x);
if (c == '\"') {
buf.append(""");
} else {
buf.append(c);
}
}
return buf.toString();
}
static String sanitizeScripting(String script) {
// Kill it. At some point we may want to allow certain recipes - FIXME
return null;
}
static String sanitizeURI(String uri, FilterCallback cb, boolean inline) throws CommentException {
return sanitizeURI(uri, null, null, null, cb, inline);
}
/*
* While we're only interested in the type and the charset, the format is a
* lot more flexible than that. (avian) TEXT/PLAIN; format=flowed;
* charset=US-ASCII IMAGE/JPEG; name=test.jpeg; x-unix-mode=0644
*/
public static String[] splitType(String type) {
StringFieldParser sfp;
String charset = null, param, name, value;
int x;
sfp = new StringFieldParser(type, ';');
type = sfp.nextField().trim();
while (sfp.hasMoreFields()) {
param = sfp.nextField();
x = param.indexOf('=');
if (x != -1) {
name = param.substring(0, x).trim();
value = param.substring(x + 1).trim();
if (name.equals("charset"))
charset = value;
}
}
return new String[] { type, charset };
}
// A simple string splitter
// StringTokenizer doesn't work well for our purpose. (avian)
static class StringFieldParser {
private String str;
private int maxPos, curPos;
private char c;
public StringFieldParser(String str) {
this(str, '\t');
}
public StringFieldParser(String str, char c) {
this.str = str;
this.maxPos = str.length();
this.curPos = 0;
this.c = c;
}
public boolean hasMoreFields() {
return curPos <= maxPos;
}
public String nextField() {
int start, end;
if (curPos > maxPos)
return null;
start = curPos;
while ((curPos < maxPos) && (str.charAt(curPos) != c))
curPos++;
end = curPos;
curPos++;
return str.substring(start, end);
}
}
static String htmlSanitizeURI(
String suri,
String overrideType,
String overrideCharset,
String maybeCharset,
FilterCallback cb,
HTMLParseContext pc,
boolean inline) {
try {
return sanitizeURI(suri, overrideType, overrideCharset, maybeCharset, cb, inline);
} catch (CommentException e) {
pc.writeAfterTag.append("<!-- ").append(HTMLEncoder.encode(e.toString())).append(" -->");
return null;
}
}
static String sanitizeURI(
String suri,
String overrideType,
String overrideCharset,
String maybeCharset,
FilterCallback cb, boolean inline) throws CommentException {
if(logMINOR)
Logger.minor(HTMLFilter.class, "Sanitizing URI: "+suri+" ( override type "+overrideType +" override charset "+overrideCharset+" ) inline="+inline, new Exception("debug"));
boolean addMaybe = false;
if((overrideCharset != null) && (overrideCharset.length() > 0))
overrideType += "; charset="+overrideCharset;
else if(maybeCharset != null)
addMaybe = true;
String retval = cb.processURI(suri, overrideType, false, inline);
if(addMaybe) {
if(retval.indexOf('?') != -1)
retval += "&maybecharset="+maybeCharset;
else
retval += "?maybecharset="+maybeCharset;
}
return retval;
}
static String getHashString(Map<String, Object> h, String key) {
Object o = h.get(key);
if (o == null)
return null;
if (o instanceof String)
return (String) o;
else
return null;
}
private static String l10n(String key) {
return NodeL10n.getBase().getString("HTMLFilter."+key);
}
private static String l10n(String key, String pattern, String value) {
return NodeL10n.getBase().getString("HTMLFilter."+key, pattern, value);
}
@Override
public BOMDetection getCharsetByBOM(byte[] input, int length) throws DataFilterException {
// No enhanced BOMs.
// FIXME XML BOMs???
return null;
}
@Override
public int getCharsetBufferSize() {
//Read in 64 kilobytes. The charset could be defined anywhere in the head section
return 1024*64;
}
}