/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.util.Iterator;
import org.commoncrawl.io.NIOHttpHeaders;
public class HttpHeaderUtils {
public static final String kCharset = "charset=";
public static class ContentTypeAndCharset {
public String _contentType = null;
public String _charset = null;
void reset() {
_contentType = null;
_charset = null;
}
}
public static void parseContentType(NIOHttpHeaders headers,
ContentTypeAndCharset metadataOut) {
Iterator<String> j = headers.multiValueIterator("content-type");
while (j.hasNext()) {
String contentType = j.next();
if (contentType != null) {
parseContentType(metadataOut, contentType);
}
}
}
static final void parseContentType(ContentTypeAndCharset metadataOut,
String contentType) {
// reset output data structure
metadataOut.reset();
// Trim leading and trailing whitespace from type. We include '(' in
// the trailing trim set to catch media-type comments, which are not at all
// standard, but may occur in rare cases.
int type_val = skipPastLWS(contentType, 0);
type_val = Math.min(type_val, contentType.length());
int type_end = skipToLWSAndExtra(contentType, type_val);
if (type_end == -1)
type_end = contentType.length();
int charset_val = 0;
int charset_end = 0;
// Iterate over parameters
boolean type_has_charset = false;
int param_start = contentType.indexOf(';', type_end);
if (param_start != -1) {
// We have parameters. Iterate over them.
int cur_param_start = param_start + 1;
do {
int cur_param_end = contentType.indexOf(';', cur_param_start);
if (cur_param_end == -1)
cur_param_end = contentType.length();
int param_name_start = skipPastLWS(contentType, cur_param_start);
param_name_start = Math.min(param_name_start, cur_param_end);
int charset_end_offset = Math.min(param_name_start + kCharset.length(),
cur_param_end);
if (contentType.substring(param_name_start, charset_end_offset)
.equalsIgnoreCase(kCharset)) {
charset_val = param_name_start + kCharset.length();
charset_end = cur_param_end;
type_has_charset = true;
}
cur_param_start = cur_param_end + 1;
} while (cur_param_start < contentType.length());
}
if (type_has_charset) {
try {
// Trim leading and trailing whitespace from charset_val. We include
// '(' in the trailing trim set to catch media-type comments, which are
// not at all standard, but may occur in rare cases.
charset_val = skipPastLWS(contentType, charset_val);
charset_val = Math.min(charset_val, charset_end);
if (charset_val == contentType.length()) {
type_has_charset = false;
} else {
char first_char = contentType.charAt(charset_val);
if (first_char == '"' || first_char == '\'') {
++charset_val;
charset_end = contentType.indexOf(first_char, charset_val);
if (charset_end == -1)
charset_end = skipToLWSAndExtra(contentType, charset_val);
} else {
charset_end = Math.min(skipToLWSAndExtra(contentType, charset_val),
charset_end);
}
}
} catch (IndexOutOfBoundsException e) {
type_has_charset = false;
}
}
// if the server sent "*/*", it is meaningless, so do not store it.
// also, if type_val is the same as mime_type, then just update the
// charset. however, if charset is empty and mime_type hasn't
// changed, then don't wipe-out an existing charset. We
// also want to reject a mime-type if it does not include a slash.
// some servers give junk after the charset parameter, which may
// include a comma, so this check makes us a bit more tolerant.
if (contentType.length() != 0 && !contentType.equals("*/*")
&& contentType.indexOf('/') != -1) {
String originalContentType = metadataOut._contentType;
metadataOut._contentType = (contentType.substring(type_val, type_end)
.toLowerCase());
if (type_has_charset) {
metadataOut._charset = (contentType.substring(charset_val, charset_end)
.toLowerCase());
} else {
// ok content type changed but no charset found ...
if (originalContentType != null
&& !originalContentType.equals(metadataOut._contentType)) {
// reset charset
metadataOut._charset = null;
}
}
}
}
public static int skipPastLWS(String sourceString, int startPos) {
int endPos = startPos;
while (endPos < sourceString.length()) {
char c = sourceString.charAt(endPos);
if (c != ' ' && c != '\t')
break;
endPos++;
}
return endPos;
}
public static int skipToLWSAndExtra(String sourceString, int startPos) {
int endPos = startPos;
while (endPos < sourceString.length()) {
char c = sourceString.charAt(endPos);
if (c == ' ' || c == '\t' || c == ';' || c == '(')
break;
endPos++;
}
return endPos;
}
}