HttpHeaderUtils.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.util.Iterator;

import org.commoncrawl.io.NIOHttpHeaders;

public class HttpHeaderUtils {

  public static final String kCharset = "charset=";

  public static class ContentTypeAndCharset {

    public String _contentType = null;
    public String _charset     = null;

    void reset() {
      _contentType = null;
      _charset = null;
    }
  }

  public static void parseContentType(NIOHttpHeaders headers,
      ContentTypeAndCharset metadataOut) {

    Iterator<String> j = headers.multiValueIterator("content-type");

    while (j.hasNext()) {

      String contentType = j.next();

      if (contentType != null) {
        parseContentType(metadataOut, contentType);
      }
    }
  }

  static final void parseContentType(ContentTypeAndCharset metadataOut,
      String contentType) {

    // reset output data structure
    metadataOut.reset();

    // Trim leading and trailing whitespace from type. We include '(' in
    // the trailing trim set to catch media-type comments, which are not at all
    // standard, but may occur in rare cases.
    int type_val = skipPastLWS(contentType, 0);
    type_val = Math.min(type_val, contentType.length());
    int type_end = skipToLWSAndExtra(contentType, type_val);
    if (type_end == -1)
      type_end = contentType.length();

    int charset_val = 0;
    int charset_end = 0;

    // Iterate over parameters
    boolean type_has_charset = false;
    int param_start = contentType.indexOf(';', type_end);
    if (param_start != -1) {
      // We have parameters. Iterate over them.
      int cur_param_start = param_start + 1;
      do {
        int cur_param_end = contentType.indexOf(';', cur_param_start);
        if (cur_param_end == -1)
          cur_param_end = contentType.length();
        int param_name_start = skipPastLWS(contentType, cur_param_start);
        param_name_start = Math.min(param_name_start, cur_param_end);
        int charset_end_offset = Math.min(param_name_start + kCharset.length(),
            cur_param_end);
        if (contentType.substring(param_name_start, charset_end_offset)
            .equalsIgnoreCase(kCharset)) {
          charset_val = param_name_start + kCharset.length();
          charset_end = cur_param_end;
          type_has_charset = true;
        }
        cur_param_start = cur_param_end + 1;
      } while (cur_param_start < contentType.length());
    }

    if (type_has_charset) {
      try {
        // Trim leading and trailing whitespace from charset_val. We include
        // '(' in the trailing trim set to catch media-type comments, which are
        // not at all standard, but may occur in rare cases.
        charset_val = skipPastLWS(contentType, charset_val);
        charset_val = Math.min(charset_val, charset_end);
        if (charset_val == contentType.length()) {
          type_has_charset = false;
        } else {
          char first_char = contentType.charAt(charset_val);
          if (first_char == '"' || first_char == '\'') {
            ++charset_val;
            charset_end = contentType.indexOf(first_char, charset_val);
            if (charset_end == -1)
              charset_end = skipToLWSAndExtra(contentType, charset_val);
          } else {
            charset_end = Math.min(skipToLWSAndExtra(contentType, charset_val),
                charset_end);
          }
        }
      } catch (IndexOutOfBoundsException e) {
        type_has_charset = false;
      }
    }

    // if the server sent "*/*", it is meaningless, so do not store it.
    // also, if type_val is the same as mime_type, then just update the
    // charset. however, if charset is empty and mime_type hasn't
    // changed, then don't wipe-out an existing charset. We
    // also want to reject a mime-type if it does not include a slash.
    // some servers give junk after the charset parameter, which may
    // include a comma, so this check makes us a bit more tolerant.
    if (contentType.length() != 0 && !contentType.equals("*/*")
        && contentType.indexOf('/') != -1) {
      String originalContentType = metadataOut._contentType;
      metadataOut._contentType = (contentType.substring(type_val, type_end)
          .toLowerCase());

      if (type_has_charset) {
        metadataOut._charset = (contentType.substring(charset_val, charset_end)
            .toLowerCase());
      } else {
        // ok content type changed but no charset found ...
        if (originalContentType != null
            && !originalContentType.equals(metadataOut._contentType)) {
          // reset charset
          metadataOut._charset = null;
        }
      }
    }
  }

  public static int skipPastLWS(String sourceString, int startPos) {
    int endPos = startPos;
    while (endPos < sourceString.length()) {
      char c = sourceString.charAt(endPos);
      if (c != ' ' && c != '\t')
        break;
      endPos++;
    }
    return endPos;
  }

  public static int skipToLWSAndExtra(String sourceString, int startPos) {
    int endPos = startPos;
    while (endPos < sourceString.length()) {
      char c = sourceString.charAt(endPos);
      if (c == ' ' || c == '\t' || c == ';' || c == '(')
        break;
      endPos++;
    }
    return endPos;
  }

}