DocOpScrub.java example

Explorer
fedone-master
// Copyright 2010 Google Inc. All Rights Reserved.

package org.waveprotocol.wave.model.document.util;

import org.waveprotocol.wave.model.document.operation.AnnotationBoundaryMap;
import org.waveprotocol.wave.model.document.operation.AnnotationBoundaryMapBuilder;
import org.waveprotocol.wave.model.document.operation.Attributes;
import org.waveprotocol.wave.model.document.operation.AttributesUpdate;
import org.waveprotocol.wave.model.document.operation.BufferedDocOp;
import org.waveprotocol.wave.model.document.operation.DocOp;
import org.waveprotocol.wave.model.document.operation.DocOpCursor;
import org.waveprotocol.wave.model.document.operation.impl.AttributesImpl;
import org.waveprotocol.wave.model.document.operation.impl.AttributesUpdateImpl;
import org.waveprotocol.wave.model.document.operation.impl.DocOpBuffer;
import org.waveprotocol.wave.model.document.operation.util.ImmutableStateMap.Attribute;
import org.waveprotocol.wave.model.document.operation.util.ImmutableUpdateMap.AttributeUpdate;
import org.waveprotocol.wave.model.util.CollectionUtils;
import org.waveprotocol.wave.model.util.Utf16Util;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * Utility for scrubbing user-sensitive information out of operations.
 *
 * It is not intended to make reverse-engineering the original content totally
 * impossible, given enough time and effort. It is meant to prevent
 * casual/accidental viewing of user data in logs, while still preserving
 * valuable debugging information, such as the type of characters in a string
 * (so we can tell, for example, if the error is related to CJK input or not).
 *
 * @author danilatos@google.com (Daniel Danilatos)
 */
public final class DocOpScrub {

  /**
   * Number of leading characters to avoid scrubbing in cases where we do
   * not scrub the entire string
   */
  static final int CHARS_TO_LEAVE = 3;

  static final char PSI = 0x3c8, JIA = 0x42f, ARMENIAN = 0x554, WO = 0x6211;

  /**
   * Creates a scrubbed copy of the given operation.
   *
   * @param op does not have to be well formed. invalid characters will be
   *        clearly noted.
   * @return the output will be well formed only if the input was well formed
   */
  public static BufferedDocOp scrub(DocOp op) {
    final DocOpBuffer b = new DocOpBuffer();

    op.apply(new DocOpCursor() {
      @Override
      public void deleteCharacters(String chars) {
        b.deleteCharacters(scrubString(chars));
      }

      @Override
      public void deleteElementEnd() {
        b.deleteElementEnd();
      }

      @Override
      public void deleteElementStart(String type, Attributes attrs) {
        b.deleteElementStart(type, scrubAttributes(attrs));
      }

      @Override
      public void replaceAttributes(Attributes oldAttrs, Attributes newAttrs) {
        b.replaceAttributes(scrubAttributes(oldAttrs), scrubAttributes(newAttrs));
      }

      @Override
      public void retain(int itemCount) {
        b.retain(itemCount);
      }

      @Override
      public void updateAttributes(AttributesUpdate attrUpdate) {
        b.updateAttributes(scrubAttributesUpdate(attrUpdate));
      }

      @Override
      public void annotationBoundary(AnnotationBoundaryMap map) {
        b.annotationBoundary(scrubAnnotationBoundary(map));
      }

      @Override
      public void characters(String chars) {
        b.characters(scrubString(chars));
      }

      @Override
      public void elementEnd() {
        b.elementEnd();
      }

      @Override
      public void elementStart(String type, Attributes attrs) {
        b.elementStart(type, scrubAttributes(attrs));
      }
    });

    return b.finishUnchecked();
  }

  public static AnnotationBoundaryMap scrubAnnotationBoundary(AnnotationBoundaryMap unscrubbed) {
    AnnotationBoundaryMapBuilder b = new AnnotationBoundaryMapBuilder();
    for (int i = 0; i < unscrubbed.endSize(); i++) {
      b.end(scrubMostAnnotationKey(unscrubbed.getEndKey(i)));
    }
    for (int i = 0; i < unscrubbed.changeSize(); i++) {
      b.change(scrubMostAnnotationKey(unscrubbed.getChangeKey(i)),
          scrubMostString(unscrubbed.getOldValue(i)),
          scrubMostString(unscrubbed.getNewValue(i)));
    }
    return b.build();
  }

  public static Attributes scrubAttributes(Attributes unscrubbed) {
    List<Attribute> list = new ArrayList<Attribute>();
    for (Map.Entry<String, String> entry : unscrubbed.entrySet()) {
      list.add(new Attribute(scrubMostString(entry.getKey()), scrubMostString(entry.getValue())));
    }
    return AttributesImpl.fromUnsortedAttributes(list);
  }

  public static AttributesUpdate scrubAttributesUpdate(AttributesUpdate unscrubbed) {
    List<AttributeUpdate> list = new ArrayList<AttributeUpdate>();
    int size = unscrubbed.changeSize();
    for (int i = 0; i < size; i++) {
      list.add(new AttributeUpdate(scrubMostString(unscrubbed.getChangeKey(i)),
          scrubMostString(unscrubbed.getOldValue(i)),
          scrubMostString(unscrubbed.getNewValue(i))));
    }
    return AttributesUpdateImpl.fromUnsortedUpdates(list);
  }

  /**
   * Scrubs most of an annotation key. Uses {@link #scrubMostString(String)} on
   * each slash (/) separated component of the key, preserving the original
   * slashes.
   *
   * @param unscrubbed key
   * @return mostly scrubbed key
   */
  public static String scrubMostAnnotationKey(String unscrubbed) {
    // pass -1 to split to prevent dropping trailing empty strings
    String[] parts = unscrubbed.split("/", -1);
    StringBuilder b = new StringBuilder();
    for (int i = 0; i < parts.length; i++) {
      parts[i] = scrubMostString(parts[i]);
    }
    return CollectionUtils.join('/', parts);
  }

  /**
   * Scrubs most of a string based on a heuristic that attempts to clean out
   * user sensitive data while still retaining some amount of information that
   * would be useful for debugging.
   *
   * If the string looks a bit like an email address, then everything but the @
   * symbol is scrubbed. Otherwise, everything but the first few characters is
   * scrubbed. In any case, invalid characters are always scrubbed for
   * readability. The motivation in the case of email addresses is both to make
   * it clear that the piece of data is an email address, and to be a little
   * stronger about removing the identifiable information
   *
   * NOTE(danilatos): Consider also always scrubbing confusing-to-print
   * characters such as RTL or ligature forming characters?
   *
   * @param unscrubbed unscrubbed string. may be null, in which case null is
   *        returned.
   * @return mostly scrubbed string
   */
  public static String scrubMostString(String unscrubbed) {
    if (unscrubbed == null) {
      return null;
    }

    int index = unscrubbed.indexOf('@');
    if (index != -1 && unscrubbed.lastIndexOf('@') == index) {
      // If it looks vaguely like an email address (contains a single '@'),
      // then scrub everything except for the '@' symbol.
      // pass 2 to split to prevent dropping of trailing empty strings
      String[] parts = unscrubbed.split("@", 2);
      return scrubString(parts[0]) + '@' + scrubString(parts[1]);
    } else if (unscrubbed.length() >= CHARS_TO_LEAVE){
      // Otherwise scrub everything but the first few characters,
      // which we leave to aid debugging.
      return
          scrubString(unscrubbed.substring(0, CHARS_TO_LEAVE), false) +
          scrubString(unscrubbed.substring(CHARS_TO_LEAVE), true);
    } else {
      return unscrubbed;
    }
  }

  public static String scrubString(String unscrubbed) {
    return scrubString(unscrubbed, true);
  }

  static String scrubString(String unscrubbed, boolean scrubValidChars) {
    char[] chars = new char[unscrubbed.length()];
    for (int i = 0; i < unscrubbed.length(); i++) {
      chars[i] = scrubChar(unscrubbed.charAt(i), scrubValidChars);
    }
    return new String(chars);
  }

  static char scrubChar(char c, boolean scrubValid) {

    assert Utf16Util.isCodePoint(c) : "isCodePoint() should always be true for char";

    if (Utf16Util.isSurrogate(c)) {
      // High surrogate comes first. Matching pairs should appear as "<>"
      if (Utf16Util.isHighSurrogate(c)) {
        return '<';
      } else {
        assert Utf16Util.isLowSurrogate(c);
        return '>';
      }
    }

    switch (Utf16Util.isCodePointGoodForBlip(c)) {
    case OK:
      return scrubValid ? scrubValidChar(c) : c;
    case BIDI:
      return '|';
    case CONTROL:
      return '^';
    case NONCHARACTER:
      return '!';
    default:
      // Other invalid characters
      return '#';
    }
  }

  private static char scrubValidChar(char c) {
    assert c >= 0x20;

    // Reference: http://www.alanwood.net/unicode/fontsbyrange.html#u0180
    // The ranges are approximate, some encompass more than they describe.
    if (c <= 0x7f) { // Basic Latin
      return 'a';
    } else if (c <= 0xff) { // Latin-1 Supplement
      return 'b';
    } else if (c <= 0x17f) { // Latin Extended-A
      return 'c';
    } else if (c <= 0x24f) { // Latin Extended-B
      return 'd';
    } else if (0x2c60 <= c && c <= 0x2c7f) { // Latin Extended-C
      return 'e';
    } else if (0xa720 <= c && c <= 0xa7ff) { // Latin Extended-D
      return 'f';
    } else if (c <= 0x2AF) { // IPA Extensions
      return 'I';
    } else if (c <= 0x2FF) { // Spacing modifier letters
      return 'S';
    } else if (c <= 0x36f) { // Combining diacritical marks
      return ':';
    } else if (c <= 0x3FF) { // Greek and coptic
      return PSI;
    } else if (c <= 0x52F) { // Cyrillic & Cyrillic supplement
      return JIA;
    } else if (c <= 0x58f) { // Armenian
      return ARMENIAN;
    } else if (c <= 0x5ff) { // Hebrew
      return 'H';
    } else if (c <= 0x6ff) { // Arabic
      return 'A';
    } else if (0x900 <= c && c <= 0x97f) { // Devangari
      return 'D';
    } else if (0xe00 <= c && c <= 0xe7f) { // Thai
      return 'T';
    } else if (0x1100 <= c && c <= 0x11ff) { // Some Hangul Jamo
      return 'K';
    } else if (0x20a0 <= c && c <= 0x2bff) { // Some symbol type things
      return '%';
    } else if (0x2e80 <= c && c <= 0x2eff ||
               0x3000 <= c && c <= 0x303f ||
               0x3200 <= c && c <= 0x9fff) { // Some CJK
      return WO;
    } else if (0x3040 <= c && c <= 0x30ff) { // Some Japanese
      return 'J';
    } else { // TODO others
      return '?';
    }
  }

  private DocOpScrub() {}
}