DocOpScrub.java example

Explorer
WaveInCloud-master
/**
 * Copyright 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.waveprotocol.wave.model.document.util;

import org.waveprotocol.wave.model.document.operation.AnnotationBoundaryMap;
import org.waveprotocol.wave.model.document.operation.Attributes;
import org.waveprotocol.wave.model.document.operation.AttributesUpdate;
import org.waveprotocol.wave.model.document.operation.DocInitialization;
import org.waveprotocol.wave.model.document.operation.DocOp;
import org.waveprotocol.wave.model.document.operation.DocOpCursor;
import org.waveprotocol.wave.model.document.operation.impl.AttributesImpl;
import org.waveprotocol.wave.model.document.operation.impl.AttributesUpdateImpl;
import org.waveprotocol.wave.model.document.operation.impl.DocOpBuffer;
import org.waveprotocol.wave.model.document.operation.impl.DocOpBuilder;
import org.waveprotocol.wave.model.document.operation.impl.DocOpUtil;
import org.waveprotocol.wave.model.document.operation.util.ImmutableStateMap.Attribute;
import org.waveprotocol.wave.model.document.operation.util.ImmutableUpdateMap.AttributeUpdate;
import org.waveprotocol.wave.model.util.CollectionUtils;
import org.waveprotocol.wave.model.util.Preconditions;
import org.waveprotocol.wave.model.util.StringMap;
import org.waveprotocol.wave.model.util.StringSet;
import org.waveprotocol.wave.model.util.Utf16Util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
 * Utility for scrubbing user-sensitive information out of operations.
 *
 * It is not intended to make reverse-engineering the original content totally
 * impossible, given enough time and effort. It is meant to prevent
 * casual/accidental viewing of user data in logs, while still preserving
 * valuable debugging information, such as the type of characters in a string
 * (so we can tell, for example, if the error is related to CJK input or not).
 *
 * @author danilatos@google.com (Daniel Danilatos)
 */
public final class DocOpScrub {

  /**
   * An implementation of {@link AnnotationBoundaryMap} that tries its best to
   * be well formed (by sorting its input data) but does not perform any strong
   * validity check.
   */
  public static class UncheckedAnnotationBoundary implements AnnotationBoundaryMap {
    static class Triplet implements Comparable<Triplet> {
      private final String key, oldVal, newVal;

      private Triplet(String key, String oldVal, String newVal) {
        Preconditions.checkNotNull(key, "triplet key");
        this.key = key;
        this.oldVal = oldVal;
        this.newVal = newVal;
      }

      @Override
      public int compareTo(Triplet o) {
        return key.compareTo(o.key);
      }
    }

    private final String[] ends;
    private final Triplet[] changes;

    public UncheckedAnnotationBoundary(String[] triplets, String[] ends) {
      Preconditions.checkArgument(triplets.length % 3 == 0, "triplets.length not a multiple of 3");

      this.ends = copy(ends);
      Arrays.sort(this.ends);

      this.changes = new Triplet[triplets.length / 3];
      for (int i = 0; i < triplets.length; i += 3) {
        this.changes[i / 3] = new Triplet(triplets[i], triplets[i + 1], triplets[i + 2]);
      }
      Arrays.sort(this.changes);
    }

    /**
     * Copies an array. GWT does not seem to support Arrays.copyOf()
     */
    private String[] copy(String[] input) {
      String[] ret = new String[input.length];
      for (int i = 0; i < input.length; i++) {
        ret[i] = input[i];
      }
      return ret;
    }

    @Override
    public int changeSize() {
      return changes.length;
    }

    @Override
    public int endSize() {
      return ends.length;
    }

    @Override
    public String getChangeKey(int i) {
      return changes[i].key;
    }

    @Override
    public String getEndKey(int i) {
      return ends[i];
    }

    @Override
    public String getNewValue(int i) {
      return changes[i].newVal;
    }

    @Override
    public String getOldValue(int i) {
      return changes[i].oldVal;
    }
  }

  private static boolean shouldScrubByDefault = true;

  /**
   * Sets whether {@link #maybeScrub(DocOp)} should scrub.
   */
  public static void setShouldScrubByDefault(boolean shouldScrub) {
    shouldScrubByDefault = shouldScrub;
  }

  /**
   * Whether operations should be scrubbed by default. This should be true for
   * non-debug environments to protect privacy.
   *
   * See {@link #setShouldScrubByDefault(boolean)}.
   */
  public static boolean shouldScrubByDefault() {
    return shouldScrubByDefault;
  }

  /**
   * Number of leading characters to avoid scrubbing in cases where we do
   * not scrub the entire string
   */
  static final int CHARS_TO_LEAVE = 3;

  static final char PSI = 0x3c8, JIA = 0x42f, ARMENIAN = 0x554, WO = 0x6211;

  public interface StringScrubber {
    String scrub(String input);
  }

  public static class ScrubCache {
    private final StringScrubber format;
    private final StringMap<String> alreadyScrubbed = CollectionUtils.createStringMap();
    private final StringSet scrubbings = CollectionUtils.createStringSet();
    private int uniqueSuffix = 0;

    public ScrubCache(StringScrubber format) {
      this.format = format;
    }

    String scrubUniquely(String input) {
      if (alreadyScrubbed.containsKey(input)) {
        return alreadyScrubbed.get(input);
      }

      String bareScrubbed = format.scrub(input);
      String scrubbed = bareScrubbed;
      while (scrubbings.contains(scrubbed)) {
        scrubbed = bareScrubbed + '_' + (++uniqueSuffix);
      }

      alreadyScrubbed.put(input, scrubbed);
      scrubbings.add(scrubbed);

      return scrubbed;
    }
  }

  private static final StringScrubber attrNameScrubber = new StringScrubber() {
    @Override public String scrub(String input) {
      return scrubMostString(input);
    }
  };

  private static final StringScrubber annotationKeyScrubber = new StringScrubber() {
    @Override public String scrub(String input) {
      return scrubMostAnnotationKey(input);
    }
  };

  /**
   * Scrubs the given operation. Ill-formed input is permitted but may lead to
   * ill-formed output. Invalid characters will be clearly noted.
   */
  public static DocOp scrub(final DocOp op) {
    try {
      DocOpBuffer b = new DocOpBuffer();
      op.apply(createScrubber(b));
      return b.finishUnchecked();
    } catch (RuntimeException e) {
      // This should not really happen unless perhaps the input operation has some
      // diabolically broken implementation of apply ofr of attribute or annotation datastructures.
      return new DocOpBuilder().characters("Scrub exploded: " + e).build();
    }
  }

  /**
   * Same as {@link #scrub(DocOp)} but deals with {@link DocInitialization}s
   */
  public static DocInitialization scrub(final DocInitialization op) {
    return DocOpUtil.asInitialization(scrub((DocOp) op));
  }

  /**
   * Maybe scrub the given operation.
   *
   * See {@link #shouldScrubByDefault()} and {@link #scrub(DocOp)}
   */
  public static DocOp maybeScrub(final DocOp op) {
    return shouldScrubByDefault ? scrub(op) : op;
  }

  /**
   * Maybe scrub the given initialization.
   *
   * See {@link #shouldScrubByDefault()} and {@link #scrub(DocInitialization)}
   */
  public static DocInitialization maybeScrub(final DocInitialization op) {
    return shouldScrubByDefault ? scrub(op) : op;
  }

  public static DocOpCursor createScrubber(final DocOpCursor target) {
    final ScrubCache attrNames = new ScrubCache(attrNameScrubber);
    final ScrubCache annotationNames = new ScrubCache(annotationKeyScrubber);

    return new DocOpCursor() {
      @Override
      public void deleteCharacters(String chars) {
        target.deleteCharacters(scrubString(chars));
      }

      @Override
      public void deleteElementEnd() {
        target.deleteElementEnd();
      }

      @Override
      public void deleteElementStart(String type, Attributes attrs) {
        target.deleteElementStart(type, scrubAttributes(attrs, attrNames));
      }

      @Override
      public void replaceAttributes(Attributes oldAttrs, Attributes newAttrs) {
        target.replaceAttributes(
            scrubAttributes(oldAttrs, attrNames),
            scrubAttributes(newAttrs, attrNames));
      }

      @Override
      public void retain(int itemCount) {
        target.retain(itemCount);
      }

      @Override
      public void updateAttributes(AttributesUpdate attrUpdate) {
        target.updateAttributes(scrubAttributesUpdate(attrUpdate, attrNames));
      }

      @Override
      public void annotationBoundary(AnnotationBoundaryMap map) {
        target.annotationBoundary(scrubAnnotationBoundary(map, annotationNames));
      }

      @Override
      public void characters(String chars) {
        target.characters(scrubString(chars));
      }

      @Override
      public void elementEnd() {
        target.elementEnd();
      }

      @Override
      public void elementStart(String type, Attributes attrs) {
        target.elementStart(type, scrubAttributes(attrs, attrNames));
      }
    };
  }

  public static AnnotationBoundaryMap scrubAnnotationBoundary(AnnotationBoundaryMap unscrubbed,
      ScrubCache nameScrubber) {
    String[] ends = new String[unscrubbed.endSize()];
    String[] changeTriplets = new String[unscrubbed.changeSize() * 3];

    for (int i = 0; i < unscrubbed.endSize(); i++) {
      ends[i] = nameScrubber.scrubUniquely(unscrubbed.getEndKey(i));
    }
    for (int i = 0; i < unscrubbed.changeSize(); i++) {
      changeTriplets[i * 3] = nameScrubber.scrubUniquely(unscrubbed.getChangeKey(i));
      changeTriplets[i * 3 + 1] = scrubMostString(unscrubbed.getOldValue(i));
      changeTriplets[i * 3 + 2] = scrubMostString(unscrubbed.getNewValue(i));
    }
    return new UncheckedAnnotationBoundary(changeTriplets, ends);
  }

  public static Attributes scrubAttributes(Attributes unscrubbed, ScrubCache nameScrubber) {
    List<Attribute> list = new ArrayList<Attribute>();
    for (Map.Entry<String, String> entry : unscrubbed.entrySet()) {
      list.add(new Attribute(
          nameScrubber.scrubUniquely(entry.getKey()),
          scrubMostString(entry.getValue())));
    }
    return AttributesImpl.fromUnsortedAttributesUnchecked(list);
  }

  public static AttributesUpdate scrubAttributesUpdate(AttributesUpdate unscrubbed,
      ScrubCache nameScrubber) {
    List<AttributeUpdate> list = new ArrayList<AttributeUpdate>();
    for (int i = 0; i < unscrubbed.changeSize(); i++) {
      list.add(new AttributeUpdate(
          nameScrubber.scrubUniquely(unscrubbed.getChangeKey(i)),
          scrubMostString(unscrubbed.getOldValue(i)),
          scrubMostString(unscrubbed.getNewValue(i))));
    }
    return AttributesUpdateImpl.fromUnsortedUpdatesUnchecked(list);
  }

  /**
   * Scrubs most of an annotation key. Uses {@link #scrubMostString(String)} on
   * each slash (/) separated component of the key, preserving the original
   * slashes.
   *
   * @param unscrubbed key
   * @return mostly scrubbed key
   */
  public static String scrubMostAnnotationKey(String unscrubbed) {
    // pass -1 to split to prevent dropping trailing empty strings
    String[] parts = unscrubbed.split("/", -1);
    StringBuilder b = new StringBuilder();
    for (int i = 0; i < parts.length; i++) {
      parts[i] = scrubMostString(parts[i]);
    }
    return CollectionUtils.join('/', parts);
  }

  /**
   * Scrubs most of a string based on a heuristic that attempts to clean out
   * user sensitive data while still retaining some amount of information that
   * would be useful for debugging.
   *
   * If the string looks a bit like an email address, then everything but the @
   * symbol is scrubbed. Otherwise, everything but the first few characters is
   * scrubbed. In any case, invalid characters are always scrubbed for
   * readability. The motivation in the case of email addresses is both to make
   * it clear that the piece of data is an email address, and to be a little
   * stronger about removing the identifiable information
   *
   * NOTE(danilatos): Consider also always scrubbing confusing-to-print
   * characters such as RTL or ligature forming characters?
   *
   * @param unscrubbed unscrubbed string. may be null, in which case null is
   *        returned.
   * @return mostly scrubbed string
   */
  public static String scrubMostString(String unscrubbed) {
    if (unscrubbed == null) {
      return null;
    }

    int index = unscrubbed.indexOf('@');
    if (index != -1 && unscrubbed.lastIndexOf('@') == index) {
      // If it looks vaguely like an email address (contains a single '@'),
      // then scrub everything except for the '@' symbol.
      // pass 2 to split to prevent dropping of trailing empty strings
      String[] parts = unscrubbed.split("@", 2);
      return scrubString(parts[0]) + '@' + scrubString(parts[1]);
    } else if (unscrubbed.length() >= CHARS_TO_LEAVE){
      // Otherwise scrub everything but the first few characters,
      // which we leave to aid debugging.
      return
          scrubString(unscrubbed.substring(0, CHARS_TO_LEAVE), false) +
          scrubString(unscrubbed.substring(CHARS_TO_LEAVE), true);
    } else {
      return unscrubbed;
    }
  }

  public static String scrubString(String unscrubbed) {
    return scrubString(unscrubbed, true);
  }

  static String scrubString(String unscrubbed, boolean scrubValidChars) {
    char[] chars = new char[unscrubbed.length()];
    for (int i = 0; i < unscrubbed.length(); i++) {
      chars[i] = scrubChar(unscrubbed.charAt(i), scrubValidChars);
    }
    return new String(chars);
  }

  static char scrubChar(char c, boolean scrubValid) {

    assert Utf16Util.isCodePoint(c) : "isCodePoint() should always be true for char";

    if (Utf16Util.isSurrogate(c)) {
      // High surrogate comes first. Matching pairs should appear as "<>"
      if (Utf16Util.isHighSurrogate(c)) {
        return '<';
      } else {
        assert Utf16Util.isLowSurrogate(c);
        return '>';
      }
    }

    switch (Utf16Util.isCodePointGoodForBlip(c)) {
    case OK:
      return scrubValid ? scrubValidChar(c) : c;
    case BIDI:
      return '|';
    case CONTROL:
      return '^';
    case NONCHARACTER:
      return '!';
    default:
      // Other invalid characters
      return '#';
    }
  }

  private static char scrubValidChar(char c) {
    assert c >= 0x20;

    // Reference: http://www.alanwood.net/unicode/fontsbyrange.html#u0180
    // The ranges are approximate, some encompass more than they describe.
    if (c <= 0x7f) { // Basic Latin
      return 'a';
    } else if (c <= 0xff) { // Latin-1 Supplement
      return 'b';
    } else if (c <= 0x17f) { // Latin Extended-A
      return 'c';
    } else if (c <= 0x24f) { // Latin Extended-B
      return 'd';
    } else if (0x2c60 <= c && c <= 0x2c7f) { // Latin Extended-C
      return 'e';
    } else if (0xa720 <= c && c <= 0xa7ff) { // Latin Extended-D
      return 'f';
    } else if (c <= 0x2AF) { // IPA Extensions
      return 'I';
    } else if (c <= 0x2FF) { // Spacing modifier letters
      return 'S';
    } else if (c <= 0x36f) { // Combining diacritical marks
      return ':';
    } else if (c <= 0x3FF) { // Greek and coptic
      return PSI;
    } else if (c <= 0x52F) { // Cyrillic & Cyrillic supplement
      return JIA;
    } else if (c <= 0x58f) { // Armenian
      return ARMENIAN;
    } else if (c <= 0x5ff) { // Hebrew
      return 'H';
    } else if (c <= 0x6ff) { // Arabic
      return 'A';
    } else if (0x900 <= c && c <= 0x97f) { // Devangari
      return 'D';
    } else if (0xe00 <= c && c <= 0xe7f) { // Thai
      return 'T';
    } else if (0x1100 <= c && c <= 0x11ff) { // Some Hangul Jamo
      return 'K';
    } else if (0x20a0 <= c && c <= 0x2bff) { // Some symbol type things
      return '%';
    } else if (0x2e80 <= c && c <= 0x2eff ||
               0x3000 <= c && c <= 0x303f ||
               0x3200 <= c && c <= 0x9fff) { // Some CJK
      return WO;
    } else if (0x3040 <= c && c <= 0x30ff) { // Some Japanese
      return 'J';
    } else { // TODO others
      return '?';
    }
  }

  private DocOpScrub() {}
}