// Copyright 2010 Google Inc. All Rights Reserved.
package org.waveprotocol.wave.model.document.util;
import org.waveprotocol.wave.model.document.operation.AnnotationBoundaryMap;
import org.waveprotocol.wave.model.document.operation.AnnotationBoundaryMapBuilder;
import org.waveprotocol.wave.model.document.operation.Attributes;
import org.waveprotocol.wave.model.document.operation.AttributesUpdate;
import org.waveprotocol.wave.model.document.operation.BufferedDocOp;
import org.waveprotocol.wave.model.document.operation.DocOp;
import org.waveprotocol.wave.model.document.operation.DocOpCursor;
import org.waveprotocol.wave.model.document.operation.impl.AttributesImpl;
import org.waveprotocol.wave.model.document.operation.impl.AttributesUpdateImpl;
import org.waveprotocol.wave.model.document.operation.impl.DocOpBuffer;
import org.waveprotocol.wave.model.document.operation.util.ImmutableStateMap.Attribute;
import org.waveprotocol.wave.model.document.operation.util.ImmutableUpdateMap.AttributeUpdate;
import org.waveprotocol.wave.model.util.CollectionUtils;
import org.waveprotocol.wave.model.util.Utf16Util;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Utility for scrubbing user-sensitive information out of operations.
*
* It is not intended to make reverse-engineering the original content totally
* impossible, given enough time and effort. It is meant to prevent
* casual/accidental viewing of user data in logs, while still preserving
* valuable debugging information, such as the type of characters in a string
* (so we can tell, for example, if the error is related to CJK input or not).
*
* @author danilatos@google.com (Daniel Danilatos)
*/
public final class DocOpScrub {
/**
* Number of leading characters to avoid scrubbing in cases where we do
* not scrub the entire string
*/
static final int CHARS_TO_LEAVE = 3;
static final char PSI = 0x3c8, JIA = 0x42f, ARMENIAN = 0x554, WO = 0x6211;
/**
* Creates a scrubbed copy of the given operation.
*
* @param op does not have to be well formed. invalid characters will be
* clearly noted.
* @return the output will be well formed only if the input was well formed
*/
public static BufferedDocOp scrub(DocOp op) {
final DocOpBuffer b = new DocOpBuffer();
op.apply(new DocOpCursor() {
@Override
public void deleteCharacters(String chars) {
b.deleteCharacters(scrubString(chars));
}
@Override
public void deleteElementEnd() {
b.deleteElementEnd();
}
@Override
public void deleteElementStart(String type, Attributes attrs) {
b.deleteElementStart(type, scrubAttributes(attrs));
}
@Override
public void replaceAttributes(Attributes oldAttrs, Attributes newAttrs) {
b.replaceAttributes(scrubAttributes(oldAttrs), scrubAttributes(newAttrs));
}
@Override
public void retain(int itemCount) {
b.retain(itemCount);
}
@Override
public void updateAttributes(AttributesUpdate attrUpdate) {
b.updateAttributes(scrubAttributesUpdate(attrUpdate));
}
@Override
public void annotationBoundary(AnnotationBoundaryMap map) {
b.annotationBoundary(scrubAnnotationBoundary(map));
}
@Override
public void characters(String chars) {
b.characters(scrubString(chars));
}
@Override
public void elementEnd() {
b.elementEnd();
}
@Override
public void elementStart(String type, Attributes attrs) {
b.elementStart(type, scrubAttributes(attrs));
}
});
return b.finishUnchecked();
}
public static AnnotationBoundaryMap scrubAnnotationBoundary(AnnotationBoundaryMap unscrubbed) {
AnnotationBoundaryMapBuilder b = new AnnotationBoundaryMapBuilder();
for (int i = 0; i < unscrubbed.endSize(); i++) {
b.end(scrubMostAnnotationKey(unscrubbed.getEndKey(i)));
}
for (int i = 0; i < unscrubbed.changeSize(); i++) {
b.change(scrubMostAnnotationKey(unscrubbed.getChangeKey(i)),
scrubMostString(unscrubbed.getOldValue(i)),
scrubMostString(unscrubbed.getNewValue(i)));
}
return b.build();
}
public static Attributes scrubAttributes(Attributes unscrubbed) {
List<Attribute> list = new ArrayList<Attribute>();
for (Map.Entry<String, String> entry : unscrubbed.entrySet()) {
list.add(new Attribute(scrubMostString(entry.getKey()), scrubMostString(entry.getValue())));
}
return AttributesImpl.fromUnsortedAttributes(list);
}
public static AttributesUpdate scrubAttributesUpdate(AttributesUpdate unscrubbed) {
List<AttributeUpdate> list = new ArrayList<AttributeUpdate>();
int size = unscrubbed.changeSize();
for (int i = 0; i < size; i++) {
list.add(new AttributeUpdate(scrubMostString(unscrubbed.getChangeKey(i)),
scrubMostString(unscrubbed.getOldValue(i)),
scrubMostString(unscrubbed.getNewValue(i))));
}
return AttributesUpdateImpl.fromUnsortedUpdates(list);
}
/**
* Scrubs most of an annotation key. Uses {@link #scrubMostString(String)} on
* each slash (/) separated component of the key, preserving the original
* slashes.
*
* @param unscrubbed key
* @return mostly scrubbed key
*/
public static String scrubMostAnnotationKey(String unscrubbed) {
// pass -1 to split to prevent dropping trailing empty strings
String[] parts = unscrubbed.split("/", -1);
StringBuilder b = new StringBuilder();
for (int i = 0; i < parts.length; i++) {
parts[i] = scrubMostString(parts[i]);
}
return CollectionUtils.join('/', parts);
}
/**
* Scrubs most of a string based on a heuristic that attempts to clean out
* user sensitive data while still retaining some amount of information that
* would be useful for debugging.
*
* If the string looks a bit like an email address, then everything but the @
* symbol is scrubbed. Otherwise, everything but the first few characters is
* scrubbed. In any case, invalid characters are always scrubbed for
* readability. The motivation in the case of email addresses is both to make
* it clear that the piece of data is an email address, and to be a little
* stronger about removing the identifiable information
*
* NOTE(danilatos): Consider also always scrubbing confusing-to-print
* characters such as RTL or ligature forming characters?
*
* @param unscrubbed unscrubbed string. may be null, in which case null is
* returned.
* @return mostly scrubbed string
*/
public static String scrubMostString(String unscrubbed) {
if (unscrubbed == null) {
return null;
}
int index = unscrubbed.indexOf('@');
if (index != -1 && unscrubbed.lastIndexOf('@') == index) {
// If it looks vaguely like an email address (contains a single '@'),
// then scrub everything except for the '@' symbol.
// pass 2 to split to prevent dropping of trailing empty strings
String[] parts = unscrubbed.split("@", 2);
return scrubString(parts[0]) + '@' + scrubString(parts[1]);
} else if (unscrubbed.length() >= CHARS_TO_LEAVE){
// Otherwise scrub everything but the first few characters,
// which we leave to aid debugging.
return
scrubString(unscrubbed.substring(0, CHARS_TO_LEAVE), false) +
scrubString(unscrubbed.substring(CHARS_TO_LEAVE), true);
} else {
return unscrubbed;
}
}
public static String scrubString(String unscrubbed) {
return scrubString(unscrubbed, true);
}
static String scrubString(String unscrubbed, boolean scrubValidChars) {
char[] chars = new char[unscrubbed.length()];
for (int i = 0; i < unscrubbed.length(); i++) {
chars[i] = scrubChar(unscrubbed.charAt(i), scrubValidChars);
}
return new String(chars);
}
static char scrubChar(char c, boolean scrubValid) {
assert Utf16Util.isCodePoint(c) : "isCodePoint() should always be true for char";
if (Utf16Util.isSurrogate(c)) {
// High surrogate comes first. Matching pairs should appear as "<>"
if (Utf16Util.isHighSurrogate(c)) {
return '<';
} else {
assert Utf16Util.isLowSurrogate(c);
return '>';
}
}
switch (Utf16Util.isCodePointGoodForBlip(c)) {
case OK:
return scrubValid ? scrubValidChar(c) : c;
case BIDI:
return '|';
case CONTROL:
return '^';
case NONCHARACTER:
return '!';
default:
// Other invalid characters
return '#';
}
}
private static char scrubValidChar(char c) {
assert c >= 0x20;
// Reference: http://www.alanwood.net/unicode/fontsbyrange.html#u0180
// The ranges are approximate, some encompass more than they describe.
if (c <= 0x7f) { // Basic Latin
return 'a';
} else if (c <= 0xff) { // Latin-1 Supplement
return 'b';
} else if (c <= 0x17f) { // Latin Extended-A
return 'c';
} else if (c <= 0x24f) { // Latin Extended-B
return 'd';
} else if (0x2c60 <= c && c <= 0x2c7f) { // Latin Extended-C
return 'e';
} else if (0xa720 <= c && c <= 0xa7ff) { // Latin Extended-D
return 'f';
} else if (c <= 0x2AF) { // IPA Extensions
return 'I';
} else if (c <= 0x2FF) { // Spacing modifier letters
return 'S';
} else if (c <= 0x36f) { // Combining diacritical marks
return ':';
} else if (c <= 0x3FF) { // Greek and coptic
return PSI;
} else if (c <= 0x52F) { // Cyrillic & Cyrillic supplement
return JIA;
} else if (c <= 0x58f) { // Armenian
return ARMENIAN;
} else if (c <= 0x5ff) { // Hebrew
return 'H';
} else if (c <= 0x6ff) { // Arabic
return 'A';
} else if (0x900 <= c && c <= 0x97f) { // Devangari
return 'D';
} else if (0xe00 <= c && c <= 0xe7f) { // Thai
return 'T';
} else if (0x1100 <= c && c <= 0x11ff) { // Some Hangul Jamo
return 'K';
} else if (0x20a0 <= c && c <= 0x2bff) { // Some symbol type things
return '%';
} else if (0x2e80 <= c && c <= 0x2eff ||
0x3000 <= c && c <= 0x303f ||
0x3200 <= c && c <= 0x9fff) { // Some CJK
return WO;
} else if (0x3040 <= c && c <= 0x30ff) { // Some Japanese
return 'J';
} else { // TODO others
return '?';
}
}
private DocOpScrub() {}
}