/** * Copyright 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.waveprotocol.wave.model.document.util; import org.waveprotocol.wave.model.document.operation.AnnotationBoundaryMap; import org.waveprotocol.wave.model.document.operation.Attributes; import org.waveprotocol.wave.model.document.operation.AttributesUpdate; import org.waveprotocol.wave.model.document.operation.DocInitialization; import org.waveprotocol.wave.model.document.operation.DocOp; import org.waveprotocol.wave.model.document.operation.DocOpCursor; import org.waveprotocol.wave.model.document.operation.impl.AttributesImpl; import org.waveprotocol.wave.model.document.operation.impl.AttributesUpdateImpl; import org.waveprotocol.wave.model.document.operation.impl.DocOpBuffer; import org.waveprotocol.wave.model.document.operation.impl.DocOpBuilder; import org.waveprotocol.wave.model.document.operation.impl.DocOpUtil; import org.waveprotocol.wave.model.document.operation.util.ImmutableStateMap.Attribute; import org.waveprotocol.wave.model.document.operation.util.ImmutableUpdateMap.AttributeUpdate; import org.waveprotocol.wave.model.util.CollectionUtils; import org.waveprotocol.wave.model.util.Preconditions; import org.waveprotocol.wave.model.util.StringMap; import org.waveprotocol.wave.model.util.StringSet; import org.waveprotocol.wave.model.util.Utf16Util; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; /** * Utility for scrubbing user-sensitive information out of operations. * * It is not intended to make reverse-engineering the original content totally * impossible, given enough time and effort. It is meant to prevent * casual/accidental viewing of user data in logs, while still preserving * valuable debugging information, such as the type of characters in a string * (so we can tell, for example, if the error is related to CJK input or not). * * @author danilatos@google.com (Daniel Danilatos) */ public final class DocOpScrub { /** * An implementation of {@link AnnotationBoundaryMap} that tries its best to * be well formed (by sorting its input data) but does not perform any strong * validity check. */ public static class UncheckedAnnotationBoundary implements AnnotationBoundaryMap { static class Triplet implements Comparable<Triplet> { private final String key, oldVal, newVal; private Triplet(String key, String oldVal, String newVal) { Preconditions.checkNotNull(key, "triplet key"); this.key = key; this.oldVal = oldVal; this.newVal = newVal; } @Override public int compareTo(Triplet o) { return key.compareTo(o.key); } } private final String[] ends; private final Triplet[] changes; public UncheckedAnnotationBoundary(String[] triplets, String[] ends) { Preconditions.checkArgument(triplets.length % 3 == 0, "triplets.length not a multiple of 3"); this.ends = copy(ends); Arrays.sort(this.ends); this.changes = new Triplet[triplets.length / 3]; for (int i = 0; i < triplets.length; i += 3) { this.changes[i / 3] = new Triplet(triplets[i], triplets[i + 1], triplets[i + 2]); } Arrays.sort(this.changes); } /** * Copies an array. GWT does not seem to support Arrays.copyOf() */ private String[] copy(String[] input) { String[] ret = new String[input.length]; for (int i = 0; i < input.length; i++) { ret[i] = input[i]; } return ret; } @Override public int changeSize() { return changes.length; } @Override public int endSize() { return ends.length; } @Override public String getChangeKey(int i) { return changes[i].key; } @Override public String getEndKey(int i) { return ends[i]; } @Override public String getNewValue(int i) { return changes[i].newVal; } @Override public String getOldValue(int i) { return changes[i].oldVal; } } private static boolean shouldScrubByDefault = true; /** * Sets whether {@link #maybeScrub(DocOp)} should scrub. */ public static void setShouldScrubByDefault(boolean shouldScrub) { shouldScrubByDefault = shouldScrub; } /** * Whether operations should be scrubbed by default. This should be true for * non-debug environments to protect privacy. * * See {@link #setShouldScrubByDefault(boolean)}. */ public static boolean shouldScrubByDefault() { return shouldScrubByDefault; } /** * Number of leading characters to avoid scrubbing in cases where we do * not scrub the entire string */ static final int CHARS_TO_LEAVE = 3; static final char PSI = 0x3c8, JIA = 0x42f, ARMENIAN = 0x554, WO = 0x6211; public interface StringScrubber { String scrub(String input); } public static class ScrubCache { private final StringScrubber format; private final StringMap<String> alreadyScrubbed = CollectionUtils.createStringMap(); private final StringSet scrubbings = CollectionUtils.createStringSet(); private int uniqueSuffix = 0; public ScrubCache(StringScrubber format) { this.format = format; } String scrubUniquely(String input) { if (alreadyScrubbed.containsKey(input)) { return alreadyScrubbed.get(input); } String bareScrubbed = format.scrub(input); String scrubbed = bareScrubbed; while (scrubbings.contains(scrubbed)) { scrubbed = bareScrubbed + '_' + (++uniqueSuffix); } alreadyScrubbed.put(input, scrubbed); scrubbings.add(scrubbed); return scrubbed; } } private static final StringScrubber attrNameScrubber = new StringScrubber() { @Override public String scrub(String input) { return scrubMostString(input); } }; private static final StringScrubber annotationKeyScrubber = new StringScrubber() { @Override public String scrub(String input) { return scrubMostAnnotationKey(input); } }; /** * Scrubs the given operation. Ill-formed input is permitted but may lead to * ill-formed output. Invalid characters will be clearly noted. */ public static DocOp scrub(final DocOp op) { try { DocOpBuffer b = new DocOpBuffer(); op.apply(createScrubber(b)); return b.finishUnchecked(); } catch (RuntimeException e) { // This should not really happen unless perhaps the input operation has some // diabolically broken implementation of apply ofr of attribute or annotation datastructures. return new DocOpBuilder().characters("Scrub exploded: " + e).build(); } } /** * Same as {@link #scrub(DocOp)} but deals with {@link DocInitialization}s */ public static DocInitialization scrub(final DocInitialization op) { return DocOpUtil.asInitialization(scrub((DocOp) op)); } /** * Maybe scrub the given operation. * * See {@link #shouldScrubByDefault()} and {@link #scrub(DocOp)} */ public static DocOp maybeScrub(final DocOp op) { return shouldScrubByDefault ? scrub(op) : op; } /** * Maybe scrub the given initialization. * * See {@link #shouldScrubByDefault()} and {@link #scrub(DocInitialization)} */ public static DocInitialization maybeScrub(final DocInitialization op) { return shouldScrubByDefault ? scrub(op) : op; } public static DocOpCursor createScrubber(final DocOpCursor target) { final ScrubCache attrNames = new ScrubCache(attrNameScrubber); final ScrubCache annotationNames = new ScrubCache(annotationKeyScrubber); return new DocOpCursor() { @Override public void deleteCharacters(String chars) { target.deleteCharacters(scrubString(chars)); } @Override public void deleteElementEnd() { target.deleteElementEnd(); } @Override public void deleteElementStart(String type, Attributes attrs) { target.deleteElementStart(type, scrubAttributes(attrs, attrNames)); } @Override public void replaceAttributes(Attributes oldAttrs, Attributes newAttrs) { target.replaceAttributes( scrubAttributes(oldAttrs, attrNames), scrubAttributes(newAttrs, attrNames)); } @Override public void retain(int itemCount) { target.retain(itemCount); } @Override public void updateAttributes(AttributesUpdate attrUpdate) { target.updateAttributes(scrubAttributesUpdate(attrUpdate, attrNames)); } @Override public void annotationBoundary(AnnotationBoundaryMap map) { target.annotationBoundary(scrubAnnotationBoundary(map, annotationNames)); } @Override public void characters(String chars) { target.characters(scrubString(chars)); } @Override public void elementEnd() { target.elementEnd(); } @Override public void elementStart(String type, Attributes attrs) { target.elementStart(type, scrubAttributes(attrs, attrNames)); } }; } public static AnnotationBoundaryMap scrubAnnotationBoundary(AnnotationBoundaryMap unscrubbed, ScrubCache nameScrubber) { String[] ends = new String[unscrubbed.endSize()]; String[] changeTriplets = new String[unscrubbed.changeSize() * 3]; for (int i = 0; i < unscrubbed.endSize(); i++) { ends[i] = nameScrubber.scrubUniquely(unscrubbed.getEndKey(i)); } for (int i = 0; i < unscrubbed.changeSize(); i++) { changeTriplets[i * 3] = nameScrubber.scrubUniquely(unscrubbed.getChangeKey(i)); changeTriplets[i * 3 + 1] = scrubMostString(unscrubbed.getOldValue(i)); changeTriplets[i * 3 + 2] = scrubMostString(unscrubbed.getNewValue(i)); } return new UncheckedAnnotationBoundary(changeTriplets, ends); } public static Attributes scrubAttributes(Attributes unscrubbed, ScrubCache nameScrubber) { List<Attribute> list = new ArrayList<Attribute>(); for (Map.Entry<String, String> entry : unscrubbed.entrySet()) { list.add(new Attribute( nameScrubber.scrubUniquely(entry.getKey()), scrubMostString(entry.getValue()))); } return AttributesImpl.fromUnsortedAttributesUnchecked(list); } public static AttributesUpdate scrubAttributesUpdate(AttributesUpdate unscrubbed, ScrubCache nameScrubber) { List<AttributeUpdate> list = new ArrayList<AttributeUpdate>(); for (int i = 0; i < unscrubbed.changeSize(); i++) { list.add(new AttributeUpdate( nameScrubber.scrubUniquely(unscrubbed.getChangeKey(i)), scrubMostString(unscrubbed.getOldValue(i)), scrubMostString(unscrubbed.getNewValue(i)))); } return AttributesUpdateImpl.fromUnsortedUpdatesUnchecked(list); } /** * Scrubs most of an annotation key. Uses {@link #scrubMostString(String)} on * each slash (/) separated component of the key, preserving the original * slashes. * * @param unscrubbed key * @return mostly scrubbed key */ public static String scrubMostAnnotationKey(String unscrubbed) { // pass -1 to split to prevent dropping trailing empty strings String[] parts = unscrubbed.split("/", -1); StringBuilder b = new StringBuilder(); for (int i = 0; i < parts.length; i++) { parts[i] = scrubMostString(parts[i]); } return CollectionUtils.join('/', parts); } /** * Scrubs most of a string based on a heuristic that attempts to clean out * user sensitive data while still retaining some amount of information that * would be useful for debugging. * * If the string looks a bit like an email address, then everything but the @ * symbol is scrubbed. Otherwise, everything but the first few characters is * scrubbed. In any case, invalid characters are always scrubbed for * readability. The motivation in the case of email addresses is both to make * it clear that the piece of data is an email address, and to be a little * stronger about removing the identifiable information * * NOTE(danilatos): Consider also always scrubbing confusing-to-print * characters such as RTL or ligature forming characters? * * @param unscrubbed unscrubbed string. may be null, in which case null is * returned. * @return mostly scrubbed string */ public static String scrubMostString(String unscrubbed) { if (unscrubbed == null) { return null; } int index = unscrubbed.indexOf('@'); if (index != -1 && unscrubbed.lastIndexOf('@') == index) { // If it looks vaguely like an email address (contains a single '@'), // then scrub everything except for the '@' symbol. // pass 2 to split to prevent dropping of trailing empty strings String[] parts = unscrubbed.split("@", 2); return scrubString(parts[0]) + '@' + scrubString(parts[1]); } else if (unscrubbed.length() >= CHARS_TO_LEAVE){ // Otherwise scrub everything but the first few characters, // which we leave to aid debugging. return scrubString(unscrubbed.substring(0, CHARS_TO_LEAVE), false) + scrubString(unscrubbed.substring(CHARS_TO_LEAVE), true); } else { return unscrubbed; } } public static String scrubString(String unscrubbed) { return scrubString(unscrubbed, true); } static String scrubString(String unscrubbed, boolean scrubValidChars) { char[] chars = new char[unscrubbed.length()]; for (int i = 0; i < unscrubbed.length(); i++) { chars[i] = scrubChar(unscrubbed.charAt(i), scrubValidChars); } return new String(chars); } static char scrubChar(char c, boolean scrubValid) { assert Utf16Util.isCodePoint(c) : "isCodePoint() should always be true for char"; if (Utf16Util.isSurrogate(c)) { // High surrogate comes first. Matching pairs should appear as "<>" if (Utf16Util.isHighSurrogate(c)) { return '<'; } else { assert Utf16Util.isLowSurrogate(c); return '>'; } } switch (Utf16Util.isCodePointGoodForBlip(c)) { case OK: return scrubValid ? scrubValidChar(c) : c; case BIDI: return '|'; case CONTROL: return '^'; case NONCHARACTER: return '!'; default: // Other invalid characters return '#'; } } private static char scrubValidChar(char c) { assert c >= 0x20; // Reference: http://www.alanwood.net/unicode/fontsbyrange.html#u0180 // The ranges are approximate, some encompass more than they describe. if (c <= 0x7f) { // Basic Latin return 'a'; } else if (c <= 0xff) { // Latin-1 Supplement return 'b'; } else if (c <= 0x17f) { // Latin Extended-A return 'c'; } else if (c <= 0x24f) { // Latin Extended-B return 'd'; } else if (0x2c60 <= c && c <= 0x2c7f) { // Latin Extended-C return 'e'; } else if (0xa720 <= c && c <= 0xa7ff) { // Latin Extended-D return 'f'; } else if (c <= 0x2AF) { // IPA Extensions return 'I'; } else if (c <= 0x2FF) { // Spacing modifier letters return 'S'; } else if (c <= 0x36f) { // Combining diacritical marks return ':'; } else if (c <= 0x3FF) { // Greek and coptic return PSI; } else if (c <= 0x52F) { // Cyrillic & Cyrillic supplement return JIA; } else if (c <= 0x58f) { // Armenian return ARMENIAN; } else if (c <= 0x5ff) { // Hebrew return 'H'; } else if (c <= 0x6ff) { // Arabic return 'A'; } else if (0x900 <= c && c <= 0x97f) { // Devangari return 'D'; } else if (0xe00 <= c && c <= 0xe7f) { // Thai return 'T'; } else if (0x1100 <= c && c <= 0x11ff) { // Some Hangul Jamo return 'K'; } else if (0x20a0 <= c && c <= 0x2bff) { // Some symbol type things return '%'; } else if (0x2e80 <= c && c <= 0x2eff || 0x3000 <= c && c <= 0x303f || 0x3200 <= c && c <= 0x9fff) { // Some CJK return WO; } else if (0x3040 <= c && c <= 0x30ff) { // Some Japanese return 'J'; } else { // TODO others return '?'; } } private DocOpScrub() {} }