blob: d034c86cae090212487a62674cb0315a808e3c36 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/randomized_encoder.h"
#include "base/strings/stringprintf.h"
#include "net/base/hex_utils.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace {
constexpr size_t kBitsPerByte = 8;
constexpr size_t kMaxLengthInBytes = 64;
constexpr size_t kMaxLengthInBits = kMaxLengthInBytes * kBitsPerByte;
// Get the |i|-th bit of |s| where |i| counts up from the 0-bit of the first
// character in |s|. It is expected that the caller guarantees that |i| is a
// valid bit-offset into |s|
bool GetBit(base::StringPiece s, size_t i) {
DCHECK_LT(i / kBitsPerByte, s.length());
return static_cast<bool>((s[i / kBitsPerByte]) & (1 << (i % kBitsPerByte)));
}
// This is a reference encoder implementation. This implementation performs the
// all bits encoding one full byte at a time and then packs the selected bits
// into a final output buffer.
std::string ReferenceEncodeImpl(base::StringPiece coins,
base::StringPiece noise,
base::StringPiece value,
size_t bit_offset,
size_t bit_stride) {
// Encode all of the bits.
std::string all_bits = noise.as_string();
size_t value_length = std::min(value.length(), kMaxLengthInBytes);
for (size_t i = 0; i < value_length; ++i) {
all_bits[i] = (value[i] & coins[i]) | (all_bits[i] & ~coins[i]);
}
// Select the only the ones matching bit_offset and bit_stride.
std::string output(kMaxLengthInBytes / bit_stride, 0);
size_t src_offset = bit_offset;
size_t dst_offset = 0;
while (src_offset < kMaxLengthInBits) {
bool bit_value = GetBit(all_bits, src_offset);
output[dst_offset / kBitsPerByte] |=
(bit_value << (dst_offset % kBitsPerByte));
src_offset += bit_stride;
dst_offset += 1;
}
return output;
}
// A test version of the RandomizedEncoder class. Exposes "ForTest" methods.
class TestRandomizedEncoder : public autofill::RandomizedEncoder {
public:
using RandomizedEncoder::RandomizedEncoder;
using RandomizedEncoder::GetCoins;
using RandomizedEncoder::GetNoise;
};
// Data structure used to drive the encoding test cases.
struct EncodeParams {
// The type of encoding to perform with the RandomizedEncoder.
autofill::AutofillRandomizedValue_EncodingType encoding_type;
// The bit offset to start from with the reference encoder.
size_t bit_offset;
// The bit stride to select the next bit to encode with the reference encoder.
size_t bit_stride;
};
// A table to test cases, mapping encoding scheme to the reference encoder.
const EncodeParams kEncodeParams[] = {
// One bit per byte. These all require 8 bytes to encode and have 8-bit
// strides, starting from a different initial bit offset.
{autofill::AutofillRandomizedValue_EncodingType_BIT_0, 0, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_1, 1, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_2, 2, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_3, 3, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_4, 4, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_5, 5, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_6, 6, 8},
{autofill::AutofillRandomizedValue_EncodingType_BIT_7, 7, 8},
// Four bits per byte. These require 32 bytes to encode and have 2-bit
// strides/
{autofill::AutofillRandomizedValue_EncodingType_EVEN_BITS, 0, 2},
{autofill::AutofillRandomizedValue_EncodingType_ODD_BITS, 1, 2},
// All bits per byte. This require 64 bytes to encode and has a 1-bit
// stride.
{autofill::AutofillRandomizedValue_EncodingType_ALL_BITS, 0u, 1},
};
using RandomizedEncoderTest = ::testing::TestWithParam<EncodeParams>;
} // namespace
TEST_P(RandomizedEncoderTest, Encode) {
const autofill::FormSignature form_signature = 0x1234567812345678;
const autofill::FieldSignature field_signature = 0xCAFEBABE;
const std::string data_type = "css_class";
const EncodeParams& params = GetParam();
const std::string value("This is some text for testing purposes.");
EXPECT_LT(value.length(), kMaxLengthInBytes);
TestRandomizedEncoder encoder("this is the seed", params.encoding_type);
// Encode the output string.
std::string actual_result =
encoder.Encode(form_signature, field_signature, data_type, value);
// Capture the coin and noise bits used for the form, field and metadata type.
std::string coins =
encoder.GetCoins(form_signature, field_signature, data_type);
std::string noise =
encoder.GetNoise(form_signature, field_signature, data_type);
// Use the reference encoder implementation to get the expected output.
std::string expected_result = ReferenceEncodeImpl(
coins, noise, value, params.bit_offset, params.bit_stride);
// The results should be the same.
EXPECT_EQ(kMaxLengthInBytes / params.bit_stride, actual_result.length());
EXPECT_EQ(expected_result, actual_result);
}
TEST_P(RandomizedEncoderTest, EncodeLarge) {
const autofill::FormSignature form_signature = 0x8765432187654321;
const autofill::FieldSignature field_signature = 0xDEADBEEF;
const std::string data_type = "html_name";
const EncodeParams& params = GetParam();
const std::string value(
"This is some text for testing purposes. It exceeds the maximum encoding "
"size. This serves to validate that truncation is performed. Lots and "
"lots of text. Yay!");
EXPECT_GT(value.length(), kMaxLengthInBytes);
TestRandomizedEncoder encoder("this is the seed", params.encoding_type);
// Encode the output string.
std::string actual_result =
encoder.Encode(form_signature, field_signature, data_type, value);
// Capture the coin and noise bits used for the form, field and metadata type.
std::string coins =
encoder.GetCoins(form_signature, field_signature, data_type);
std::string noise =
encoder.GetNoise(form_signature, field_signature, data_type);
// Use the reference encoder implementation to get the expected output.
std::string expected_result = ReferenceEncodeImpl(
coins, noise, value, params.bit_offset, params.bit_stride);
// The results should be the same.
EXPECT_EQ(kMaxLengthInBytes / params.bit_stride, actual_result.length());
EXPECT_EQ(expected_result, actual_result);
}
INSTANTIATE_TEST_CASE_P(All,
RandomizedEncoderTest,
::testing::ValuesIn(kEncodeParams));
namespace {
// Data structure used to drive the decoding test cases.
struct DecodeParams {
// The number of samples for this test.
size_t num_votes;
// The lower and upper bound of the confidence interval given |num_votes|
// samples. If fewer than (lower_bound * num_votes) samples are one then the
// bit is a zero. If more than (upper_bound * num_votes) samples are one then
// the bit is a one.
double lower_bound;
double upper_bound;
};
// The fundamental idea of this algorithm is to count the number of reported 1s
// and 0s from a crowdsourced set of votes and look for a threshold that gives
// us confidence that we have seen enough 1s or 0s to assume that the actual
// value is a 1 or 0.
//
// Due to the symmetriy of the problem, we can conveniently choose the following
// null-hypothesis:
//
// N0: The true value of a bit is random on each pageload.
//
// In this case, we expect a binomial distribution B(n, 0.5), for which we can
// find a confidence interval that a sample of size n ends up in this interval.
// If a sample is outside of this interval, we can assume with the given
// confidence that we can reject the null-hypothesis and assume that the actual
// value is 0 or 1.
//
// For the confidence interval, we use the Wilson Score interval.
// https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
//
// The confidence intervals are calculate as:
//
// import scipy.stats as st
// import math
//
// def ConfidenceInterval(ph, n, confidence):
// """
// Args:
// confidence: e.g. 0.95 for a 95% confidence
// ph = p hat (see Wilson Score inverval link above)
// n = number of samples
// """
// alpha = 1 - confidence
// # for the 95% confidence interval this gives 1.96.
// z = st.norm.ppf(1 - alpha/2)
//
// base = (ph + z*z/(2*n)) / (1 + z*z/n)
// offset = z / (1 + z*z/n) * math.sqrt(ph*(1-ph)/n + z*z/(4*n*n))
//
// return [base - offset, base + offset]
//
// Note: The string being encoded/decoded for each "sample" consists of a common
// prefix followed by the decimal value of the sample number (i.e., "foo 25").
// Avoid test cases that skew the distribution of the appended noise to far
// from random (for example, using 0-199, more than 55% of the appended noise
// starts with the ASCII digit '1').
const DecodeParams kDecodeParams[] = {
// 99.0 % confidence interval
{150, 0.39709349666174232, 0.60290650333825768},
{256, 0.42052859913480434, 0.57947140086519566},
// 99.5% confidence interval
{150, 0.38829956746162475, 0.61170043253837525},
{256, 0.41359977651950797, 0.58640022348049203},
};
using RandomizedDecoderTest = ::testing::TestWithParam<DecodeParams>;
// Generate a hex string representing the 128 bit value where each byte has
// value |i|. This lets us spread the seeds across the 128 bit space.
std::string Make128BitSeed(size_t i) {
EXPECT_LT(i, 256u);
return net::HexDump(
std::string(128 / kBitsPerByte, static_cast<char>(i & 0xFF)));
}
} // namespace
TEST_P(RandomizedDecoderTest, Decode) {
static const autofill::FormSignature form_signature = 0x8765432187654321;
static const autofill::FieldSignature field_signature = 0xDEADBEEF;
static const char data_type[] = "data_type";
static const base::StringPiece common_prefix(
"This is the common prefix to encode and recover ");
const size_t num_votes = GetParam().num_votes;
const double lower_bound = GetParam().lower_bound;
const double upper_bound = GetParam().upper_bound;
// This vector represents the aggregate counts of the number of times a
// separate encoding of out sample string had a given bit encoded as a one.
std::vector<size_t> num_times_bit_is_1(/*count=*/kMaxLengthInBits,
/*value=*/0);
// Perform |num_votes| independent encoding operations, with seeds (somewhat)
// evenly spread out across a 128-bit space.
for (size_t i = 0; i < num_votes; ++i) {
// Create a new encoder with a different secret each time.
TestRandomizedEncoder encoder(
Make128BitSeed(i),
autofill::AutofillRandomizedValue_EncodingType_ALL_BITS);
// Encode the common prefix plus some non-constant data.
std::string encoded =
encoder.Encode(form_signature, field_signature, data_type,
base::StringPrintf("%s%zu", common_prefix.data(), i));
// Update |num_times_bit_is_1| for each bit in the encoded string.
for (size_t b = 0; b < kMaxLengthInBits; ++b) {
num_times_bit_is_1[b] += GetBit(encoded, b);
}
}
// Use |num_times_bit_is_1| to reconstruct the encoded string, bit by bit,
// as well as a record of whether or not each bit in the reconstruction
// buffer was validated with sufficient confidence.
std::string output(kMaxLengthInBytes, static_cast<char>(0));
std::vector<bool> bit_is_valid(kMaxLengthInBits);
const double threshold_for_zero = lower_bound * num_votes;
const double threshold_for_one = upper_bound * num_votes;
for (size_t b = 0; b < kMaxLengthInBits; ++b) {
if (num_times_bit_is_1[b] < threshold_for_zero) {
// bit it already zero, just mark it as valid
bit_is_valid[b] = true;
} else if (num_times_bit_is_1[b] > threshold_for_one) {
output[b / kBitsPerByte] |= (1 << (b % kBitsPerByte));
bit_is_valid[b] = true;
}
}
// Validation: All bits overlapping the constant prefix should be valid.
for (size_t b = 0; b < common_prefix.length() * kBitsPerByte; ++b) {
EXPECT_TRUE(bit_is_valid[b]) << "True bit found to be noise at " << b;
}
// Validation: All of the recovered prefix bits should match the prefix.
for (size_t i = 0; i < common_prefix.length(); ++i) {
EXPECT_EQ(common_prefix[i], output[i]) << "Incorrect char at offset " << i;
}
// Validation: Most noise bits should be invalid, but we may get some false
// positives. Instead, we expect that no noise byte will have all of its
// bits turn up as valid.
for (size_t i = common_prefix.length(); i < kMaxLengthInBytes; ++i) {
size_t num_valid_bits = 0;
for (size_t b = 0; b < kBitsPerByte; ++b) {
num_valid_bits += bit_is_valid[i * kBitsPerByte + b];
}
EXPECT_LT(num_valid_bits, kBitsPerByte)
<< "Noise byte at offset " << i << " decoded as " << output[i];
}
}
INSTANTIATE_TEST_CASE_P(All,
RandomizedDecoderTest,
::testing::ValuesIn(kDecodeParams));