blob: f4d046bc0e52227d035431ff86ee67b03d044dd4 [file] [log] [blame]
# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Recipe to bisect flaky tests in V8.
Bisection will start at a known bad to_revision and:
1. Calibrate the number of repetitions until enough confidence is reached.
2. Bisect backwards exponentially, doubling the offset in each step.
3. After finding a good from_revision, bisect into the range
from_revision..to_revision and report the suspect.
Tests are only run on existing isolated files, looked up on Google Storage.
All revisions during bisections are represented as offsets to the start revision
which has offset 0.
See PROPERTIES for documentation on the recipe's interface.
"""
import re
from recipe_engine.post_process import DoesNotRun, DropExpectation, MustRun
from recipe_engine.recipe_api import Property
DEPS = [
'depot_tools/gitiles',
'depot_tools/gsutil',
'recipe_engine/json',
'recipe_engine/properties',
'recipe_engine/raw_io',
'recipe_engine/step',
'recipe_engine/tempfile',
'swarming',
'swarming_client',
]
PROPERTIES = {
# Master name of the builder that produced the builds for bisection.
'bisect_mastername': Property(kind=str),
# Name of the builder that produced the builds for bisection.
'bisect_buildername': Property(kind=str),
# Build config passed to V8's run-tests.py script (there it's parameter
# --mode, example: Release or Debug).
'build_config': Property(kind=str),
# Extra arguments to V8's run-tests.py script.
'extra_args': Property(default=None, kind=list),
# Number of commits, backwards bisection will initially leap over.
'initial_commit_offset': Property(default=1, kind=int),
# Name of the isolated file (e.g. bot_default, mjsunit).
'isolated_name': Property(kind=str),
# Initial number of test repetitions (passed to --random-seed-stress-count
# option).
'repetitions': Property(default=5000, kind=int),
# Swarming dimensions classifying the type of bot the tests should run on.
# Passed as list of strings, each in the format name:value.
'swarming_dimensions': Property(default=None, kind=list),
# Fully qualified test name passed to run-tests.py. E.g. mjsunit/foobar.
'test_name': Property(kind=str),
# Timeout parameter passed to run-tests.py. Keep small when bisecting
# fast-running tests that occasionally hang.
'timeout_sec': Property(default=60, kind=int),
# Initial total timeout for one entire bisect step. During calibration, this
# time might be increased for more confidence. Set to 0 to disable and specify
# the 'repetitions' property instead.
'total_timeout_sec': Property(default=120, kind=int),
# Revision known to be bad, where backwards bisection will start.
'to_revision': Property(kind=str),
# Name of the testing variant passed to run-tests.py.
'variant': Property(kind=str),
}
# The maximum number of steps for backwards and inwards bisection (safeguard to
# prevent infinite loops).
MAX_BISECT_STEPS = 16
# The maximum number of calibration attempts (safeguard to prevent infinite
# loops). Repetitions are doubled on each attempt until there's enough
# confidence.
MAX_CALIBRATION_ATTEMPTS = 5
# A build with isolates must be within a distance of maximum 32 revisions for
# any revision that should be tested. We don't look further as a safeguard.
MAX_ISOLATE_OFFSET = 32
# Maximum number of test name characters printed in UI in step names.
MAX_LABEL_SIZE = 32
# Minimim number of flakes needed to have confidence in a run.
MIN_FLAKE_THRESHOLD = 4
# Response of gsutil when non-existing objects are looked up.
GSUTIL_NO_MATCH_TXT = 'One or more URLs matched no objects'
# URL of V8 repository.
REPO = 'https://chromium.googlesource.com/v8/v8'
# Exit code of V8's run-tests.py when no tests were run.
EXIT_CODE_NO_TESTS = 2
# Output of V8's test runner when all tests passed.
TEST_PASSED_TEXT = """
=== All tests succeeded
""".strip()
# Output of V8's test runner when some tests failed.
TEST_FAILED_TEMPLATE = """
=== %d tests failed
""".strip()
class Command(object):
"""Helper class representing a command line to V8's run-tests.py."""
def __init__(self, test_name, build_config, variant, repetitions,
total_timeout_sec, timeout=60, extra_args=None):
self.repetitions = repetitions
self.test_name = test_name
self.total_timeout_sec = total_timeout_sec
self.base_cmd = [
'tools/run-tests.py',
'--progress=verbose',
'--mode=%s' % build_config,
'--outdir=out',
'--timeout=%d' % timeout,
'--swarming',
'--variants=%s' % variant,
] + (extra_args or []) + [test_name]
@property
def label(self):
"""Test name for UI output limited to MAX_LABEL_SIZE chars."""
if len(self.test_name) > MAX_LABEL_SIZE:
return self.test_name[:MAX_LABEL_SIZE - 3] + '...'
return self.test_name
def raw_cmd(self, multiplier):
if self.total_timeout_sec:
return (
self.base_cmd +
[
'--random-seed-stress-count=1000000',
'--total-timeout-sec=%d' % (self.total_timeout_sec * multiplier),
]
)
else:
return (
self.base_cmd +
['--random-seed-stress-count=%d' % (self.repetitions * multiplier)]
)
class Depot(object):
"""Helper class for interacting with remote storage (GS bucket and git)."""
def __init__(self, api, mastername, buildername, isolated_name, revision):
"""
Args:
mastername: Master name of the builder that produced the builds for
bisection.
buildername: Name of the builder that produced the builds for bisection.
isolated_name: Name of the isolated file (e.g. bot_default, mjsunit).
revision: Start revision of bisection (known bad revision). All other
revisions during bisection will be represented as offsets to this
revision.
"""
self.api = api
self.gs_url_template = (
'gs://chromium-v8/isolated/%s/%s/%%s.json' % (mastername, buildername))
self.isolated_name = isolated_name
self.revision = revision
# Cache for mapping offsets to real revisions.
self.revisions = {0: revision}
# Cache for isolated hashes.
self.isolates = {}
# Offset cache for closets builds with isolates.
self.closest_builds = {}
def get_revision(self, offset):
"""Returns the git revision at the given offset (cached)."""
revision = self.revisions.get(offset)
if not revision:
commits, _ = self.api.gitiles.log(
REPO, '%s~%d' % (self.revision, offset), limit=1,
step_name='get revision #%d' % offset)
assert commits
for i, commit in enumerate(commits):
# Gitiles returns several commits. Fill our cache to avoid subsequent
# calls.
self.revisions[offset + i] = commit['commit']
return self.revisions[offset]
def has_build(self, offset):
"""Checks if an isolate exists for the given offset."""
rev = self.get_revision(offset)
link = '%s/+/%s' % (REPO, rev)
try:
self.api.gsutil.list(
self.gs_url_template % rev,
name='lookup isolates for #%d' % offset,
stderr=self.api.raw_io.output_text(),
)
return True
except self.api.step.StepFailure as e:
# Gsutil's api has no good result format for missing objects, hence, we
# look for the output text for missing objects. Treat missing object as
# success as we expect some builds not to exist.
if GSUTIL_NO_MATCH_TXT in e.result.stderr:
e.result.presentation.status = self.api.step.SUCCESS
return False
raise # pragma: no cover
finally:
self.api.step.active_result.presentation.links[rev[:8]] = link
def find_closest_build(self, offset, max_offset=None):
"""Looks backwards for the closest offset with an existing isolate (cached).
Args:
offset: The offset to the base revision where the lookup is started.
max_offset: Lookup stops at this offset if reached.
Returns:
The closest offset for which an isolate exists.
"""
closest = self.closest_builds.get(offset)
if closest is not None:
return closest
for i in range(MAX_ISOLATE_OFFSET):
closest = offset + i
if closest == max_offset or self.has_build(closest):
for j in range(offset, closest + 1):
# Cache the closest build for all offsets we tried.
self.closest_builds[j] = closest
return closest
raise self.api.step.StepFailure('Couldn\'t find isolates.')
def get_isolated_hash(self, offset):
"""Returns the isolated hash for a given offset (cached)."""
if offset in self.isolates:
return self.isolates[offset]
self.api.gsutil.download_url(
self.gs_url_template % self.get_revision(offset),
self.api.json.output(),
name='get isolates for #%s' % offset,
step_test_data=lambda: self.api.json.test_api.output(
{'foo_isolated': '[dummy hash for foo_isolated]'}),
)
step_result = self.api.step.active_result
self.isolates[offset] = step_result.json.output[self.isolated_name]
return self.isolates[offset]
class Runner(object):
"""Helper class for executing the V8 test runner to check for flakes."""
def __init__(self, api, depot, command):
self.api = api
self.depot = depot
self.command = command
self.multiplier = 1
def calibrate(self, offset):
"""Calibrates the multiplier for test time or repetitions of the runner for
the given offset.
Testing is repeated until MIN_FLAKE_THRESHOLD test failures are counted in
an attempt. The multiplier is doubled on each fresh attempt.
"""
for _ in range(MAX_CALIBRATION_ATTEMPTS):
if self.check_num_flakes(offset) >= MIN_FLAKE_THRESHOLD:
return True
self.multiplier *= 2
return False
def check_num_flakes(self, offset):
"""Stress tests the given revision and returns the number of failures."""
isolated_hash = self.depot.get_isolated_hash(offset)
with self.api.tempfile.temp_dir('v8-flake-bisect-') as path:
task = self.api.swarming.task(
'check %s at #%d - %d' %
(self.command.label, offset, self.multiplier),
isolated_hash,
task_output_dir=path.join('task_output_dir'),
raw_cmd=self.command.raw_cmd(self.multiplier),
)
self.api.swarming.trigger_task(task)
try:
step_result = self.api.swarming.collect_task(task)
data = step_result.swarming.summary['shards'][0]
# Sanity checks.
# TODO(machenbach): Add this information to the V8 test runner's json
# output as parsing stdout is brittle.
assert TEST_PASSED_TEXT in data['outputs'][0]
return 0
except self.api.step.StepFailure as e:
data = e.result.swarming.summary['shards'][0]
if data['exit_codes'][0] == EXIT_CODE_NO_TESTS:
# The desired tests seem to not exist in this revision.
# TODO(machenbach): Add special logic for dealing with tests not
# existing. They might have been added in a revision and are flaky
# since then. Treat them as good revisions for now.
# Maybe we should not do this during initialization to make sure it's
# not a setup error?
return 0 # pragma: no cover
stdout = data['outputs'][0]
if TEST_PASSED_TEXT in stdout: # pragma: no cover
# It's possible that the return code is non-zero due to a test runner
# leak.
# TODO(machenbach): Remove this when https://crbug.com/v8/8001 is
# resolved.
return 0
match = re.search(r'=== (\d+) tests failed', stdout)
assert match
return int(match.group(1))
def bisect(api, depot, initial_commit_offset, is_bad_func, offset):
"""Exercises the bisection control flow.
Args:
api: Recipe api.
depot: Helper for accessing storage and git.
initial_commit_offset: Number of commits, backwards bisection will
initially leap over.
is_bad_func: Function (revision->bool) determining if a given revision is
bad.
offset: Offset at which to start bisection.
"""
def report_range(text, from_offset, to_offset):
from_revision = depot.get_revision(from_offset)
to_revision = depot.get_revision(to_offset)
offset_range = '#%d..#%d' % (from_offset, to_offset)
git_range = '%s..%s' % (from_revision[:8], to_revision[:8])
step_result = api.step(text % offset_range, cmd=None)
step_result.presentation.links[git_range] = '%s/+log/%s' % (REPO, git_range)
def report_revision(text, offset):
rev = depot.get_revision(offset)
step_result = api.step(text % ('#%d' % offset), cmd=None)
step_result.presentation.links[rev[:8]] = '%s/+/%s' % (REPO, rev)
def bisect_back(to_offset):
"""Bisects backwards from to_offset, doubling the delta in each
iteration.
Returns:
A tuple of (from_offset, to_offset), where from_offset..to_offset
represents the range of good..bad revision found.
"""
commit_offset = initial_commit_offset
for _ in range(MAX_BISECT_STEPS):
from_offset = to_offset + commit_offset
# Check if from_offset is bad and iterate backwards if so.
from_offset = depot.find_closest_build(from_offset)
report_revision('Checking %s', from_offset)
if is_bad_func(from_offset):
to_offset = from_offset
commit_offset *= 2
continue
return from_offset, to_offset
raise api.step.StepFailure(
'Could not not find a good revision.') # pragma: no cover
def bisect_into(from_offset, to_offset):
"""Bisects into a given range from_offset..to_offset and determins a
suspect commit range.
"""
assert from_offset >= to_offset
known_good = from_offset
known_bad = to_offset
report_range('Bisecting %s', from_offset, to_offset)
for _ in range(MAX_BISECT_STEPS):
# End of bisection. Note that possibly known_good..known_bad is a larger
# range than 1 commit due to missing isolates.
if from_offset - to_offset <= 1:
return known_good, known_bad
middle_offset = to_offset + (from_offset - to_offset ) / 2
build_offset = depot.find_closest_build(middle_offset, from_offset)
if build_offset >= from_offset:
report_range('No builds in %s', from_offset, middle_offset)
# There are no isolates in lower half. Skip it and continue.
from_offset = middle_offset
continue
report_revision('Checking %s', build_offset)
if is_bad_func(build_offset):
to_offset = build_offset
known_bad = build_offset
else:
from_offset = build_offset
known_good = build_offset
from_offset, to_offset = bisect_back(offset)
from_offset, to_offset = bisect_into(from_offset, to_offset)
report_range('Result: Suspecting %s', from_offset, to_offset)
def setup_swarming(api, swarming_dimensions):
api.swarming_client.checkout('master')
api.swarming.default_expiration = 60 * 60
api.swarming.default_hard_timeout = 60 * 60
api.swarming.default_io_timeout = 20 * 60
api.swarming.default_idempotent = False
api.swarming.default_priority = 25
api.swarming.default_user = 'v8-flake-bisect'
api.swarming.add_default_tag('purpose:v8-flake-bisect')
api.swarming.set_default_dimension('pool', 'Chrome')
api.swarming.set_default_dimension('gpu', 'none')
for item in swarming_dimensions:
k, v = item.split(':')
api.swarming.set_default_dimension(k, v)
def RunSteps(api, bisect_mastername, bisect_buildername, build_config,
extra_args, initial_commit_offset, isolated_name, repetitions,
swarming_dimensions, test_name, timeout_sec, total_timeout_sec,
to_revision, variant):
# Set up swarming client.
setup_swarming(api, swarming_dimensions)
# Set up bisection helpers.
depot = Depot(
api, bisect_mastername, bisect_buildername, isolated_name, to_revision)
command = Command(
test_name, build_config, variant, repetitions, total_timeout_sec,
timeout_sec, extra_args)
runner = Runner(api, depot, command)
# Get confidence that the initial revision is flaky and calibrate the
# repetitions.
to_offset = depot.find_closest_build(0)
if not runner.calibrate(to_offset):
raise api.step.StepFailure('Could not reach enough confidence.')
# Run bisection.
bisect(api, depot, initial_commit_offset, runner.check_num_flakes, to_offset)
def GenTests(api):
def test(name):
return (
api.test(name) +
api.properties(
bisect_mastername='foo.v8',
bisect_buildername='V8 Foobar',
extra_args=['--foo-flag', '--bar-flag'],
isolated_name='foo_isolated',
build_config='Debug',
repetitions=64,
swarming_dimensions=['os:Ubuntu-14.04', 'cpu:x86-64'],
test_name='mjsunit/foobar',
timeout_sec=20,
to_revision='a0',
variant='stress_foo',
)
)
def isolated_lookup(offset, exists):
return api.step_data(
'gsutil lookup isolates for #%d' % offset,
api.raw_io.stream_output(
'' if exists else GSUTIL_NO_MATCH_TXT,
stream='stderr',
),
retcode=0 if exists else 1,
)
def get_revisions(offset, *revisions):
return api.step_data(
'get revision #%d' % offset,
api.json.output({'log': [
{'commit': revision} for revision in revisions
]}),
)
def is_flaky(offset, multiplier, flakes, test_name='mjsunit/foobar'):
test_data = api.swarming.canned_summary_output_raw()
if flakes:
output = TEST_FAILED_TEMPLATE % flakes
exit_code = 1
else:
output = TEST_PASSED_TEXT
exit_code = 0
test_data['shards'][0]['outputs'][0] = output
test_data['shards'][0]['exit_codes'][0] = exit_code
return api.step_data(
'check %s at #%d - %d' % (test_name, offset, multiplier),
api.swarming.summary(test_data),
retcode=exit_code,
)
def verify_suspects(from_offset, to_offset):
"""Verify that the correct reporting step for from_offset..to_offset is
emitted.
"""
git_range = 'a%d..a%d' % (from_offset, to_offset)
step_name = 'Result: Suspecting #%d..#%d' % (from_offset, to_offset)
def suspects_internal(check, steps):
check(step_name in steps)
check(steps[step_name]['~followup_annotations'][0] ==
'@@@STEP_LINK@%s@%s/+log/%s@@@' % (git_range, REPO, git_range))
return api.post_process(suspects_internal)
# Full bisect run with some corner cases. Overview of all revisions ordered
# new -> old.
# a0: no isolate
# a1: not flaky enough with 64 but flaky with 128 repetitions
# a2: flaky
# a3: flaky
# a4: no isolate
# a5: not flaky
# -> Should result in suspecting range a5..a3.
yield (
test('full_bisect') +
# Test path where total timeout isn't used.
api.properties(total_timeout_sec=0) +
# Data for resolving offsets to git hashes. Simulate gitiles page size of
# 3 commits per call.
get_revisions(1, 'a1', 'a2', 'a3') +
get_revisions(4, 'a4', 'a5', 'a6') +
# Isolate data simulation for all revisions.
isolated_lookup(0, False) +
isolated_lookup(1, True) +
isolated_lookup(2, True) +
isolated_lookup(3, True) +
isolated_lookup(4, False) +
isolated_lookup(5, True) +
# Calibration. We check for flakes until enough are found.
is_flaky(1, 1, 2) +
is_flaky(1, 2, 5) +
# Bisect backwards from a1 until good revision a5 is found.
is_flaky(2, 2, 3) +
is_flaky(5, 2, 0) +
# Bisect into a5..a2.
is_flaky(3, 2, 3) +
verify_suspects(5, 3)
)
# Similar to above but fewer corner cases. This is for simulating bisection
# going into the upper half of a git range, which has different code paths
# above.
yield (
test('full_bisect_upper') +
# Data for resolving offsets to git hashes. Simulate gitiles page size of
# 8, fetching all data in the first call.
get_revisions(1, 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8') +
# Isolate data simulation for all revisions.
isolated_lookup(0, True) +
isolated_lookup(1, True) +
isolated_lookup(3, True) +
isolated_lookup(4, True) +
isolated_lookup(5, True) +
isolated_lookup(7, True) +
# Calibration.
is_flaky(0, 1, 5) +
# Bisect backwards from a0 until good revision a7 is found.
is_flaky(1, 1, 3) +
is_flaky(3, 1, 3) +
is_flaky(7, 1, 0) +
# Bisect into a7..a3.
is_flaky(5, 1, 0) +
is_flaky(4, 1, 2) +
verify_suspects(5, 4) +
api.post_process(DropExpectation)
)
# Test bisecting through a large range of missing builds.
yield (
test('large_gap') +
get_revisions(1, 'a1', 'a2', 'a3', 'a4') +
# Simulate a large gap between #0 and #4..
isolated_lookup(0, True) +
isolated_lookup(1, False) +
isolated_lookup(2, False) +
isolated_lookup(3, False) +
isolated_lookup(4, True) +
# Bad build #0.
is_flaky(0, 1, 5) +
# Good build #4.
is_flaky(4, 1, 0) +
# Check that bisect continues properly after not finding a build in one
# half.
api.post_process(MustRun, 'No builds in #4..#2') +
api.post_process(MustRun, 'No builds in #2..#1') +
# Check that isolate lookup is cached for the negative case. We look only
# once for a build that's not found.
api.post_process(MustRun, 'gsutil lookup isolates for #2') +
api.post_process(DoesNotRun, 'gsutil lookup isolates for #2 (2)') +
verify_suspects(4, 0) +
api.post_process(DropExpectation)
)
def verify_failure_reason(reason):
"""Helper for verifing the failure reason text of the recipe."""
def _verify_failure_reason(check, steps):
check(steps['$result']['reason'] == reason)
return api.post_process(_verify_failure_reason)
# Simulate not finding any isolates.
yield (
test('no_isolates') +
sum((isolated_lookup(i, False) + get_revisions(i, 'a%d' % i)
for i in range(1, MAX_ISOLATE_OFFSET)),
isolated_lookup(0, False)) +
verify_failure_reason('Couldn\'t find isolates.') +
api.post_process(DropExpectation)
)
# Simulate not finding enough flakes during calibration.
# Also test cutting off overly long test names in step names.
long_test_name = (29 * '*') + 'too_long'
shortened_test_name = (29 * '*') + '...'
yield (
test('no_confidence') +
api.properties(test_name=long_test_name) +
isolated_lookup(0, True) +
is_flaky(0, 1, 0, test_name=shortened_test_name) +
is_flaky(0, 2, 2, test_name=shortened_test_name) +
is_flaky(0, 4, 1, test_name=shortened_test_name) +
is_flaky(0, 8, 3, test_name=shortened_test_name) +
is_flaky(0, 16, 3, test_name=shortened_test_name) +
verify_failure_reason('Could not reach enough confidence.') +
api.post_process(DropExpectation)
)