| # Copyright 2018 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Recipe to bisect flaky tests in V8. |
| |
| Bisection will start at a known bad to_revision and: |
| 1. Calibrate the number of repetitions until enough confidence is reached. |
| 2. Bisect backwards exponentially, doubling the offset in each step. |
| 3. After finding a good from_revision, bisect into the range |
| from_revision..to_revision and report the suspect. |
| |
| Tests are only run on existing isolated files, looked up on Google Storage. |
| |
| All revisions during bisections are represented as offsets to the start revision |
| which has offset 0. |
| |
| See PROPERTIES for documentation on the recipe's interface. |
| """ |
| |
| import re |
| |
| from recipe_engine.config import Single |
| from recipe_engine.post_process import ( |
| DoesNotRun, DropExpectation, Filter, MustRun) |
| from recipe_engine.post_process import ResultReasonRE |
| from recipe_engine.recipe_api import Property |
| |
| |
| |
| DEPS = [ |
| 'depot_tools/gitiles', |
| 'depot_tools/gsutil', |
| 'recipe_engine/json', |
| 'recipe_engine/properties', |
| 'recipe_engine/raw_io', |
| 'recipe_engine/step', |
| 'recipe_engine/tempfile', |
| 'swarming', |
| 'swarming_client', |
| ] |
| |
| |
| PROPERTIES = { |
| # Master name of the builder that produced the builds for bisection. |
| 'bisect_mastername': Property(kind=str), |
| # Name of the builder that produced the builds for bisection. |
| 'bisect_buildername': Property(kind=str), |
| # Build config passed to V8's run-tests.py script (there it's parameter |
| # --mode, example: Release or Debug). |
| 'build_config': Property(kind=str), |
| # Extra arguments to V8's run-tests.py script. |
| 'extra_args': Property(default=None, kind=list), |
| # Number of commits, backwards bisection will initially leap over. |
| 'initial_commit_offset': Property(default=1, kind=Single((int, float))), |
| # Name of the isolated file (e.g. bot_default, mjsunit). |
| 'isolated_name': Property(kind=str), |
| # Initial number of swarming shards. |
| 'num_shards': Property(default=2, kind=Single((int, float))), |
| # Initial number of test repetitions (passed to --random-seed-stress-count |
| # option). |
| 'repetitions': Property(default=5000, kind=Single((int, float))), |
| # Switch to only attempt to reproduce with given revision. Skips bisection. |
| 'repro_only': Property(default=False, kind=bool), |
| # Swarming dimensions classifying the type of bot the tests should run on. |
| # Passed as list of strings, each in the format name:value. |
| 'swarming_dimensions': Property(default=None, kind=list), |
| # Fully qualified test name passed to run-tests.py. E.g. mjsunit/foobar. |
| 'test_name': Property(kind=str), |
| # Timeout parameter passed to run-tests.py. Keep small when bisecting |
| # fast-running tests that occasionally hang. |
| 'timeout_sec': Property(default=60, kind=Single((int, float))), |
| # Initial total timeout for one entire bisect step. During calibration, this |
| # time might be increased for more confidence. Set to 0 to disable and specify |
| # the 'repetitions' property instead. |
| 'total_timeout_sec': Property(default=120, kind=Single((int, float))), |
| # Revision known to be bad, where backwards bisection will start. |
| 'to_revision': Property(kind=str), |
| # Name of the testing variant passed to run-tests.py. |
| 'variant': Property(kind=str), |
| } |
| |
| # The maximum number of steps for backwards and inwards bisection (safeguard to |
| # prevent infinite loops). |
| MAX_BISECT_STEPS = 16 |
| |
| # The maximum number of calibration attempts (safeguard to prevent infinite |
| # loops). Repetitions are doubled on each attempt until there's enough |
| # confidence. |
| MAX_CALIBRATION_ATTEMPTS = 5 |
| |
| # A build with isolates must be within a distance of maximum 32 revisions for |
| # any revision that should be tested. We don't look further as a safeguard. |
| MAX_ISOLATE_OFFSET = 32 |
| |
| # Maximum number of test name characters printed in UI in step names. |
| MAX_LABEL_SIZE = 32 |
| |
| # Maximum number of swarming shards to be used for a single attempt. |
| MAX_SWARMING_SHARDS = 8 |
| |
| # Minimim number of flakes needed to have confidence in a run. |
| MIN_FLAKE_THRESHOLD = 4 |
| |
| # Response of gsutil when non-existing objects are looked up. |
| GSUTIL_NO_MATCH_TXT = 'One or more URLs matched no objects' |
| |
| # URL of V8 repository. |
| REPO = 'https://chromium.googlesource.com/v8/v8' |
| |
| # Exit code of V8's run-tests.py when no tests were run. |
| EXIT_CODE_NO_TESTS = 2 |
| |
| # Output of V8's test runner when all tests passed. |
| TEST_PASSED_TEXT = """ |
| === All tests succeeded |
| """.strip() |
| |
| # Output of V8's test runner when some tests failed. |
| TEST_FAILED_TEMPLATE = """ |
| === %d tests failed |
| """.strip() |
| |
| |
| class Command(object): |
| """Helper class representing a command line to V8's run-tests.py.""" |
| def __init__(self, test_name, build_config, variant, repetitions, repro_only, |
| total_timeout_sec, timeout=60, extra_args=None): |
| self.repetitions = repetitions |
| self.test_name = test_name |
| self.total_timeout_sec = total_timeout_sec |
| self.min_failures = 1 if repro_only else MIN_FLAKE_THRESHOLD |
| self.base_cmd = [ |
| 'tools/run-tests.py', |
| '--progress=verbose', |
| '--mode=%s' % build_config, |
| '--outdir=out', |
| '--timeout=%d' % timeout, |
| '--swarming', |
| '--variants=%s' % variant, |
| ] |
| if repro_only: |
| # In repro-only mode we keep running skipped tests. |
| self.base_cmd.append('--run-skipped') |
| self.base_cmd += (extra_args or []) |
| self.base_cmd.append(test_name) |
| |
| @property |
| def label(self): |
| """Test name for UI output limited to MAX_LABEL_SIZE chars.""" |
| if len(self.test_name) > MAX_LABEL_SIZE: |
| return self.test_name[:MAX_LABEL_SIZE - 3] + '...' |
| return self.test_name |
| |
| def raw_cmd(self, multiplier, offset): |
| cmd = list(self.base_cmd) |
| if self.total_timeout_sec: |
| cmd.append('--random-seed-stress-count=1000000') |
| cmd.append( |
| '--total-timeout-sec=%d' % (self.total_timeout_sec * multiplier)) |
| else: |
| cmd.append( |
| '--random-seed-stress-count=%d' % (self.repetitions * multiplier)) |
| if offset <= 1024: |
| # TODO(machenbach): Make this unconditional in 2019, when the feature has |
| # become old enough to be compatible with long backwards bisection. |
| # 1024 is a rough approximation of commits since the flag below was |
| # introduced. |
| cmd.append('--exit-after-n-failures=%d' % self.min_failures) |
| return cmd |
| |
| |
| class Depot(object): |
| """Helper class for interacting with remote storage (GS bucket and git).""" |
| def __init__(self, api, mastername, buildername, isolated_name, revision): |
| """ |
| Args: |
| mastername: Master name of the builder that produced the builds for |
| bisection. |
| buildername: Name of the builder that produced the builds for bisection. |
| isolated_name: Name of the isolated file (e.g. bot_default, mjsunit). |
| revision: Start revision of bisection (known bad revision). All other |
| revisions during bisection will be represented as offsets to this |
| revision. |
| """ |
| self.api = api |
| self.gs_url_template = ( |
| 'gs://chromium-v8/isolated/%s/%s/%%s.json' % (mastername, buildername)) |
| self.isolated_name = isolated_name |
| self.revision = revision |
| # Cache for mapping offsets to real revisions. |
| self.revisions = {0: revision} |
| # Cache for isolated hashes. |
| self.isolates = {} |
| # Offset cache for closets builds with isolates. |
| self.closest_builds = {} |
| |
| def get_revision(self, offset): |
| """Returns the git revision at the given offset (cached).""" |
| revision = self.revisions.get(offset) |
| if not revision: |
| commits, _ = self.api.gitiles.log( |
| REPO, '%s~%d' % (self.revision, offset), limit=1, |
| step_name='get revision #%d' % offset) |
| assert commits |
| for i, commit in enumerate(commits): |
| # Gitiles returns several commits. Fill our cache to avoid subsequent |
| # calls. |
| self.revisions[offset + i] = commit['commit'] |
| return self.revisions[offset] |
| |
| def has_build(self, offset): |
| """Checks if an isolate exists for the given offset.""" |
| rev = self.get_revision(offset) |
| link = '%s/+/%s' % (REPO, rev) |
| try: |
| self.api.gsutil.list( |
| self.gs_url_template % rev, |
| name='lookup isolates for #%d' % offset, |
| stderr=self.api.raw_io.output_text(), |
| ) |
| return True |
| except self.api.step.StepFailure as e: |
| # Gsutil's api has no good result format for missing objects, hence, we |
| # look for the output text for missing objects. Treat missing object as |
| # success as we expect some builds not to exist. |
| if GSUTIL_NO_MATCH_TXT in e.result.stderr: |
| e.result.presentation.status = self.api.step.SUCCESS |
| return False |
| raise # pragma: no cover |
| finally: |
| self.api.step.active_result.presentation.links[rev[:8]] = link |
| |
| def find_closest_build(self, offset, max_offset=None): |
| """Looks backwards for the closest offset with an existing isolate (cached). |
| |
| Args: |
| offset: The offset to the base revision where the lookup is started. |
| max_offset: Lookup stops at this offset if reached. |
| Returns: |
| The closest offset for which an isolate exists. |
| """ |
| closest = self.closest_builds.get(offset) |
| if closest is not None: |
| return closest |
| for i in range(MAX_ISOLATE_OFFSET): |
| closest = offset + i |
| if closest == max_offset or self.has_build(closest): |
| for j in range(offset, closest + 1): |
| # Cache the closest build for all offsets we tried. |
| self.closest_builds[j] = closest |
| return closest |
| raise self.api.step.StepFailure('Couldn\'t find isolates.') |
| |
| def get_isolated_hash(self, offset): |
| """Returns the isolated hash for a given offset (cached).""" |
| if offset in self.isolates: |
| return self.isolates[offset] |
| |
| self.api.gsutil.download_url( |
| self.gs_url_template % self.get_revision(offset), |
| self.api.json.output(), |
| name='get isolates for #%s' % offset, |
| step_test_data=lambda: self.api.json.test_api.output( |
| {'foo_isolated': '[dummy hash for foo_isolated]'}), |
| ) |
| step_result = self.api.step.active_result |
| self.isolates[offset] = step_result.json.output[self.isolated_name] |
| return self.isolates[offset] |
| |
| |
| class Runner(object): |
| """Helper class for executing the V8 test runner to check for flakes.""" |
| def __init__(self, api, depot, command, num_shards, repro_only): |
| self.api = api |
| self.depot = depot |
| self.command = command |
| self.num_shards = min(num_shards, MAX_SWARMING_SHARDS) |
| self.repro_only = repro_only |
| self.multiplier = 1 |
| |
| def calibrate(self, offset): |
| """Calibrates the multiplier for test time or repetitions of the runner for |
| the given offset. |
| |
| Testing is repeated until MIN_FLAKE_THRESHOLD test failures are counted in |
| an attempt. First the number of swarming shards, then the multiplier is |
| doubled on each fresh attempt. |
| |
| Args: |
| offset (int): Distance to the start commit. |
| """ |
| for i in range(MAX_CALIBRATION_ATTEMPTS): |
| # Nest to disambiguate step names during calibration. |
| with self.api.step.nest('calibration attempt %d' % (i + 1)): |
| num_failures = self.check_num_flakes(offset) |
| if (self.repro_only and num_failures or |
| num_failures >= MIN_FLAKE_THRESHOLD): |
| return True |
| if self.num_shards < MAX_SWARMING_SHARDS: |
| # First double the swarming shards until reaching the maximum. |
| self.num_shards = min(self.num_shards * 2, MAX_SWARMING_SHARDS) |
| else: |
| # Then double the repetition multiplier. Use lower swarming priority |
| # given the number and time of the tasks. |
| self.api.swarming.default_priority = 35 |
| self.multiplier *= 2 |
| return False |
| |
| def _default_task_pass_test_data(self): |
| test_data = self.api.swarming.test_api.canned_summary_output_raw() |
| test_data['shards'][0]['output'] = TEST_PASSED_TEXT |
| return ( |
| self.api.swarming.test_api.summary(test_data) + |
| self.api.json.test_api.output({}) + |
| self.api.raw_io.test_api.output('') |
| ) |
| |
| def check_num_flakes(self, offset): |
| """Stress tests the given revision and returns the number of failures. |
| |
| Returns: Boolean indicating if enough failures have been found. |
| """ |
| # TODO(machenbach): Use the sharding logic from the swarming module. We |
| # don't use it yet, since swarming sets the GTEST_SHARD_INDEX environment |
| # variable, which is used by the V8 test runner. This makes the test |
| # disappear on all but one shards, because the test runner distributes tests |
| # in a way such that each test only runs on one shard. |
| # We first need a change of that logic on V8-side to suppress using |
| # GTEST_SHARD_INDEX for flake bisection (e.g. by introducing another flag). |
| # This V8-side commit needs to age enough before using it on infra-side, |
| # so that it is availabe in each revision when bisecting backwards. |
| |
| isolated_hash = self.depot.get_isolated_hash(offset) |
| step_prefix = 'check %s at #%d' % (self.command.label, offset) |
| |
| def trigger_task(path, shard): |
| # TODO(machenbach): Would be nice to just use 'shard X' as step names for |
| # trigger/collect. But swarming enforces unique task titles and we can't |
| # use our optimization to not collect some tasks. Either properly |
| # cancel the task, such that they are not in the list of pending tasks or |
| # override the step names. |
| task = self.api.swarming.task( |
| '%s - shard %d' % (step_prefix, shard), |
| isolated_hash, |
| task_output_dir=path.join('task_output_dir_%d' % shard), |
| raw_cmd=self.command.raw_cmd(self.multiplier, offset), |
| ) |
| |
| # Override cpu and gpu defaults for Android as such devices don't have |
| # these dimensions. |
| if task.dimensions['os'] == 'Android': |
| task.dimensions.pop('cpu') |
| task.dimensions.pop('gpu') |
| |
| self.api.swarming.trigger_task(task) |
| return task |
| |
| def collect_task(task): |
| try: |
| step_result = self.api.swarming.collect_task( |
| task, step_test_data=self._default_task_pass_test_data()) |
| data = step_result.swarming.summary['shards'][0] |
| # Sanity checks. |
| # TODO(machenbach): Add this information to the V8 test runner's json |
| # output as parsing stdout is brittle. |
| |
| output = data.get('output') |
| assert TEST_PASSED_TEXT in output |
| return 0 |
| except self.api.step.StepFailure as e: |
| data = e.result.swarming.summary['shards'][0] |
| assert data['exit_code'], ( |
| 'The bot might have died. Please restart the analysis') |
| if data['exit_code'] == EXIT_CODE_NO_TESTS: |
| # The desired tests seem to not exist in this revision. |
| # TODO(machenbach): Add special logic for dealing with tests not |
| # existing. They might have been added in a revision and are flaky |
| # since then. Treat them as good revisions for now. |
| # Maybe we should not do this during initialization to make sure it's |
| # not a setup error? |
| return 0 # pragma: no cover |
| |
| output = data.get('output') |
| if TEST_PASSED_TEXT in output: # pragma: no cover |
| # It's possible that the return code is non-zero due to a test runner |
| # leak. |
| # TODO(machenbach): Remove this when https://crbug.com/v8/8001 is |
| # resolved. |
| return 0 |
| match = re.search(r'=== (\d+) tests failed', output) |
| assert match |
| return int(match.group(1)) |
| |
| with self.api.tempfile.temp_dir('v8-flake-bisect-') as path: |
| with self.api.step.nest(step_prefix) as parent: |
| tasks = [ |
| trigger_task(path, shard) |
| for shard in range(self.num_shards) |
| ] |
| num_failures = 0 |
| for task in tasks: |
| num_failures += collect_task(task) |
| if (self.repro_only and num_failures or |
| num_failures >= MIN_FLAKE_THRESHOLD): |
| # Stop waiting for more tasks early if already enough failures are |
| # found. |
| # TODO(machenbach): Cancel the tasks we don't collect. During |
| # calibration we might even want to figure out a better number of |
| # shards? E.g. when doubling from 4 to 8, maybe 5 was enough and |
| # should be used throughout. |
| break |
| if num_failures: |
| parent.presentation.status = self.api.step.FAILURE |
| parent.presentation.step_text = '%d failures' % num_failures |
| return num_failures |
| |
| |
| def bisect(api, depot, initial_commit_offset, is_bad_func, offset): |
| """Exercises the bisection control flow. |
| |
| Args: |
| api: Recipe api. |
| depot: Helper for accessing storage and git. |
| initial_commit_offset: Number of commits, backwards bisection will |
| initially leap over. |
| is_bad_func: Function (revision->bool) determining if a given revision is |
| bad. |
| offset: Offset at which to start bisection. |
| """ |
| def report_range(text, from_offset, to_offset): |
| from_revision = depot.get_revision(from_offset) |
| to_revision = depot.get_revision(to_offset) |
| offset_range = '#%d..#%d' % (from_offset, to_offset) |
| git_range = '%s..%s' % (from_revision[:8], to_revision[:8]) |
| step_result = api.step(text % offset_range, cmd=None) |
| step_result.presentation.links[git_range] = '%s/+log/%s' % (REPO, git_range) |
| |
| def report_revision(text, offset): |
| rev = depot.get_revision(offset) |
| step_result = api.step(text % ('#%d' % offset), cmd=None) |
| step_result.presentation.links[rev[:8]] = '%s/+/%s' % (REPO, rev) |
| |
| def bisect_back(to_offset): |
| """Bisects backwards from to_offset, doubling the delta in each |
| iteration. |
| |
| Returns: |
| A tuple of (from_offset, to_offset), where from_offset..to_offset |
| represents the range of good..bad revision found. |
| """ |
| commit_offset = initial_commit_offset |
| for _ in range(MAX_BISECT_STEPS): |
| from_offset = to_offset + commit_offset |
| |
| # Check if from_offset is bad and iterate backwards if so. |
| from_offset = depot.find_closest_build(from_offset) |
| report_revision('Checking %s', from_offset) |
| if is_bad_func(from_offset): |
| to_offset = from_offset |
| commit_offset *= 2 |
| continue |
| |
| return from_offset, to_offset |
| raise api.step.StepFailure( |
| 'Could not not find a good revision.') # pragma: no cover |
| |
| def bisect_into(from_offset, to_offset): |
| """Bisects into a given range from_offset..to_offset and determins a |
| suspect commit range. |
| """ |
| assert from_offset >= to_offset |
| known_good = from_offset |
| known_bad = to_offset |
| report_range('Bisecting %s', from_offset, to_offset) |
| for _ in range(MAX_BISECT_STEPS): |
| # End of bisection. Note that possibly known_good..known_bad is a larger |
| # range than 1 commit due to missing isolates. |
| if from_offset - to_offset <= 1: |
| return known_good, known_bad |
| middle_offset = to_offset + (from_offset - to_offset ) / 2 |
| build_offset = depot.find_closest_build(middle_offset, from_offset) |
| |
| if build_offset >= from_offset: |
| report_range('No builds in %s', from_offset, middle_offset) |
| # There are no isolates in lower half. Skip it and continue. |
| from_offset = middle_offset |
| continue |
| |
| report_revision('Checking %s', build_offset) |
| if is_bad_func(build_offset): |
| to_offset = build_offset |
| known_bad = build_offset |
| else: |
| from_offset = build_offset |
| known_good = build_offset |
| |
| from_offset, to_offset = bisect_back(offset) |
| from_offset, to_offset = bisect_into(from_offset, to_offset) |
| report_range('Result: Suspecting %s', from_offset, to_offset) |
| |
| def setup_swarming(api, swarming_dimensions): |
| api.swarming_client.checkout('master') |
| api.swarming.default_expiration = 60 * 60 |
| api.swarming.default_hard_timeout = 60 * 60 |
| api.swarming.default_io_timeout = 20 * 60 |
| api.swarming.default_idempotent = False |
| api.swarming.default_priority = 25 |
| api.swarming.default_user = 'v8-flake-bisect' |
| api.swarming.add_default_tag('purpose:v8-flake-bisect') |
| api.swarming.set_default_dimension('pool', 'Chrome') |
| api.swarming.set_default_dimension('gpu', 'none') |
| api.swarming.task_output_stdout = 'all' |
| |
| # TODO(tikuta): Remove this after the switch (crbug.com/894045). |
| api.swarming.use_go_client = True |
| |
| for item in swarming_dimensions: |
| k, v = item.split(':') |
| api.swarming.set_default_dimension(k, v) |
| |
| |
| def RunSteps(api, bisect_mastername, bisect_buildername, build_config, |
| extra_args, initial_commit_offset, isolated_name, num_shards, |
| repetitions, repro_only, swarming_dimensions, test_name, |
| timeout_sec, total_timeout_sec, to_revision, variant): |
| # Convert floats to ints. |
| initial_commit_offset = int(initial_commit_offset) |
| num_shards = int(num_shards) |
| repetitions = int(repetitions) |
| timeout_sec = int(timeout_sec) |
| total_timeout_sec = int(total_timeout_sec) |
| |
| # Set up swarming client. |
| setup_swarming(api, swarming_dimensions) |
| |
| # Set up bisection helpers. |
| depot = Depot( |
| api, bisect_mastername, bisect_buildername, isolated_name, to_revision) |
| command = Command( |
| test_name, build_config, variant, repetitions, repro_only, |
| total_timeout_sec, timeout_sec, extra_args) |
| runner = Runner(api, depot, command, num_shards, repro_only) |
| |
| to_offset = depot.find_closest_build(0) |
| |
| # Get confidence that the given revision is flaky and optionally calibrate the |
| # repetitions. |
| could_reproduce = runner.calibrate(to_offset) |
| |
| if repro_only: |
| if could_reproduce: |
| api.step('Flake still reproduces.', cmd=None) |
| return |
| else: |
| # We treat it as an error if a flake belived to repro, doesn't repro. |
| raise api.step.StepFailure('Could not reproduce flake.') |
| |
| if not could_reproduce: |
| raise api.step.StepFailure('Could not reach enough confidence.') |
| |
| # Run bisection. |
| bisect(api, depot, initial_commit_offset, runner.check_num_flakes, to_offset) |
| |
| |
| def GenTests(api): |
| def test(name): |
| return ( |
| api.test(name) + |
| api.properties( |
| bisect_mastername='foo.v8', |
| bisect_buildername='V8 Foobar', |
| extra_args=['--foo-flag', '--bar-flag'], |
| isolated_name='foo_isolated', |
| build_config='Debug', |
| repetitions=64, |
| swarming_dimensions=['os:Ubuntu-14.04', 'cpu:x86-64'], |
| test_name='mjsunit/foobar', |
| timeout_sec=20, |
| to_revision='a0', |
| variant='stress_foo', |
| ) |
| ) |
| |
| def isolated_lookup(offset, exists): |
| return api.step_data( |
| 'gsutil lookup isolates for #%d' % offset, |
| api.raw_io.stream_output( |
| '' if exists else GSUTIL_NO_MATCH_TXT, |
| stream='stderr', |
| ), |
| retcode=0 if exists else 1, |
| ) |
| |
| def get_revisions(offset, *revisions): |
| return api.step_data( |
| 'get revision #%d' % offset, |
| api.json.output({'log': [ |
| {'commit': revision} for revision in revisions |
| ]}), |
| ) |
| |
| def is_flaky(offset, shard, flakes, calibration_attempt=0, |
| test_name='mjsunit/foobar'): |
| test_data = api.swarming.canned_summary_output_raw() |
| test_data['shards'][0]['output'] = TEST_FAILED_TEMPLATE % flakes |
| test_data['shards'][0]['exit_code'] = 1 |
| step_prefix = '' |
| if calibration_attempt: |
| step_prefix = 'calibration attempt %d.' % calibration_attempt |
| step_name = 'check %s at #%d' % (test_name, offset) |
| return api.step_data( |
| '%s%s.%s - shard %d' % (step_prefix, step_name, step_name, shard), |
| api.swarming.summary(test_data), |
| retcode=1, |
| ) |
| |
| def verify_suspects(from_offset, to_offset): |
| """Verify that the correct reporting step for from_offset..to_offset is |
| emitted. |
| """ |
| git_range = 'a%d..a%d' % (from_offset, to_offset) |
| step_name = 'Result: Suspecting #%d..#%d' % (from_offset, to_offset) |
| def suspects_internal(check, steps): |
| check(step_name in steps) |
| check(steps[step_name]['~followup_annotations'][0] == |
| '@@@STEP_LINK@%s@%s/+log/%s@@@' % (git_range, REPO, git_range)) |
| return api.post_process(suspects_internal) |
| |
| # Full bisect run with some corner cases. Overview of all revisions ordered |
| # new -> old. |
| # a0: no isolate |
| # a1: not flaky enough with 64 but flaky with 128 repetitions |
| # a2: flaky |
| # a3: flaky |
| # a4: no isolate |
| # a5: not flaky |
| # -> Should result in suspecting range a5..a3. |
| yield ( |
| test('full_bisect') + |
| # Test path where total timeout isn't used. |
| api.properties(total_timeout_sec=0) + |
| # Data for resolving offsets to git hashes. Simulate gitiles page size of |
| # 3 commits per call. |
| get_revisions(1, 'a1', 'a2', 'a3') + |
| get_revisions(4, 'a4', 'a5', 'a6') + |
| # Isolate data simulation for all revisions. |
| isolated_lookup(0, False) + |
| isolated_lookup(1, True) + |
| isolated_lookup(2, True) + |
| isolated_lookup(3, True) + |
| isolated_lookup(4, False) + |
| isolated_lookup(5, True) + |
| # Calibration. We check for flakes until enough are found. First only one |
| # shard reports 2 failures. |
| is_flaky(1, 1, 2, calibration_attempt=1) + |
| # Then 3 shards report 5 failures total. |
| is_flaky(1, 0, 2, calibration_attempt=2) + |
| is_flaky(1, 1, 1, calibration_attempt=2) + |
| is_flaky(1, 2, 2, calibration_attempt=2) + |
| # Bisect backwards from a1 until good revision a5 is found. |
| is_flaky(2, 0, 3) + |
| # Bisect into a5..a2. |
| is_flaky(3, 0, 3) + |
| verify_suspects(5, 3) |
| ) |
| |
| # Similar to above but fewer corner cases. This is for simulating bisection |
| # going into the upper half of a git range, which has different code paths |
| # above. |
| yield ( |
| test('full_bisect_upper') + |
| # Data for resolving offsets to git hashes. Simulate gitiles page size of |
| # 8, fetching all data in the first call. |
| get_revisions(1, 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8') + |
| # Isolate data simulation for all revisions. |
| isolated_lookup(0, True) + |
| isolated_lookup(1, True) + |
| isolated_lookup(3, True) + |
| isolated_lookup(4, True) + |
| isolated_lookup(5, True) + |
| isolated_lookup(7, True) + |
| # Calibration. |
| is_flaky(0, 0, 5, calibration_attempt=1) + |
| # Bisect backwards from a0 until good revision a7 is found. |
| is_flaky(1, 0, 3) + |
| is_flaky(3, 0, 3) + |
| # Bisect into a7..a3. |
| is_flaky(4, 0, 2) + |
| verify_suspects(5, 4) + |
| api.post_process(DropExpectation) |
| ) |
| |
| # Test bisecting through a large range of missing builds. |
| yield ( |
| test('large_gap') + |
| get_revisions(1, 'a1', 'a2', 'a3', 'a4') + |
| # Simulate a large gap between #0 and #4.. |
| isolated_lookup(0, True) + |
| isolated_lookup(1, False) + |
| isolated_lookup(2, False) + |
| isolated_lookup(3, False) + |
| isolated_lookup(4, True) + |
| # Bad build #0 wile #4 is a good build using default test data. |
| is_flaky(0, 0, 5, calibration_attempt=1) + |
| # Check that bisect continues properly after not finding a build in one |
| # half. |
| api.post_process(MustRun, 'No builds in #4..#2') + |
| api.post_process(MustRun, 'No builds in #2..#1') + |
| # Check that isolate lookup is cached for the negative case. We look only |
| # once for a build that's not found. |
| api.post_process(MustRun, 'gsutil lookup isolates for #2') + |
| api.post_process(DoesNotRun, 'gsutil lookup isolates for #2 (2)') + |
| verify_suspects(4, 0) + |
| api.post_process(DropExpectation) |
| ) |
| |
| # Simulate not finding any isolates. |
| yield ( |
| test('no_isolates') + |
| sum((isolated_lookup(i, False) + get_revisions(i, 'a%d' % i) |
| for i in range(1, MAX_ISOLATE_OFFSET)), |
| isolated_lookup(0, False)) + |
| api.post_process(ResultReasonRE, 'Couldn\'t find isolates.') + |
| api.post_process(DropExpectation) |
| ) |
| |
| # Simulate repro-only mode reproducing a flake. |
| yield ( |
| test('repro_only') + |
| api.properties(repro_only=True) + |
| isolated_lookup(0, True) + |
| is_flaky(0, 0, 1, calibration_attempt=1) + |
| api.post_process(MustRun, 'Flake still reproduces.') + |
| api.post_process(Filter( |
| 'calibration attempt 1.check mjsunit/foobar at #0.' |
| '[trigger] check mjsunit/foobar at #0 - shard 0')) |
| ) |
| |
| # Simulate repro-only mode not reproducing a flake. |
| yield ( |
| test('repro_only_failed') + |
| api.properties(repro_only=True) + |
| isolated_lookup(0, True) + |
| api.post_process(ResultReasonRE, 'Could not reproduce flake.') + |
| api.post_process(DropExpectation) |
| ) |
| |
| # Simulate running tasks on Android and verify correct dimensions. |
| def check_dimensions(check, steps): |
| step = ('calibration attempt 1.check mjsunit/foobar at #0.' |
| '[trigger] check mjsunit/foobar at #0 - shard 0 on Android') |
| if check(step in steps): |
| check(all(arg != 'cpu' for arg in steps[step]['cmd'])) |
| check(all(arg != 'gpu' for arg in steps[step]['cmd'])) |
| yield ( |
| test('android_dimensions') + |
| api.properties( |
| repro_only=True, |
| swarming_dimensions=[ |
| 'os:Android', 'cpu:x86-64', 'device_os:MMB29Q', |
| 'device_type:bullhead', 'pool:Chrome']) + |
| isolated_lookup(0, True) + |
| api.post_process(check_dimensions) + |
| api.post_process(DropExpectation) |
| ) |
| |
| # Simulate not finding enough flakes during calibration. |
| # Also test cutting off overly long test names in step names. |
| long_test_name = (29 * '*') + 'too_long' |
| shortened_test_name = (29 * '*') + '...' |
| yield ( |
| test('no_confidence') + |
| api.properties(test_name=long_test_name, num_shards=8) + |
| isolated_lookup(0, True) + |
| is_flaky(0, 0, 0, calibration_attempt=1, test_name=shortened_test_name) + |
| is_flaky(0, 1, 2, calibration_attempt=2, test_name=shortened_test_name) + |
| is_flaky(0, 2, 1, calibration_attempt=3, test_name=shortened_test_name) + |
| is_flaky(0, 1, 3, calibration_attempt=4, test_name=shortened_test_name) + |
| is_flaky(0, 0, 3, calibration_attempt=5, test_name=shortened_test_name) + |
| api.post_process(ResultReasonRE, 'Could not reach enough confidence.') + |
| api.post_process(DropExpectation) |
| ) |