tools/binary_size/libsupersize/linker_map_parser.py - chromium/src - Git at Google

 #!/usr/bin/env python
 # Copyright 2017 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Parser for linker map files.

 The format of a linker map file depends on the linker that generates it. This
 file uses "coded linker name" to identify formats and variants:

   'gold': The gold linker (usage is being deprecated by Chrome).
   'lld_v0': LLD linker (no LTO), old format.
   'lld-lto_v0': LLD linker with ThinLTO, old format.
   'lld_v1': LLD linker (no LTO), new format.
   'lld-lto_v1': LLD linker with ThinLTO, new format.

 The |linker_name| parameter in various functions must take one of the above
 coded linker name values.
 """

 from __future__ import print_function

 import argparse
 import code
 import itertools
 import logging
 import os
 import re
 import readline

 import demangle
 import models

 # About linker maps:
 # * "Discarded input sections" include symbols merged with other symbols
 #   (aliases), so the information there is not actually a list of unused things.
 # * Linker maps include symbols that do not have names (with object path),
 #   whereas "nm" skips over these (they don't account for much though).
 # * The parse time for compressed linker maps is dominated by ungzipping.


 class MapFileParserGold(object):
   """Parses a linker map file from gold linker."""
   # Map file writer for gold linker:
   # https://github.com/gittup/binutils/blob/HEAD/gold/mapfile.cc

   def __init__(self):
     self._common_symbols = []
     self._symbols = []
     self._section_sizes = {}
     self._lines = None

   def Parse(self, lines):
     """Parses a linker map file.

     Args:
       lines: Iterable of lines, the first of which has been consumed to
       identify file type.

     Returns:
       A tuple of (section_sizes, symbols).
     """
     self._lines = iter(lines)
     logging.debug('Scanning for Header')

     while True:
       line = self._SkipToLineWithPrefix('Common symbol', 'Memory map')
       if line.startswith('Common symbol'):
         self._common_symbols = self._ParseCommonSymbols()
         logging.debug('.bss common entries: %d', len(self._common_symbols))
         continue
       elif line.startswith('Memory map'):
         self._ParseSections()
       break
     return self._section_sizes, self._symbols

   def _SkipToLineWithPrefix(self, prefix, prefix2=None):
     for l in self._lines:
       if l.startswith(prefix) or (prefix2 and l.startswith(prefix2)):
         return l

   def _ParsePossiblyWrappedParts(self, line, count):
     parts = line.split(None, count - 1)
     if not parts:
       return None
     if len(parts) != count:
       line = next(self._lines)
       parts.extend(line.split(None, count - len(parts) - 1))
       assert len(parts) == count, 'parts: ' + ' '.join(parts)
     parts[-1] = parts[-1].rstrip()
     return parts

   def _ParseCommonSymbols(self):
 # Common symbol       size              file
 #
 # ff_cos_131072       0x40000           obj/third_party/<snip>
 # ff_cos_131072_fixed
 #                     0x20000           obj/third_party/<snip>
     ret = []
     next(self._lines)  # Skip past blank line

     name, size_str, path = None, None, None
     for l in self._lines:
       parts = self._ParsePossiblyWrappedParts(l, 3)
       if not parts:
         break
       name, size_str, path = parts
       sym = models.Symbol(models.SECTION_BSS,  int(size_str[2:], 16),
                           full_name=name, object_path=path)
       ret.append(sym)
     return ret

   def _ParseSections(self):
 # .text           0x0028c600  0x22d3468
 #  .text.startup._GLOBAL__sub_I_bbr_sender.cc
 #                 0x0028c600       0x38 obj/net/net/bbr_sender.o
 #  .text._reset   0x00339d00       0xf0 obj/third_party/icu/icuuc/ucnv.o
 #  ** fill        0x0255fb00   0x02
 #  .text._ZN4base8AutoLockD2Ev
 #                 0x00290710        0xe obj/net/net/file_name.o
 #                 0x00290711                base::AutoLock::~AutoLock()
 #                 0x00290711                base::AutoLock::~AutoLock()
 # .text._ZNK5blink15LayoutBlockFlow31mustSeparateMarginAfterForChildERK...
 #                 0xffffffffffffffff       0x46 obj/...
 #                 0x006808e1                blink::LayoutBlockFlow::...
 # .text.OUTLINED_FUNCTION_0
 #                 0x002a2000       0x20 obj/net/net/tag.o
 # .bss
 #  .bss._ZGVZN11GrProcessor11initClassIDI10LightingFPEEvvE8kClassID
 #                 0x02d4b294        0x4 obj/skia/skia/SkLightingShader.o
 #                 0x02d4b294   guard variable for void GrProcessor::initClassID
 # .data           0x0028c600  0x22d3468
 #  .data.rel.ro._ZTVN3gvr7android19ScopedJavaGlobalRefIP12_jfloatArrayEE
 #                 0x02d1e668       0x10 ../../third_party/.../libfoo.a(bar.o)
 #                 0x02d1e668   vtable for gvr::android::GlobalRef<_jfloatArray*>
 #  ** merge strings
 #                 0x0255fb00   0x1f2424
 #  ** merge constants
 #                 0x0255fb00   0x8
 # ** common       0x02db5700   0x13ab48
     syms = self._symbols
     while True:
       line = self._SkipToLineWithPrefix('.')
       if not line:
         break
       section_name = None
       try:
         # Parse section name and size.
         parts = self._ParsePossiblyWrappedParts(line, 3)
         if not parts:
           break
         section_name, section_address_str, section_size_str = parts
         section_address = int(section_address_str[2:], 16)
         section_size = int(section_size_str[2:], 16)
         self._section_sizes[section_name] = section_size
         if (section_name in (models.SECTION_BSS,
                              models.SECTION_RODATA,
                              models.SECTION_TEXT) or
             section_name.startswith(models.SECTION_DATA)):
           logging.info('Parsing %s', section_name)
           if section_name == models.SECTION_BSS:
             # Common symbols have no address.
             syms.extend(self._common_symbols)
           prefix_len = len(section_name) + 1  # + 1 for the trailing .
           symbol_gap_count = 0
           merge_symbol_start_address = section_address
           sym_count_at_start = len(syms)
           line = next(self._lines)
           # Parse section symbols.
           while True:
             if not line or line.isspace():
               break
             if line.startswith(' **'):
               zero_index = line.find('0')
               if zero_index == -1:
                 # Line wraps.
                 name = line.strip()
                 line = next(self._lines)
               else:
                 # Line does not wrap.
                 name = line[:zero_index].strip()
                 line = line[zero_index:]
               address_str, size_str = self._ParsePossiblyWrappedParts(line, 2)
               line = next(self._lines)
               # These bytes are already accounted for.
               if name == '** common':
                 continue
               address = int(address_str[2:], 16)
               size = int(size_str[2:], 16)
               path = None
               sym = models.Symbol(section_name, size, address=address,
                                   full_name=name, object_path=path)
               syms.append(sym)
               if merge_symbol_start_address > 0:
                 merge_symbol_start_address += size
             else:
               # A normal symbol entry.
               subsection_name, address_str, size_str, path = (
                   self._ParsePossiblyWrappedParts(line, 4))
               size = int(size_str[2:], 16)
               assert subsection_name.startswith(section_name), (
                   'subsection name was: ' + subsection_name)
               mangled_name = subsection_name[prefix_len:]
               name = None
               address_str2 = None
               while True:
                 line = next(self._lines).rstrip()
                 if not line or line.startswith(' .'):
                   break
                 # clang includes ** fill, but gcc does not.
                 if line.startswith(' ** fill'):
                   # Alignment explicitly recorded in map file. Rather than
                   # record padding based on these entries, we calculate it
                   # using addresses. We do this because fill lines are not
                   # present when compiling with gcc (only for clang).
                   continue
                 elif line.startswith(' **'):
                   break
                 elif name is None:
                   address_str2, name = self._ParsePossiblyWrappedParts(line, 2)

               if address_str == '0xffffffffffffffff':
                 # The section needs special handling (e.g., a merge section)
                 # It also generally has a large offset after it, so don't
                 # penalize the subsequent symbol for this gap (e.g. a 50kb gap).
                 # There seems to be no corelation between where these gaps occur
                 # and the symbols they come in-between.
                 # TODO(agrieve): Learn more about why this happens.
                 if address_str2:
                   address = int(address_str2[2:], 16) - 1
                 elif syms and syms[-1].address > 0:
                   # Merge sym with no second line showing real address.
                   address = syms[-1].end_address
                 else:
                   logging.warning('First symbol of section had address -1')
                   address = 0

                 merge_symbol_start_address = address + size
               else:
                 address = int(address_str[2:], 16)
                 # Finish off active address gap / merge section.
                 if merge_symbol_start_address:
                   merge_size = address - merge_symbol_start_address
                   merge_symbol_start_address = 0
                   if merge_size > 0:
                     # merge_size == 0 for the initial symbol generally.
                     logging.debug('Merge symbol of size %d found at:\n  %r',
                                   merge_size, syms[-1])
                     # Set size=0 so that it will show up as padding.
                     sym = models.Symbol(
                         section_name, 0,
                         address=address,
                         full_name='** symbol gap %d' % symbol_gap_count)
                     symbol_gap_count += 1
                     syms.append(sym)

               #  .text.res_findResource_60
               #                 0x00178de8       0x12a obj/...
               #                 0x00178de9                res_findResource_60
               #  .text._ZN3url6ParsedC2Ev
               #                 0x0021ad62       0x2e obj/url/url/url_parse.o
               #                 0x0021ad63                url::Parsed::Parsed()
               #  .text.unlikely._ZN4base3CPUC2Ev
               #                 0x003f9d3c       0x48 obj/base/base/cpu.o
               #                 0x003f9d3d                base::CPU::CPU()
               full_name = name
               if mangled_name and (not name or mangled_name.startswith('_Z') or
                                    '._Z' in mangled_name):
                 full_name = mangled_name
               # Handle outlined functions. These are actual LLD features, but we
               # handle them here for Gold to facilitate testing.
               if full_name and full_name.startswith('OUTLINED_FUNCTION_'):
                 full_name = '** outlined function'

               sym = models.Symbol(section_name, size, address=address,
                                   full_name=full_name, object_path=path)
               syms.append(sym)
           section_end_address = section_address + section_size
           if section_name != models.SECTION_BSS and (
               syms[-1].end_address < section_end_address):
             # Set size=0 so that it will show up as padding.
             sym = models.Symbol(
                 section_name, 0,
                 address=section_end_address,
                 full_name=(
                     '** symbol gap %d (end of section)' % symbol_gap_count))
             syms.append(sym)
           logging.debug('Symbol count for %s: %d', section_name,
                         len(syms) - sym_count_at_start)
       except:
         logging.error('Problem line: %r', line)
         logging.error('In section: %r', section_name)
         raise


 class MapFileParserLld(object):
   """Parses a linker map file from LLD."""
   # Map file writer for LLD linker (for ELF):
   # https://github.com/llvm-mirror/lld/blob/HEAD/ELF/MapFile.cpp
   _LINE_RE_V0 = re.compile(r'([0-9a-f]+)\s+([0-9a-f]+)\s+(\d+) ( *)(.*)')
   _LINE_RE_V1 = re.compile(
       r'\s*[0-9a-f]+\s+([0-9a-f]+)\s+([0-9a-f]+)\s+(\d+) ( *)(.*)')
   _LINE_RE = [_LINE_RE_V0, _LINE_RE_V1]

   def __init__(self, linker_name):
     self._linker_name = linker_name
     self._common_symbols = []
     self._section_sizes = {}

   @staticmethod
   def ParseArmAnnotations(tok):
     """Decides whether a Level 3 token is an annotation.

     Returns:
       A 2-tuple (is_annotation, next_thumb2_mode):
         is_annotation: Whether |tok| is an annotation.
         next_thumb2_mode: New |thumb2_mode| value, or None if keep old value.
     """
     # Annotations for ARM match '$t', '$d.1', but not '$_21::invoke'.
     if tok.startswith('$') and (len(tok) == 2 or
                                 (len(tok) >= 3 and tok[2] == '.')):
       if tok.startswith('$t'):
         return True, True  # Is annotation, enter Thumb2 mode.
       if tok.startswith('$a'):
         return True, False  # Is annotation, enter ARM32 mode.
       return True, None  # Is annotation, keep old |thumb2_mode| value.
     return False, None  # Not annotation, keep old |thumb2_mode| value.

   def Tokenize(self, lines):
     """Generator to filter and tokenize linker map lines."""
     # Extract e.g., 'lld_v0' -> 0, or 'lld-lto_v1' -> 1.
     map_file_version = int(self._linker_name.split('_v')[1])
     pattern = MapFileParserLld._LINE_RE[map_file_version]

     # A Level 3 symbol can have |size == 0| in some situations (e.g., assembly
     # code symbols). To provided better size estimates in this case, the "span"
     # of a Level 3 symbol is computed as:
     #  (A) The |address| difference compared to the next Level 3 symbol.
     #  (B) If the Level 3 symbol is the last one among Level 3 lines nested
     #      in a Level 2 line: The difference between the Level 3 symbol's
     #      |address| with the containing Level 2 line's end address.
     # To handle (A), |lines| is visited using a one-step lookahead, using
     # |sentinel| to handle the last line. To handle (B), |level2_end_address|
     # is updated whenever a Level 2 line is processed.
     sentinel = '0 0 0 0 THE_END'
     assert pattern.match(sentinel)
     level2_end_address = None
     thumb2_mode = False
     (line, address, size, level, tok) = (None, None, None, None, None)
     for next_line in itertools.chain(lines, (sentinel,)):
       m = pattern.match(next_line)
       if m is None:
         continue
       next_address = int(m.group(1), 16)
       next_size = int(m.group(2), 16)
       next_level = (len(m.group(4)) // 8) + 1  # Add 1 to agree with comments.
       next_tok = m.group(5)

       if next_level == 3:
         assert level >= 2, 'Cannot jump from Level 1 to Level 3.'
         # Detect annotations. If found, maybe update |thumb2_mode|, then skip.
         (is_annotation, next_thumb2_mode) = (
             MapFileParserLld.ParseArmAnnotations(next_tok))
         if is_annotation:
           if next_thumb2_mode:
             thumb2_mode = next_thumb2_mode
           continue  # Skip annotations.
         if thumb2_mode:
           # Adjust odd address to even. Alignment is not guanteed for all
           # symbols (e.g., data, or x86), so this is judiciously applied.
           next_address &= ~1
       else:
         thumb2_mode = False  # Resets on leaving Level 3.

       if address is not None:
         span = None
         if level == 3:
           span = next_address if next_level == 3 else level2_end_address
           span -= address
         elif level == 2:
           level2_end_address = address + size
         yield (line, address, size, level, span, tok)

       line = next_line
       address = next_address
       size = next_size
       level = next_level
       tok = next_tok

   def Parse(self, lines):
     """Parses a linker map file.

     Args:
       lines: Iterable of lines, the first of which has been consumed to
       identify file type.

     Returns:
       A tuple of (section_sizes, symbols).
     """
 # Newest format:
 #     VMA      LMA     Size Align Out     In      Symbol
 #     194      194       13     1 .interp
 #     194      194       13     1         <internal>:(.interp)
 #     1a8      1a8     22d8     4 .ARM.exidx
 #     1b0      1b0        8     4         obj/sandbox/syscall.o:(.ARM.exidx)
 #     400      400   123400    64 .text
 #     600      600       14     4         obj/...:(.text.OUTLINED_FUNCTION_0)
 #     600      600        0     1                 $x.3
 #     600      600       14     1                 OUTLINED_FUNCTION_0
 #  123800   123800    20000   256 .rodata
 #  123800   123800       4      4         ...:o:(.rodata._ZN3fooE.llvm.1234)
 #  123800   123800       4      1                 foo (.llvm.1234)
 #  123804   123804       4      4         ...:o:(.rodata.bar.llvm.1234)
 #  123804   123804       4      1                 bar.llvm.1234
 # Older format:
 # Address          Size             Align Out     In      Symbol
 # 00000000002002a8 000000000000001c     1 .interp
 # 00000000002002a8 000000000000001c     1         <internal>:(.interp)
 # ...
 # 0000000000201000 0000000000000202    16 .text
 # 0000000000201000 000000000000002a     1         /[...]/crt1.o:(.text)
 # 0000000000201000 0000000000000000     0                 _start
 # 000000000020102a 0000000000000000     1         /[...]/crti.o:(.text)
 # 0000000000201030 00000000000000bd    16         /[...]/crtbegin.o:(.text)
 # 0000000000201030 0000000000000000     0                 deregister_tm_clones
 # 0000000000201060 0000000000000000     0                 register_tm_clones
 # 00000000002010a0 0000000000000000     0                 __do_global_dtors_aux
 # 00000000002010c0 0000000000000000     0                 frame_dummy
 # 00000000002010ed 0000000000000071     1         a.o:(.text)
 # 00000000002010ed 0000000000000071     0                 main
     syms = []
     cur_section = None
     cur_section_is_useful = None
     promoted_name_count = 0

     tokenizer = self.Tokenize(lines)
     # TODO(huangs): Use |span| from |tokenizer| to fix http://crbug.com/892648.
     for (line, address, size, level, _, tok) in tokenizer:
       # Level 1 data match the "Out" column. They specify sections or
       # PROVIDE_HIDDEN lines.
       if level == 1:
         if not tok.startswith('PROVIDE_HIDDEN'):
           self._section_sizes[tok] = size
         cur_section = tok
         # E.g., Want to convert "(.text._name)" -> "_name" later.
         mangled_start_idx = len(cur_section) + 2
         cur_section_is_useful = (
             cur_section in (models.SECTION_BSS,
                             models.SECTION_RODATA,
                             models.SECTION_TEXT) or
             cur_section.startswith(models.SECTION_DATA))

       elif cur_section_is_useful:
         # Level 2 data match the "In" column. They specify object paths and
         # section names within objects, or '<internal>:...'.
         if level == 2:
           # Create symbol, which can be modified as sym[-1] by Level 3 parsing.
           syms.append(models.Symbol(cur_section, size, address=address))
           # E.g. path.o:(.text._name)
           cur_obj, paren_value = tok.split(':')
           # '(.text._name)' -> '_name'.
           mangled_name = paren_value[mangled_start_idx:-1]
           # As of 2017/11 LLD does not distinguish merged strings from other
           # merged data. Feature request is filed under:
           # https://bugs.llvm.org/show_bug.cgi?id=35248
           if cur_obj == '<internal>':
             if cur_section == '.rodata' and mangled_name == '':
               # Treat all <internal> sections within .rodata as as string
               # literals. Some may hold numeric constants or other data, but
               # there is currently no way to distinguish them.
               syms[-1].full_name = '** lld merge strings'
             else:
               # e.g. <internal>:(.text.thunk)
               syms[-1].full_name = '** ' + mangled_name
           elif cur_obj == 'lto.tmp' or 'thinlto-cache' in cur_obj:
             pass
           else:
             syms[-1].object_path = cur_obj

         # Level 3 data match the "Symbol" column. They specify symbol names or
         # special names such as '.L_MergeGlobals'. Annotations such as '$d',
         # '$t.42' also appear at Level 3, but they are consumed by |tokenizer|,
         # so don't appear hear.
         elif level == 3:
           # Ignore anything with '.L_MergedGlobals' prefix. This seems to only
           # happen for ARM (32-bit) builds.
           if tok.startswith('.L_MergedGlobals'):
             continue
           # Multiple Level 3 entries may exist. Take the first with |size != 0|.
           # TODO(huangs): Process all entries to fix http://crbug.com/892648.
           if size and not syms[-1].full_name:
             # Outlined functions have names like OUTLINED_FUNCTION_0, which can
             # appear 1000+ time, and can cause false aliasing. We treat these as
             # special cases by designating them as a placeholder symbols and
             # renaming them to '** outlined function'.
             if tok.startswith('OUTLINED_FUNCTION_'):
               tok = '** outlined function'
             stripped_tok = demangle.StripLlvmPromotedGlobalNames(tok)
             if len(tok) != len(stripped_tok):
               promoted_name_count += 1
               tok = stripped_tok
             syms[-1].full_name = tok
         else:
           logging.error('Problem line: %r', line)

     if promoted_name_count:
       logging.info('Found %d promoted global names', promoted_name_count)
     return self._section_sizes, syms


 def _DetectLto(lines):
   """Scans LLD linker map file and returns whether LTO was used."""
   # It's assumed that the first line in |lines| was consumed to determine that
   # LLD was used. Seek 'thinlto-cache' prefix within an "indicator section" as
   # indicator for LTO.
   found_indicator_section = False
   # Potential names of "main section". Only one gets used.
   indicator_section_set = set(['.rodata', '.ARM.exidx'])
   start_pos = -1
   for line in lines:
     # Shortcut to avoid regex: The first line seen (second line in file) should
     # start a section, and start with '.', e.g.:
     #     194      194       13     1 .interp
     # Assign |start_pos| as position of '.', and trim everything before!
     if start_pos < 0:
       start_pos = line.index('.')
     if len(line) < start_pos:
       continue
     line = line[start_pos:]
     tok = line.lstrip()  # Allow whitespace at right.
     indent_size = len(line) - len(tok)
     if indent_size == 0:  # Section change.
       if found_indicator_section:  # Exit if just visited "main section".
         break
       if tok.strip() in indicator_section_set:
         found_indicator_section = True
     elif indent_size == 8:
       if found_indicator_section:
         if tok.startswith('thinlto-cache'):
           return True
   return False


 def DetectLinkerNameFromMapFile(lines):
   """Heuristic linker detection from partial scan of the linker map.

   Args:
     lines: Iterable of lines from the linker map.

   Returns:
     A coded linker name.

   """
   first_line = next(lines)

   if first_line.startswith('Address'):
     return 'lld-lto_v0' if _DetectLto(lines) else 'lld_v0'

   if first_line.lstrip().startswith('VMA'):
     return 'lld-lto_v1' if _DetectLto(lines) else 'lld_v1'

   if first_line.startswith('Archive member'):
     return 'gold'

   raise Exception('Invalid map file: ' + first_line)


 class MapFileParser(object):
   """Parses a linker map file generated from a specified linker."""
   def Parse(self, linker_name, lines):
     """Parses a linker map file.

     Args:
       linker_name: Coded linker name to specify a linker.
       lines: Iterable of lines from the linker map.

     Returns:
       A tuple of (section_sizes, symbols).
     """
     next(lines)  # Consume the first line of headers.
     if linker_name.startswith('lld'):
       inner_parser = MapFileParserLld(linker_name)
     elif linker_name == 'gold':
       inner_parser = MapFileParserGold()
     else:
       raise Exception('.map file is from a unsupported linker.')

     section_sizes, syms = inner_parser.Parse(lines)
     for sym in syms:
       if sym.object_path and not sym.object_path.endswith(')'):
         # Don't want '' to become '.'.
         # Thin archives' paths will get fixed in |ar.CreateThinObjectPath|.
         sym.object_path = os.path.normpath(sym.object_path)
     return (section_sizes, syms)


 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument('linker_file', type=os.path.realpath)
   parser.add_argument(
       '-v',
       '--verbose',
       default=0,
       action='count',
       help='Verbose level (multiple times for more)')
   parser.add_argument('--dump', action='store_true')
   args = parser.parse_args()

   logging.basicConfig(
       level=logging.WARNING - args.verbose * 10,
       format='%(levelname).1s %(relativeCreated)6d %(message)s')

   with open(args.linker_file, 'r') as map_file:
     linker_name = DetectLinkerNameFromMapFile(map_file)
   print('Linker type: %s' % linker_name)

   with open(args.linker_file, 'r') as map_file:
     section_sizes, syms = MapFileParser().Parse(linker_name, map_file)

   if args.dump:
     print(section_sizes)
     for sym in syms:
       print(sym)
   else:
     # Enter interactive shell.
     readline.parse_and_bind('tab: complete')
     variables = {'section_sizes': section_sizes, 'syms': syms}
     banner_lines = [
         '*' * 80,
         'Variables:',
         '  section_sizes: Map from section to sizes.',
         '  syms: Raw symbols parsed from the linker map file.',
         '*' * 80,
     ]
     code.InteractiveConsole(variables).interact('\n'.join(banner_lines))


 if __name__ == '__main__':
   main()