| 1 | from __future__ import print_function
|
| 2 | """
|
| 3 | callgraph.py
|
| 4 | """
|
| 5 |
|
| 6 | import collections
|
| 7 | import os
|
| 8 | import sys
|
| 9 |
|
| 10 | import __builtin__ # For looking up names
|
| 11 | import types
|
| 12 |
|
| 13 | from .lib import dis
|
| 14 | from .lib import inspect
|
| 15 |
|
| 16 | from .util import log
|
| 17 |
|
| 18 |
|
| 19 | def Disassemble(co):
|
| 20 | """Given a code object, yield instructions of interest.
|
| 21 |
|
| 22 | Args:
|
| 23 | co: __code__ attribute
|
| 24 |
|
| 25 | Structure copied from opy/compiler2/dis_tool.py, which was copied from
|
| 26 | dis.disassemble().
|
| 27 | """
|
| 28 | code = co.co_code
|
| 29 | extended_arg = 0 # Not used
|
| 30 |
|
| 31 | i = 0
|
| 32 | n = len(code)
|
| 33 | #log('\tLENGTH OF CODE %s: %d', co.co_name, n)
|
| 34 |
|
| 35 | while i < n:
|
| 36 | #log('\ti = %d, n = %d', i, n)
|
| 37 | op = ord(code[i])
|
| 38 | i += 1
|
| 39 |
|
| 40 | op_name = dis.opname[op]
|
| 41 |
|
| 42 | # operation is 1 byte, argument is 2 bytes.
|
| 43 | if op >= dis.HAVE_ARGUMENT:
|
| 44 | oparg = ord(code[i]) + ord(code[i+1])*256 + extended_arg
|
| 45 | extended_arg = 0
|
| 46 |
|
| 47 | if op == dis.EXTENDED_ARG:
|
| 48 | # Hm not used?
|
| 49 | raise AssertionError
|
| 50 | extended_arg = oparg*65536L
|
| 51 |
|
| 52 | i += 2
|
| 53 |
|
| 54 | const_name = None
|
| 55 | var_name = None
|
| 56 |
|
| 57 | if op in dis.hasconst:
|
| 58 | const_name = co.co_consts[oparg]
|
| 59 |
|
| 60 | elif op in dis.hasname:
|
| 61 | try:
|
| 62 | var_name = co.co_names[oparg]
|
| 63 | except IndexError:
|
| 64 | log('Error: %r cannot index %s with %d', op_name, co.co_names,
|
| 65 | oparg)
|
| 66 | raise
|
| 67 |
|
| 68 | elif op in dis.hasjrel:
|
| 69 | #raise AssertionError(op_name)
|
| 70 | pass
|
| 71 |
|
| 72 | elif op in dis.haslocal:
|
| 73 | #raise AssertionError(op_name)
|
| 74 | pass
|
| 75 |
|
| 76 | elif op in dis.hascompare:
|
| 77 | #raise AssertionError(op_name)
|
| 78 | pass
|
| 79 |
|
| 80 | elif op in dis.hasfree:
|
| 81 | #raise AssertionError(op_name)
|
| 82 | pass
|
| 83 |
|
| 84 | yield op_name, const_name, var_name
|
| 85 | #log('\t==> i = %d, n = %d', i, n)
|
| 86 |
|
| 87 |
|
| 88 | import sre_compile
|
| 89 |
|
| 90 | def _GetAttr(module, name):
|
| 91 | # Hack for bug in _fixup_range() ! (No longer in Python 3.6 head.)
|
| 92 | if module is sre_compile and name == 'l':
|
| 93 | return None
|
| 94 | # traceback.py has a hasattr() test
|
| 95 | if module is sys and name == 'tracebacklimit':
|
| 96 | return None
|
| 97 |
|
| 98 | try:
|
| 99 | val = getattr(module, name)
|
| 100 | except AttributeError:
|
| 101 | #log('%r not on %r', name, module)
|
| 102 | # This could raise too
|
| 103 | val = getattr(__builtin__, name)
|
| 104 | return val
|
| 105 |
|
| 106 |
|
| 107 | def _Walk(obj, cls, ref, syms):
|
| 108 | """
|
| 109 | Discover statically what (globally-accessible) functions and classes are
|
| 110 | used.
|
| 111 |
|
| 112 | Args:
|
| 113 | obj: a callable object
|
| 114 | cls: an optional class that it was attached to, could be None
|
| 115 | ref: the way the object was referenced from another place in the code
|
| 116 | syms: output Symbols()
|
| 117 |
|
| 118 | Something like this is OK:
|
| 119 | # def Adder(x):
|
| 120 | # def f(y):
|
| 121 | # return x + y
|
| 122 | # return f
|
| 123 |
|
| 124 | Because we'll still have access to the inner code object. We probably won't
|
| 125 | compile it though.
|
| 126 | """
|
| 127 | if syms.Seen(obj):
|
| 128 | return
|
| 129 |
|
| 130 | module_name = getattr(obj, '__module__', None)
|
| 131 | # NOTE: We get duplicate objects like this, due to bound methods on different
|
| 132 | # objects. Not sure how to fix it.
|
| 133 | #
|
| 134 | # OBJ <function Parse at 0x7fcd270b4de8>
|
| 135 | # OBJ <function Parse at 0x7fcd2709e500>
|
| 136 | # OBJ <built-in method get of dict object at 0x7fcd28c53280>
|
| 137 | # OBJ <built-in method get of dict object at 0x7fcd28c53398>
|
| 138 |
|
| 139 | #if obj.__name__ in ('write', 'get', 'Parse'):
|
| 140 | # log('OBJ %s %d', obj, id(obj))
|
| 141 | # pass
|
| 142 |
|
| 143 | # Oh is the namedtuple_ crap because of the Block class byterun/pyobj?
|
| 144 |
|
| 145 | if module_name is None or module_name in (
|
| 146 | 'namedtuple_Arguments', 'namedtuple_ArgSpec',
|
| 147 | 'namedtuple_Block'):
|
| 148 | syms.Add(obj, None, ref, None, None, None)
|
| 149 | return # Can't walk anything
|
| 150 |
|
| 151 | #log('OBJ %s %d', obj, id(obj))
|
| 152 | module = sys.modules[obj.__module__]
|
| 153 |
|
| 154 | co = getattr(obj, '__code__', None)
|
| 155 | # For example, Builtins don't have bytecode.
|
| 156 | if isinstance(co, types.CodeType):
|
| 157 | co = obj.__code__
|
| 158 | #log('CO %s', co)
|
| 159 | #path = co.co_filename
|
| 160 |
|
| 161 | mod_name = None
|
| 162 | mod_path = co.co_filename
|
| 163 |
|
| 164 | syms.Add(obj, cls, ref, mod_name, mod_path, co.co_firstlineno)
|
| 165 | else:
|
| 166 | mod_name = module.__name__
|
| 167 | try:
|
| 168 | mod_path = module.__file__
|
| 169 | if mod_path.endswith('.pyc'):
|
| 170 | mod_path = mod_path[:-1]
|
| 171 | except AttributeError:
|
| 172 | mod_path = None
|
| 173 |
|
| 174 | #if obj.__name__ == 'lex_mode_e':
|
| 175 | # log('!!! %s name %s path %s', obj, mod.__name__, mod.__file__)
|
| 176 |
|
| 177 | #mod_name = None
|
| 178 | #mod_path = None
|
| 179 |
|
| 180 | syms.Add(obj, cls, ref, mod_name, mod_path, None)
|
| 181 | return
|
| 182 |
|
| 183 | #log('\tNAME %s', val.__code__.co_name)
|
| 184 | #log('\tNAMES %s', val.__code__.co_names)
|
| 185 |
|
| 186 | # Most functions and classes we call are globals!
|
| 187 | #log('\t_Walk %s %s', obj, module)
|
| 188 | #log('\t%s', sorted(dir(module)))
|
| 189 |
|
| 190 | # Have to account for foo.Bar(), which gives this sequence:
|
| 191 | # 2 0 LOAD_GLOBAL 0 (foo)
|
| 192 | # 3 LOAD_ATTR 1 (Bar)
|
| 193 | # 6 CALL_FUNCTION 0
|
| 194 | #
|
| 195 | # Also: os.path.join().
|
| 196 |
|
| 197 | try:
|
| 198 | last_val = None # value from previous LOAD_GLOBAL or LOAD_ATTR
|
| 199 | ref = []
|
| 200 | g = Disassemble(obj.__code__)
|
| 201 |
|
| 202 | while True:
|
| 203 | op, const, var = g.next()
|
| 204 |
|
| 205 | if op == 'LOAD_GLOBAL':
|
| 206 | val = _GetAttr(module, var)
|
| 207 | ref = [var] # reset it
|
| 208 |
|
| 209 | elif op == 'LOAD_ATTR':
|
| 210 | #if last_val is not None and isinstance(last_val, types.ModuleType):
|
| 211 | if last_val is not None:
|
| 212 | #log('%s %s', op, var)
|
| 213 | val = _GetAttr(last_val, var)
|
| 214 | ref.append(var)
|
| 215 |
|
| 216 | # Crawl the methods below. Otherwise we get duplicate bound/unbound
|
| 217 | # methods, which have unique addresses.
|
| 218 | # Examples: WAIT_SPEC.Parse, sys.stdout.write
|
| 219 |
|
| 220 | # BUG: os.fork and sys.stdout.write are the same?
|
| 221 | # I thought os.fork is types.BuiltinFunctionType, and
|
| 222 | # sys.stdout.write is types.BuiltinMethodType, but why not?
|
| 223 |
|
| 224 | if isinstance(val, (types.MethodType, types.BuiltinMethodType)):
|
| 225 | val = None
|
| 226 | ref = []
|
| 227 |
|
| 228 | else:
|
| 229 | val = None
|
| 230 | ref = []
|
| 231 |
|
| 232 | else: # Some other opcode
|
| 233 | val = None
|
| 234 | ref = []
|
| 235 |
|
| 236 | if callable(val):
|
| 237 | #log('VAL %s module %s', val, val.__module__)
|
| 238 | # Recursive call.
|
| 239 |
|
| 240 | # Check for old style:
|
| 241 | #if isinstance(val, types.ClassType):
|
| 242 | # print('OLD %s' % val)
|
| 243 | _Walk(val, None, ref, syms)
|
| 244 |
|
| 245 | # If the value is a class, walk its methods. Note that we assume ALL
|
| 246 | # methods are used. It's possible to narrow this down a bit and detect
|
| 247 | # unused methods.
|
| 248 | if isinstance(val, type):
|
| 249 | cls = val
|
| 250 | #log('type %s', val)
|
| 251 | for name in dir(val):
|
| 252 | # prevent error with __abstractmethods__ attribute
|
| 253 | #if name.startswith('__'):
|
| 254 | if name == '__abstractmethods__':
|
| 255 | continue
|
| 256 | field_val = getattr(val, name)
|
| 257 | #log('field_val %s', field_val)
|
| 258 | if isinstance(field_val, types.MethodType):
|
| 259 | func_obj = field_val.im_func
|
| 260 | _Walk(func_obj, cls, ref, syms)
|
| 261 |
|
| 262 | last_val = val # Used on next iteration
|
| 263 |
|
| 264 | except StopIteration:
|
| 265 | pass
|
| 266 |
|
| 267 | #log('\tDone _Walk %s %s', obj, module)
|
| 268 |
|
| 269 |
|
| 270 | def PrintSig(fmt, func):
|
| 271 | if os.getenv('CALLGRAPH_SIG') != '1':
|
| 272 | return
|
| 273 | try:
|
| 274 | argspec = inspect.getargspec(func)
|
| 275 | except TypeError:
|
| 276 | return
|
| 277 | parts = [':%s' % ' '.join(argspec.args)]
|
| 278 | # These are keyword-only args?
|
| 279 | if argspec.varargs:
|
| 280 | parts.append('varargs=%s' % (argspec.varargs,))
|
| 281 | if argspec.keywords:
|
| 282 | parts.append('kw=%s' % (argspec.keywords,))
|
| 283 | # Hm the default of 'None' is bad -- you can't distinguish a default of None,
|
| 284 | # which is very common!
|
| 285 | # It's better to get this by parsing the AST.
|
| 286 | if argspec.defaults and any(d is not None for d in argspec.defaults):
|
| 287 | parts.append('defaults=%s' % (argspec.defaults,))
|
| 288 |
|
| 289 | print(fmt % ' '.join(parts))
|
| 290 |
|
| 291 |
|
| 292 | class Class(object):
|
| 293 |
|
| 294 | def __init__(self, cls, mod_name, mod_path):
|
| 295 | self.cls = cls
|
| 296 | self.mod_name = mod_name
|
| 297 | self.mod_path = mod_path
|
| 298 | self.methods = []
|
| 299 |
|
| 300 | def Name(self):
|
| 301 | return self.cls.__name__
|
| 302 |
|
| 303 | def AddMethod(self, m, mod_name, mod_path, line_num):
|
| 304 | # Just assume the method is in the same file as the class itself.
|
| 305 | self.methods.append((m, mod_name, mod_path, line_num))
|
| 306 |
|
| 307 | def Print(self):
|
| 308 | base_names = ' '.join(c.__name__ for c in self.cls.__bases__)
|
| 309 | print(' %s(%s)' % (self.cls.__name__, base_names))
|
| 310 |
|
| 311 | methods = [(m.__name__, m) for (m, _, _, _) in self.methods]
|
| 312 | methods.sort()
|
| 313 | for name, m in methods:
|
| 314 | print(' %s' % name)
|
| 315 | PrintSig(' %s', m)
|
| 316 |
|
| 317 | def __repr__(self):
|
| 318 | return '<Class %s %s %s %s>' % (
|
| 319 | self.cls, self.mod_name, self.mod_path, self.methods)
|
| 320 |
|
| 321 |
|
| 322 | class Symbols(object):
|
| 323 | """A sink for discovered symbols."""
|
| 324 |
|
| 325 | def __init__(self):
|
| 326 | self.seen = set()
|
| 327 |
|
| 328 | self.classes = {} # integer id() -> Class
|
| 329 | self.functions = [] # list of callables
|
| 330 |
|
| 331 | self.paths = {} # path -> list of functions
|
| 332 |
|
| 333 | def Seen(self, c):
|
| 334 | """c: a callable."""
|
| 335 | id_ = id(c)
|
| 336 | if id_ in self.seen:
|
| 337 | return True
|
| 338 | self.seen.add(id_)
|
| 339 | return False
|
| 340 |
|
| 341 | def Add(self, obj, cls, ref, mod_name, mod_path, line_num):
|
| 342 | """Could be a function, class Constructor, or method.
|
| 343 | Can also be native (C) or interpreted (Python with __code__ attribute.)
|
| 344 |
|
| 345 | Returns:
|
| 346 | True if we haven't yet seen it.
|
| 347 | """
|
| 348 | if mod_path is not None:
|
| 349 | mod_path = os.path.normpath(mod_path)
|
| 350 |
|
| 351 | if isinstance(obj, type):
|
| 352 | id_ = id(obj)
|
| 353 |
|
| 354 | # NOTE: Python's classes don't have a __code__ object, which appears to
|
| 355 | # be an irregularity. So we have to get location information from the
|
| 356 | # METHODS.
|
| 357 |
|
| 358 | # Exception: this type has a __code__ object that is not types.CodeType
|
| 359 | if obj is not types.FunctionType:
|
| 360 | assert not hasattr(obj, '__code__'), obj
|
| 361 | #assert path is None
|
| 362 | assert line_num is None
|
| 363 |
|
| 364 | self.classes[id_] = Class(obj, mod_name, mod_path)
|
| 365 |
|
| 366 | elif cls is not None:
|
| 367 | id_ = id(cls)
|
| 368 | descriptor = self.classes[id_]
|
| 369 | descriptor.AddMethod(obj, mod_name, mod_path, line_num)
|
| 370 |
|
| 371 | else:
|
| 372 | self.functions.append((obj, ref, mod_name, mod_path, line_num))
|
| 373 |
|
| 374 | return True
|
| 375 |
|
| 376 | def Report(self, f=sys.stdout):
|
| 377 | # Now categorize into files. We couldn't do that earlier because classes
|
| 378 | # don't know where they are located!
|
| 379 |
|
| 380 | py_srcs = collections.defaultdict(SourceFile)
|
| 381 | c_srcs = collections.defaultdict(SourceFile)
|
| 382 |
|
| 383 | for func, ref, mod_name, mod_path, line_num in self.functions:
|
| 384 | if mod_path:
|
| 385 | py_srcs[mod_path].functions.append((func, ref, line_num))
|
| 386 | else:
|
| 387 | c_srcs[mod_name].functions.append((func, ref, line_num))
|
| 388 |
|
| 389 | for cls in self.classes.values():
|
| 390 | #if cls.cls.__name__ == 'lex_mode_e':
|
| 391 | # log('!!! %s', cls)
|
| 392 |
|
| 393 | if cls.mod_path:
|
| 394 | py_srcs[cls.mod_path].classes.append(cls)
|
| 395 | else:
|
| 396 | c_srcs[cls.mod_name].classes.append(cls)
|
| 397 |
|
| 398 | prefix = os.getcwd()
|
| 399 | n = len(prefix) + 1
|
| 400 |
|
| 401 | #for name in sorted(py_srcs):
|
| 402 | # print(name)
|
| 403 |
|
| 404 | #return
|
| 405 |
|
| 406 | # Still missing: non-enum ASDL types? Why? CompoundObj?
|
| 407 | # command_e is there, but command and SimpleCommand aren't.
|
| 408 | # it's because we do
|
| 409 | # ast.command_e vs. ast.SimpleCommand
|
| 410 | # in both cases ast is a core.meta _AsdlModule?
|
| 411 |
|
| 412 | print('PYTHON CODE')
|
| 413 | print()
|
| 414 |
|
| 415 | for path in sorted(py_srcs):
|
| 416 | src = py_srcs[path]
|
| 417 |
|
| 418 | if path is not None and path.startswith(prefix):
|
| 419 | path = path[n:]
|
| 420 |
|
| 421 | print('%s' % path)
|
| 422 |
|
| 423 | for func, ref, _ in src.functions:
|
| 424 | third = func
|
| 425 | #third = ''
|
| 426 | #print(' %s [%s] %s' % (func.__name__, '.'.join(ref), third))
|
| 427 | try:
|
| 428 | print(' %s' % func.__name__)
|
| 429 | except AttributeError:
|
| 430 | # In the CI, when we don't have fastlex, we get _MatchOshToken_Slow()
|
| 431 | # instances
|
| 432 | print(" NOTE: %s found at top level, but it's not a simple function" % func)
|
| 433 |
|
| 434 | PrintSig(' %s', func)
|
| 435 |
|
| 436 | classes = [(c.Name(), c) for c in src.classes]
|
| 437 | classes.sort()
|
| 438 | for c in src.classes:
|
| 439 | c.Print()
|
| 440 |
|
| 441 | print()
|
| 442 |
|
| 443 | print('NATIVE CODE')
|
| 444 | print()
|
| 445 |
|
| 446 | for mod_name in sorted(c_srcs):
|
| 447 | src = c_srcs[mod_name]
|
| 448 |
|
| 449 | print('%s' % mod_name)
|
| 450 |
|
| 451 | for func, ref, _ in src.functions:
|
| 452 | #third = func
|
| 453 | third = ''
|
| 454 | #print(' %s [%s] %s' % (func.__name__, '.'.join(ref), third))
|
| 455 | print(' %s' % func.__name__)
|
| 456 |
|
| 457 | classes = [(c.Name(), c) for c in src.classes]
|
| 458 | classes.sort()
|
| 459 | for c in src.classes:
|
| 460 | c.Print()
|
| 461 |
|
| 462 | print()
|
| 463 |
|
| 464 |
|
| 465 | class SourceFile(object):
|
| 466 |
|
| 467 | def __init__(self):
|
| 468 | self.classes = []
|
| 469 | self.functions = []
|
| 470 |
|
| 471 |
|
| 472 | def Walk(main, modules):
|
| 473 | """Given a function main, finds all functions it transitively calls.
|
| 474 |
|
| 475 | Uses heuristic bytecode analysis. Limitations:
|
| 476 |
|
| 477 | - functions that are local variables might not work? But this should work:
|
| 478 |
|
| 479 | if x > 0:
|
| 480 | f = GlobalFunc
|
| 481 | else:
|
| 482 | f = OtherGlobalFunc
|
| 483 | f() # The LOAD_GLOBAL will be found.
|
| 484 |
|
| 485 | Args:
|
| 486 | main: function
|
| 487 | modules: Dict[str, module]
|
| 488 | """
|
| 489 | syms = Symbols()
|
| 490 | _Walk(main, None, ['main'], syms)
|
| 491 |
|
| 492 | # TODO:
|
| 493 | # - co_consts should be unified? So we know how big the const pool is.
|
| 494 |
|
| 495 | syms.Report()
|
| 496 |
|