diff --git a/.gitignore b/.gitignore index dacbe65..e9377f2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ tags *.pyc *.swp build/ +doc/_build dist/ *.egg-info tests/priv +tests/.cache .tox/ diff --git a/README.rst b/README.rst index d1199cb..caa183b 100644 --- a/README.rst +++ b/README.rst @@ -13,11 +13,6 @@ Amoco | Version: | 2.4 | +-----------+-----------------------------------+ -.. contents:: **Table of Contents** - :local: - :depth: 3 - :backlinks: top - Description =========== @@ -31,40 +26,26 @@ It features: fits in less than 800 lines of Python. The full SPARCv8 RISC decoder (or the ARM THUMB-1 set as well) fits in less than 350 lines. The ARMv8 instruction set decoder is less than - 650 lines. See arch_ for details. + 650 lines. - a **symbolic** algebra module which allows to describe the semantics of every instructions and compute a functional representation of instruction - blocks. See cas_ for details. + blocks. - a generic execution model wich provides an abstract memory model to deal with concrete or symbolic values transparently, and other system-dependent - features. See system_ for details. + features. - various classes implementing usual disassembly techniques like linear sweep, recursive traversal, or more elaborated techniques like path-predicate which relies on SAT/SMT solvers to proceed with discovering the control flow graph or even to implement techniques like DARE (Directed Automated - Random Exploration). See main.py_ for details. + Random Exploration). - various generic "helpers" and arch-dependent pretty printers to allow custom look-and-feel configurations (think AT&T vs. Intel syntax, absolute vs. relative offsets, decimal or hex immediates, etc). - See arch_ for details. Amoco is still *work in progress*. See Todo_ for a list of features to be merged from develop branch or to be more thoroughly implemented. -History -======= - -Development started in late 2006 with a proof-of-concept for symbolic -interpretation of x86 ELF programs. At that time it used a modified -version of minisat_ for simplifying symbolic expressions. -In 2009, it was fully rewritten with support for various other architectures -(``z80, armv7/thumb``) and executable formats (``PE, Gameboy Cardridge``). -In 2013 the internal decoding system was redesigned, and the minisat solver -was replaced by z3_. The ``armv8`` and ``sparc`` architectures were added. - -Despite being (just) yet another tool for analysing binaries, -in 2014 a dedicated 'release' branch was created with most of the above -features to be open-sourced. +User documentation and API can be found at `https://amoco.readthedocs.org` Todo ==== @@ -73,1262 +54,14 @@ Some components of Amoco are still in the process of being pushed to the release branch or further developed. More precisely: -- x86 fpu and sse instructions semantics are not implemented, +- x86 fpu instructions semantics are not implemented, - arm SIMD, VFP, NEON, TrustZone, Jazelle instruction sets are not implemented, -- pretty printers based on pygments package are not merged, -- solver-based disassembling strategies are not merged yet. -- persistent database (session) and idb import/export features are planned (Q2 2015). -- sphinx documentation is planned. +- some solver-based disassembling strategies are not merged yet. +- idb import/export features are not implemented. - MIPS, 6502 and PPC archs are planned. Contributions to fulfill uncomplete/unimplemented parts are welcome. - -Install -======= - -Amoco is tested on python 2.7 and depends on the following python packages: - -- grandalf_ used for building CFG (and eventually rendering it) -- crysp_ used by the generic intruction decoder (``arch/core.py``) -- z3_ used to simplify expressions and solve constraints -- pygments_ used for pretty printing of assembly code and expressions -- pyparsing_ for parsing instruction decoder formats -- ply_ (optional), for parsing *GNU as* files -- zodb_ (optional), provides persistence of amoco objects in a database - - -Quickstart -========== - -Below is a very simple example where basic blocks are build with linear sweep: - -.. sourcecode:: python2 - - >>> import amoco - >>> p = amoco.system.loader.load_program('tests/samples/x86/flow.elf') - amoco.system.loader: INFO: Elf32 file detected - amoco.system.loader: INFO: linux_x86 program created - >>> p - - - -We are analysing file ``flow.elf``. Since we don't know nothing about it -we start by using a high level loader which will try to detect its format -and target platform and provide some feedback info. Here the loader -creates a ``linux_x86.ELF`` object which shall represent the program task. - - -.. sourcecode:: python2 - - >>> p.bin - - >>> print p.mmap - - - - - - - - - > - - >>> p.mmap.read(0x0804a004,4) - [] - >>> print _[0] - @malloc - >>> p.mmap.read(0x0804a00c,6) - [, '\x00\x00'] - - -The object gives access to the Elf32 object and its mapping in our abstract -memory model. We can note that in this model, imports location in .got segment -are modeled as abstract expressions of type ``ext``. Note also that fetching -compound data (symbolic+concrete) is possible. See MemoryZone_ for more details. -Lets proceed with getting some basic blocks... - -.. sourcecode:: python2 - - >>> z = amoco.lsweep(p) - >>> ib = z.iterblocks() - >>> next(ib) - - >>> b=_ - >>> print b - # --- block 0x8048380 --- - 0x8048380 31ed xor ebp,ebp - 0x8048382 5e pop esi - 0x8048383 89e1 mov ecx,esp - 0x8048385 83e4f0 and esp,0xfffffff0 - 0x8048388 50 push eax - 0x8048389 54 push esp - 0x804838a 52 push edx - 0x804838b 6810860408 push #__libc_csu_fini - 0x8048390 68a0850408 push #__libc_csu_init - 0x8048395 51 push ecx - 0x8048396 56 push esi - 0x8048397 68fd840408 push #main - 0x804839c e8cfffffff call *0x8048370 - >>> b.instr - [, , , , , , , , , , , , ] - >>> i = b.instr[-1] - >>> i - - >>> print i - 0x804839c e8cfffffff call *0x8048370 - >>> i.mnemonic - 'CALL' - >>> i.bytes - '\xe8\xcf\xff\xff\xff' - >>> i._uarch['i_CALL'] - - >>> str(i.operands[0]) - '-0x31' - >>> i.operands[0].value - -49L - >>> i.typename() - 'control_flow' - - -We use here the most basic **linear sweep** approach and spawn a basic -block iterator. The first block is well known. We can see that the default -x86 pretty printer uses Intel syntax and codehelpers that show PLT refs -as associated .got ``ext`` expression. Also, relative offsets are displayed -as absolute addresses (indicated by the \* prefix). - -Lets look at the symbolic execution of this block: - -.. sourcecode:: python2 - - >>> b.map - - >>> print b.map - ebp <- { | [0:32]->0x0 | } - esi <- { | [0:32]->M32(esp) | } - ecx <- { | [0:32]->(esp+0x4) | } - eflags <- { | [0:1]->0x0 | [1:2]->eflags[1:2] | [2:3]->(0x6996>>(((esp+0x4)&0xfffffff0)[0:8]^(((esp+0x4)&0xfffffff0)[0:8]>>0x4))[0:4])[0:1] | [3:6]->eflags[3:6] | [6:7]->(((esp+0x4)&0xfffffff0)==0x0) | [7:8]->(((esp+0x4)&0xfffffff0)<0x0) | [8:11]->eflags[8:11] | [11:12]->0x0 | [12:32]->eflags[12:32] | } - ((((esp+0x4)&0xfffffff0)-4)) <- eax - ((((esp+0x4)&0xfffffff0)-8)) <- (((esp+0x4)&0xfffffff0)-0x4) - ((((esp+0x4)&0xfffffff0)-12)) <- edx - ((((esp+0x4)&0xfffffff0)-16)) <- 0x8048610 - ((((esp+0x4)&0xfffffff0)-20)) <- 0x80485a0 - ((((esp+0x4)&0xfffffff0)-24)) <- (esp+0x4) - ((((esp+0x4)&0xfffffff0)-28)) <- M32(esp) - ((((esp+0x4)&0xfffffff0)-32)) <- 0x80484fd - esp <- { | [0:32]->(((esp+0x4)&0xfffffff0)-0x24) | } - ((((esp+0x4)&0xfffffff0)-36)) <- (eip+0x21) - eip <- { | [0:32]->(eip+-0x10) | } - >>> b.map[p.cpu.esi] - - >>> e=_ - >>> print e - M32(esp) - >>> e.length - 4 - >>> e.size - 32 - - -When a block is instanciated, a ``mapper`` object is automatically created. -This function can map any input state to an output state corresponding to the -interpretation of this block. - -A mapper object is now also equipped with a MemoryMap to mitigate aliasing issues -and ease updating the global mmap state. - -.. sourcecode:: python2 - - >>> print b.map.memory() - - - - - - - - - - > - >>> print b.map(p.cpu.mem(p.cpu.esp,64)) - { | [0:32]->(eip+0x21) | [32:64]->0x80484fd | } - >>> print b.map(p.cpu.mem(p.cpu.ebx,32)) - M32$9(ebx) - - -As shown above, reading memory in the mapper can return a compound expression. -Note also that unmapped areas are returned as symbolic mem objects. -Since aliasing between different MemoryZones is possible, the returned -symbolic expression of fetching memory at pointer ``ebx`` is special: -the ``M32$9(ebx)`` expression says "in input state, take 32 bits found at -pointer ebx *after* applying 9 possibly aliasing memory writes to the state. -More details in mapper_. - - ------ - -Lets try a (little) more elaborated analysis that will not only allow to -build a list of basic blocks but will also help us discover (parts of) -the control flow graph of the program: - -.. sourcecode:: python2 - - >>> ff = amoco.fforward(p) - >>> ff.policy - {'depth-first': True, 'branch-lazy': True} - >>> ff.policy['branch-lazy']=False - >>> ff.getcfg() - amoco.cas.expressions: INFO: stub __libc_start_main called - amoco.main: INFO: fforward analysis stopped at block 0x8048370 - - >>> G=_ - >>> G.C - [] - -Here we use the **fast-forward** analysis (see below) and set its "branch-lazy" policy -to ``False`` to avoid falling back to linear sweep when analysis of branch fails. -Interestingly, we can see that the PLT jump to ``__libc_start_main`` external function -has been followed thanks to a ``@stub`` defined for this external (see ``system/linux_x86.py``). - -Let's have a look at the graph instance: - -.. sourcecode:: python2 - - >>> print G.C[0].sV - 0.| - 1.| - >>> print G.C[0].sE - 0.| 0x8048370] at 0x8db742c> - >>> G.get_by_name('0x8048370') - - >>> n=_ - >>> print n.data - # --- block 0x8048370 --- - 0x8048370 'ff250ca00408' jmp [@__libc_start_main] - >>> print n.data.map - eip <- { | [0:32]->M32(esp+4) | } - esp <- { | [0:32]->(esp-0x4) | } - (esp-4) <- @exit - -Ok, so the program counter is correctly pointing to the ``#main`` address located -at offset +4 in the stack, but since the fast-forward method only look at one block, -it cannot know that this location holds this address. - -A little more elaborated analysis like **link-forward** would have started analysing -``#main``: - -.. sourcecode:: python2 - - >>> lf = amoco.lforward(p) - >>> lf.getcfg() - amoco.cas.expressions: INFO: stub __libc_start_main called - amoco.main: INFO: lforward analysis stopped at block 0x80484d4 - - >>> G=_ - >>> print G.C - [, - , - , - ] - >>> for g in G.C: - ... print g.sV - ... print '------' - ... - 0.| - 1.| - 2.| - ------ - 0.| - ------ - 0.| - 1.| - ------ - 0.| - ------ - >>> print G.get_by_name('0x8048434').data - # --- block 0x8048434 --- - 0x8048434 '55' push ebp - 0x8048435 '89e5' mov ebp,esp - 0x8048437 '83ec38' sub esp,0x38 - 0x804843a '8b4508' mov eax,[ebp+8] - 0x804843d '83c001' add eax,0x1 - 0x8048440 '8945f4' mov [ebp-12],eax - 0x8048443 '8b45f4' mov eax,[ebp-12] - 0x8048446 'a320a00408' mov [#global_var],eax - 0x804844b 'c744240403000000' mov [esp+4],0x3 - 0x8048453 '8b45f4' mov eax,[ebp-12] - 0x8048456 '890424' mov [esp],eax - 0x8048459 'e825000000' call *#fct_b - >>> print G.get_by_name('0x8048483').data - # --- block 0x8048483 --- - 0x8048483 '55' push ebp - 0x8048484 '89e5' mov ebp,esp - 0x8048486 '8b450c' mov eax,[ebp+12] - 0x8048489 '8b5508' mov edx,[ebp+8] - 0x804848c '01d0' add eax,edx - 0x804848e '5d' pop ebp - 0x804848f 'c3' ret - - -The **fast-backward** is another analysis that tries to evaluate the expression of -the program counter backwardly and thus reconstructs function frames in simple cases. - -.. sourcecode:: python2 - - >>> amoco.Log.loggers['amoco.main'].setLevel(15) - >>> z = amoco.fbackward(p) - >>> z.getcfg() - amoco.main: VERBOSE: root node 0x8048380 added - amoco.main: VERBOSE: block #PLT@__libc_start_main starts a new cfg component - amoco.cas.expressions: INFO: stub __libc_start_main called - amoco.main: VERBOSE: function f:#PLT@__libc_start_main{2} created - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: block 0x8048434 starts a new cfg component - amoco.main: VERBOSE: block 0x8048483 starts a new cfg component - amoco.main: VERBOSE: function fct_b:0x8048483{1} created - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: block 0x80484d4 starts a new cfg component - amoco.main: VERBOSE: function fct_e:0x80484d4{1} created - amoco.main: VERBOSE: pc is memory aliased in fct_e:0x80484d4{1} (assume_no_aliasing) - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: function fct_a:0x8048434{5} created - amoco.main: VERBOSE: pc is memory aliased in fct_a:0x8048434{5} (assume_no_aliasing) - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: function fct_b:0x8048483{1} called - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: block 0x8048490 starts a new cfg component - amoco.main: VERBOSE: block 0x80484ab starts a new cfg component - amoco.main: VERBOSE: block #PLT@malloc starts a new cfg component - amoco.cas.expressions: INFO: stub malloc called - amoco.main: VERBOSE: function f:#PLT@malloc{2} created - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: function fct_d:0x80484ab{3} created - amoco.main: VERBOSE: pc is memory aliased in fct_d:0x80484ab{3} (assume_no_aliasing) - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: function fct_c:0x8048490{3} created - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: edge -?-> added - amoco.main: VERBOSE: block #PLT@__stack_chk_fail starts a new cfg component - amoco.cas.expressions: INFO: stub __stack_chk_fail called - amoco.main: VERBOSE: function f:#PLT@__stack_chk_fail{2} created - amoco.main: VERBOSE: edge ---> added - amoco.main: VERBOSE: function f:0x8048380{12} created - amoco.main: VERBOSE: pc is memory aliased in f:0x8048380{12} (assume_no_aliasing) - amoco.main: INFO: fbackward analysis stopped at - amoco.main: VERBOSE: edge -?-> added - - >>> - -.. ** - -API Overview -============ - -Amoco is composed of 3 packages arch_, cas_ and system_, on top of which the -classes implemented in ``code.py``, ``cfg.py`` and ``main.py`` provide high-level -abstractions of basic blocks, functions, control flow graphs and -disassembling/analysis techniques. - -We will now describe this architecture starting from low-level layers (arch_, cas_) -up to system_ and finally to higher level classes. - -A *Sphinx* generated doc will be available soon. - - -arch ----- - -Supported CPU architectures are implemented in this package as subpackages and all -use the ``arch/core.py`` generic classes. The interface to a CPU used by -system_ classes is generally provided by a ``cpu_XXX.py`` module in the CPU subpackage. -This module shall: - -- provide the CPU *environment* (registers and other internals) -- provide an instance of ``core.disassembler`` class, which requires to: - - + define the ``@ispec`` of every instruction for the generic decoder, - + and define the *semantics* of every instruction with cas_ expressions. - -- optionnally define the output assembly format, and the *GNU as* (or any other) - assembly parser. - -A simple example is provided by the ``arch/arm/v8`` architecture which provides -a model of ARM AArch64: -The interface module is ``arch/arm/cpu_armv8.py``, which imports everything from -the v8 subpackage. - -instruction specifications -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``v8/spec_armv8.py`` module implements all decoding specifications thanks -to an original decorating mechanism. For example, the EXTR instruction encoding -is defined like this: - -.. sourcecode:: python2 - - @ispec("32[ sf 0 0 100111 N 0 Rm(5) imms(6) Rn(5) Rd(5) ]",mnemonic="EXTR") - def A64_EXTR(obj,sf,N,Rm,imms,Rn,Rd): - if sf!=N: raise InstructionError(obj) - if sf==0 and imms>31: raise InstructionError(obj) - obj.datasize = 64 if (sf==1) else 32 - regs = env.Xregs if sf==1 else env.Wregs - obj.d = sp2z(regs[Rd]) - obj.n = sp2z(regs[Rn]) - obj.m = sp2z(regs[Rm]) - obj.lsb = env.cst(imms,6) - obj.operands = [obj.d,obj.n,obj.m,obj.lsb] - obj.type = type_data_processing - - -The ``@ispec(...)`` decorator indicates that whenever the decoder buffer is filled -with 32 bits that matches a given pattern, the decorated function is called with -first argument being a ``arch.core.instruction`` instance with ``mnemonic`` attribute -set to EXTR, and other arguments being extracted from corresponding bitfields. -The function itself is responsible for filling the instruction instance with useful -other attributes like operands, type, etc. -If you look at page 480 of armv8_, you will likely feel at home... - -The same is true for ``x86/spec_ia32.py`` and the Intel manuals, for example -the CMOVcc instruction(s) specification is: - -.. sourcecode:: python2 - - # conditionals: - @ispec_ia32("*>[ {0f} cc(4) 0010 /r ]", mnemonic = "CMOVcc") # 0f 4x /r - def ia32_CMOVcc(obj,cc,Mod,RM,REG,data): - obj.cond = CONDITION_CODES[cc] - op2,data = getModRM(obj,Mod,RM,data) - op1 = env.getreg(REG,op2.size) - obj.operands = [op1, op2] - obj.type = type_data_processing - -.. ** - -A detailed description of the ispec decorator class pattern format is provided in -``arch/core.py``. Since implementing these specifications from CPUs docs -is always error-prone, Amoco will check several things for you: - -- the size of the ispec format (the "pattern" to match) is consistent with its declared length (if not \*). -- the prototype of the decorated function match the identifiers in the ispec format (count and names must match). -- the ispec format is unique: the fixed part of the pattern does not exist in any other ispec instance. - -Internally, the decoder will collect all ispec instances declared within the module. -The ``core.disassembler`` setup will later organize the list in a tree based on fixed patterns of each ispec. -Note that identifying *holes* of the architecture's encoding scheme becomes relatively simple once this tree -is built. -Architectures with multiple (disjoint) instructions sets (think armv7/thumb) is supported by instanciating -the core disassembler with respective specs modules and with the function that decides how to switch -from one set to the other. - -instruction semantics -~~~~~~~~~~~~~~~~~~~~~ - -The semantics of instructions are defined separately from their decoder specs, -generally in a ``asm.py`` module. An ``instruction`` instance with mnemonic *XXX* -will find its semantics definition by looking for a function ``i_XXX(i,fmap): ...``. - -For example (in ``arch/x86/asm.py``): - -.. sourcecode:: python2 - - def i_CMOVcc(i,fmap): - fmap[eip] = fmap(eip)+i.length - op1 = i.operands[0] - op2 = i.operands[1] - fmap[op1] = fmap(tst(i.cond[1],op2,op1)) - -The function takes as input the instruction instance *i* and a ``mapper`` -instance *fmap* (see cas_) and implements (an approximation of) the opcode semantics. - -instruction formats -~~~~~~~~~~~~~~~~~~~ - -How an instruction object is printed is also defined separately to allow various -outputs. A ``Formatter`` instance can be associated to the core instruction class -to handle "pretty printing", including aliases of instructions. - -Basically, a ``Formatter`` object is created from a dict associating a key with a list -of functions or format string. The key is either one of the mnemonics or possibly -the name of a ispec-decorated function (this allows to group formatting styles -rather than having to declare formats for every possible mnemonic.) -When the instruction is printed, the formatting list elements are "called" and -concatenated to produce the output string. - -An example follows from ``arch/x86/formats.py``: - -.. sourcecode:: python2 - - def mnemo(i): - mnemo = i.mnemonic.replace('cc','') - if hasattr(i,'cond'): mnemo += i.cond[0].split('/')[0] - return '{: <12}'.format(mnemo.lower()) - - def opsize(i): - s = [op.size for op in i.operands if op._is_mem] - if len(s)==0: return '' - m = max(s) - return {8:'byte ptr ',16:'word ptr ',32:''}[m] - - ... - format_intel_ptr = (mnemo,opsize,opers) - ... - IA32_Intel_formats = { - .... - 'ia32_mov_adr' : format_intel_ptr, - 'ia32_ptr_ib' : format_intel_ptr, - ... - } - -The formatter is also used to take care of aliasing instructions like for example -in the arm architectures where the *ANDS* instruction is replaced by *TST* when -the destination register is X0/W0 : - -.. sourcecode:: python2 - - def alias_AND(i): - m = mnemo(i) - r = regs(i) - if i.setflags and i.d==0: - m = 'tst' - r.pop(0) - return m.ljust(12) + ', '.join(r) - - -cas ---- - -The *computer algebra system* of Amoco is built with the following elements implemented -in ``cas/expressions.py``: - -- Constant ``cst``, which represents immediate (signed or unsigned) value of fixed size (bitvector), -- Symbol ``sym``, a Constant equipped with a reference string (non-external symbol), -- Register ``reg``, a fixed size CPU register **location**, -- External ``ext``, a reference to an external location (external symbol), -- Floats ``cfp``, constant (fixed size) floating-point values, -- Composite ``comp``, a bitvector composed of several elements, -- Pointer ``ptr``, a memory **location** in a segment, with possible displacement, -- Memory ``mem``, a Pointer to represent a value of fixed size in memory, -- Slice ``slc``, a bitvector slice of any element, -- Test ``tst``, a conditional expression, (see Tests_ below.) -- Operator ``uop``, an unary operator expression, -- Operator ``op``, a binary operator expression. The list of supported operations is - not fixed althrough several predefined operators allow to build expressions directly from - Python expressions: say, you don't need to write ``op('+',x,y)``, but can write ``x+y``. - Supported operators are: - - + ``+``, ``-``, ``*`` (multiply low), ``**`` (multiply extended), ``/`` - + ``&``, ``|``, ``^``, ``~`` - + ``==``, ``!=``, ``<=``, ``>=``, ``<``, ``>`` - + ``>>``, ``<<``, ``//`` (arithmetic shift right), ``>>>`` and ``<<<`` (rotations). - - See Operators_ for more details. - -All elements inherit from the ``exp`` class which defines all default methods/properties. -Common attributes and methods for all elements are: - -- ``size``, a Python integer representing the size in bits, -- ``sf``, the True/False *sign-flag*. -- ``length`` (size/8) -- ``mask`` (1<>> from amoco.cas.expressions import * - >>> c = cst(253,8) - >>> print c - 0xfd - >>> c.sf - False - >>> c.sf=True - >>> print c - -0x3 - >>> print c.value, type(c.value) - -3 - >>> print c.v, c.mask, c.size - 253 255 8 - >>> c.zeroextend(16) - - >>> c2 = _ - >>> print c2.sf, c2 - False 0xfd - >>> assert c2.bytes(1,2)==0 - >>> e = c2+c.signextend(16)+5 - >>> print e - 0xff - >>> c3 = e[0:8] - >>> print c3==cst(-1,8) - 0x1 - -Here, after declaring an 8-bit constant with value 253, we can see that by default the -associated ``cst`` object is unsigned. The internal storage is always the unsigned -representation of the value. If we set its ``sf`` *sign-flag* attribute to True, -the ``value`` property will return a signed Python integer. -If the constant is inited from a negative integer, the resulting object's *sign-flag* is set to True. -If a constant is *signextended* its *sign-flag* is set automatically, unset if *zeroextended*. -Basically, during interpretation, the flag is set or unset depending on how the expression is -used by the instructions. Logical operators tend to unset it, explicit sign-relevant instructions -need to set it. - -The ``cst`` class is special because it is the only class that can be used as a -Python boolean type: - -.. sourcecode:: python2 - - >>> e==0xff - - >>> t=_ - >>> print t - 0x1 - >>> if t==True: print 'OK' - ... - OK - >>> t.size - 1 - -In above examples, the ``==`` Python operator is used. The return value is not a Python -True/False value but as expected a new expression object. Since the operation here involves -only constants, the result need not be an ``op`` element but can be readily simplified to -a 1-bit constant with value 0 or 1. -In Amoco, the **only** expression that evaluates to True is ``cst(1,1)``. - -Expressions of type ``sym`` are constants equipped with a symbol string for printing purpose only: - -.. sourcecode:: python2 - - >>> s = sym('Hubble',42,8) - >>> print s - #Hubble - >>> s.value - 42 - >>> print s+1 - 0x2b - -(Note that as seen above, usage of a ``sym`` object in another expression will obviously -forget the symbol string in the resulting expression.) - -Registers -~~~~~~~~~ - -Expressions of class ``reg`` are pure symbolic values. -They are essentially used for representing the registers of a CPU, as "right-values" -or left-values (locations). More details on *locations* in mapper_. - -.. sourcecode:: python2 - - >>> a = reg('%a',32) - >>> print a - %a - >>> e = 2+a - >>> print e - (%a+0x2) - >>> x = e-2 - >>> print x - (%a-0x0) - >>> x.simplify() - - >>> print _ - %a - -As shown above, elementary simplification rules are applied such that ``(2+a)-2`` -leads to an ``op`` expression with operator ``-``, right member 0 and left member ``r1``, -which eventually also simplifies further to the r1 register. -Most real simplification rules should rely on SMT solvers like z3_ (see smt_). - -Externals -~~~~~~~~~ - -Class ``ext`` inherit from registers as pure symbolic values -but is used to represent external symbols that are equipped with a ``stub`` function. -When "called", these objects can invoke their stub function in two ways: - -- when the program counter is an ``ext`` expression, - the object invokes its __call__ method to modify the provided mapper by calling the - registered *stub* with the mapper and possibly other needed parameters. -- when used to simulate actions of *interruptions* like for example - in the semantics of ``IN/OUT`` or ``INT`` instructions which invoke the object's ``call`` - method to eventually return an expression. - -(More details on ``@stub`` decorated functions are provided in system_.) - -Pointers and Memory objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A ``ptr`` object is a memory **location**. These objects are generally not found -in expressions but only as mapper_ locations or addresses in ``mem`` objects. -They have a ``base`` expression, a ``disp`` integer offset, -and an optional ``seg`` attribute to be used by MemoryZone_ objects. - -As illustrated below, simplification of ``ptr`` objects tends to extract constant -offsets found in the base expression to adjust the ``disp`` field. - -.. sourcecode:: python2 - - >>> a = reg('a',32) - >>> p = ptr(a) - >>> q = ptr(a,disp=17) - >>> print p,q - (a) (a+17) - >>> assert p+17 == q - >>> assert p+2 == q-15 - >>> assert (p+3).base == (q-5).base - - -A ``mem`` object is a symbolic memory value equipped with a pointer, a size, and -a special ``.mods`` attribute that will be discussed in mapper_. - -.. sourcecode:: python2 - - >>> x = mem(p,64,disp=2) - >>> y = mem(q-5,48,disp=-10) - >>> print x,y - M64(a+2) M48(a+2) - >>> assert x.bytes(4,6) == y[32:48] - - -Note: the segment attribute is currently not used by the core memory classes. - - -Operators -~~~~~~~~~ - -Unary operators (``+``, ``-`` and ``~``) have elementary simplification rules: - -.. sourcecode:: python2 - - >>> a = reg('a',32) - >>> assert +a == -(-a) - >>> assert -a == 0-a - -Most operations in Amoco involve left and right members sub-expressions. The operation -will then usually proceed only if both member have the same size. If one member is not -an expression but a Python integer, it will be implicitly "casted" to a constant of size -required by the other expression member. Thus, it is possible to write ``r1+2`` and not -``r1+cst(2,32)``. - -Binary operations have elementary simplification rules that try to arrange symbols -in lexical order and move constants to the right side of the expression. - -.. sourcecode:: python2 - - >>> a = reg('a',32) - >>> b = reg('b',32) - >>> print a+0, a*1, a^a, a*0, a&0, a|0 - a a 0x0 0x0 0x0 a - >>> print (b-a)|0 - ((-a)+b) - >>> assert b-a == (-a)+b - >>> assert -(a+b) == (-a)-b - >>> assert -(a-b) == b-a - >>> assert -(b-a) == (a-b)*1 - >>> assert -(1-a) == a-1 - >>> assert (-a+(b-1)) == b-a-1 - >>> e = -((b-1)-a) - >>> assert e == 1+(a-b) - >>> print e - ((a-b)+0x1) - >>> extract_offset(e) - (, 1) - >>> print _[0] - (a-b) - -Internal attributes and methods of ``op`` instances are: - -- ``.op``, the operator symbol (``.op.symbol``) and function (``.op.impl``), -- ``.r``, the left member sub-expression, -- ``.l``, the right member sub-expression of binary ops. -- ``.prop``, an or-ed flag indicating the kind of operators involved: - - + 1 means only arithmetic, - + 2 means only logic, - + 4 means only conditional, - + 8 means only shifts and rotations, - -- ``depth()`` returns the expression tree depth, -- ``limit(value)`` is a class method used to set a threshold parameter involved - in simplifying the expression to ``top`` when the expression's complexity is too high. - -The ``symbols_of(e)`` function returns the list of registers expressions involved in ``e``. -The ``locations_of(e)`` function returns the list of *locations* used in ``e``. -The ``complexity(e)`` function computes an arbitrary complexity measure of expression ``e`` -which is linear in depth and number of symbols, and increases by a factor of ``prop``. - -Composer and Slicer -~~~~~~~~~~~~~~~~~~~ - -A ``comp`` object is a composite expression corresponding to a bit-vector made of -several expression parts. -A ``slc`` object is the expression obtained by extracting a bit-vector slice out -of an expression. - -The ``composer(parts)`` function, which takes as input the parts as a list of expressions in -least-to-most significant order, is the preferred method for instanciating composite objects. -Since ``comp`` is essentially a container class for other expressions, the resulting object -is possibly of another class if some simplification occured. - -.. sourcecode:: python2 - - >>> composer([cst(1,8),cst(2,8),cst(3,8)]) - - >>> c=_ - >>> assert c == 0x030201 - >>> a = reg('a',32) - >>> b = reg('b',32) - >>> c = comp(24) - >>> c[0:8] = (a+b)[24:32] - >>> c[8:24] = b[0:16] - >>> print c - { | [0:8]->(a+b)[24:32] | [8:24]->b[0:16] | } - >>> c[8:16] = cst(0xff,8) - >>> print c - { | [0:8]->(a+b)[24:32] | [8:16]->0xff | [16:24]->b[8:16] | } - >>> c[0:8] = cst(0x01,8) - >>> print c - { | [0:8]->0x1 | [8:16]->0xff | [16:24]->b[8:16] | } - >>> print c.simplify() - { | [0:16]->0xff01 | [16:24]->b[8:16] | } - -As shown above, a composite instance supports dynamic asignment of any parts defined by a python -slice object. Simplification of composite objects tends to merge contiguous constant parts. - -A ``slc`` expression is obtained by using a python slice object of the form [start:stop] -where start/stop are non-negative integers in the bit range of the sliced expression. -Simplification occurs when the sliced expression is itself of class ``slc`` or ``mem``: - -.. sourcecode:: python2 - - >>> a = reg('%a',32) - >>> ah = slc(a,24,8,ref='%ah') - >>> assert ah.x == a - >>> print ah.pos - 24 - >>> print ah - %ah - >>> ax = a[16:32] - >>> print ax - %a[16:32] - >>> print ax[0:8] - %a[16:24] - >>> print ax[8:16] - ah - >>> y = mem(a,64) - >>> print y[16:48] - M32(%a+2) - -Note that, as shown above, slices of registers can be instanciated with an optional -reference string that is used for printing whenever the matching register slice is involved. - -Note also that parts and slices [start:stop] bounds are limited to python integers only -(indices can't be symbolic!) - - -Conditionals -~~~~~~~~~~~~ - -The ``tst`` class is used for conditional expressions in the form ``tst(cond, eT, eF)`` -where ``cond`` is an expression, ``eT`` is the resulting expression whenever -``cond==1`` and ``eF`` is the resulting expression whenever ``cond==0``. - -.. sourcecode:: python2 - - >>> t = tst(a>0, c, cst(0xdeadbe,24)) - >>> print t - ((%a>0x0) ? { | [0:16]->0xff01 | [16:24]->b[8:16] | } : 0xdeadbe) - >>> t.l[16:24] = cst(0xab,8) - >>> print t.simplify() - ((%a>0x0) ? 0xabff01 : 0xdeadbe) - >>> t.tst.l = cst(-1,32) - >>> print t - ((-0x1>0x0) ? 0xabff01 : 0xdeadbe) - >>> print t.simplify() - 0xdeadbe - - -mapper -~~~~~~ - -A ``mapper`` object captures the symbolic operations of a sequence of instructions by -mapping input expressions to output *locations* which are registers or pointers. -It represents the transition function from an input state to an output state corresponding -to the execution of the captured instructions. -As shown in the ``i_MOVcc`` example above, the ``fmap`` argument of every instruction semantics -is a mapper on which the instruction currently operates (see asm_). - -.. sourcecode:: python2 - - >>> from amoco.arch.x86.env import * - >>> from amoco.cas.mapper import mapper - >>> m = mapper() - >>> m[eax] = cst(0xabff01,32) - >>> print m - eax <- { | [0:32]->0xabff01 | } - >>> print m(eax) - 0xabff01 - >>> print m(ah) - 0xff - >>> m[eax[16:32]] = bx - >>> print m - eax <- { | [0:16]->0xff01 | [16:32]->bx | } - >>> print m(ax+cx) - (cx+0xff01) - >>> print m(eax[16:32]^ecx[16:32]) - (bx^ecx[16:32]) - >>> print m(mem(ecx+2,8)) - M8(ecx+2) - >>> print m(mem(eax+2,8)) - M8({ | [0:16]->0xff01 | [16:32]->bx | }+2) - -The mapper class defines two essential methods to set and get expressions in and out. - -- ``__setitem__`` is used for mapping any expression to a location which can be a register - (or a register slice), a pointer or a memory expression. When the location is a pointer, - the base expression refers to input state values, whereas a memory expression refers to - the output state (see example below). -- ``__call__`` is used for evaluating any expression in the mapper, by replacing every - register and memory object of the expression by their mapped expressions. - -A *push* instruction could thus be implemented using: - -.. sourcecode:: python2 - - >>> def push(fmap,x): - ... fmap[esp] = fmap(esp)-x.length - ... fmap[mem(esp,x.size)] = x # put x at the current (updated) esp address - ... - >>> m.clear() - >>> push(m, cst(0x41414141,32)) - >>> print m - esp <- { | [0:32]->(esp-0x4) | } - (esp-4) <- 0x41414141 - >>> push(m, ebx) - >>> print m - (esp-4) <- 0x41414141 - esp <- { | [0:32]->(esp-0x8) | } - (esp-8) <- ebx - -Note that a ``__getitem__`` method is implemented as well in order to fetch items -that are locations of the mapper. So here, to get the value at the top of stack, we -can do: - -.. sourcecode:: python2 - - >>> print m[mem(esp-8,32)] # fetch the expression associated with ptr (esp-8) - ebx - >>> print m(mem(esp,32)) # evaluates mem(esp,32) => first evaluate ptr, then fetch. - ebx - >>> print m(mem(esp+4,32)) - 0x41414141 - >>> print m[mem(esp-4,32)] - 0x41414141 - -The internal memory model of a mapper is a MemoryMap_: symbolic memory locations are related -to individual separated MemoryZone_ objects that deal with all read/write to/from location's -``ptr.base`` expression. - -.. sourcecode:: python2 - - >>> print m.memory() - - - > - -This model allows to access offsets that have not been explicitly written to before. -For example, if we now execute *mov ecx, [esp+2]* we still fetch the correct expression: - -.. sourcecode:: python2 - - >>> m[ecx] = m(mem(esp+2,32)) - >>> print m(ecx) - { | [0:16]->ebx[16:32] | [16:32]->0x4141 | } - -However, aliasing between zones is possible a must be avoided: imagine that we now -execute *mov byte ptr [eax], 0x42*, we obtain: - -.. sourcecode:: python2 - - >>> m[mem(eax,8)] = cst(0x42,8) - >>> print m - (esp-4) <- 0x41414141 - esp <- { | [0:32]->(esp-0x8) | } - (esp-8) <- ebx - ecx <- { | [0:16]->ebx[16:32] | [16:32]->0x4141 | } - (eax) <- 0x42 - >>> print m.memory() - - > - - > - -If we now again fetch memory at ``esp+2`` the previous answer is not valid anymore due -to a possible aliasing (overlapping) of ``eax`` and ``esp`` zones. Think of what should -the memory look like if ``eax`` value was ``esp-4`` for example. Let's try: - -.. sourcecode:: python2 - - >>> print m(mem(esp+2,32)) - M32$3(esp-6) - >>> mprev = mapper() - >>> mprev[eax] = esp-4 - >>> print mprev( m(mem(esp+2,32)) ) - { | [0:16]->ebx[16:32] | [16:32]->0x4142 | } - -Indeed, the mapper returns a special memory expression that embeds modifications -(saved in ``.mods`` of the mem expression) that have been applied on its memory until now, -and that must be executed in order to return a correct answer. As demonstrated above, -these mods are taken into account whenever the expression is evaluated in another mapper. - -Note that it is possible to force the mapper class to *assume no aliasing* : - -.. sourcecode:: python2 - - >>> print mapper.assume_no_aliasing - False - >>> mapper.assume_no_aliasing = True - >>> print m(mem(esp+2,32)) - { | [0:16]->ebx[16:32] | [16:32]->0x4141 | } - -In Amoco, a mapper instance is created for every basic block. The right -and left shift operators allow for right of left composition so that symbolic -forward or backward execution of several basic blocks is easy: - -.. sourcecode:: python2 - - >>> m1 = mapper() - >>> m1[eax] = ebx - >>> push(m1,eax) - >>> m2 = mapper() - >>> m2[ebx] = cst(0x33,32) - >>> push(m2,ebx) - >>> m2[eax] = m2(mem(esp,32)) - >>> print m1 - eax <- { | [0:32]->ebx | } - esp <- { | [0:32]->(esp-0x4) | } - (esp-4) <- eax - >>> print m2 - ebx <- { | [0:32]->0x33 | } - esp <- { | [0:32]->(esp-0x4) | } - (esp-4) <- ebx - eax <- { | [0:32]->ebx | } - >>> print m1>>m2 # forward execute m1 -> m2 - (esp-4) <- eax - ebx <- { | [0:32]->0x33 | } - esp <- { | [0:32]->(esp-0x8) | } - (esp-8) <- ebx - eax <- { | [0:32]->ebx | } - >>> print m2<0x33 | } - esp <- { | [0:32]->(esp-0x8) | } - (esp-8) <- ebx - eax <- { | [0:32]->ebx | } - -TODO: mapper unions. - -smt -~~~ - -Amoco uses z3_ for constraint solving by translating its equation expressions -into z3_ equivalent objects. The interface with z3_ is implemented in ``cas/smt.py``. - -- ``cst`` expressions are translated as ``BitVecVal`` objects -- ``cfp`` expressions are translated as ``RealVal`` objects -- ``reg`` expressions are translated as ``BitVec`` objects -- ``comp`` expressions use the z3_ ``Concat`` function -- ``slc`` expressions use the z3_ ``Extract`` function -- ``mem`` expressions are converted as Concat of ``Array`` of ``BitVecSort(8)``, - with current endianess taken into account. -- ``tst`` expressions use the z3_ ``If`` function -- operators are translated by propagating translations to left & right sides. - -When the ``smt`` module is imported it replaces the ``.to_smtlib()`` method of -every expression class (which by default raises UnImplementedError). - -.. sourcecode:: python2 - - >>> from amoco.arch.x86.env import * - >>> from amoco.cas import smt - >>> z = (eax^cst(0xcafebabe,32))+(ebx+(eax>>2)) - >>> print z - ((eax^0xcafebabe)+(ebx+(eax>>0x2))) - >>> print z.to_smtlib() - (eax ^ 3405691582) + ebx + LShR(eax, 2) - >>> print z.to_smtlib().sexpr() - (bvadd (bvxor eax #xcafebabe) ebx (bvlshr eax #x00000002)) - >>> r = smt.solver([z==cst(0x0,32),al==0xa,ah==0x84]).get_model() - >>> print r - [eax = 33802, ebx = 889299018] - >>> x,y = [r[v].as_long() for v in r] - >>> ((x^0xcafebabe)+(y+(x>>2)))&0xffffffffL - 0L - >>> p = mem(esp,32) - >>> q = mem(esp+2,32) - >>> ql = q[0:16] - >>> ph = p[16:32] - >>> z = (p^cst(0xcafebabe,32))+(q+(p>>2)) - >>> m = smt.solver().get_mapper([z==cst(0,32),esp==0x0804abcd]) - >>> print m - (esp+2) <- 0x7ffc9151 - (esp) <- 0x9151babe - esp <- { | [0:32] -> 0x0804abcd | } - - -In the ``smt`` module, the ``solver`` class is typically used to verify that some -properties hold and find a set of input (concrete) values to be set for example in -an emulator or debugger to reach a chosen branch. A solver instance can be created with -a python list of expressions, or expressions can be added afterward. - -The ``.get_model()`` method will check added contraint equations and return a -z3_ ``ModelRef`` object if the z3_ solver has returned ``z3.sat`` or None otherwise. -A list of equations to be taken into account can be provided as well with ``.add()``. - -The ``.get_mapper()`` method calls ``get_model`` and returns a mapper object with -locations set to their ``cst`` values. A list of equations can be provided here too. - -main.py -------- - -This module contains *high-level* analysis techniques implemented as classes that -take a program abstraction provided by the system_ package. - -The first 3 basic techniques are: - -- *linear-sweep* (``lsweep`` class) disassembles instructions without taking - into account any branching instruction. - - Methods exposed by the ``lsweep`` class are: - - * ``sequence(loc=None)``: returns an iterator that will yield disassembled - instructions starting at virtual address *loc* (defaults to entrypoint). - * ``iterblocks(loc=None)``: which returns an iterator that will yield (basic) block_ - of instructions starting at virtual address *loc*. - -- *fast forward* (``fforward``) inherits from ``lsweep`` and adds an algorithm that - tries to build the control-flow graph of the program by following branching - instructions when the program counter is composed essentially of constant - expressions when evaluated within block scope only. - The default policy is to fallback to linear sweep otherwise. - -- *link forward* (``lforward``) inherits from ``fforward`` but uses a strict - follow branch policy to avoid linear sweep and evaluates the program counter - by taking into account the parent block semantics. - -Other more elaborated techniques are: - -- *fast backward* (``fbackward``) inherits from ``lforward`` but evaluates the - program counter backardly by taking *first-parent* block until either the - expression is a constant target or the root node of the graph component (entry of function) - is reached. The analysis proceeds then by evaluating the pc expression in every - caller blocks, assuming that no frame-aliasing occured (pointer arguments did not - mess up with the caller's stack.) A ``func`` instance is created but its mapper - contains by default only the computed pc expression. - -- *link-backward* (``lbackward``) inherits from ``fbackward`` but walks back *all* - parent-paths up to the entry node, composing and assembling all mappers to end up - with an approximated mapper of the entire function. - -code.py -------- - -The ``code`` module defines two main classes: - -- a ``block`` contains a list of instructions and computes the associated mapper object. - The arch-dependent CoreExec classes (see system_ below) can add ``tag`` indicators like - ``FUNC_START`` (if block looks like a function entry), ``FUNC_CALL`` if block makes a call, etc. -- a ``func`` contains the cfg graph component of a function once it has been fully - recovered by an analysis class. It inherits from ``block`` and contains a mapper that - captures an approximation of the entire function. - -blocks are created by the ``lsweep.iterblocks()`` iterator (or by ``.get_block()``) which -is inherited by all ``main`` analysis classes discussed above. Functions are created by -``fbackward`` and ``lbackward`` classes only. - -The ``xfunc`` class is used when an external expression is called. It contains a mapper -build by a ``stub`` function. Instances are present in graph nodes but have a zero length -and no address and thus do not exist in memory. - -cfg.py ------- - -Classes ``node``, ``link`` and ``graph`` use *grandalf* Vertex/Edge/Graph with additional -formatters or way to compare instances by name. A node's data is a block instance, and an -edge's data is possibly a set of conditional expressions. A graph connected component is -a function's control-flow graph (a *graph_core* object). -The ``graph.add_vertex`` extends Graph.add_vertex to detect that the node to be added *cuts* -an existing node and adjust the graph structure accordingly. -The ``graph.spool()`` method provides a list of the current leaves in the graph. -The ``graph.get_by_name(name)`` method allows to get a node object by its name. - -system ------- - -The system_ package is the main interface with the binary program. It contains executable -format parsers, the memory model, the execution engine, and some operating system -models responsible for mapping the binary in the memory model, setting up the environment -and taking care of system calls. - -The ``loader.py`` module is the frontend that will try to parse the input file and import the -targeted system_ and arch_ modules. If the executable format is unkown or if the input is a -bytecode python string, the binary is mapped at address 0 in a ``RawExec`` instance. - -The ``elf.py`` module implements the ``Elf32`` and ``Elf64`` classes. The ``pe.py`` module -implements the ``PE`` class which handles both PE32 and PE32+ (64-bits). - -The ``core.py`` module implements the memory model classes and the CoreExec_ generic -execution engine inherited by various system's classes like ``linux_x86.ELF``, -``linux_arm.ELF`` or ``win32.PE`` and ``win64.PE``. - -MemoryZone -~~~~~~~~~~ - -The memory model in Amoco is implemented by the MemoryMap class in ``system/core.py``. Instance -of MemoryMap are created by the system's CoreExec classes and by every block's mapper_ objects. -This model associates memory locations with raw bytes or symbolic expressions in separated *zones* -implemented by the MemoryZone_ class. -Each zone is associated with a symbolic location reference, the default ``None`` reference zone -being used for concrete (cst) locations. -In a MemoryZone_, an *address* is an integer offset to the reference location expression, and -the associated *value* is a ``mo`` memory object that stores bytes or an expression wrapped in -a ``datadiv`` object. - -CoreExec -~~~~~~~~ - -The execution engine core class is the users's frontend to the binary. It is responsible for -creating a MemoryMap with the binary image, reading data in memory, or reading instructions -at some address by calling ``cpu.disassemble()``. - -stubs -~~~~~ - -System calls and externals are emulated by implementing ``stubs`` that modify a mapper instance. A *stub* -is a Python function decorated with ``@stub``. For example, for example in -the *Linux* system (see ``linux_x86.py``), the *__libc_start_main* is approximated by: - -.. sourcecode:: python2 - - @stub - def __libc_start_main(m,**kargs): - m[cpu.eip] = m(cpu.mem(cpu.esp+4,32)) - cpu.push(m,cpu.ext('exit',size=32)) - -The default stub performs only a ``ret``-like instruction. - Licence ======= @@ -1338,6 +71,19 @@ Please see `LICENSE`_. Changelog ========= +- `v2.4.6`_ + + * add sphinx documentation (rst files and docstrings) + * add functions method for main classes + * improve ELF pretty printing + * changed db module to use sqlalchemy rather than zodb + * make all objects pickable (with highest protocol) + * add new x86 & x64 formatters + * fix many x64 specs and semantics + * some performance improvements + * improve simplify mem(vec) and slc(vec) + * fix slc.simplify for '**' operator + - `v2.4.5`_ * add x86/x64 internals 'mode' selector @@ -1464,8 +210,9 @@ Changelog .. _armv8: http://www.cs.utexas.edu/~peterson/arm/DDI0487A_a_armv8_arm_errata.pdf .. _pyparsing: http://pyparsing.wikispaces.com/ .. _ply: http://www.dabeaz.com/ply/ -.. _zodb: http://www.zodb.org +.. _sqlalchemy: http://www.sqlalchemy.org .. _LICENSE: https://github.com/bdcht/amoco/blob/release/LICENSE +.. _v2.4.6: https://github.com/bdcht/amoco/releases/tag/v2.4.6 .. _v2.4.5: https://github.com/bdcht/amoco/releases/tag/v2.4.5 .. _v2.4.4: https://github.com/bdcht/amoco/releases/tag/v2.4.4 .. _v2.4.3: https://github.com/bdcht/amoco/releases/tag/v2.4.3 diff --git a/amoco/arch/core.py b/amoco/arch/core.py index 21227b6..f89109f 100644 --- a/amoco/arch/core.py +++ b/amoco/arch/core.py @@ -1,5 +1,21 @@ #!/usr/bin/env python +""" +arch/core.py +============ + +The architecture's core module implements essential classes +for the definition of new cpu architectures: + +- the :class:`instruction` class models cpu instructions decoded by the disassembler. +- the :class:`disassembler` class implements the instruction decoding logic based \ + on provided specifications. +- the :class:`ispec` class is a function decorator that allows to define the \ + specification of an instruction. +- the :class:`Formatter` class is used for instruction pretty printing + +""" + # This code is part of Amoco # Copyright (C) 2006-2014 Axel Tillequin (bdcht3@gmail.com) # published under GPLv2 license @@ -44,7 +60,7 @@ def __init__(self,istr=''): self.operands = [] # we add a misc defaultdict container. # see x86 specs for example of misc usage. - self.misc = defaultdict(lambda: None) + self.misc = defaultdict(_core_misc_default) @classmethod def set_uarch(cls,uarch): @@ -76,6 +92,11 @@ def length(self): # ----------------- class instruction(icore): + """The generic instruction class allows to define instruction for any cpu + instructions set and provides a common API for all arch-independent methods. + It extends the :class:`icore` with an :attr:`address` attribute and formatter + methods. + """ def __init__(self,istr): icore.__init__(self,istr) @@ -108,27 +129,6 @@ def __str__(self): def toks(self): return self.formatter(i=self,toks=True) - def __getstate__(self): - return (self.bytes, - self.type, - self.spec, - self.address, - self.mnemonic, - self.operands, - self.formatter, - dict(self.misc)) - - def __setstate__(self,state): - b,t,s,a,m,o,f,D = state - self.bytes = b - self.type = t - self.spec = s - self.address = a - self.mnemonic = m - self.operands = o - self.formatter = f - self.misc = defaultdict(lambda: None) - self.misc.update(D.iteritems()) class InstructionError(Exception): def __init__(self,i): @@ -138,13 +138,31 @@ def __str__(self): class DecodeError(Exception): pass +def _core_misc_default(): + return None + # disassembler core class # ------------------------ + class disassembler(object): + """The generic disassembler class will decode a byte string based on provided + sets of instructions specifications and various parameters like endianess and + ways to select the appropriate instruction set. + + Arguments: + + specmodules: list of python modules containing ispec decorated funcs + iset: lambda used to select module (ispec list) + endian: instruction fetch endianess (1: little, -1: big) + + Attributes: + + maxlen: the length of the longest instruction found in provided specmodules. + iset: the lambda used to select the right specifications for decoding + endian: the lambda used to define endianess. + specs: the *tree* of :class:`ispec` objects that defines the cpu architecture. + """ - # specmodules: list of python modules containing ispec decorated funcs - # iset: lambda used to select module (ispec list) - # endian: instruction fetch endianess (1: little, -1: big) def __init__(self,specmodules,iset=(lambda *args,**kargs:0),endian=(lambda *args, **kargs:1)): self.maxlen = max((s.mask.size/8 for s in sum((m.ISPECS for m in specmodules),[]))) self.iset = iset @@ -155,12 +173,13 @@ def __init__(self,specmodules,iset=(lambda *args,**kargs:0),endian=(lambda *args # so we keep an __i instruction for decoding until a non prefix ispec is used. self.__i = None - # setup will (recursively) organize the provided ispecs list into an optimal tree so that - # __call__ can efficiently find the matching ispec format for a given bytestring - # (we don't want to search until a match, so we need to separate formats as much - # as possible). The output tree is (f,l) where f is the submask to check at this level - # and l is a defaultdict such that l[x] is the subtree of formats for which submask is x. def setup(self,ispecs): + """setup will (recursively) organize the provided ispecs list into an optimal tree so that + __call__ can efficiently find the matching ispec format for a given bytestring + (we don't want to search until a match, so we need to separate formats as much + as possible). The output tree is (f,l) where f is the submask to check at this level + and l is a defaultdict such that l[x] is the subtree of formats for which submask is x. + """ # sort ispecs from high constrained to low constrained: ispecs.sort(lambda x,y: cmp(x.mask.hw(),y.mask.hw()), reverse=True) if len(ispecs)<2: return (0,ispecs) @@ -208,84 +227,110 @@ def __call__(self,bytestring,**kargs): self.__i = None return None -# ispec (parametrable) decorator -# ----------------------------------------- -# @ispec allows to easily define instruction decoders based on architectures specifications. -# The 'spec' argument is a human-friendly string that describes how the ispec object will -# (on request) decode a given bytestring and how it will expose various decoded entities to -# the decorated function in order to define an instruction instance. -# It uses the following syntax : -# -# 'LEN<[ FORMAT ]' : LEN indicates the bit length corresponding to the FORMAT. Here, -# FORMAT is interpreted as a list of directives ordered -# from MSB (bit index LEN-1) to LSB (bit index 0). This is the default -# direction if the '<' indicator is missing. LEN%8!=0 is unsupported. -# or -# 'LEN>[ FORMAT ]' : here FORMAT is ordered from LS bit to MS bit. -# if LEN is '*', the FORMAT is of variable length, which removes checks and allows to -# use a variable length directive at the end of the FORMAT. -# -# possibly terminated with an optional '+' char to indicate that the spec is a prefix. -# In this case, the bytestring prefix matching the ispec format is stacked temporarily -# until the rest of the bytestring matches a non prefix ispec. -# -# The directives composing the FORMAT string are used to associate symbols to bits -# located at dedicated offsets within the bitstring to be decoded. A directive has the -# following syntax: -# -# '-' (indicates that current bit position within FORMAT is not decoded) -# '0' (indicates that current bit position within FORMAT must be 0) -# '1' (indicates that current bit position within FORMAT must be 1) -# or -# 'type SYMBOL location' -# where: -# type is an optional modifier char with possible values: -# '.' indicates that the symbol will be an attribute of the instruction instance. -# '~' indicates that the decoded value will be returned as a Bits instance. -# '#' indicates that the decoded value will be returned as a string of 0/1 chars. -# '=' indicates that decoding should END at current position (overlapping) -# if not present, the symbol will be passed as keyword argument to the function with -# value decoded as an integer. -# -# SYMBOL: is a mandatory string matching regex [A-Za-z_][0-9A-Za-z_]* -# -# location: is an optional string matching the following expressions -# '( len )' : indicates that the value is decoded from the next len bits starting -# from the current position of the directive within the FORMAT string. -# '(*)' : indicates a 'variable length directive' for which the value is decoded -# from the current position with all remaining bits in the FORMAT. -# If the FORMAT LEN is also variable then all remaining bits from the -# instruction buffer input string are used. -# if ommitted, default location is '(1)'. -# -# The special directive {byte} is a shortcut for 8 fixed bits. For example -# 8>[{2f}] is equivalent to 8>[ 1111 0100 ], or 8<[ 0010 1111 ]. -# -# Example: -# -# @ispec(32[ .cond(4) 101 1 imm24(24) ]", mnemonic="BL", _flag=True) -# def f(obj,imm24,_flag): -# [...] -# -# This statement creates an ispec object with hook f, and registers this object automatically -# in a SPECS list object within the module where the statement is found. -# Upon calling the decode method of this ispec object with a provided bytestring: -# => will proceed with decoding ONLY if bits 27,26,25,24 are 1,0,1,1 or raise exception -# => will instanciate an instruction object (obj) -# => will decode 4 bits at position [28,29,30,31] and provide this value as an integer -# in 'obj.cond' instruction instance attribute. -# => will decode 24 bits at positions 23..0 and provide this value as an integer as -# argument 'imm24' of the decorated function f. -# => will set obj.mnemonic to 'BL' and pass argument _flag=True to f. -# => will call f(obj,...) -# => will return obj - -# additional arguments to ispec decorator **must** be provided with symbol=value form and -# are declared as attributes/values within the instruction instance *before* calling the -# decorated function. In the previous example, the instruction has attribute mnemonic -# with value 'BL' when the function is called. # ----------------------------------------- + class ispec(object): + """ispec (customizable) decorator + + @ispec allows to easily define instruction decoders based on architecture specifications. + + Arguments: + + spec (str): + a human-friendly *format* string that describes how the ispec object will + (on request) decode a given bytestring and how it will expose various + decoded entities to the decorated function in order to define an instruction. + **kargs: + additional arguments to ispec decorator **must** be provided with ``symbol=value`` + form and are declared as attributes/values within the instruction instance *before* + calling the decorated function. + + Attributes: + + format (str): the spec format passed as argument (see above). + hook (callable): the decorated python function to be called during decoding. + iattr (dict): the dictionary of instruction attributes to add before decoding. + fargs (dict): the dictionary of keywords arguments to pass the hook. + size (int): the bit length of the format (``LEN`` value) + fix (Bits): the values of fixed bits within the format + mask (Bits): the mask of fixed bits within the format + + Examples: + + This statement creates an ispec object with hook ``f``, and registers this object + automatically in a SPECS list object within the module where the statement is found:: + + @ispec("32[ .cond(4) 101 1 imm24(24) ]", mnemonic="BL", _flag=True) + def f(obj,imm24,_flag): + [...] + + When provided with a bytestring, the :meth:`decode` method of this ispec object will: + + - proceed with decoding ONLY if bits 27,26,25,24 are 1,0,1,1 or raise an exception + - instanciate an instruction object (obj) + - decode 4 bits at position [28,29,30,31] and provide this value as an integer \ + in 'obj.cond' instruction instance attribute. + - decode 24 bits at positions 23..0 and provide this value as an integer as \ + argument 'imm24' of the decorated function f. + - set obj.mnemonic to 'BL' and pass argument _flag=True to f. + - call f(obj,...) + - return obj + + Note: + + The ``spec`` argument uses the following patterns: + + - ``LEN<[ FORMAT ]`` : + ``LEN`` is an integer that defines the bit length of the ``FORMAT`` + (LEN%8!=0 is not supported.) + The ``FORMAT`` is a sequence of *directives* ordered + from MSB (bit index LEN-1) to LSB (bit index 0). + (This is the default direction if the '<' char is missing.) + - ``LEN>[ FORMAT ]`` : + same as above but ``FORMAT`` is ordered from LSB to MSB. + + If ``LEN`` is the special char ``*``, the ``FORMAT`` has a variable length, + which removes some verifications and allows to terminate the ``FORMAT`` with + a variable length directive. + + The spec string is possibly terminated with an optional ``+`` char to indicate that it + represents an instruction *prefix*. In this case, the bytestring prefix matching the + ispec format is stacked temporarily until the rest of the bytestring matches a non + prefix ispec. + + The directives defining the ``FORMAT`` string are used to associate symbols to bits + located at dedicated offsets within the bitstring to be decoded. A directive has the + following syntax: + + * ``-`` (indicates that current bit position is not decoded) + * ``0`` (indicates that current bit position must be 0) + * ``1`` (indicates that current bit position must be 1) + + or + + * ``type SYMBOL location`` where: + + * ``type`` is an *optional* modifier char with possible values: + + * ``.`` indicates that the ``SYMBOL`` will be an *attribute* of the :class:`instruction`. + * ``~`` indicates that the decoded value will be returned as a Bits instance. + * ``#`` indicates that the decoded value will be returned as a string of [01] chars. + * ``=`` indicates that decoding should *end* at current position (overlapping) + + if not present, the ``SYMBOL`` will be passed as a keyword argument to the function with + value decoded as an integer. + + * ``SYMBOL``: is a mandatory string matching regex ``[A-Za-z_][0-9A-Za-z_]*`` + * ``location``: is an optional string matching the following expressions: + + * ``( len )`` : indicates that the value is decoded from the next len bits starting from the current position of the directive within the ``FORMAT`` string. + * ``(*)`` : indicates a *variable length directive* for which the value is decoded from the current position with all remaining bits in the ``FORMAT``.\ + If the ``LEN`` is also variable then all remaining bits from the instruction buffer input string are used. + default location value is ``(1)``. + + The special directive ``{byte}`` is a shortcut for 8 fixed bits. For example + ``8>[{2f}]`` is equivalent to ``8>[ 1111 0100 ]``, or ``8<[ 0010 1111 ]``. + """ __slots__ = ['format','iattr','fargs','ast','fix','mask','pfx','size','hook'] def __init__(self,format,**kargs): @@ -458,9 +503,18 @@ def __call__(self, handler): self.hook = handler return handler -# Formatter is used for instruction pretty printing # ------------------------------------------------- + class Formatter(object): + """Formatter is used for instruction pretty printing + + Basically, a ``Formatter`` object is created from a dict associating a key with a list + of functions or format string. The key is either one of the mnemonics or possibly + the name of a @ispec-decorated function (this allows to group formatting styles rather + than having to declare formats for every possible mnemonic.) + When the instruction is printed, the formatting list elements are "called" and + concatenated to produce the output string. + """ def __init__(self,formats): self.formats = formats @@ -493,6 +547,7 @@ def __call__(self,i,toks=False): # ispec format parser: #--------------------- + integer = pp.Regex(r'[1-9][0-9]*') indxdir = pp.oneOf(['<','>']) fixbit = pp.oneOf(['0','1']) diff --git a/amoco/cas/expressions.py b/amoco/cas/expressions.py index 6b96bf1..b322e2a 100644 --- a/amoco/cas/expressions.py +++ b/amoco/cas/expressions.py @@ -670,7 +670,7 @@ def stub(cls,ref): return (lambda env,**kargs:None) def call(self,env,**kargs): - logger.info('stub %s called'%self.ref) + logger.info('stub %s explicit call'%self.ref) if not 'size' in kargs: kargs.update(size=self.size) res = self.stub(self.ref)(env,**kargs) if res is None: return top(self.size) @@ -678,7 +678,7 @@ def call(self,env,**kargs): # used when the expression is a target used to build a block def __call__(self,env): - logger.info('stub %s called'%self.ref) + logger.info('stub %s implicit call'%self.ref) self.stub(self.ref)(env,**self._subrefs) ## @@ -929,6 +929,10 @@ def eval(self,env): def simplify(self): self.a.simplify() + if self.a.base._is_vec: + seg,disp = self.a.seg,self.a.disp + v = vec([mem(a,self.size,seg,disp,mods=self.mods) for a in self.a.base.l]) + return v if self.a.base._is_def else vecw(v) return self def addr(self,env): @@ -1084,12 +1088,15 @@ def simplify(self): if rst==0: a = ptr(self.x.a.base,self.x.a.seg,self.x.a.disp+off) return mem(a,self.size) - if self.x._is_eqn and self.x.op.type==2: + if self.x._is_eqn and (self.x.op.type==2 or + (self.x.op.symbol in '+-' and self.pos==0)): r = self.x.r[self.pos:self.pos+self.size] if self.x.op.unary: return self.x.op(r) l = self.x.l[self.pos:self.pos+self.size] return self.x.op(l,r) + if self.x._is_vec: + return vec([x[self.pos:self.pos+self.size] for x in self.x.l]) else: return self diff --git a/amoco/cas/mapper.py b/amoco/cas/mapper.py index 194580f..b054d17 100644 --- a/amoco/cas/mapper.py +++ b/amoco/cas/mapper.py @@ -65,23 +65,24 @@ def pp(self,**kargs): if t.colsize[1]>58: t.colsize[1]=58 return str(t) - def __getstate__(self): - return (self.__map,self.csi) - def __setstate__(self,state): - self.__map,self.csi = state - self.__Mem = MemoryMap() - for loc,v in self: - if loc._is_ptr: self._Mem_write(loc,v) - # list antecedent locations (used in the mapping) def inputs(self): - return sum(map(locations_of,self.__map.itervalues()),[]) + r = [] + for l,v in self.__map.iteritems(): + for lv in locations_of(v): + if lv._is_reg and l._is_reg: + if (lv == l) or (lv.type==l.type==regtype.FLAGS): + continue + r.append(lv) + return r # list image locations (modified in the mapping) def outputs(self): L = [] for l in sum(map(locations_of,self.__map.iterkeys()),[]): + if l._is_reg and l.type in (regtype.PC,regtype.FLAGS): continue if l._is_ptr: l = mem(l,self.__map[l].size) + if self.__map[l]==l: continue L.append(l) return L @@ -103,6 +104,7 @@ def rw(self): def clear(self): self.__map.clear() self.__Mem = MemoryMap() + self.conds = [] def memory(self): return self.__Mem diff --git a/amoco/cfg.py b/amoco/cfg.py index 04302bb..548c5c8 100644 --- a/amoco/cfg.py +++ b/amoco/cfg.py @@ -4,7 +4,16 @@ # Copyright (C) 2006-2011 Axel Tillequin (bdcht3@gmail.com) # published under GPLv2 license -# we wrap the grandalf classes here +""" +cfg.py +====== + +This module provides elements to define *control flow graphs* (CFG). +It is based essentially on classes provided by the `grandalf`_ package. + +.. _grandalf: https://grandalf.readthedocs.io/ + +""" from amoco.logger import Log logger = Log(__name__) @@ -14,16 +23,58 @@ from amoco.system.core import MemoryZone #------------------------------------------------------------------------------ -# node class is a graph vertex that embeds a block instance and inherits its -# name (default to the address of the block). It extends the Vertex class by -# overloading the __hash__ method in order to test membership based on the data -# rather than on the Vertex instance. class node(Vertex): - # protect from None data node: + """A node is a graph vertex that embeds a :mod:`code` object. + It extends the :ref:`Vertex ` class in order to compare + nodes by their data blocks rather than their id. + + Args: + acode : an instance of :class:`block`, :class:`func` or :class:`xfunc`. + + Attributes: + data : the reference to the ``acode`` argument above. + e (list[link]): inherited from `grandalf`_, the list of edges with this + node. In amoco, edges and vertices are called links and nodes. + c (graph_core): reference to the connected component that contains this + node. + + Methods: + deg(): returns the *degree* of this node (number of its links). + + N(dir=0): provides a list of *neighbor* nodes, all if *dir* parameter is 0, + parent nodes if *dir<0*, children nodes if *dir>0*. + + e_dir(dir=0): provides a list of *links*, all if *dir* parameter is 0, + incoming links if *dir<0*, outgoing links if *dir>0*. + + e_in(): a shortcut for ``e_dir(-1)``. + + e_out(): a shortcut for ``e_dir(+1)``. + + e_with(v): provides a *link* to or from v. Should be used with caution: if there is + several links between current node and v this method gives the first one + listed only, independently of the direction. + + e_to(v): provides the *link* from current node to node v. + + e_from(v): provides the *link* to current node from node v. + + """ + def __init__(self,acode): Vertex.__init__(self,data=acode) - self.name = self.data.name - self.view = self.data.view + + @property + def name(self): + """name (str): name property of the node's code object. + """ + return self.data.name + + @property + def view(self): + """view : view property of the node's code object. + """ + return self.data.view def __repr__(self): return '<%s [%s] at 0x%x>'%(self.__class__.__name__,self.name,id(self)) @@ -41,11 +92,43 @@ def __getitem__(self,i): res = node(self.data.__getitem__(i)) return res + def __getstate__(self): + return (self.index,self.data) + + def __setstate__(self,state): + self.__index,self.data = state + self.c = None + self.e = [] + #------------------------------------------------------------------------------ -# link is a direct graph edge between two nodes. It extends the Edge class by -# overloading the __hash__ method in order to test membership based on the data -# rather than on the Edge instance. class link(Edge): + """A directed edge between two nodes. It extends :ref:`Edge ` + class in order to compare edges based on their data rather than id. + + Args: + x (node) : the source node. + y (node) : the destination node. + w (int) : an optional weight value, default 1. + data : a list of conditional expressions associated with the link. + connect : a flag to indicate that a new node should be automatically + added to the connected component of its parent/child if it + is defined (default False). + + Attributes: + name : the name property returns the string composed of source and + destination node's *addresses*. + deg (int): 1 if source and destination are the same node, 0 otherwise. + v (tuple[node]): inherited from `grandalf`_, the 2-tuple (source,dest) + nodes of the link. + feedback: a flag indicating that this link is involved in a loop, + used internally by `grandalf`_ layout algorithm. + + Methods: + attach(): add current link to its :attr:`node.e` attribute list. + + detach(): remove current link from its :attr:`node.e` attribute list. + + """ def __str__(self): n0 = self.v[0].name @@ -58,8 +141,8 @@ def __repr__(self): @property def name(self): - n0 = self.v[0].name - n1 = self.v[1].name + n0 = self.v[0].data.address + n1 = self.v[1].data.address return "%s -> %s"%(n0,n1) def __cmp__(self,e): @@ -68,21 +151,86 @@ def __cmp__(self,e): def __hash__(self): return hash(self.name) + def __getstate__(self): + xi,yi = (self.v[0].index,self.v[1].index) + return (xi,yi,self.w,self.data,self.feedback) + + def __setstate__(self,state): + xi,yi,self.w,self.data,self.feedback = state + self._v = [xi,yi] + self.deg = 0 if xi==yi else 1 + #------------------------------------------------------------------------------ -# graph is a Graph that represents a set of functions as individual components class graph(Graph): + """a :ref:`` that represents a set of functions as its + individual components. + + Args: + V (iterable[node]) : the set of (possibly detached) nodes. + E (iterable[link]) : the set of links of this graph. + + Attributes: + C : the list of :class:`graph_core ` connected + components of the graph. + support (:class:`~system.core.MemoryZone`): the abstract memory zone + holding all nodes contained in this graph. + overlay : defaults to None, another instance of MemoryZone + with nodes of the graph that overlap other nodes already mapped + in :attr:`support`. + + Methods: + get_by_name(name): get the node with the given name (as string). + + get_with_address(vaddr): get the node that contains the given *vaddr* + :class:`~cas.expressions.cst` expression. + + signature(): returns the full signature string of all connected + components. + + add_vertex(v,[support=None]): add node v to the graph and declare + node support in the default MemoryZone or the overlay zone if + provided as support argument. This method deals with a node v + that cuts or swallows a previously added node. + + remove_vertex(v): remove node v from the graph. + + add_edge(e): add link to the graph as well as possible new nodes. + + remove_edge(e): remove the provided link. + + get_vertices_count(): a synonym for :meth:`order`. + + V(): generator of all nodes of the graph. + + E(): generator of all links of the graph. + + N(v,f_io=0): returns the neighbors of node v in direction f_io. + path(x,y,f_io=0,hook=None): + + order(): number of nodes in the graph. + + norm(): number of links in the graph. + + deg_min(): minimum degree of nodes. + + deg_max(): maximum degree of nodes. + + deg_avg(): average degree of nodes. + + eps(): ratio of links over nodes (norm/order). + + connected(): boolean flag indicating that the graph as + only one connected component. + + components(): synonym for attribute :attr:`C`. + + """ def __init__(self,*args,**kargs): self.support = MemoryZone() self.overlay = None super(graph,self).__init__(*args,**kargs) - def spool(self,n=None): - L = [] - for v in self.V(): - if len(v.e_out())==0: L.append(v) - return L - def __cut_add_vertex(self,v,mz,vaddr,mo): oldnode = mo.data.val if oldnode==v: return oldnode @@ -160,7 +308,15 @@ def get_with_address(self,vaddr): def signature(self): return ''.join([signature(g) for g in self.C]) +#------------------------------------------------------------------------------ + def signature(g): + """compute the signature of a :ref:`graph_core ` component + based on :meth:`block.sig` value of nodes in every partion of the graph. + + Returns: + str: the signature string. + """ P = g.partition() S = [] for p in P: @@ -168,4 +324,4 @@ def signature(g): for n in p: s.append(n.data.sig()) S.append(''.join(s)) - return '{[%s]}'%']['.join(S) + return '{[%s]}'%'] ['.join(S) diff --git a/amoco/code.py b/amoco/code.py index 6e9d7dc..2bdec99 100644 --- a/amoco/code.py +++ b/amoco/code.py @@ -5,8 +5,20 @@ # published under GPLv2 license """ -The code module defines classes that represent assembly blocks, functions, -and *external functions*. +code.py +======= + +This module defines classes that represent assembly instructions blocks, +functions, and calls to *external* functions. In amoco, such objects are +found as :attr:`node.data` in nodes of a :class:`cfg.graph`. As such,they +all provide a common API with: + * ``name`` to get/set a name, + * ``map`` to get the associated symbolic execution + * ``address`` to identify and locate the object in memory + * ``support`` to get the address range of the object + * ``misc`` to note discovered properties + * ``view`` to display the object + """ import pdb @@ -23,24 +35,43 @@ #------------------------------------------------------------------------------- class block(object): + """A block instance holds a sequence of instructions and allows to compute + the *map* corresponding to the symbolic continous execution of this sequence. + + Args: + instr (list[instruction]): the sequence of continuous instructions + name (Optional[str]): the name of the block + (defaults to address of the 1st instruction) + + Attributes: + _map (:class:`mapper`): works as a cache for the ``map`` property below. + instr (list): the list of instructions of the block. + name (str): the name of the block (defaults to address string). + misc (dict): placeholder for (any) information about the block semantics. + view (:class:`blockView`): holds the :mod:`ui.views` object used to display the block. + length (int): the byte length of the block instructions sequence. + support (tuple): the memory footprint of the block + _helper: a *callable* object used for adding `misc` :ref:`tag` items + based on plateform-dependent patterns. This object is usually set by the + system class when a block is instanciated. """ - A block instance is a 'continuous' sequence of instructions. - """ + __slots__=['_map','instr','_name','misc','_helper','view'] def __init__(self, instrlist, name=None): - """ - the init of a block takes a list of instructions and creates a `map` of it - """ self._map = None self.instr = instrlist self._name = name - self.misc = defaultdict(lambda :0) + self.misc = defaultdict(_code_misc_default) self._helper = None self.view = blockView(self) @property def map(self): + """the propery providing the symbolic map of the block, or if this + block is the entry of a :class:`func` object, the map of the + function it belongs to. + """ if self._map is None: self._map = mapper(self.instr) self.helper(self._map) @@ -56,7 +87,12 @@ def helper(self,m): @property def address(self): - return self.instr[0].address if len(self.instr)>0 else None + """address (:class:`cst`): the address of the first instruction in the block. + """ + try: + return self.instr[0].address + except IndexError: + return None @property def length(self): @@ -76,6 +112,16 @@ def setname(self,name): name = property(getname,setname) def __getitem__(self,i): + """block objects support slicing from given start/stop addresses + + Args: + i (slice): start and stop address *within* the block. The + values must match addresses of instructions otherwise + a :exc:`ValueError` exception is raised. + + Returns: + block: a new block with selected instructions. + """ sta,sto,stp = i.indices(self.length) assert stp==1 pos = [0] @@ -91,10 +137,17 @@ def __getitem__(self,i): if len(I)>0: return block(self.instr[ista:isto]) - # cut the block at given address will remove instructions after this address, - # which needs to be aligned with instructions boundaries. The effect is thus to - # reduce the block size. The returned value is the number of instruction removed. def cut(self,address): + """cutting the block at given address will remove instructions after this address, + (which needs to be aligned with instructions boundaries.) The effect is thus to + reduce the block size. + + Args: + address (cst): the address where the cut occurs. + + Returns: + int: the number of instructions removed from the block. + """ I = [i.address for i in self.instr] try: pos = I.index(address) @@ -115,18 +168,22 @@ def __str__(self): return '\n'.join([r.show(raw=True,**T.rowparams) for r in T.rows]) def __repr__(self): - return '<%s object (name=%s) at 0x%08x>'%(self.__class__.__name__,self.name,id(self)) + return '<%s object (%s) at 0x%08x>'%(self.__class__.__name__,self.name,id(self)) def raw(self): - return ''.join([i.bytes for i in self.instr]) + """returns the *raw* bytestring of the block instructions. + """ + return ''.join([i.bytes for i in self.instr]) def __cmp__(self,b): return cmp(self.raw(),b.raw()) def __hash__(self): - return hash(self.name) + return hash(self.address) def sig(self): + """returns the :meth:`cfg.signature` of the block. + """ misc = defaultdict(lambda :None) misc.update(self.misc) if len(misc)==0: @@ -134,12 +191,18 @@ def sig(self): s = [tag.sig(k) for k in misc] return '(%s)'%(''.join(s)) -#------------------------------------------------------------------------------ -# func is a cfg connected component that generally represents a called function -# It appears in the other graphs whenever the function is called and provides a -# synthetic map that captures the semantics of the function. #------------------------------------------------------------------------------ class func(block): + """A graph of blocks that represents a function's CFG and allows + to compute a symbolic execution *map* of the function. Inherits from :class:`block`. + + Args: + g (graph_core): the connected graph component of block nodes. + name (Optional[str]): the optional name of the function (defaults to the name of its root node.) + + Attributes: + cfg (graph_core): the :grandalf:class:`graph_core` CFG of the function (see :mod:`cfg`.) + """ __slots__ = ['cfg'] # the init of a func takes a core_graph and creates a map of it: @@ -149,16 +212,18 @@ def __init__(self, g=None, name=None): self.instr = [] # base/offset need to be defined before code (used in setcode) self._name = name - self.misc = defaultdict(lambda :0) + self.misc = defaultdict(_code_misc_default) self._helper = None self.view = funcView(self) @property def address(self): - return self.blocks[0].address + return self.cfg.sV[0].data.address @property def blocks(self): + """blocks (list): the list of blocks within the function. + """ return [n.data for n in self.cfg.sV] @property @@ -167,8 +232,21 @@ def support(self): smax = max((b.address+b.length for b in self.blocks)) return (smin,smax) + def __hash__(self): + return hash(self.address) + #(re)compute the map of the entire function cfg: def makemap(self,withmap=None,widening=True): + """compute the mapper of the entire function. + + Args: + withmap (Optional[mapper]): an input mapper instance that + can be used to represent the calling stack frame. + widening (Bool): indicates if loop widening should apply. + + Returns: + mapper: an approximated symbolic execution of the function. + """ # spawn a cfg layout to detect loops and allow to # walk the cfg by using the nodes rank. gr = self.view.layout @@ -249,15 +327,34 @@ def __str__(self): # defined in the ext expression. #------------------------------------------------------------------------------ class xfunc(object): - __slots__ = ['map','name','address','length','misc','view'] + """A class to represent an external symbol in the CFG and compute the *map* + associated with the call to this function. + + Args: + x (ext): the :class:`~cas.expressions.ext` expression associated with the + external symbol. + + Attributes: + map (mapper): the symbolic execution *map* of the external symbol. + name (str): the external symbol string. + length (int): set to zero. + instr (list): empty. + address (exp): the :class:`~cas.expressions.ext` expression. + misc (dict): placeholder for (any) information about the xfunc. + view (:class:`xfuncView`): holds the :mod:`ui.views` object used to + display the object. + + """ + __slots__ = ['map','name','address','instr','length','misc','view'] def __init__(self, x): self.map = mapper() x(self.map) self.name = str(x) self.address = x + self.instr = [] self.length = 0 - self.misc = defaultdict(lambda :0) + self.misc = defaultdict(_code_misc_default) doc = x.stub(x.ref).func_doc if doc: for (k,v) in tag.list(): @@ -265,12 +362,23 @@ def __init__(self, x): self.misc[v] = 1 self.view = xfuncView(self) + def __hash__(self): + return hash(self.name) + @property def support(self): return (self.address,self.address) + def sig(self): + s = [tag.sig(k) for k in self.misc] + return '(x:%s)'%(''.join(s)) + #------------------------------------------------------------------------------ + class tag: + """defines keys as class attributes, used in :attr:`misc` attributes to + indicate various relevant properties of blocks within functions. + """ FUNC_START = 'func_start' FUNC_END = 'func_end' FUNC_STACK = 'func_stack' @@ -287,10 +395,14 @@ class tag: @classmethod def list(cls): + """get the list of all defined keys + """ return filter(lambda kv: kv[0].startswith('FUNC_'), cls.__dict__.items()) @classmethod def sig(cls,name): + """symbols for tag keys used to compute the block's signature + """ return { 'cond' : '?', 'func' : 'F', @@ -307,3 +419,5 @@ def sig(cls,name): cls.LOOP_START : 'l' }.get(name,'') +def _code_misc_default(): + return 0 diff --git a/amoco/config.py b/amoco/config.py index b4527c3..a9aea9a 100644 --- a/amoco/config.py +++ b/amoco/config.py @@ -1,4 +1,45 @@ # -*- coding: utf-8 -*- +""" +config.py +========= + +This module defines the default amoco configuration +and loads any user-defined configuration file. + +Attributes: + conf (SafeConfigParser): holds in a standard ConfigParser object, + various parameters mostly related to how outputs should be formatted. + + The defined sections are: + + - 'block' which deals with how basic blocks are printed, with options: + + - 'header' will show a dashed header line including the address of the block if True (default) + - 'footer' will show a dashed footer line if True + - 'bytecode' will show the hex encoded bytecode string of every instruction if True (default) + - 'padding' will add the specified amount of blank chars to between address/bytecode/instruction (default 4). + + - 'cas' which deals with parameters of the algebra system: + + - 'complexity' threshold for expressions (default 100). See expressions_ for details. + + - 'db' which deals with database backend options: + + - 'url' allows to define the dialect and/or location of the database (default to sqlite) + - 'log' indicates that database logging should be redirected to the amoco logging handlers + + - 'log' which deals with logging options: + + - 'level' one of 'ERROR' (default), 'VERBOSE', 'INFO', 'WARNING' or 'DEBUG' from less to more verbose, + - 'tempfile' to also save DEBUG logs in a temporary file if True (default), + - 'filename' to also save DEBUG logs using this filename. + + - 'ui' which deals with some user-interface pretty-printing options: + + - 'formatter' one of 'Null' (default), 'Terminal', "Terminal256', 'TerminalDark', 'TerminalLight', 'Html' + - 'graphics' one of 'term' (default), 'qt' or 'gtk' + +""" import ConfigParser as cp from collections import defaultdict @@ -18,6 +59,11 @@ conf.add_section('cas') conf.set('cas', 'complexity' , '100' ) +# db section +conf.add_section('db') +conf.set('db', 'url', 'sqlite:///') +conf.set('db', 'log', 'False') + # log section conf.add_section('log') conf.set('log', 'level', 'ERROR') @@ -33,7 +79,19 @@ conf.read([os.path.expanduser('~/.amocorc')]) #----------------------- + + def get_module_conf(module_name): + """utility function that will return the dict of options related to a section name. + + Args: + module_name (str): a section of the conf object, usually the name of the module (__name__). + + Returns: + dict: The options associated to the section module_name, with values casted to their + natural python types (lowercase strings, booleans, or integers). + + """ D = defaultdict(lambda:None) if conf.has_section(module_name): for k,v in conf.items(module_name): diff --git a/amoco/db.py b/amoco/db.py index dc46c86..ca7524d 100644 --- a/amoco/db.py +++ b/amoco/db.py @@ -1,163 +1,186 @@ +# -*- coding: utf-8 -*- + # This code is part of Amoco -# Copyright (C) 2014 Axel Tillequin (bdcht3@gmail.com) +# Copyright (C) 2016 Axel Tillequin (bdcht3@gmail.com) # published under GPLv2 license -import importlib +""" +db.py +===== + +This module implements all amoco's database facilities using the +`sqlalchemy`_ package, allowing to store many analysis results and +pickled objects. + +.. _sqlalchemy: http://www.sqlalchemy.org/ + +""" + +from amoco.config import conf -from amoco.logger import Log +from amoco.logger import Log,logging logger = Log(__name__) -from amoco.cas.expressions import exp -from amoco.arch.core import instruction -from amoco.system.core import CoreExec -from amoco.code import mapper,block,func,xfunc -from amoco.cfg import node,link,graph - -class db_core(object): - @staticmethod - def dump(self): - raise NotImplementedError - @staticmethod - def load(self): - raise NotImplementedError - -#------------------------------------------------------------------------------ -class db_instruction(db_core): - - def __init__(self,i): - self.address = i.address - self.misc = dict(i.misc) - self.bytes = i.bytes - self.view = str(i) - - def build(self,cpu): - i = cpu.disassemble(self.bytes) - i.misc.update(self.misc) - i.address = self.address - return i - -#------------------------------------------------------------------------------ -class db_mapper(db_core): - - def __init__(self,m): - self.map = [(db_exp(l),db_exp(x)) for l,x in m] - self.conds = [db_exp(c) for c in m.conds] - self.view = str(m) - - def build(self): - m = mapper() - for k,v in self.map: - m[k.build()] = v.build() - m.conds = [c.build() for c in self.conds] - return m - -#------------------------------------------------------------------------------ -class db_block(db_core): - - def __init__(self,b): - self.name = b.name - self.misc = dict(b.misc) - self.instr = [db_instruction(i) for i in b.instr] - self.map = db_mapper(b.map) - self.view = str(b) - - def build(self,cpu): - instr = [i.build(cpu) for i in self.instr] - b = block(instr) - b.map = self.map.build() - b.misc.update(self.misc) - return b - -#------------------------------------------------------------------------------ -class db_exp(db_core): - - def __init__(self,x): - self.view = x.dumps() - - def build(self): - return exp().loads(self.view) - -#------------------------------------------------------------------------------ -class db_graph(db_core): - - def __init__(self,g): - self.nodes = [db_block(n.data) for n in g.V()] - self.links = [(e.v[0].name,e.v[1].name) for e in g.E()] - - def build(self,cpu): - g = graph() - nodes = dict([(b.name,node(b.build(cpu))) for b in self.nodes]) - for l in [link(nodes[n1],nodes[n2]) for (n1,n2) in self.links]: - g.add_edge(l) - return g - -#------------------------------------------------------------------------------ -class db_exec(db_core): - - def __init__(self,p): - self.filename = p.bin.filename - self.format = p.bin.__class__ - self.cls = p.__class__ - self.cpu = p.cpu.__name__ - - def build(self): - f = self.format(self.filename) - p = self.cls(f) - p.cpu = importlib.import_module(self.cpu) - return p - - -#------------------------------------------------------------------------------ try: - import transaction - from ZODB import DB, FileStorage - from persistent import Persistent -except ImportError,e: - logger.warning(e.message) - from StringIO import StringIO - - # declare void Session class: - class Session(object): - _is_active = False - def __init__(self,filename=None): - logger.info('this session is not active') - self.db = StringIO() - self.conn = None - self.root = None - def add(self,key,obj): - pass - def commit(self): - pass - def restore(self): - pass -else: - - # Session database class: - class Session(object): - _is_active = True - - def __init__(self,filename): - storage = FileStorage.FileStorage(filename) - self.db = DB(storage) - self.conn = self.db.open() - self.root = self.conn.root() - - def add(self,key,obj): - self.root[key] = db_interface(obj) - - def commit(self): - transaction.commit() - - def restore(self): - pass - -def db_interface(obj): - if isinstance(obj,block): return db_block(obj) - if isinstance(obj,mapper): return db_mapper(obj) - if isinstance(obj,exp): return db_exp(obj) - elif isinstance(obj,graph): return db_graph(obj) - elif isinstance(obj,CoreExec): return db_exec(obj) + import sqlalchemy as sql + from sqlalchemy import orm + from sqlalchemy.ext.declarative import declarative_base + has_sql = True + Session = orm.scoped_session(orm.sessionmaker()) + Base = declarative_base() + logflag = conf.getboolean('db','log') + if logflag: + for l in ('sqlalchemy.engine','sqlalchemy.orm'): + alog = logging.getLogger(l) + for h in logger.handlers: alog.addHandler(h) + alog.setLevel(logger.level) +except ImportError: + logger.warning("package sqlalchemy not found.") + has_sql = False + +def create(filename=None): + """creates the database engine and bind it to the scoped Session class. + The database URL (see :mod:`config.py`) is opened and the + schema is created if necessary. The default URL uses *sqlite* dialect and + opens a temporary file for storage. + """ + import tempfile + url = conf.get('db','url') + if not url.endswith('.db'): + if conf.has_option('log','file'): + case = conf.get('log','file').rpartition('.')[0] + else: + case = tempfile.mktemp(prefix='amoco-') + url += case+'.db' + logflag = conf.getboolean('db','log') + if has_sql: + engine = sql.create_engine(url,echo=False,logging_name=__name__) + Session.configure(bind=engine) + Case.metadata.create_all(bind=engine,checkfirst=True) else: - logger.warning("no db interface defined for %s, using str..."%obj.__class__) - return str(obj) + logger.error('No Session defined') + engine = None + return engine + +if has_sql: + class Case(Base): + """A Case instance describes the analysis of some binary program. + It allows to query stored results by date, source, format or + architecture for example, and provides relations to associated + functions that have been discovered or saved traces. + """ + __tablename__ = 'cases_info' + id = sql.Column(sql.Integer, primary_key=True) + date = sql.Column(sql.DateTime) + name = sql.Column(sql.String) + source = sql.Column(sql.String) + binfmt = sql.Column(sql.String) + arch = sql.Column(sql.String) + msize = sql.Column(sql.Integer) + score = sql.Column(sql.Integer,default=0) + method = sql.Column(sql.String) + funcs = orm.relationship('FuncData',back_populates='case') + other = orm.relationship('CfgData',back_populates='case') + traces = orm.relationship('Trace',back_populates='case') + + def __init__(self,z,name=None): + from datetime import datetime + from os.path import basename,splitext + self.date = datetime.now() + self.source = z.prog.bin.filename + self.name = name or splitext(basename(self.source))[0] + self.binfmt = z.prog.bin.__class__.__name__ + self.arch = z.prog.cpu.__name__ + self.msize = z.prog.bin.getsize() + self.method = z.__class__.__name__ + if z.G.order()>0: + self.score = z.score() + F = z.functions() + for f in F: self.funcs.append(FuncData(f)) + F = [f.cfg for f in F] + for g in z.G.C: + if g not in F: + self.other.append(CfgData(obj=g)) + + def __repr__(self): + s = (self.id, self.name, self.binfmt, self.arch, self.method) + return "".format(*s) + + class FuncData(Base): + """This class holds pickled :class:`~cas.mapper.mapper` and + :class:`code.func` instances related to a Case, and provides + relationship with gathered infos about the discovered function. + """ + __tablename__ = 'funcs_data' + id = sql.Column(sql.Integer, primary_key=True) + fmap = orm.deferred(sql.Column(sql.PickleType)) + obj = orm.deferred(sql.Column(sql.PickleType)) + case_id= sql.Column(sql.Integer, sql.ForeignKey('cases_info.id')) + case = orm.relationship('Case',back_populates='funcs') + info = orm.relationship('FuncInfo',uselist=False,back_populates='data') + + def __init__(self,f): + self.fmap = f.map + self.obj = f + self.info = FuncInfo(f,self) + + class FuncInfo(Base): + """This class gathers useful informations about a function, allowing + to query by signature or various characteristics like number of blocks, + number of args, stack size, byte size, number of instructions, calls, + or cross-references. + """ + __tablename__ = 'funcs_info' + id = sql.Column(sql.Integer, sql.ForeignKey('funcs_data.id'),primary_key=True) + name = sql.Column(sql.String, nullable=False) + sig = sql.Column(sql.String) + blocks = sql.Column(sql.Integer) + argsin = sql.Column(sql.Integer) + argsout= sql.Column(sql.Integer) + stksz = sql.Column(sql.Integer) + vaddr = sql.Column(sql.Integer) + bsize = sql.Column(sql.Integer) + nbinst = sql.Column(sql.Integer) + calls = sql.Column(sql.String) + xrefs = sql.Column(sql.String) + data = orm.relationship('FuncData',uselist=False,back_populates='info') + notes = orm.deferred(sql.Column(sql.Text,default='')) + + def __init__(self,f,data): + from amoco.cfg import signature + self.name = f.name + self.sig = signature(f.cfg) + self.blocks = f.cfg.order() + self.argsin = f.misc['func_in'] + self.argsout = f.misc['func_out'] + self.stksz = min([x.a.disp for x in f.misc['func_var']],0) + self.vaddr = str(f.address) + self.bsize = sum([b.length for b in f.blocks],0) + self.nbinst = sum([len(b.instr) for b in f.blocks],0) + self.calls = ' '.join(filter(None,[x.name if hasattr(x,'cfg') else None for x in f.blocks])) + self.xrefs = ' '.join([str(x.data.support[1]) for x in f.cfg.sV[0].data.misc['callers']]) + + class CfgData(Base): + """The CfgData class is intented to pickle data that has not yet been + identified as a function but is part of the recovered :class:graph. + """ + __tablename__ = 'cfgs_data' + id = sql.Column(sql.Integer, primary_key=True) + obj = orm.deferred(sql.Column(sql.PickleType)) + case_id= sql.Column(sql.Integer, sql.ForeignKey('cases_info.id')) + case = orm.relationship('Case', back_populates='other') + + class Trace(Base): + """The Trace class allows to pickle abstract memory states (:class:`mapper` objects) + obtained from a given input map after executing the binary program from *start* address + to *stop* address. + """ + __tablename__ = 'traces_data' + id = sql.Column(sql.Integer, primary_key=True) + start = sql.Column(sql.Integer) + stop = sql.Column(sql.Integer) + mapin = orm.deferred(sql.Column(sql.PickleType)) + mapout = orm.deferred(sql.Column(sql.PickleType)) + case_id= sql.Column(sql.Integer, sql.ForeignKey('cases_info.id')) + case = orm.relationship('Case', back_populates='traces') diff --git a/amoco/logger.py b/amoco/logger.py index 5d3de90..ba5c224 100644 --- a/amoco/logger.py +++ b/amoco/logger.py @@ -4,6 +4,35 @@ # Copyright (C) 2006-2011 Axel Tillequin (bdcht3@gmail.com) # published under GPLv2 license +""" +logger.py +========= + +This module defines amoco logging facilities. The ``Log`` class inherits from a standard :py:class:`logging.Logger`, +with minor additional features like a ``'VERBOSE'`` level introduced between ``'INFO'`` and ``'DEBUG'`` +levels, and a progress method that can be useful for time consuming activities. See below for details. + +Most amoco modules start by creating their local ``logger`` object used to provide various feedback. +Users can thus focus on messages from selected amoco modules by adjusting their level independently, +or use the ``set_quiet()``, ``set_debug()`` or ``set_log_all(level)`` functions to adjust all loggers +at once. + +Examples: + Setting the mapper module to ``'VERBOSE'`` level:: + + In [1]: import amoco + In [2]: amoco.cas.mapper.logger.setlevel('VERBOSE') + + + Setting all modules loggers to ``'ERROR'`` level:: + + In [2]: amoco.set_quiet() + +Note that amoco loggers are configured to log both to *stderr* with selected level +and to a temporary file with ``'DEBUG'`` level. + +""" + import logging @@ -15,11 +44,14 @@ try: from amoco import conf - try: - default_level = conf.getint('log','level') - if default_level is None: default_level = 0 - except ValueError: - default_level = logging._levelNames.get(conf.get('log','level'),0) + def get_log_level(): + try: + level = conf.getint('log','level') + if level is None: level = 0 + except ValueError: + level = logging._levelNames.get(conf.get('log','level'),0) + return level + default_level = get_log_level() if conf.has_option('log','file'): logfilename = conf.get('log','file') elif conf.getboolean('log','tempfile'): @@ -35,10 +67,25 @@ logfile = logging.FileHandler(logfilename,mode='w') logfile.setFormatter(default_format) logfile.setLevel(logging.DEBUG) + conf.set('log','file',logfilename) else: logfile = None class Log(logging.Logger): + """This class is intended to allow amoco activities to be logged + simultaneously to the *stderr* output with an adjusted level and to + a temporary file with full verbosity. + + All instanciated Log objects are tracked by the Log class attribute ``Log.loggers`` + which maps their names with associated instances. + + The recommended way to create a Log object is to add, near the begining + of amoco modules:: + + from amoco.logger import Log + logger = Log(__name__) + + """ def __init__(self,name,handler=logging.StreamHandler()): logging.Logger.__init__(self,name) handler.setFormatter(default_format) @@ -77,17 +124,32 @@ def register(cls,name,self): def set_quiet(): + """set all loggers to ``'ERROR'`` level + """ set_log_all(logging.ERROR) def set_debug(): + """set all loggers to ``'DEBUG'`` level + """ set_log_all(logging.DEBUG) def set_log_all(level): + """set all loggers to specified level + + Args: + level (int): level value as an integer. + """ default_level = level for l in Log.loggers.itervalues(): l.setLevel(level) def set_log_file(filename): + """set log file for all loggers + + Args: + filename (str): filename for the FileHandler added + to all amoco loggers + """ if logfile is not None: logfile.close() logfile = logging.FileHandler(logfilename,mode='w') diff --git a/amoco/main.py b/amoco/main.py index 3f581cd..46f9f40 100644 --- a/amoco/main.py +++ b/amoco/main.py @@ -1,5 +1,15 @@ # -*- coding: utf-8 -*- +""" +main.py +======= +The main module of amoco implements various strategies to perform CFG recovery. + +.. inheritance-diagram:: main + :parts: 1 + +""" + # This code is part of Amoco # Copyright (C) 2006-2014 Axel Tillequin (bdcht3@gmail.com) # published under GPLv2 license @@ -14,19 +24,35 @@ from amoco.arch.core import INSTRUCTION_TYPES # ----------------------------------------------------------------------------- -# linear sweep based analysis: -# fast & dumb way of disassembling prog, -# but provides iterblocks() for all parent classes. class lsweep(object): + """linear sweep based analysis: fast & dumb way of disassembling prog, + but provides :meth:`iterblocks` for all parent classes. + + Arguments: + prog: the :class:`system.core.CoreExec` inherited program's instance + to analyze. + + Attributes: + prog: the :class:`system.core.CoreExec` inherited program's instance + to analyze. + G (graph): the placeholder for the recovered :class:`cfg.graph`. + """ __slots__ = ['prog','G'] def __init__(self,prog): self.prog = prog self.G = cfg.graph() - # iterator over linearly sweeped instructions - # starting at address loc (defaults to entrypoint). - # If not None, loc argument should be a cst object. def sequence(self,loc=None): + """iterator over linearly sweeped instructions. + + Arguments: + loc (Optional[cst]): the address to start disassembling + (defaults to the program's entrypoint). + + Yields: + instructions from given address, until a non-instruction + byte sequence is reached. + """ p = self.prog if loc is None: try: @@ -40,10 +66,20 @@ def sequence(self,loc=None): loc += i.length yield i - # iterator over basic blocks using the instruction.type attribute - # to detect the end of block (type_control_flow). The returned block - # object is enhanced with plateform-specific infos (see block.misc). def iterblocks(self,loc=None): + """iterator over basic blocks. The :attr:`instruction.type` + attribute is used to detect the end of a block (type_control_flow). + The returned :class:`block` object is enhanced with plateform-specific + informations (see :attr:`block.misc`). + + Arguments: + loc (Optional[cst]): the address of the first block + (defaults to the program's entrypoint). + + Yields: + linear sweeped blocks of instructions from given address, + until :meth:`sequence` stops. + """ inblock = (lambda i: INSTRUCTION_TYPES[i.type]!='control_flow') l = [] seq = self.sequence(loc) @@ -72,9 +108,10 @@ def iterblocks(self,loc=None): b=self.prog.codehelper(block=b) yield b - # getblock is a handy wrapper of iterblocks to - # return the block located at address val provided as Python Int. def getblock(self,val): + """getblock is just a wrapper of iterblocks to + return the first block located at a *Python Integer* provided address. + """ p = self.prog target = p.cpu.cst(val,p.cpu.PC().size) ib = self.iterblocks(target) @@ -82,10 +119,37 @@ def getblock(self,val): ib.close() return b - # poorman's cfg builder that assumes calls return to next block. - # and link blocks based on direct concrete targets without computing - # the block semantics (map). => Fast but possibly wrong... + def functions(self): + """provides the list of functions recovered so far. + """ + F = [] + for c in self.G.C: + f = c.sV[0].data.misc['func'] + if f: F.append(f) + return F + + def signature(self,func=None): + """provides the signature of a given function, + or the entire signature string. + """ + if func is not None: + return cfg.signature(func.cfg) + return self.G.signature() + + def score(self,func=None): + """a measure for the *complexity* of the program. + For the moment it is associated only with the + signature length. + """ + sig = self.signature(func) + return len(sig) + def getcfg(self,loc=None): + """the most basic cfg recovery method: it assumes that calls always + return to the following block, and links blocks based on direct + concrete targets without computing any symbolic map. + Its *fast* but probably very wrong... + """ from collections import OrderedDict,defaultdict D = OrderedDict() C = defaultdict(lambda: []) @@ -158,15 +222,17 @@ def getcfg(self,loc=None): return self.G # ----------------------------------------------------------------------------- + class _target(object): - ''' Candidate for extending a CFG. + ''' Candidate for extending a :class:`cfg.graph` under construction. - A _target is an internal object used during CFG reconstruction to point - to addresses that are candidates for extending the CFG with either new edge - or new basic block. + A :class:`_target` is an internal object used during cfg recovery to point + to addresses that are candidates for extending the cfg with a new link or + a new block. Attributes: - cst (exp): the targeted address expression + cst (exp): the targeted address expression, usually a constant but can + be an instance of any :mod:`cas.expressions` class. parent (node): the basic block that targets this address econd (exp): the conditional expression by which the execution would proceed from parent to the basic block at this address @@ -178,6 +244,9 @@ def __init__(self,cst,parent,econd=None): self.dirty = False def expand(self): + """Returns the list of constant (or external) expressions associated + with this target. + """ x=self.cst if x._is_ext: return [self] @@ -195,6 +264,9 @@ def expand(self): return [] def select(self,side): + """Returns the target of the selected ``True`` or ``False`` *side* of + the current conditional branch target expression. + """ x=self.cst assert x._is_tst v = x.l if side is True else x.r @@ -214,10 +286,26 @@ def __repr__(self): # ----------------------------------------------------------------------------- -# fast forward based analysis: -# follows PC expression evaluated within a single block only. -# exploration goes forward until expressions are not cst. + class fforward(lsweep): + """The fast forward based analysis follows the :meth:`PC` expression evaluated + within a single block only. Exploration goes forward until expressions + are not :class:`~cas.expressions.cst`. This class is a base for most of the + main analysis classes. + + Attributes: + policy (dict): holds various useful parameters for the analysis. + + * 'depth-first' : walk the graph with *depth-first* policy if True. + * 'branch-lazy' : proceed with linear sweep whenever the target \ + expression does not evaluate to a constant address. + * 'frame-aliasing' : assume no pointer aliasing if False. + * 'complexity' : limit expressions complexity. + + spool (list[_target]): the list of current targets to extend the + :class:`cfg.graph`. + + """ policy = {'depth-first': True, 'branch-lazy': True} def init_spool(self,loc): @@ -234,8 +322,19 @@ def update_spool(self,vtx,parent): logger.info(err) vtx.data.misc['tbc'] = 1 - # compute expression of target address (PC) in node.data.map def get_targets(self,node,parent): + """Computes expression of target address in the given node, based + on its address and the architecture's program counter (PC). + + Arguments: + node: the current node, not yet added to the cfg. + parent: the parent node in the cfg that has targeted the + current node. (Unused by :class:`fforward` but required as + a generic API for parent classes). + + Returns: + :class:`_target`: the evaluated PC expression. + """ blk = node.data m = code.mapper() pc = self.prog.cpu.PC() @@ -244,12 +343,21 @@ def get_targets(self,node,parent): return _target(pc,node).expand() def add_root_node(self,vtx): + """The given vertex node (vtx) is added as a root node of a new connected + component in the cfg referenced by :attr:`self.G`. + """ vtx.data.misc[code.tag.FUNC_START]=1 vtx.data.misc['callers'] = [] self.G.add_vertex(vtx) logger.verbose('root node %s added'%vtx.name) def add_call_node(self,vtx,parent,econd): + """When a (parent) block performs a call, the (vtx) targeted block + will not be linked with its parent but rather will possibly start a + new connected component of the cfg. When the component is declared + as a function, the parent block is linked to a new node that embeds + the function instead. + """ b = vtx.data callers = b.misc['callers'] if callers: @@ -273,9 +381,19 @@ def add_call_node(self,vtx,parent,econd): return vtx def check_func(self,vtx): + """check if vtx node creates a function. In the fforward method + this method does nothing. + """ pass def check_ext_target(self,t): + """check if the target is the address of an external function. + If True, the :class:`code.xfunc` node is linked to the parent + and the spool is updated with this node. + + Returns: + `True` if target is external, `False` otherwise. + """ if t.cst is None: return False if t.cst._is_ext: b = code.xfunc(t.cst) @@ -288,6 +406,16 @@ def check_ext_target(self,t): return False def getcfg(self,loc=None,debug=False): + """The getcfg method is the cfg recovery method of any analysis + class. + + Arguments: + loc (Optional[cst]): the address to start the cfg recovery + (defaults to the program's entrypoint). + debug (bool): A python debugger :func:`set_trace()` call is + emitted at every node added to the cfg. + (Default to False.) + """ if debug: import pdb try: for x in self.itercfg(loc): @@ -296,10 +424,18 @@ def getcfg(self,loc=None,debug=False): if debug: pdb.set_trace() return self.G - # generic 'forward' analysis explorer. - # default explore policy is depth-first search (use policy=0 for breadth-first search.) - # return instructions are not followed (see lbackward analysis). def itercfg(self,loc=None): + """A generic *forward* analysis explorer. The default policy + is *depth-first* search (use policy=0 for breadth-first search.) + The ret instructions are not followed (see lbackward analysis). + + Arguments: + loc (Optional[cst]): the address to start the cfg recovery + (defaults to the program's entrypoint). + + Yields: + :class:`cfg.node`: every nodes added to the graph. + """ G = self.G # spool is the list of (target,parent) addresses to be analysed self.init_spool(loc) @@ -340,13 +476,29 @@ def itercfg(self,loc=None): econd = None # ----------------------------------------------------------------------------- -# link forward based analysis: -# follows PC expression evaluated with parent block mapping. -# Exploration goes forward until expressions are not cst. + class lforward(fforward): + """link forward based analysis: + follows PC expression evaluated with parent block mapping. + Exploration goes forward until expressions are not cst. + """ policy = {'depth-first': True, 'branch-lazy': False} def get_targets(self,node,parent): + """Computes expression of target address in the given node, based + on its parent address and symbolic map, using the architecture's + program counter (PC). + + Arguments: + node: the current node, not yet added to the cfg. + parent: the parent node in the cfg that has targeted the + current node. + + Returns: + :class:`_target`: + the PC expression evaluated from the parent + symbolic map and the current node's map. + """ blk = node.data pc = self.prog.cpu.PC() if parent is None: @@ -359,19 +511,37 @@ def get_targets(self,node,parent): # ----------------------------------------------------------------------------- -# fast backward based analysis: -# a generalisation of link forward where pc is evaluated backwardly by taking -# the first-parent-node path until no parent exists (entry of a function) -# fbackward is the first class to instanciate code.func objects. -# The 'frame_aliasing' policy indicates wether memory aliasing of pc expression -# outside of the function frame can occur or if the frame is assumed to be clean. -# Default frame-aliasing is set to False (assume no aliasing) otherwise any -# function that writes in memory results in potential aliasing (say for an arch -# that uses a memory stack for storing return addresses). class fbackward(lforward): + """fast backward based analysis: + a generalisation of *link forward* where pc is evaluated backwardly by taking + the *first-parent-node* path until no parent exists (entry of a function). + *fbackward* is the first class to instanciate :class:`code.func` objects. + + Note: + The 'frame_aliasing' policy indicates wether memory aliasing of pc expression + outside of the function frame can occur or if the frame is assumed to be clean. + Default frame-aliasing is set to False (assume no aliasing) otherwise any + function that writes in memory results in potential aliasing (say for an arch + that uses a memory stack for storing return addresses). + """ policy = {'depth-first': True, 'branch-lazy': False, 'frame-aliasing':False} def get_targets(self,node,parent): + """Computes expression of target address in the given node, based + on backward evaluation of all *first-parent* symbolic maps, until the + program counter (PC) expression is a constant or the function entry block + is reached. + + Arguments: + node: the current node, not yet added to the cfg. + parent: the parent node in the cfg that has targeted the + current node. + + Returns: + :class:`_target`: + the PC expression evaluated from composition of + *first-parent-path* symbolic maps. + """ pc = self.prog.cpu.PC() n = node mpc = pc @@ -412,14 +582,22 @@ def get_targets(self,node,parent): # ----------------------------------------------------------------------------- -# link backward based analysis: -# a generalisation of link forward where pc is evaluated by considering all paths -# that link to the current node. class lbackward(fforward): + """link backward based analysis: + a generalisation of *fast forward* where pc is evaluated by considering + **all** paths that link to the current node. + + Note: + This is currently the most advanced stategy for performing cfg recovery + in amoco. + """ policy = {'depth-first': False, 'branch-lazy': False, 'frame-aliasing':False, 'complexity': 30} def check_func(self,node): + """check if vtx node creates a function. In the fforward method + this method does nothing. + """ if node is None: return for t in self.spool: if t.parent in node.c: @@ -443,11 +621,9 @@ def check_func(self,node): else: logger.info('lbackward: function %s done'%f) f.map = m - self.prog.codehelper(func=f) + #self.prog.codehelper(func=f) mpc = f.map(pc) roots = f.view.layout.layers[0] - if len(roots)>1: - logger.verbose('lbackward: multiple entries into function %s ?!'%f) assert len(roots)>0 nroot = roots[0] nroot.data.misc['func'] = f @@ -456,6 +632,7 @@ def check_func(self,node): except (IndexError,TypeError,AttributeError): fsym = 'f' f.name = "%s:%s"%(fsym,nroot.name) + self.prog.codehelper(func=f) for cn in nroot.data.misc['callers']: cnpc = cn.data.map(mpc) fn = cfg.node(f) @@ -466,6 +643,19 @@ def check_func(self,node): self.spool.extend(T) def get_targets(self,node,parent): + """Computes expression of target address in the given node, based + on fast-forward evaluation taking into account the expressions + complexity and frame-aliasing parameters. + + Arguments: + node: the current node, not yet added to the cfg. + parent: the parent node in the cfg that has targeted the + current node. + + Returns: + :class:`_target`: + the PC expression evaluated from current node map. + """ pc = self.prog.cpu.PC() alf = code.mapper.assume_no_aliasing cxl = code.op.threshold diff --git a/amoco/system/core.py b/amoco/system/core.py index fdf5378..ce7e9cc 100644 --- a/amoco/system/core.py +++ b/amoco/system/core.py @@ -373,7 +373,7 @@ def grep(self,pattern): #------------------------------------------------------------------------------ class CoreExec(object): - __slots__ = ['bin','cpu','mmap'] + __slots__ = ['bin','cpu','mmap','symbols'] def __init__(self,p,cpu=None): self.bin = p @@ -382,6 +382,7 @@ def __init__(self,p,cpu=None): self.load_binary() if cpu is not None: cpu.ext.stubs = stubs + self.symbols = {} def initenv(self): return None @@ -528,7 +529,7 @@ def name(self): try: return self.f.name except AttributeError: - return self.f.getvalue() + return '(sc-%s...)'%(self.f.getvalue().encode('hex')[:8]) filename = name diff --git a/amoco/system/elf.py b/amoco/system/elf.py index 4013f1b..28b4e18 100644 --- a/amoco/system/elf.py +++ b/amoco/system/elf.py @@ -5,7 +5,10 @@ # published under GPLv2 license import struct +import pdb +from collections import defaultdict from amoco.logger import * +from amoco.ui.render import Token,highlight logger = Log(__name__) @@ -22,17 +25,80 @@ def __init__(self,message): def __str__(self): return str(self.message) +#------------------------------------------------------------------------------ +# formatting facilities: + +# init of reverse dict to get constant name from value. +# This dict is updated by using 'with' statement of Consts. +ELF_CONSTS = defaultdict(dict) + +class Consts(object): + def __init__(self,name): + self.name = name + def __enter__(self): + ELF_CONSTS[self.name] = {} + self.globnames = set(globals().keys()) + def __exit__(self,exc_type,exc_value,traceback): + G = globals() + for k in set(G.keys())-self.globnames: + ELF_CONSTS[self.name][G[k]] = k + +def default_formatter(): + return token_default_fmt + +def token_default_fmt(k,x): + if 'addr' in k: return token_address_fmt(k,x) + if 'flags' in k: return token_flag_fmt(k,x) + return highlight([(Token.Literal,str(x))]) + +def token_address_fmt(k,x): + return highlight([(Token.Address,hex(x))]) + +def token_constant_fmt(k,x): + return highlight([(Token.Constant,str(x))]) + +def token_name_fmt(k,x): + try: + return highlight([(Token.Name,ELF_CONSTS[k][x])]) + except KeyError: + return token_constant_fmt(k,x) + +def token_flag_fmt(k,x): + s = [] + for v,name in ELF_CONSTS[k].items(): + if (x&v): s.append(highlight([(Token.Name,name)])) + return ','.join(s) + +#------------------------------------------------------------------------------ class Elfcore(object): order = '=' #native order pfx = '' + ksz = 12 + fkeys = defaultdict(default_formatter) def set(self,data): S = struct.unpack(self.order+self.fmt,data) self.__dict__.update(zip(self.keys,S)) def pack(self): return struct.pack(self.order+self.fmt,*(getattr(self,k) for k in self.keys)) + @classmethod + def func_formatter(cls,**kargs): + for key,func in kargs.items(): + cls.fkeys[key] = func + @classmethod + def name_formatter(cls,*keys): + for key in keys: + cls.fkeys[key] = token_name_fmt + @classmethod + def flag_formatter(cls,*keys): + for key in keys: + cls.fkeys[key] = token_flag_fmt + def strkey(self,k): + fmt = '%%s%%-%ds:%%s'%self.ksz + return fmt%(self.pfx,k,self.fkeys[k](k,getattr(self,k))) def __str__(self): - return '\n'.join( - ("%s%-12s:%s"%(self.pfx,k,str(getattr(self,k))) for k in self.keys)) + fmt = '%%s%%-%ds:%%s'%self.ksz + s = '\n'.join(self.strkey(k) for k in self.keys) + return "[%s]\n%s"%(self.__class__.__name__,s) # The ELF file header. #------------------------------------------------------------------------------ @@ -69,15 +135,21 @@ def __init__(self, data): logger.info("Not a 32-bit ELF file") raise ElfError(self) self.set(data[16:52]) + self.name_formatter('e_type','e_machine','e_version') + self.func_formatter(e_entry=token_address_fmt) + self.func_formatter(e_flags=token_address_fmt) def pack(self): e_ident_s = struct.pack('B3sBBBBBxxxxxxx',*[self.e_ident[k] for k in EI_KEYS]) return e_ident_s+Elfcore.pack(self) + # patched Elfcore str to have entrypoint in hex: def __str__(self): - c = ["%s%-12s:%s"%(self.pfx,k,str(getattr(self,k))) for k in self.keys] - c[3] = "%s%-12s:%s"%(self.pfx,'e_entry',hex(self.e_entry)) - return "%s\n"%self.e_ident + '\n'.join(c) + s = list(Elfcore.__str__(self).partition('\n')) + x = '; '.join([token_name_fmt(k,v) for (k,v) in self.e_ident.iteritems()]) + fmt = '\n%%s%%-%ds:%%s'%self.ksz + s.insert(1,fmt%(self.pfx,'e_ident',x)) + return ''.join(s) # keys of the e_ident field: EI_KEYS = ( 'ELFMAG0', @@ -91,131 +163,137 @@ def __str__(self): # legal values for e_indent: #EI_CLASS values: -ELFCLASSNONE=0 -ELFCLASS32=1 -ELFCLASS64=2 -ELFCLASSNUM=3 +with Consts('EI_CLASS'): + ELFCLASSNONE=0 + ELFCLASS32=1 + ELFCLASS64=2 + ELFCLASSNUM=3 #EI_DATA values: -ELFDATANONE=0 -ELFDATA2LSB=1 -ELFDATA2MSB=2 -ELFDATANUM=3 +with Consts('EI_DATA'): + ELFDATANONE=0 + ELFDATA2LSB=1 + ELFDATA2MSB=2 + ELFDATANUM=3 #EI_OSABI values: -ELFOSABI_NONE=0 -ELFOSABI_SYSV=0 -ELFOSABI_HPUX=1 -ELFOSABI_NETBSD=2 -ELFOSABI_LINUX=3 -ELFOSABI_SOLARIS=6 -ELFOSABI_AIX=7 -ELFOSABI_IRIX=8 -ELFOSABI_FREEBSD=9 -ELFOSABI_TRU64=10 -ELFOSABI_MODESTO=11 -ELFOSABI_OPENBSD=12 -ELFOSABI_ARM=97 -ELFOSABI_STANDALONE=255 +with Consts('EI_OSABI'): + ELFOSABI_NONE=0 + ELFOSABI_SYSV=0 + ELFOSABI_HPUX=1 + ELFOSABI_NETBSD=2 + ELFOSABI_LINUX=3 + ELFOSABI_SOLARIS=6 + ELFOSABI_AIX=7 + ELFOSABI_IRIX=8 + ELFOSABI_FREEBSD=9 + ELFOSABI_TRU64=10 + ELFOSABI_MODESTO=11 + ELFOSABI_OPENBSD=12 + ELFOSABI_ARM=97 + ELFOSABI_STANDALONE=255 # legal values for e_type (object file type): -ET_NONE=0 -ET_REL=1 -ET_EXEC=2 -ET_DYN=3 -ET_CORE=4 -ET_NUM=5 -ET_LOOS=0xfe00 -ET_HIOS=0xfeff -ET_LOPROC=0xff00 -ET_HIPROC=0xffff +with Consts('e_type'): + ET_NONE=0 + ET_REL=1 + ET_EXEC=2 + ET_DYN=3 + ET_CORE=4 + ET_NUM=5 + ET_LOOS=0xfe00 + ET_HIOS=0xfeff + ET_LOPROC=0xff00 + ET_HIPROC=0xffff # legal values for e_machine (architecture): -EM_NONE=0 -EM_M32=1 -EM_SPARC=2 -EM_386=3 -EM_68K=4 -EM_88K=5 -EM_860=7 -EM_MIPS=8 -EM_S370=9 -EM_MIPS_RS3_LE=10 - -EM_PARISC=15 -EM_VPP500=17 -EM_SPARC32PLUS=18 -EM_960=19 -EM_PPC=20 -EM_PPC64=21 -EM_S390=22 - -EM_V800=36 -EM_FR20=37 -EM_RH32=38 -EM_RCE=39 -EM_ARM=40 -EM_FAKE_ALPHA=41 -EM_SH=42 -EM_SPARCV9=43 -EM_TRICORE=44 -EM_ARC=45 -EM_H8_300=46 -EM_H8_300H=47 -EM_H8S=48 -EM_H8_500=49 -EM_IA_64=50 -EM_MIPS_X=51 -EM_COLDFIRE=52 -EM_68HC12=53 -EM_MMA=54 -EM_PCP=55 -EM_NCPU=56 -EM_NDR1=57 -EM_STARCORE=58 -EM_ME16=59 -EM_ST100=60 -EM_TINYJ=61 -EM_X86_64=62 -EM_PDSP=63 - -EM_FX66=66 -EM_ST9PLUS=67 -EM_ST7=68 -EM_68HC16=69 -EM_68HC11=70 -EM_68HC08=71 -EM_68HC05=72 -EM_SVX=73 -EM_ST19=74 -EM_VAX=75 -EM_CRIS=76 -EM_JAVELIN=77 -EM_FIREPATH=78 -EM_ZSP=79 -EM_MMIX=80 -EM_HUANY=81 -EM_PRISM=82 -EM_AVR=83 -EM_FR30=84 -EM_D10V=85 -EM_D30V=86 -EM_V850=87 -EM_M32R=88 -EM_MN10300=89 -EM_MN10200=90 -EM_PJ=91 -EM_OPENRISC=92 -EM_ARC_A5=93 -EM_XTENSA=94 -EM_NUM=95 -# unofficial values should pick large index: -EM_ALPHA=0x9026 +with Consts('e_machine'): + EM_NONE=0 + EM_M32=1 + EM_SPARC=2 + EM_386=3 + EM_68K=4 + EM_88K=5 + EM_860=7 + EM_MIPS=8 + EM_S370=9 + EM_MIPS_RS3_LE=10 + + EM_PARISC=15 + EM_VPP500=17 + EM_SPARC32PLUS=18 + EM_960=19 + EM_PPC=20 + EM_PPC64=21 + EM_S390=22 + + EM_V800=36 + EM_FR20=37 + EM_RH32=38 + EM_RCE=39 + EM_ARM=40 + EM_FAKE_ALPHA=41 + EM_SH=42 + EM_SPARCV9=43 + EM_TRICORE=44 + EM_ARC=45 + EM_H8_300=46 + EM_H8_300H=47 + EM_H8S=48 + EM_H8_500=49 + EM_IA_64=50 + EM_MIPS_X=51 + EM_COLDFIRE=52 + EM_68HC12=53 + EM_MMA=54 + EM_PCP=55 + EM_NCPU=56 + EM_NDR1=57 + EM_STARCORE=58 + EM_ME16=59 + EM_ST100=60 + EM_TINYJ=61 + EM_X86_64=62 + EM_PDSP=63 + + EM_FX66=66 + EM_ST9PLUS=67 + EM_ST7=68 + EM_68HC16=69 + EM_68HC11=70 + EM_68HC08=71 + EM_68HC05=72 + EM_SVX=73 + EM_ST19=74 + EM_VAX=75 + EM_CRIS=76 + EM_JAVELIN=77 + EM_FIREPATH=78 + EM_ZSP=79 + EM_MMIX=80 + EM_HUANY=81 + EM_PRISM=82 + EM_AVR=83 + EM_FR30=84 + EM_D10V=85 + EM_D30V=86 + EM_V850=87 + EM_M32R=88 + EM_MN10300=89 + EM_MN10200=90 + EM_PJ=91 + EM_OPENRISC=92 + EM_ARC_A5=93 + EM_XTENSA=94 + EM_NUM=95 + # unofficial values should pick large index: + EM_ALPHA=0x9026 # legal values for e_version (version): -EV_NONE=0 -EV_CURRENT=1 -EV_NUM=2 +with Consts('e_version'): + EV_NONE=0 + EV_CURRENT=1 + EV_NUM=2 # Section header: #------------------------------------------------------------------------------ @@ -234,77 +312,81 @@ class Elf32_Shdr(Elfcore): 'sh_entsize') def __init__(self,data): self.set(data[:40]) + self.name_formatter('sh_name','sh_type') + self.func_formatter(sh_addralign=token_constant_fmt) def __str__(self): if hasattr(self,'name'): self.pfx = '%-20s| '% ('<%s>'%self.name) return Elfcore.__str__(self) -SHN_UNDEF=0 -SHN_LORESERVE=0xff00 -SHN_LOPROC=0xff00 -SHN_BEFORE=0xff00 -SHN_AFTER=0xff01 -SHN_HIPROC=0xff1f -SHN_LOOS=0xff20 -SHN_HIOS=0xff3f -SHN_ABS=0xfff1 -SHN_COMMON=0xfff2 -SHN_XINDEX=0xffff -SHN_HIRESERVE=0xffff +with Consts('sh_name'): + SHN_UNDEF=0 + SHN_LORESERVE=0xff00 + SHN_LOPROC=0xff00 + SHN_BEFORE=0xff00 + SHN_AFTER=0xff01 + SHN_HIPROC=0xff1f + SHN_LOOS=0xff20 + SHN_HIOS=0xff3f + SHN_ABS=0xfff1 + SHN_COMMON=0xfff2 + SHN_XINDEX=0xffff + SHN_HIRESERVE=0xffff # legal values for sh_type (section type): -SHT_NULL=0 -SHT_PROGBITS=1 -SHT_SYMTAB=2 -SHT_STRTAB=3 -SHT_RELA=4 -SHT_HASH=5 -SHT_DYNAMIC=6 -SHT_NOTE=7 -SHT_NOBITS=8 -SHT_REL=9 -SHT_SHLIB=10 -SHT_DYNSYM=11 -SHT_INIT_ARRAY=14 -SHT_FINI_ARRAY=15 -SHT_PREINIT_ARRAY=16 -SHT_GROUP=17 -SHT_SYMTAB_SHNDX=18 -SHT_NUM=19 -SHT_LOOS=0x60000000 -SHT_GNU_HASH=0x6ffffff6 -SHT_GNU_LIBLIST=0x6ffffff7 -SHT_CHECKSUM=0x6ffffff8 -SHT_LOSUNW=0x6ffffffa -SHT_SUNW_move=0x6ffffffa -SHT_SUNW_COMDAT=0x6ffffffb -SHT_SUNW_syminfo=0x6ffffffc -SHT_GNU_verdef=0x6ffffffd -SHT_GNU_verneed=0x6ffffffe -SHT_GNU_versym=0x6fffffff -SHT_HISUNW=0x6fffffff -SHT_HIOS=0x6fffffff -SHT_LOPROC=0x70000000 -SHT_HIPROC=0x7fffffff -SHT_LOUSER=0x80000000 -SHT_HIUSER=0x8fffffff -SHT_legal=[eval(v) for v in filter( (lambda s: s.startswith('SHT_')), dir())] +with Consts('sh_type'): + SHT_NULL=0 + SHT_PROGBITS=1 + SHT_SYMTAB=2 + SHT_STRTAB=3 + SHT_RELA=4 + SHT_HASH=5 + SHT_DYNAMIC=6 + SHT_NOTE=7 + SHT_NOBITS=8 + SHT_REL=9 + SHT_SHLIB=10 + SHT_DYNSYM=11 + SHT_INIT_ARRAY=14 + SHT_FINI_ARRAY=15 + SHT_PREINIT_ARRAY=16 + SHT_GROUP=17 + SHT_SYMTAB_SHNDX=18 + SHT_NUM=19 + SHT_LOOS=0x60000000 + SHT_GNU_HASH=0x6ffffff6 + SHT_GNU_LIBLIST=0x6ffffff7 + SHT_CHECKSUM=0x6ffffff8 + SHT_LOSUNW=0x6ffffffa + SHT_SUNW_move=0x6ffffffa + SHT_SUNW_COMDAT=0x6ffffffb + SHT_SUNW_syminfo=0x6ffffffc + SHT_GNU_verdef=0x6ffffffd + SHT_GNU_verneed=0x6ffffffe + SHT_GNU_versym=0x6fffffff + SHT_HISUNW=0x6fffffff + SHT_HIOS=0x6fffffff + SHT_LOPROC=0x70000000 + SHT_HIPROC=0x7fffffff + SHT_LOUSER=0x80000000 + SHT_HIUSER=0x8fffffff # legal values for sh_flags (section flags): -SHF_WRITE=(1<<0) -SHF_ALLOC=(1<<1) -SHF_EXECINSTR=(1<<2) -SHF_MERGE=(1<<4) -SHF_STRINGS=(1<<5) -SHF_INFO_LINK=(1<<6) -SHF_LINK_ORDER=(1<<7) -SHF_OS_NONCONFORMING=(1<<8) -SHF_GROUP=(1<<9) -SHF_TLS=(1<<10) -SHF_MASKOS=0x0ff00000 -SHF_MASKPROC=0xf0000000 -SHF_ORDERED=(1<<30) -SHF_EXCLUDE=(1<<31) +with Consts('sh_flags'): + SHF_WRITE=(1<<0) + SHF_ALLOC=(1<<1) + SHF_EXECINSTR=(1<<2) + SHF_MERGE=(1<<4) + SHF_STRINGS=(1<<5) + SHF_INFO_LINK=(1<<6) + SHF_LINK_ORDER=(1<<7) + SHF_OS_NONCONFORMING=(1<<8) + SHF_GROUP=(1<<9) + SHF_TLS=(1<<10) + SHF_MASKOS=0x0ff00000 + SHF_MASKPROC=0xf0000000 + SHF_ORDERED=(1<<30) + SHF_EXCLUDE=(1<<31) # section group handling: GRP_COMDAT=0x1 @@ -322,44 +404,59 @@ class Elf32_Sym(Elfcore): 'st_shndx') def __init__(self,data): self.set(data[:16]) + self.name_formatter('st_name','st_bind','st_type','st_visibility') def ELF32_ST_BIND(self): return self.st_info>>4 + st_bind = property(ELF32_ST_BIND) def ELF32_ST_TYPE(self): return self.st_info&0xf + st_type = property(ELF32_ST_TYPE) def ELF32_ST_INFO(self,bind,type): self.st_info = bind<<4 + (type&0xf) def ELF32_ST_VISIBILITY(self): return self.st_other&0x03 + st_visibility = property(ELF32_ST_VISIBILITY) + def __str__(self): + s = Elfcore.__str__(self)+'\n' + s += self.strkey('st_bind') + s += self.strkey('st_type') + s += self.strkey('st_visibility') + return s # legal values for elf32_st_bind: -STB_LOCAL=0 -STB_GLOBAL=1 -STB_WEAK=2 -STB_NUM=3 -STB_LOOS=10 -STB_HIOS=12 -STB_LOPROC=13 -STB_HIPROC=15 +with Consts('st_bind'): + STB_LOCAL=0 + STB_GLOBAL=1 + STB_WEAK=2 + STB_NUM=3 + STB_LOOS=10 + STB_HIOS=12 + STB_LOPROC=13 + STB_HIPROC=15 + # legal values for elf32_st_type: -STT_NOTYPE=0 -STT_OBJECT=1 -STT_FUNC=2 -STT_SECTION=3 -STT_FILE=4 -STT_COMMON=5 -STT_TLS=6 -STT_NUM=7 -STT_LOOS=10 -STT_HIOS=12 -STT_LOPROC=13 -STT_HIPROC=15 +with Consts('st_type'): + STT_NOTYPE=0 + STT_OBJECT=1 + STT_FUNC=2 + STT_SECTION=3 + STT_FILE=4 + STT_COMMON=5 + STT_TLS=6 + STT_NUM=7 + STT_LOOS=10 + STT_HIOS=12 + STT_LOPROC=13 + STT_HIPROC=15 + # special index indicating the end end of a chain: STN_UNDEF=0 -STV_DEFAULT=0 -STV_INTERNAL=1 -STV_HIDDEN=2 -STV_PROTECTED=3 +with Consts('st_visibility'): + STV_DEFAULT=0 + STV_INTERNAL=1 + STV_HIDDEN=2 + STV_PROTECTED=3 # Relocations: #------------------------------------------------------------------------------ @@ -368,12 +465,20 @@ class Elf32_Rel(Elfcore): keys = ('r_offset','r_info') def __init__(self,data): self.set(data[:8]) + self.name_formatter('r_type') + self.func_formatter(r_sym=token_address_fmt) def ELF32_R_SYM(self): return self.r_info>>8 + r_sym = property(ELF32_R_SYM) def ELF32_R_TYPE(self): return self.r_info&0xff + r_type = property(ELF32_R_TYPE) def ELF32_R_INFO(self,sym,type): self.r_info = sym<<8 + (type&0xff) + def __str__(self): + s = Elfcore.__str__(self)+'\n' + s += self.strkey('r_type') + return s class Elf32_Rela(Elf32_Rel): fmt = 'III' @@ -382,44 +487,44 @@ def __init__(self,data): self.set(data[:12]) #Intel 80386 specific definitions. #i386 relocs. - -R_386_NONE=0 -R_386_32=1 -R_386_PC32=2 -R_386_GOT32=3 -R_386_PLT32=4 -R_386_COPY=5 -R_386_GLOB_DAT=6 -R_386_JMP_SLOT=7 -R_386_RELATIVE=8 -R_386_GOTOFF=9 -R_386_GOTPC=10 -R_386_32PLT=11 -R_386_TLS_TPOFF=14 -R_386_TLS_IE=15 -R_386_TLS_GOTIE=16 -R_386_TLS_LE=17 -R_386_TLS_GD=18 -R_386_TLS_LDM=19 -R_386_16=20 -R_386_PC16=21 -R_386_8=22 -R_386_PC8=23 -R_386_TLS_GD_32=24 -R_386_TLS_GD_PUSH=25 -R_386_TLS_GD_CALL=26 -R_386_TLS_GD_POP=27 -R_386_TLS_LDM_32=28 -R_386_TLS_LDM_PUSH=29 -R_386_TLS_LDM_CALL=30 -R_386_TLS_LDM_POP=31 -R_386_TLS_LDO_32=32 -R_386_TLS_IE_32=33 -R_386_TLS_LE_32=34 -R_386_TLS_DTPMOD32=35 -R_386_TLS_DTPOFF32=36 -R_386_TLS_TPOFF32=37 -R_386_NUM=38 +with Consts('r_type'): + R_386_NONE=0 + R_386_32=1 + R_386_PC32=2 + R_386_GOT32=3 + R_386_PLT32=4 + R_386_COPY=5 + R_386_GLOB_DAT=6 + R_386_JMP_SLOT=7 + R_386_RELATIVE=8 + R_386_GOTOFF=9 + R_386_GOTPC=10 + R_386_32PLT=11 + R_386_TLS_TPOFF=14 + R_386_TLS_IE=15 + R_386_TLS_GOTIE=16 + R_386_TLS_LE=17 + R_386_TLS_GD=18 + R_386_TLS_LDM=19 + R_386_16=20 + R_386_PC16=21 + R_386_8=22 + R_386_PC8=23 + R_386_TLS_GD_32=24 + R_386_TLS_GD_PUSH=25 + R_386_TLS_GD_CALL=26 + R_386_TLS_GD_POP=27 + R_386_TLS_LDM_32=28 + R_386_TLS_LDM_PUSH=29 + R_386_TLS_LDM_CALL=30 + R_386_TLS_LDM_POP=31 + R_386_TLS_LDO_32=32 + R_386_TLS_IE_32=33 + R_386_TLS_LE_32=34 + R_386_TLS_DTPMOD32=35 + R_386_TLS_DTPOFF32=36 + R_386_TLS_TPOFF32=37 + R_386_NUM=38 # Program Segment header: #------------------------------------------------------------------------------ @@ -437,41 +542,44 @@ class Elf32_Phdr(Elfcore): def __init__(self, data): self.set(data[:32]) + self.name_formatter('p_type') # legal values for p_type (segment type): -PT_NULL=0 -PT_LOAD=1 -PT_DYNAMIC=2 -PT_INTERP=3 -PT_NOTE=4 -PT_SHLIB=5 -PT_PHDR=6 -PT_TLS=7 -PT_NUM=8 -PT_LOOS=0x60000000 -PT_GNU_EH_FRAME=0x6474e550 -PT_GNU_STACK=0x6474e551 -PT_GNU_RELRO=0x6474e552 -PT_LOSUNW=0x6ffffffa -PT_SUNWBSS=0x6ffffffa -PT_SUNWSTACK=0x6ffffffb -PT_HISUNW=0x6fffffff -PT_HIOS=0x6fffffff -PT_LOPROC=0x70000000 -PT_HIPROC=0x7fffffff -PT_legal=[eval(v) for v in filter( (lambda s: s.startswith('PT_')), dir())] +with Consts('p_type'): + PT_NULL=0 + PT_LOAD=1 + PT_DYNAMIC=2 + PT_INTERP=3 + PT_NOTE=4 + PT_SHLIB=5 + PT_PHDR=6 + PT_TLS=7 + PT_NUM=8 + PT_LOOS=0x60000000 + PT_GNU_EH_FRAME=0x6474e550 + PT_GNU_STACK=0x6474e551 + PT_GNU_RELRO=0x6474e552 + PT_LOSUNW=0x6ffffffa + PT_SUNWBSS=0x6ffffffa + PT_SUNWSTACK=0x6ffffffb + PT_HISUNW=0x6fffffff + PT_HIOS=0x6fffffff + PT_LOPROC=0x70000000 + PT_HIPROC=0x7fffffff + # legal values for p_flags (segment flags): -PF_X=(1<<0) -PF_W=(1<<1) -PF_R=(1<<2) -PF_MASKOS=0x0ff00000 -PF_MASKPROC=0xf0000000 +with Consts('p_flags'): + PF_X=(1<<0) + PF_W=(1<<1) + PF_R=(1<<2) + PF_MASKOS=0x0ff00000 + PF_MASKPROC=0xf0000000 # Note Sections : #------------------------------------------------------------------------------ class Elf32_Note(Elfcore): fmt = 'III' - keys = ('namesz','descsz','type') + keys = ('namesz','descsz','n_type') def __init__(self, data): self.set(data[:12]) p = 12+self.namesz @@ -480,24 +588,25 @@ def __init__(self, data): self.desc = data[p:p+self.descsz] # legal values for note segment descriptor types for core files: -NT_PRSTATUS=1 -NT_FPREGSET=2 -NT_PRPSINFO=3 -NT_PRXREG=4 -NT_TASKSTRUCT=4 -NT_PLATFORM=5 -NT_AUXV=6 -NT_GWINDOWS=7 -NT_ASRS=8 -NT_PSTATUS=10 -NT_PSINFO=13 -NT_PRCRED=14 -NT_UTSNAME=15 -NT_LWPSTATUS=16 -NT_LWPSINFO=17 -NT_PRFPXREG=20 - -NT_VERSION=1 +with Consts('n_type'): + NT_PRSTATUS=1 + NT_FPREGSET=2 + NT_PRPSINFO=3 + NT_PRXREG=4 + NT_TASKSTRUCT=4 + NT_PLATFORM=5 + NT_AUXV=6 + NT_GWINDOWS=7 + NT_ASRS=8 + NT_PSTATUS=10 + NT_PSINFO=13 + NT_PRCRED=14 + NT_UTSNAME=15 + NT_LWPSTATUS=16 + NT_LWPSINFO=17 + NT_PRFPXREG=20 + + NT_VERSION=1 # Dynamic Section: #------------------------------------------------------------------------------ @@ -506,82 +615,84 @@ class Elf32_Dyn(Elfcore): keys = ('d_tag','d_un') def __init__(self,data): self.set(data[:8]) + self.name_formatter('d_tag','d_un') def DT_VALTAGIDX(self,tag) : self.d_un = DT_VALRNGHI - tag def DT_ADDRTAGIDX(self,tag): self.d_un = DT_ADDRRNGHI - tag # legal values for d_tag (dynamic entry type): -DT_NULL=0 -DT_NEEDED=1 -DT_PLTRELSZ=2 -DT_PLTGOT=3 -DT_HASH=4 -DT_STRTAB=5 -DT_SYMTAB=6 -DT_RELA=7 -DT_RELASZ=8 -DT_RELAENT=9 -DT_STRSZ=10 -DT_SYMENT=11 -DT_INIT=12 -DT_FINI=13 -DT_SONAME=14 -DT_RPATH=15 -DT_SYMBOLIC=16 -DT_REL=17 -DT_RELSZ=18 -DT_RELENT=19 -DT_PLTREL=20 -DT_DEBUG=21 -DT_TEXTREL=22 -DT_JMPREL=23 -DT_BIND_NOW=24 -DT_INIT_ARRAY=25 -DT_FINI_ARRAY=26 -DT_INIT_ARRAYSZ=27 -DT_FINI_ARRAYSZ=28 -DT_RUNPATH=29 -DT_FLAGS=30 -DT_ENCODING=32 -DT_PREINIT_ARRAY=32 -DT_PREINIT_ARRAYSZ=33 -DT_NUM=34 -DT_LOOS=0x6000000d -DT_HIOS=0x6ffff000 -DT_LOPROC=0x70000000 -DT_HIPROC=0x7fffffff +with Consts('d_tag'): + DT_NULL=0 + DT_NEEDED=1 + DT_PLTRELSZ=2 + DT_PLTGOT=3 + DT_HASH=4 + DT_STRTAB=5 + DT_SYMTAB=6 + DT_RELA=7 + DT_RELASZ=8 + DT_RELAENT=9 + DT_STRSZ=10 + DT_SYMENT=11 + DT_INIT=12 + DT_FINI=13 + DT_SONAME=14 + DT_RPATH=15 + DT_SYMBOLIC=16 + DT_REL=17 + DT_RELSZ=18 + DT_RELENT=19 + DT_PLTREL=20 + DT_DEBUG=21 + DT_TEXTREL=22 + DT_JMPREL=23 + DT_BIND_NOW=24 + DT_INIT_ARRAY=25 + DT_FINI_ARRAY=26 + DT_INIT_ARRAYSZ=27 + DT_FINI_ARRAYSZ=28 + DT_RUNPATH=29 + DT_FLAGS=30 + DT_ENCODING=32 + DT_PREINIT_ARRAY=32 + DT_PREINIT_ARRAYSZ=33 + DT_NUM=34 + DT_LOOS=0x6000000d + DT_HIOS=0x6ffff000 + DT_LOPROC=0x70000000 + DT_HIPROC=0x7fffffff # legal values for d_un (union type use here value): -DT_VALRNGLO=0x6ffffd00 -DT_GNU_PRELINKED=0x6ffffdf5 -DT_GNU_CONFLICTSZ=0x6ffffdf6 -DT_GNU_LIBLISTSZ=0x6ffffdf7 -DT_CHECKSUM=0x6ffffdf8 -DT_PLTPADSZ=0x6ffffdf9 -DT_MOVEENT=0x6ffffdfa -DT_MOVESZ=0x6ffffdfb -DT_FEATURE_1=0x6ffffdfc -DT_POSFLAG_1=0x6ffffdfd -DT_SYMINSZ=0x6ffffdfe -DT_SYMINENT=0x6ffffdff -DT_VALRNGHI=0x6ffffdff -DT_VALNUM=12 - -# legal values for d_un (union type use here address): -DT_ADDRRNGLO=0x6ffffe00 -DT_GNU_CONFLICT=0x6ffffef8 -DT_GNU_LIBLIST=0x6ffffef9 -DT_CONFIG=0x6ffffefa -DT_DEPAUDIT=0x6ffffefb -DT_AUDIT=0x6ffffefc -DT_PLTPAD=0x6ffffefd -DT_MOVETAB=0x6ffffefe -DT_SYMINFO=0x6ffffeff -DT_ADDRRNGHI=0x6ffffeff -DT_ADDRNUM=10 - -DT_legal=[eval(v) for v in filter( (lambda s: s.startswith('DT_')), dir())] +with Consts('d_un'): + DT_VALRNGLO=0x6ffffd00 + DT_GNU_PRELINKED=0x6ffffdf5 + DT_GNU_CONFLICTSZ=0x6ffffdf6 + DT_GNU_LIBLISTSZ=0x6ffffdf7 + DT_CHECKSUM=0x6ffffdf8 + DT_PLTPADSZ=0x6ffffdf9 + DT_MOVEENT=0x6ffffdfa + DT_MOVESZ=0x6ffffdfb + DT_FEATURE_1=0x6ffffdfc + DT_POSFLAG_1=0x6ffffdfd + DT_SYMINSZ=0x6ffffdfe + DT_SYMINENT=0x6ffffdff + DT_VALRNGHI=0x6ffffdff + DT_VALNUM=12 + + # legal values for d_un (union type use here address): + DT_ADDRRNGLO=0x6ffffe00 + DT_GNU_CONFLICT=0x6ffffef8 + DT_GNU_LIBLIST=0x6ffffef9 + DT_CONFIG=0x6ffffefa + DT_DEPAUDIT=0x6ffffefb + DT_AUDIT=0x6ffffefc + DT_PLTPAD=0x6ffffefd + DT_MOVETAB=0x6ffffefe + DT_SYMINFO=0x6ffffeff + DT_ADDRRNGHI=0x6ffffeff + DT_ADDRNUM=10 + # Version definition sections are not supported. #------------------------------------------------------------------------------ @@ -632,7 +743,7 @@ def __init__(self,filename): if not self.basemap: self.basemap = self.Phdr[-1].p_vaddr elif self.Phdr[-1].p_type == PT_DYNAMIC: self.dynamic = True - elif not self.Phdr[-1].p_type in PT_legal: + elif not self.Phdr[-1].p_type in ELF_CONSTS['p_type'].keys(): logger.verbose('invalid segment detected (removed)') self.Phdr.pop() @@ -646,7 +757,7 @@ def __init__(self,filename): for sht in range(n): logger.progress(sht,n,'parsing Shdrs ') S = Elf32_Shdr(data[sht*l:]) - if S.sh_type in SHT_legal: + if S.sh_type in ELF_CONSTS['sh_type'].keys(): self.Shdr.append(S) else: raise StandardError @@ -765,12 +876,14 @@ def readsection(self,sect): else: S = sect if S: - if S.sh_type==SHT_SYMTAB \ - or S.sh_type==SHT_DYNSYM : return self.__read_symtab(S) - elif S.sh_type==SHT_STRTAB : return self.__read_strtab(S) - elif S.sh_type==SHT_REL \ - or S.sh_type==SHT_RELA : return self.__read_relocs(S) - elif S.sh_type==SHT_DYNAMIC : return self.__read_dynamic(S) + if S.sh_type in (SHT_SYMTAB,SHT_DYNSYM): + return self.__read_symtab(S) + elif S.sh_type==SHT_STRTAB : + return self.__read_strtab(S) + elif S.sh_type in (SHT_REL,SHT_RELA): + return self.__read_relocs(S) + elif S.sh_type==SHT_DYNAMIC : + return self.__read_dynamic(S) elif S.sh_type==SHT_PROGBITS: self.__file.seek(S.sh_offset) return self.__file.read(S.sh_size) @@ -778,7 +891,7 @@ def readsection(self,sect): ## def __read_symtab(self,section): - if section.sh_type!=SHT_SYMTAB and section.sh_type!=SHT_DYNSYM : + if section.sh_type not in (SHT_SYMTAB,SHT_DYNSYM) : logger.warning('not a symbol table section') return None # read the section: @@ -808,8 +921,7 @@ def __read_strtab(self,section): ## def __read_relocs(self,section): - if section.sh_type!=SHT_REL \ - and section.sh_type!=SHT_RELA : + if section.sh_type not in (SHT_REL,SHT_RELA) : logger.warning('not a relocation table section') return None self.__file.seek(section.sh_offset) @@ -878,12 +990,12 @@ def __variables(self,fltr=None): if self.Shdr[v[2]].name != fltr: D.pop(k) return D - def __symbols(self,type): + def __symbols(self,t): if not self.readsection('.symtab') : return {} D = {} if self.readsection('.strtab'): for sym in self.symtab: - if sym.ELF32_ST_TYPE()==type and sym.st_value: + if sym.st_type==t and sym.st_value: D[sym.st_value] = (self.strtab[sym.st_name], sym.st_size, sym.st_info, @@ -898,12 +1010,11 @@ def __dynamic(self,type=STT_FUNC): D = {} if self.readsection('.dynstr'): for i,s in enumerate(self.Shdr): - if s.sh_type == SHT_REL \ - or s.sh_type == SHT_RELA : + if s.sh_type in (SHT_REL,SHT_RELA): if self.readsection(i): for r in self.reltab: if r.r_offset: - sym = self.symtab[ r.ELF32_R_SYM() ] + sym = self.symtab[ r.r_sym ] D[r.r_offset] = self.strtab[sym.st_name] else: # need to build a fake strtab with our own symbol names: @@ -998,15 +1109,21 @@ def __init__(self, data): logger.warning("not a 64-bit ELF, tried Elf32 ?") raise ElfError(self.e_ident['EI_CLASS']) self.set(data[16:16+struct.calcsize(self.fmt)]) + self.name_formatter('e_type','e_machine','e_version') + self.func_formatter(e_entry=token_address_fmt) + self.func_formatter(e_flags=token_address_fmt) def pack(self): e_ident_s = struct.pack('B3sBBBBBxxxxxxx',*[self.e_ident[k] for k in EI_KEYS]) return e_ident_s+Elfcore.pack(self) # patched Elfcore str to have entrypoint in hex: def __str__(self): - c = ["%s%-12s:%s"%(self.pfx,k,str(getattr(self,k))) for k in self.keys] - c[3] = "%s%-12s:%s"%(self.pfx,'e_entry',hex(self.e_entry)) - return "%s\n"%self.e_ident + '\n'.join(c) + s = list(Elfcore.__str__(self).partition('\n')) + x = '; '.join([token_name_fmt(k,v) for (k,v) in self.e_ident.iteritems()]) + fmt = '\n%%s%%-%ds:%%s'%self.ksz + s.insert(1,fmt%(self.pfx,'e_ident',x)) + return ''.join(s) + # Section header: #------------------------------------------------------------------------------ @@ -1025,6 +1142,7 @@ class Elf64_Shdr(Elfcore): 'sh_entsize') def __init__(self,data): self.set(data[:struct.calcsize(self.fmt)]) + self.name_formatter('sh_name','sh_type') def __str__(self): if hasattr(self,'name'): self.pfx = '%-20s| '% ('<%s>'%self.name) @@ -1043,14 +1161,24 @@ class Elf64_Sym(Elfcore): 'st_size') def __init__(self,data): self.set(data[:struct.calcsize(self.fmt)]) + self.name_formatter('st_name','st_bind','st_type','st_visibility') def ELF64_ST_BIND(self): return self.st_info>>4 + st_bind = property(ELF64_ST_BIND) def ELF64_ST_TYPE(self): return self.st_info&0xf + st_type = property(ELF64_ST_TYPE) def ELF64_ST_INFO(self,bind,type): self.st_info = bind<<4 + (type&0xf) def ELF64_ST_VISIBILITY(self): return self.st_other&0x03 + st_visibility = property(ELF64_ST_VISIBILITY) + def __str__(self): + s = Elfcore.__str__(self)+'\n' + s += self.strkey('st_bind') + s += self.strkey('st_type') + s += self.strkey('st_visibility') + return s # Relocations: #------------------------------------------------------------------------------ @@ -1061,10 +1189,16 @@ def __init__(self,data): self.set(data[:struct.calcsize(self.fmt)]) def ELF64_R_SYM(self): return self.r_info>>32 + r_sym = property(ELF64_R_SYM) def ELF64_R_TYPE(self): return self.r_info&0xffffffffL + r_type = property(ELF64_R_TYPE) def ELF64_R_INFO(self,sym,type): self.r_info = sym<<32 + (type&0xffffffffL) + def __str__(self): + s = Elfcore.__str__(self)+'\n' + s += self.strkey('r_type') + return s class Elf64_Rela(Elf64_Rel): fmt = 'QQQ' @@ -1088,12 +1222,13 @@ class Elf64_Phdr(Elfcore): def __init__(self, data): self.set(data[:struct.calcsize(self.fmt)]) + self.name_formatter('p_type') # Note Sections : #------------------------------------------------------------------------------ class Elf64_Note(Elfcore): fmt = 'QQQ' - keys = ('namesz','descsz','type') + keys = ('namesz','descsz','n_type') def __init__(self, data): l = struct.calcsize(self.fmt) self.set(data[:l]) @@ -1162,7 +1297,7 @@ def __init__(self,filename): if not self.basemap: self.basemap = self.Phdr[-1].p_vaddr elif self.Phdr[-1].p_type == PT_DYNAMIC: self.dynamic = True - elif not self.Phdr[-1].p_type in PT_legal: + elif not self.Phdr[-1].p_type in ELF_CONSTS['p_type'].keys(): logger.verbose('invalid segment detected (removed)') self.Phdr.pop() @@ -1176,7 +1311,7 @@ def __init__(self,filename): for sht in range(n): logger.progress(sht,n,'parsing Shdrs ') S = Elf64_Shdr(data[sht*l:]) - if S.sh_type in SHT_legal: + if S.sh_type in ELF_CONSTS['sh_type'].keys(): self.Shdr.append(S) else: raise StandardError diff --git a/amoco/system/linux_x64.py b/amoco/system/linux_x64.py index 113474e..3e9bce0 100644 --- a/amoco/system/linux_x64.py +++ b/amoco/system/linux_x64.py @@ -5,7 +5,7 @@ # published under GPLv2 license from amoco.system.core import * -from amoco.code import tag +from amoco.code import tag,xfunc import amoco.arch.x64.cpu_x64 as cpu @@ -15,6 +15,8 @@ class ELF(CoreExec): def __init__(self,p): CoreExec.__init__(self,p,cpu) + self.symbols.update(self.bin.functions) + self.symbols.update(self.bin.variables) # load the program into virtual memory (populate the mmap dict) def load_binary(self): @@ -40,10 +42,12 @@ def load_shlib(self): # lookup in bin if v is associated with a function or variable name: def check_sym(self,v): if v._is_cst: - x = self.bin.functions.get(v.value,None) or self.bin.variables.get(v.value,None) + x = self.symbols.get(v.value,None) if x is not None: - if isinstance(x,str): x=cpu.ext(x,size=64) - else: x=cpu.sym(x[0],v.value,v.size) + if isinstance(x,str): + x=cpu.ext(x,size=64) + else: + x=cpu.sym(x[0],v.value,v.size) return x return None @@ -106,8 +110,8 @@ def seqhelper(self,seq): for op in i.operands: if op._is_mem: if op.a.base is cpu.rbp: - if op.a.disp<0: i.misc[tag.FUNC_ARG]=1 - elif op.a.disp>4: i.misc[tag.FUNC_VAR]=1 + if op.a.disp<0: i.misc[tag.FUNC_VAR]=True + elif op.a.disp>=16: i.misc[tag.FUNC_ARG]=True elif op.a.base._is_cst or (op.a.base is cpu.rip): b = op.a.base if b is cpu.rip: b=i.address+i.length @@ -116,7 +120,6 @@ def seqhelper(self,seq): op.a.base=x op.a.disp=0 if i.mnemonic == 'JMP': # PLT jumps: - #i.address = i.address.to_sym('PLT%s'%x) i.misc[tag.FUNC_START]=1 i.misc[tag.FUNC_END]=1 elif op._is_cst: @@ -127,19 +130,11 @@ def seqhelper(self,seq): def blockhelper(self,block): for i in self.seqhelper(block.instr): block.misc.update(i.misc) - def _helper(block,m): - # annotations based on block semantics: - sta,sto = block.support - if m[cpu.mem(cpu.rbp-4,64)] == cpu.rbp: - block.misc[tag.FUNC_START]=1 - if m[cpu.rip]==cpu.mem(cpu.rsp-4,64): - block.misc[tag.FUNC_END]=1 - if m[cpu.mem(cpu.rsp,64)]==sto: - block.misc[tag.FUNC_CALL]=1 - block._helper = _helper + block._helper = block_helper_ return block def funchelper(self,f): + # check single root node: roots = f.cfg.roots() if len(roots)==0: roots = filter(lambda n:n.data.misc[tag.FUNC_START],f.cfg.sV) @@ -147,14 +142,48 @@ def funchelper(self,f): logger.warning("no entry to function %s found"%f) if len(roots)>1: logger.verbose('multiple entries into function %s ?!'%f) + # check start symbol: + elif roots[0].data.address == self.bin.entrypoints[0]: + f.name = '_start' + # get section symbol if any: + f.misc['section'] = section = self.bin.getinfo(f.address.value)[0] rets = f.cfg.leaves() if len(rets)==0: logger.warning("no exit to function %s found"%f) if len(rets)>1: logger.verbose('multiple exits in function %s'%f) for r in rets: - if r.data.misc[tag.FUNC_CALL]: f.misc[tag.FUNC_CALL] += 1 + # export PLT external symbol name: + if section and section.name=='.plt': + if isinstance(r.data,xfunc): f.name = section.name+r.name + if r.data.misc[tag.FUNC_CALL]: + f.misc[tag.FUNC_CALL] += 1 + if f.map: + # check vars & args: should reflect x64 register calling convention + f.misc[tag.FUNC_VAR] = [] + f.misc[tag.FUNC_ARG] = [] + for x in set(f.map.inputs()): + f.misc[tag.FUNC_IN] += 1 + if x._is_mem and x.a.base==cpu.rsp: + if x.a.disp>=8: + f.misc[tag.FUNC_ARG].append(x) + for x in set(f.map.outputs()): + if x in (cpu.rsp, cpu.rbp): continue + f.misc[tag.FUNC_OUT] += 1 + if x._is_mem and x.a.base==cpu.rsp: + if x.a.disp<0: + f.misc[tag.FUNC_VAR].append(x) + +def block_helper_(block,m): + # annotations based on block semantics: + sta,sto = block.support + if m[cpu.mem(cpu.rbp-8,64)] == cpu.rbp: + block.misc[tag.FUNC_START]=1 + if m[cpu.rip]==cpu.mem(cpu.rsp-8,64): + block.misc[tag.FUNC_END]=1 + if m[cpu.mem(cpu.rsp,64)]==sto: + block.misc[tag.FUNC_CALL]=1 # HOOKS DEFINED HERE : #---------------------------------------------------------------------------- diff --git a/amoco/system/linux_x86.py b/amoco/system/linux_x86.py index c3ac9fd..2fa89a1 100644 --- a/amoco/system/linux_x86.py +++ b/amoco/system/linux_x86.py @@ -5,7 +5,7 @@ # published under GPLv2 license from amoco.system.core import * -from amoco.code import tag +from amoco.code import tag,xfunc import amoco.arch.x86.cpu_x86 as cpu @@ -15,6 +15,8 @@ class ELF(CoreExec): def __init__(self,p): CoreExec.__init__(self,p,cpu) + self.symbols.update(self.bin.functions) + self.symbols.update(self.bin.variables) # load the program into virtual memory (populate the mmap dict) def load_binary(self): @@ -40,10 +42,12 @@ def load_shlib(self): # lookup in bin if v is associated with a function or variable name: def check_sym(self,v): if v._is_cst: - x = self.bin.functions.get(v.value,None) or self.bin.variables.get(v.value,None) + x = self.symbols.get(v.value,None) if x is not None: - if isinstance(x,str): x=cpu.ext(x,size=32) - else: x=cpu.sym(x[0],v.value,v.size) + if isinstance(x,str): + x=cpu.ext(x,size=32) + else: + x=cpu.sym(x[0],v.value,v.size) return x return None @@ -106,15 +110,14 @@ def seqhelper(self,seq): for op in i.operands: if op._is_mem: if op.a.base is cpu.ebp: - if op.a.disp<0: i.misc[tag.FUNC_ARG]=1 - elif op.a.disp>4: i.misc[tag.FUNC_VAR]=1 + if op.a.disp<0: i.misc[tag.FUNC_VAR]=True + elif op.a.disp>=8: i.misc[tag.FUNC_ARG]=True elif op.a.base._is_cst: x = self.check_sym(op.a.base+op.a.disp) if x is not None: op.a.base=x op.a.disp=0 if i.mnemonic == 'JMP': # PLT jumps: - #i.address = i.address.to_sym('PLT%s'%x) i.misc[tag.FUNC_START]=1 i.misc[tag.FUNC_END]=1 elif op._is_cst: @@ -126,21 +129,11 @@ def blockhelper(self,block): for i in self.seqhelper(block.instr): block.misc.update(i.misc) # delayed computation of block.map: - def _helper(block,m): - # update block.misc based on semantics: - sta,sto = block.support - if m[cpu.mem(cpu.ebp-4,32)] == cpu.ebp: - block.misc[tag.FUNC_START]=1 - if m[cpu.eip]==cpu.mem(cpu.esp-4,32): - block.misc[tag.FUNC_END]=1 - if m[cpu.mem(cpu.esp,32)]==sto: - block.misc[tag.FUNC_CALL]=1 - # register the block helper that will be called - # only when the map is computed. - block._helper = _helper + block._helper = block_helper_ return block def funchelper(self,f): + # check single root node: roots = f.cfg.roots() if len(roots)==0: roots = filter(lambda n:n.data.misc[tag.FUNC_START],f.cfg.sV) @@ -148,15 +141,54 @@ def funchelper(self,f): logger.warning("no entry to function %s found"%f) if len(roots)>1: logger.verbose('multiple entries into function %s ?!'%f) + # check _start symbol: + elif roots[0].data.address == self.bin.entrypoints[0]: + f.name = '_start' + # get section symbol if any: + f.misc['section'] = section = self.bin.getinfo(f.address.value)[0] + # check leaves: rets = f.cfg.leaves() if len(rets)==0: logger.warning("no exit to function %s found"%f) if len(rets)>1: logger.verbose('multiple exits in function %s'%f) for r in rets: - if r.data.misc[tag.FUNC_CALL]: f.misc[tag.FUNC_CALL] += 1 + # export PLT external symbol name: + if section and section.name=='.plt': + if isinstance(r.data,xfunc): f.name = section.name+r.name + if r.data.misc[tag.FUNC_CALL]: + f.misc[tag.FUNC_CALL] += 1 + if f.map: + # check vars & args: + f.misc[tag.FUNC_VAR] = [] + f.misc[tag.FUNC_ARG] = [] + for x in set(f.map.inputs()): + f.misc[tag.FUNC_IN] += 1 + if x._is_mem and x.a.base==cpu.esp: + if x.a.disp>=4: + f.misc[tag.FUNC_ARG].append(x) + for x in set(f.map.outputs()): + if x in (cpu.esp, cpu.ebp): continue + f.misc[tag.FUNC_OUT] += 1 + if x._is_mem and x.a.base==cpu.esp: + if x.a.disp<0: + f.misc[tag.FUNC_VAR].append(x) + +#---------------------------------------------------------------------------- +# the block helper that will be called +# only when the map is computed. +def block_helper_(block,m): + # update block.misc based on semantics: + sta,sto = block.support + if m[cpu.mem(cpu.ebp-4,32)] == cpu.ebp: + block.misc[tag.FUNC_START]=1 + if m[cpu.eip]==cpu.mem(cpu.esp-4,32): + block.misc[tag.FUNC_END]=1 + if m[cpu.mem(cpu.esp,32)]==sto: + block.misc[tag.FUNC_CALL]=1 + # HOOKS DEFINED HERE : #---------------------------------------------------------------------------- diff --git a/doc/advanced.rst b/doc/advanced.rst new file mode 100644 index 0000000..09df60f --- /dev/null +++ b/doc/advanced.rst @@ -0,0 +1,2 @@ +Advanced features +================= diff --git a/doc/arch.rst b/doc/arch.rst new file mode 100644 index 0000000..0f247e7 --- /dev/null +++ b/doc/arch.rst @@ -0,0 +1,37 @@ +.. _arch: + +The architecture package +======================== + +Supported CPU architectures are implemented in this package as subpackages and all +use the :mod:`arch.core` generic classes. The interface to a CPU used by +:ref:`system ` classes is generally provided by a ``cpu_XXX.py`` +module in the CPU subpackage. + +This module shall: + +- provide the CPU *environment* (registers and other internals) +- provide an instance of :class:`core.disassembler` class, which requires to: + + - define the :class:`@ispec` of every instruction for the generic decoder, + - and define the *semantics* of every instruction with :mod:`cas.expressions`. + +- optionnally define the output assembly format, and the *GNU as* (or any other) + assembly parser. + +A simple example is provided by the ``arch/arm/v8`` architecture which implements +a model of ARM AArch64: +The interface module is :mod:`arch.arm.cpu_armv8`, which imports everything from +the v8 subpackage. + + +.. automodule:: arch.core + :members: + +.. automodule:: arch.x86.cpu_x86 + :members: + +.. automodule:: arch.arm.cpu_armv8 + :members: + +.. _arch: `The architecture package`_ diff --git a/doc/cas.rst b/doc/cas.rst new file mode 100644 index 0000000..f2229ff --- /dev/null +++ b/doc/cas.rst @@ -0,0 +1,52 @@ +The computer algebra system package +=================================== + +The *computer algebra system* of Amoco is built with the following elements implemented +in module :mod:`cas/expressions`: + +- Constant :class:`cst`, which represents immediate (signed or unsigned) value of fixed size (bitvector), +- Symbol :class:`sym`, a Constant equipped with a reference string (non-external symbol), +- Register :class:`reg`, a fixed size CPU register *location*, +- External :class:`ext`, a reference to an external location (external symbol), +- Floats :class:`cfp`, constant (fixed size) floating-point values, +- Composite :class:`comp`, a bitvector composed of several elements, +- Pointer :class:`ptr`, a memory *location* in a segment, with possible displacement, +- Memory :class:`mem`, a Pointer to represent a value of fixed size in memory, +- Slice :class:`slc`, a bitvector slice of any element, +- Test :class:`tst`, a conditional expression, (see Tests_ below.) +- Operator :class:`uop`, an unary operator expression, +- Operator :class:`op`, a binary operator expression. The list of supported operations is + not fixed althrough several predefined operators allow to build expressions directly from + Python expressions: say, you don't need to write ``op('+',x,y)``, but can write ``x+y``. + Supported operators are: + + + ``+``, ``-``, ``*`` (multiply low), ``**`` (multiply extended), ``/`` + + ``&``, ``|``, ``^``, ``~`` + + ``==``, ``!=``, ``<=``, ``>=``, ``<``, ``>`` + + ``>>``, ``<<``, ``//`` (arithmetic shift right), ``>>>`` and ``<<<`` (rotations). + + See Operators_ for more details. + +All elements inherit from the :class:`exp` class which defines all default methods/properties. +Common attributes and methods for all elements are: + +- ``size``, a Python integer representing the size in bits, +- ``sf``, the True/False *sign-flag*. +- ``length`` (size/8) +- ``mask`` (1< + In [4]: print p.bin.Ehdr + ELF header: + [Elf32_Ehdr] + e_ident :ELF; ELFOSABI_SYSV; 1; ELFCLASS32; ELFDATA2LSB; 0; 127 + e_type :ET_EXEC + e_machine :EM_386 + e_version :EV_CURRENT + e_entry :0x8048380 + e_phoff :52 + e_shoff :4416 + e_flags :0x0 + e_ehsize :52 + e_phentsize :32 + e_phnum :9 + e_shentsize :40 + e_shnum :30 + e_shstrndx :27 + +If the file uses a supported executable format (currently ``PE`` of ``ELF``) and +targets a supported plateform (see :ref:`system` and :ref:`arch` packages), +the returned object is an *abstraction* of the memory mapped program:: + + In [5]: print p.mmap + + + + + + + + + > + + +Note that it is also possible to provide a *raw* bytes +string as input and then manually load the suited architecture:: + + In [1]: import amoco + In [2]: shellcode = ("\xeb\x16\x5e\x31\xd2\x52\x56\x89\xe1\x89\xf3\x31\xc0\xb0\x0b\xcd" + "\x80\x31\xdb\x31\xc0\x40\xcd\x80\xe8\xe5\xff\xff\xff\x2f\x62\x69" + "\x6e\x2f\x73\x68") + In [3]: p = amoco.system.loader.load_program(shellcode) + amoco.system.loader: WARNING: unknown format + amoco.system.raw: WARNING: a cpu module must be imported + In [4]: from amoco.arch.x86 import cpu_x86 + In [5]: p.cpu = cpu_x86 + In [6]: print p + + In [7]: print p.mmap + > + +The shellcode is loaded at address 0 by default, but can be relocated with:: + + In [8]: p.relocate(0x4000) + In [9]: print p.mmap + > + + +Decoding blocks of instructions +=============================== + +Decoding a bytes stream as instruction needs only to load the desired cpu module, for +example:: + + In [10]: cpu_x86.disassemble('\xeb\x16') + Out[10]: + In [11]: print _ + jmp .+22 + +But when a mapped binary program is available, we can start disassembling instructions +or *data* located at virtual addresses:: + + In [12]: print p.read_instruction(p.cpu.cst(0x4000,32)) + jmp *0x4018 + In [13]: p.read_data(p.cpu.cst(0x4000,32),2) + Out[13]: ['\xeb\x16'] + +However, rather than manually adjusting the address to fetch the next instruction, we +can use any of the code analysis strategies implemented in amoco to disassemble +*basic blocks* directly:: + + In [1]: import amoco + In [2]: p = amoco.system.loader.load_program('samples/x86/flow.elf') + In [3]: z = amoco.lsweep(p) + In [4]: z.getblock(0x8048380) + Out[4]: + In [5]: b=_ + In [6]: print b + 0x8048380 '31ed' xor ebp, ebp + 0x8048382 '5e' pop esi + 0x8048383 '89e1' mov ecx, esp + 0x8048385 '83e4f0' and esp, 0xfffffff0 + 0x8048388 '50' push eax + 0x8048389 '54' push esp + 0x804838a '52' push edx + 0x804838b '6810860408' push #__libc_csu_fini + 0x8048390 '68a0850408' push #__libc_csu_init + 0x8048395 '51' push ecx + 0x8048396 '56' push esi + 0x8048397 '68fd840408' push #main + 0x804839c 'e8cfffffff' call *0x8048370 + + + +Symbolic representations of blocks +================================== + +Starting some analysis +====================== + + diff --git a/doc/system.rst b/doc/system.rst new file mode 100644 index 0000000..98762b7 --- /dev/null +++ b/doc/system.rst @@ -0,0 +1,33 @@ +.. _system: + +The system package +================== + + +.. automodule:: system.loader + :members: + +.. automodule:: system.elf + :members: + +.. automodule:: system.pe + :members: + +.. automodule:: system.core + :members: + +.. automodule:: system.raw + :members: + +.. automodule:: system.linux_x86 + :members: + +.. automodule:: system.linux_x64 + :members: + +.. automodule:: system.win32 + :members: + +.. automodule:: system.win64 + :members: + diff --git a/doc/ui.rst b/doc/ui.rst new file mode 100644 index 0000000..e24f8ee --- /dev/null +++ b/doc/ui.rst @@ -0,0 +1,2 @@ +The user interface package +========================== diff --git a/tests/samples/x86/test.s b/tests/samples/x86/test.s new file mode 100644 index 0000000..0d3ee65 --- /dev/null +++ b/tests/samples/x86/test.s @@ -0,0 +1,44 @@ +.global main +.intel_syntax noprefix + +.extern getchar +.extern printf + +.section .data +jmpTable: + .long _stub0 + .long _stub1 + .long _stub2 +fmt: .asciz "%x\n" + +.section .text + +main: + call getchar + mov dl, 4 + imul dl + add eax, offset jmpTable + jmp [eax] + .long 3851 + +_stub0: + mov eax, 0 + jmp _end + .long 3851 + +_stub1: + mov eax, 1 + jmp _end + .long 3851 + +_stub2: + mov eax, 2 + jmp _end + .long 3851 + +_end: + push eax + push offset fmt + call printf + add esp, 8 + ret diff --git a/tests/test_arch_x86.py b/tests/test_arch_x86.py index b42d94d..53724e5 100644 --- a/tests/test_arch_x86.py +++ b/tests/test_arch_x86.py @@ -229,6 +229,16 @@ def test_decoder_028(): op1,op2 = i.operands assert op1.size==op2.size==128 +def test_pickle_instruction(): + import pickle + pickler = lambda x: pickle.dumps(x,2) + c = '\xff\x9c\xc3\x88\x67\xeb\x01' + i = cpu.disassemble(c) + i.address = cst(0x1000,32) + p = pickler(i) + j = pickle.loads(p) + assert str(j)==str(i) + #------------------------------------------------------------------------------ diff --git a/tests/test_cas_exp.py b/tests/test_cas_exp.py index 3aa3824..f63c9ce 100644 --- a/tests/test_cas_exp.py +++ b/tests/test_cas_exp.py @@ -1,5 +1,5 @@ import pytest - +import pickle from amoco.cas.expressions import * def test_cst(): @@ -86,7 +86,7 @@ def test_op(a,b): assert e.r.v == 0xffffffffL assert e.r.sf == True -def test_op_slc(a,b): +def test_op1_slc(a,b): e = a^b assert e[8:16] == a[8:16]^b[8:16] e = composer([a[0:8],b[0:8]]) @@ -95,6 +95,14 @@ def test_op_slc(a,b): x = x.simplify() assert x._is_reg and x==a[0:8] +def test_op2_slc(a,b): + x = (a**b) + assert x.size == 64 + y = x[0:32] + assert y.size == 32 + z = y.simplify() + assert z._is_slc and z==y + def test_ptr(a): p = ptr(a) q = ptr(a,disp=17) @@ -138,8 +146,81 @@ def test_vecw(): assert v4.depth()==float('inf') assert v3[8:16].l == v4[8:16].l +def test_mem_vec(): + x = [cst(n) for n in range(5)] + v1 = vec(x) + z = mem(v1,8)[0:4] + s = z.simplify() + assert s._is_vec + assert s.l[0] == mem(x[0],8)[0:4] + def test_top(r): t = top(8) assert t+3 == t assert t^r[0:8] == t assert (t==3) == top(1) + +def pickler(obj): + return pickle.dumps(obj,pickle.HIGHEST_PROTOCOL) + +def test_pickle_cst(): + x = cst(0x1,32) + p = pickler(x) + y = pickle.loads(p) + assert x==y + +def test_pickle_sym(): + x = sym('one',0x1,32) + p = pickler(x) + y = pickle.loads(p) + assert y.ref=='one' + assert y.v==1 + +def test_pickle_reg(a): + p = pickler(a) + y = pickle.loads(p) + assert a==y + +def test_pickle_ext(): + x = ext('a',size=32) + p = pickler(x) + y = pickle.loads(p) + assert x==y + +def test_pickle_cmp(a): + b = cst(0x1,16) + p = pickler(composer([a[0:16],b])) + y = pickle.loads(p) + assert y[16:32]==b + +def test_pickle_mem(a): + p = pickler(mem(a,8)) + y = pickle.loads(p) + assert y._is_mem + assert y.a.base==a + +def test_pickle_slc(a): + p = pickler(a[8:16]) + y = pickle.loads(p) + assert a[8:16]==y + +def test_pickle_tst(a,b): + p = pickler(tst(a==b,a+b,a^b)) + y = pickle.loads(p) + assert y._is_tst + assert y.tst == (a==b) + assert y.l == a+b + assert y.r == a^b + +def test_pickle_uop(a): + p = pickler(-a) + y = pickle.loads(p) + assert a==-y + +def test_pickle_vec(a,b): + p = pickler(vec([a,-b])) + y = pickle.loads(p) + assert y._is_vec + assert y.l[0] == a + assert y.l[1] == -b + diff --git a/tests/test_cas_mapper.py b/tests/test_cas_mapper.py index a87af48..cb4291e 100644 --- a/tests/test_cas_mapper.py +++ b/tests/test_cas_mapper.py @@ -147,3 +147,20 @@ def test_merge(m,r,w,x,y,a,b): assert mm4w._is_vec assert w in mm4w assert 0x1000 in mm4w + +def test_pickle_mapper(a,m): + from pickle import dumps,loads,HIGHEST_PROTOCOL + pickler = lambda x: dumps(x,HIGHEST_PROTOCOL) + x = cst(0x1,32) + m[a] = a+3 + m[mem(a,8)] = x[0:8] + m.conds.append(a==0) + p = pickler(m) + w = loads(p) + assert w.conds[0]==(a==0) + assert w(a)==(a+3) + M = w.memory() + parts = M.read(ptr(w(a)),1) + assert len(parts)==1 + assert parts[0]==x[0:8] + diff --git a/tests/test_code.py b/tests/test_code.py new file mode 100644 index 0000000..bc96a4a --- /dev/null +++ b/tests/test_code.py @@ -0,0 +1,34 @@ +import pytest +from amoco.main import * + +from pickle import dumps,loads,HIGHEST_PROTOCOL +pickler = lambda x: dumps(x,HIGHEST_PROTOCOL) + +def test_block(sc1): + p = system.loader.load_program(sc1) + p.use_x86() + z = lsweep(p) + ib = z.iterblocks() + b0 = next(ib) + b1 = next(ib) + b1.map + # test pickle block: + x = pickler(b0) + y = pickler(b1) + X = loads(x) + assert len(X.instr)==1 + Y = loads(y) + assert Y.map.inputs()[0]==code.mem(code.reg('esp',32),32) + +def test_func(ploop): + p = system.loader.load_program(ploop) + z = lbackward(p) + z.getcfg(code.cst(0x804849d,32)) + f = z.functions()[0] + s = cfg.signature(f.cfg) + sig = '{[(c+veFj)] [(cvjl?Fa)(cvj?Fa)(-crlF)]}' + assert s == sig + # test pickle func: + x = pickler(f) + y = loads(x) + assert cfg.signature(y.cfg) == sig diff --git a/tests/test_main_db.py b/tests/test_main_db.py index 5e8341d..1f7af9b 100644 --- a/tests/test_main_db.py +++ b/tests/test_main_db.py @@ -17,46 +17,3 @@ def blocks(prog): b1 = next(ib) return (b0,b1) -def test_001_checkmaps(prog,blocks,m): - p = prog - b0,b1 = blocks - m[p.cpu.eip] = p.cpu.cst(0,32) - m01 = (b0.map>>b1.map) - m10 = (b1.map<>= b0.map - m >>= b1.map - assert m[p.cpu.edi] == 4 - assert m[p.cpu.eax] == 0 - assert m[p.cpu.ebx] == 0 - t = m(p.cpu.eip).simplify() - x = p.cpu.mem(p.cpu.esp,32) - assert t == x - -def test_002_session(tmpdir,prog,blocks): - p = prog - b0,b1 = blocks - sfile = str(tmpdir.dirpath('amoco-session')) - # create and commit: - S = Session(sfile) - S.add('m0',b0.map) - S.commit() - S.add('b1',b1) - S.add('p',p) - S.commit() - S.db.close() - # open existing and get: - s = Session(sfile) - if s.root: - pcopy = s.root.get('p').build() - b1copy = s.root.get('b1').build(pcopy.cpu) - assert b1 == b1copy - -def test_003_pickle(blocks): - import pickle - b0,b1 = blocks - i = b0.instr[0] - si = pickle.dumps(i,2) - ii = pickle.loads(si) - assert ii.mnemonic == 'PUSH' - assert ii.operands[0]==0x0 and ii.length==2 diff --git a/tests/test_system_core.py b/tests/test_system_core.py index cb7dc33..f0f4bbf 100644 --- a/tests/test_system_core.py +++ b/tests/test_system_core.py @@ -67,3 +67,15 @@ def test_memory_004(M,sc1,p,y): c.setendian(+1) exp.setendian(+1) +def test_pickle_memorymap(a,m): + from pickle import dumps,loads,HIGHEST_PROTOCOL + pickler = lambda x: dumps(x,HIGHEST_PROTOCOL) + m[mem(a,32)] = cst(0xcafebabe,32) + p = pickler(m.memory()) + M = loads(p) + parts = M.read(ptr(a+1),2) + assert len(parts)==1 + assert parts[0]==cst(0xfeba,16) + + + diff --git a/tests/test_system_loader.py b/tests/test_system_loader.py index 29391f8..d1d255a 100644 --- a/tests/test_system_loader.py +++ b/tests/test_system_loader.py @@ -6,3 +6,6 @@ def test_loader_001(samples): for f in samples: p = amoco.system.loader.load_program(f) +def test_loader_002(sc1): + p = amoco.system.loader.load_program(sc1) + assert p.bin.filename == '(sc-eb165e31...)' diff --git a/tests/test_system_raw.py b/tests/test_system_raw.py index ffaffaa..6a6ae1a 100644 --- a/tests/test_system_raw.py +++ b/tests/test_system_raw.py @@ -8,3 +8,6 @@ def test_raw_001(samples): if f[-4:]=='.raw': p = RawExec(DataIO(file(f,'rb'))) +def test_raw_002(sc1): + p = RawExec(DataIO(sc1)) +