内存管理 · 2015-12-01 0

【Linux内存管理】SLUB分配算法(5)

前面分析了Slub分配算法的缓存区创建及对象分配,现继续分配算法的对象回收。

Slub分配算法中对象释放的接口为kmem_cache_free():

【file:/mm/slub.c】
void kmem_cache_free(struct kmem_cache *s, void *x)
{
    s = cache_from_obj(s, x);
    if (!s)
        return;
    slab_free(s, virt_to_head_page(x), x, _RET_IP_);
    trace_kmem_cache_free(_RET_IP_, x);
}

 

该函数中,cache_from_obj()主要是用于获取回收对象的kmem_cache,而slab_free()主要是用于将对象回收,至于trace_kmem_cache_free()则是对对象的回收做轨迹跟踪的。

具体看一下cache_from_obj()的实现:

【file:/mm/slub.h】
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
    struct kmem_cache *cachep;
    struct page *page;

    /*
     * When kmemcg is not being used, both assignments should return the
     * same value. but we don't want to pay the assignment price in that
     * case. If it is not compiled in, the compiler should be smart enough
     * to not do even the assignment. In that case, slab_equal_or_root
     * will also be a constant.
     */
    if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
        return s;

    page = virt_to_head_page(x); 
    cachep = page->slab_cache;
    if (slab_equal_or_root(cachep, s)) 
        return cachep;

    pr_err("%s: Wrong slab cache. %s but object is from %s\n",
        __FUNCTION__, cachep->name, s->name);
    WARN_ON_ONCE(1);
    return s;
}

 

kmem_cache在kmem_cache_free()的入参已经传入了,但是这里仍然要去重新判断获取该结构,主要是由于当内核将各缓冲区链起来的时候,其通过对象地址经virt_to_head_page()转换后获取的page页面结构远比用户传入的值得可信。所以在该函数中则先会if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))判断是否memcg未开启且kmem_cache未设置SLAB_DEBUG_FREE,如果是的话,接着通过virt_to_head_page()经由对象地址获得其页面page管理结构;再经由slab_equal_or_root()判断调用者传入的kmem_cache是否与释放的对象所属的cache相匹配,如果匹配,则将由对象得到kmem_cache返回;否则最后只好将调用者传入的kmem_cache返回。

详细分析一下slub的对象回收实现函数slab_free():

【file:/mm/slub.c】
/*
 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
 * can perform fastpath freeing without additional function calls.
 *
 * The fastpath is only possible if we are freeing to the current cpu slab
 * of this processor. This typically the case if we have just allocated
 * the item before.
 *
 * If fastpath is not possible then fall back to __slab_free where we deal
 * with all sorts of special processing.
 */
static __always_inline void slab_free(struct kmem_cache *s,
            struct page *page, void *x, unsigned long addr)
{
    void **object = (void *)x;
    struct kmem_cache_cpu *c;
    unsigned long tid;

    slab_free_hook(s, x);

redo:
    /*
     * Determine the currently cpus per cpu slab.
     * The cpu may change afterward. However that does not matter since
     * data is retrieved via this pointer. If we are on the same cpu
     * during the cmpxchg then the free will succedd.
     */
    preempt_disable();
    c = __this_cpu_ptr(s->cpu_slab);

    tid = c->tid;
    preempt_enable();

    if (likely(page == c->page)) {
        set_freepointer(s, object, c->freelist);

        if (unlikely(!this_cpu_cmpxchg_double(
                s->cpu_slab->freelist, s->cpu_slab->tid,
                c->freelist, tid,
                object, next_tid(tid)))) {

            note_cmpxchg_failure("slab_free", s, tid);
            goto redo;
        }
        stat(s, FREE_FASTPATH);
    } else
        __slab_free(s, page, x, addr);

}

 

函数最先的是slab_free_hook()对象释放处理钩子调用处理,主要是用于去注册kmemleak中的对象;接着是redo的标签,该标签主要是用于释放过程中出现因抢占而发生CPU迁移的时候,跳转重新处理的点;在redo里面,将先通过preempt_disable()禁止抢占,然后__this_cpu_ptr()获取本地CPU的kmem_cache_cpu管理结构以及其中的事务ID(tid),然后preempt_enable()恢复抢占;if(likely(page == c->page))如果当前释放的对象与本地CPU的缓存区相匹配,将会set_freepointer()设置该对象尾随的空闲对象指针数据,然后类似分配时,经由this_cpu_cmpxchg_double()原子操作,将对象归还回去;但是如果当前释放的对象与本地CPU的缓存区不匹配,意味着不可以快速释放对象,此时将会通过__slab_free()慢通道将对象释放。

接着分析一下__slab_free()的实现:

【file:/mm/slub.c】
/*
 * Slow patch handling. This may still be called frequently since objects
 * have a longer lifetime than the cpu slabs in most processing loads.
 *
 * So we still attempt to reduce cache line usage. Just take the slab
 * lock and free the item. If there is no additional partial page
 * handling required then we can return immediately.
 */
static void __slab_free(struct kmem_cache *s, struct page *page,
            void *x, unsigned long addr)
{
    void *prior;
    void **object = (void *)x;
    int was_frozen;
    struct page new;
    unsigned long counters;
    struct kmem_cache_node *n = NULL;
    unsigned long uninitialized_var(flags);

    stat(s, FREE_SLOWPATH);

    if (kmem_cache_debug(s) &&
        !(n = free_debug_processing(s, page, x, addr, &flags)))
        return;

    do {
        if (unlikely(n)) {
            spin_unlock_irqrestore(&n->list_lock, flags);
            n = NULL;
        }
        prior = page->freelist;
        counters = page->counters;
        set_freepointer(s, object, prior);
        new.counters = counters;
        was_frozen = new.frozen;
        new.inuse--;
        if ((!new.inuse || !prior) && !was_frozen) {

            if (kmem_cache_has_cpu_partial(s) && !prior) {

                /*
                 * Slab was on no list before and will be
                 * partially empty
                 * We can defer the list move and instead
                 * freeze it.
                 */
                new.frozen = 1;

            } else { /* Needs to be taken off a list */

                            n = get_node(s, page_to_nid(page));
                /*
                 * Speculatively acquire the list_lock.
                 * If the cmpxchg does not succeed then we may
                 * drop the list_lock without any processing.
                 *
                 * Otherwise the list_lock will synchronize with
                 * other processors updating the list of slabs.
                 */
                spin_lock_irqsave(&n->list_lock, flags);

            }
        }

    } while (!cmpxchg_double_slab(s, page,
        prior, counters,
        object, new.counters,
        "__slab_free"));

    if (likely(!n)) {

        /*
         * If we just froze the page then put it onto the
         * per cpu partial list.
         */
        if (new.frozen && !was_frozen) {
            put_cpu_partial(s, page, 1);
            stat(s, CPU_PARTIAL_FREE);
        }
        /*
         * The list lock was not taken therefore no list
         * activity can be necessary.
         */
                if (was_frozen)
                        stat(s, FREE_FROZEN);
                return;
        }

    if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
        goto slab_empty;

    /*
     * Objects left in the slab. If it was not on the partial list before
     * then add it.
     */
    if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
        if (kmem_cache_debug(s))
            remove_full(s, n, page);
        add_partial(n, page, DEACTIVATE_TO_TAIL);
        stat(s, FREE_ADD_PARTIAL);
    }
    spin_unlock_irqrestore(&n->list_lock, flags);
    return;

slab_empty:
    if (prior) {
        /*
         * Slab on the partial list.
         */
        remove_partial(n, page);
        stat(s, FREE_REMOVE_PARTIAL);
    } else {
        /* Slab must be on the full list */
        remove_full(s, n, page);
    }

    spin_unlock_irqrestore(&n->list_lock, flags);
    stat(s, FREE_SLAB);
    discard_slab(s, page);
}

 

该函数最先的if (kmem_cache_debug(s) && !(n = free_debug_processing(s, page, x, addr, &flags)))主要用于kmem_cache_debug()判断是否开启调试,如果开启,将通过free_debug_processing()进行调试检测以及获取经检验过的合法的kmem_cache_node节点缓冲区管理结构;接着进入do-while循环,如果kmem_cache_node从free_debug_processing()返回出来,则n不为空,那么将会释放其在free_debug_processing()内加的锁进行释放,并将n置空;然后获取缓冲区的信息以及设置对象末尾的空闲对象指针,同时更新缓冲区中对象使用数。

往下if ((!new.inuse || !prior) && !was_frozen)的判断,如果缓冲区中被使用的对象为0或者空闲队列为空,且缓冲区未处于冻结态(即缓冲区未处于每CPU对象缓存中),那么意味着该释放的对象是缓冲区中最后一个被使用的对象,对象释放之后的缓冲区是可以被释放回伙伴管理算法的;接着if (kmem_cache_has_cpu_partial(s) && !prior)的判断,表示每CPU存在partial半满队列同时空闲队列不为空,那么该缓冲区将会设置frozen标识,用于后期将其放置到每CPU的partial队列中,反之,那么意味着该缓冲区将会从链表中移出,接着将会get_node()获取节点缓冲区管理结构,同时spin_lock_irqsave()加锁持有该slab的节点管理结构;最后通过cmpxchg_double_slab()将对象释放,如果执行失败,将返回重试。

接下来if (likely(!n))判断中kmem_cache_node不为空,如果if (new.frozen && !was_frozen)前面未冻结该缓冲区,这将会把该缓冲区put_cpu_partial()挂入到每CPU的partial队列中,同时stat()更新统计信息;如果if (was_frozen)冻结了该缓冲区,则仅需stat()更新统计信息。

而if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) 如果缓冲区无对象被使用,且节点的半满slab缓冲区数量超过了最小临界点,则该页面将需要被释放掉,那么将会跳转至slab_empty执行缓冲区释放操作。

此外if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) 该缓冲区因对象的释放,处于半满状态(即仍有对象被占用的情况),则其将从full链表中remove_full()移出,并add_partial()添加至半满partial队列中。

最后spin_unlock_irqrestore()释放中断锁并恢复中断环境。

至于slab_empty标签中的缓冲区释放流程,则是根据其空闲队列是否空,然后选择地去将该页面从对应的full或者partial链表中摘除,然后spin_unlock_irqrestore()释放锁,最终通过discard_slab()将缓冲区释放。

回顾该函数,下面侧重看一下free_debug_processing()的处理及discard_slab()的实现。

【file:/mm/slub.c】
static noinline struct kmem_cache_node *free_debug_processing(
    struct kmem_cache *s, struct page *page, void *object,
    unsigned long addr, unsigned long *flags)
{
    struct kmem_cache_node *n = get_node(s, page_to_nid(page));

    spin_lock_irqsave(&n->list_lock, *flags);
    slab_lock(page);

    if (!check_slab(s, page))
        goto fail;

    if (!check_valid_pointer(s, page, object)) {
        slab_err(s, page, "Invalid object pointer 0x%p", object);
        goto fail;
    }

    if (on_freelist(s, page, object)) {
        object_err(s, page, object, "Object already free");
        goto fail;
    }

    if (!check_object(s, page, object, SLUB_RED_ACTIVE))
        goto out;

    if (unlikely(s != page->slab_cache)) {
        if (!PageSlab(page)) {
            slab_err(s, page, "Attempt to free object(0x%p) "
                "outside of slab", object);
        } else if (!page->slab_cache) {
            printk(KERN_ERR
                "SLUB <none>: no slab for object 0x%p.\n",
                        object);
            dump_stack();
        } else
            object_err(s, page, object,
                    "page slab pointer corrupt.");
        goto fail;
    }

    if (s->flags & SLAB_STORE_USER)
        set_track(s, object, TRACK_FREE, addr);
    trace(s, page, object, 0);
    init_object(s, object, SLUB_RED_INACTIVE);
out:
    slab_unlock(page);
    /*
     * Keep node_lock to preserve integrity
     * until the object is actually freed
     */
    return n;

fail:
    slab_unlock(page);
    spin_unlock_irqrestore(&n->list_lock, *flags);
    slab_fix(s, "Object at 0x%p not freed", object);
    return NULL;
}

 

该调测处理函数主要检测有:check_slab()检查slab的kmem_cache与page中的slab信息是否匹配,如果不匹配,可能发生了破坏或者数据不符;check_valid_pointer()检查对象地址的合法性,表示地址确切地为某对象的首地址,而非对象的中间位置;on_freelist()检测该对象是否已经被释放,避免造成重复释放;check_object()主要是根据内存标识SLAB_RED_ZONE及SLAB_POISON的设置,对对象空间进行完整性检测;至于if (unlikely(s != page->slab_cache))判断主要是为了确保用户传入的kmem_cache与页面所属的kmem_cache类型是匹配的,否则将记录错误日志。此外还根据if (s->flags & SLAB_STORE_USER) 如果设置了SLAB_STORE_USER标识,将记录对象释放的track信息。最后将trace()记录对象的轨迹信息,同时还init_object()将重新初始化对象。代码末尾的out及fail则是对检测处理的成功及释放的后处理。

至于discard_slab()的实现:

【file:/mm/slub.c】
static void discard_slab(struct kmem_cache *s, struct page *page)
{
    dec_slabs_node(s, page_to_nid(page), page->objects); 
    free_slab(s, page);
}

 

如果discard_slab()释放缓冲区,将会先dec_slabs_node()更新统计,然后通过free_slab()进行处理。

【file:/mm/slub.c】
static void free_slab(struct kmem_cache *s, struct page *page)
{
    if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
        struct rcu_head *head;

        if (need_reserve_slab_rcu) {
            int order = compound_order(page);
            int offset = (PAGE_SIZE << order) - s->reserved;

            VM_BUG_ON(s->reserved != sizeof(*head));
            head = page_address(page) + offset;
        } else {
            /*
             * RCU free overloads the RCU head over the LRU
             */
            head = (void *)&page->lru;
        }

        call_rcu(head, rcu_free_slab);
    } else
        __free_slab(s, page);
}

 

如果设置了SLAB_DESTROY_BY_RCU标识,将会通过RCU的方式将内存页面释放掉,否则将会通过__free_slab()普通方式释放。

而__free_slab()的实现:

【file:/mm/slub.c】
static void __free_slab(struct kmem_cache *s, struct page *page)
{
    int order = compound_order(page);
    int pages = 1 << order;

    if (kmem_cache_debug(s)) {
        void *p;

        slab_pad_check(s, page);
        for_each_object(p, s, page_address(page),
                        page->objects)
            check_object(s, page, p, SLUB_RED_INACTIVE);
    }

    kmemcheck_free_shadow(page, compound_order(page));

    mod_zone_page_state(page_zone(page),
        (s->flags & SLAB_RECLAIM_ACCOUNT) ?
        NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
        -pages);

    __ClearPageSlabPfmemalloc(page);
    __ClearPageSlab(page);

    memcg_release_pages(s, order);
    page_mapcount_reset(page);
    if (current->reclaim_state)
        current->reclaim_state->reclaimed_slab += pages;
    __free_memcg_kmem_pages(page, order);
}

 

其将通过compound_order()获取页面阶数转而获得释放的页面数;然后kmem_cache_debug()判断该slab是否开启了调测,如果开启,将会对该slab缓冲区进行一次检测,主要是检测是否有内存破坏以记录相关信息;接着kmemcheck_free_shadow()释放影子内存;mod_zone_page_state()修改内存页面的状态,同时__ClearPageSlabPfmemalloc()和__ClearPageSlab()清除页面的slab信息;最后memcg_release_pages()释放memcg中的页面处理,接着page_mapcount_reset()重置页面映射计数,最后则是__free_memcg_kmem_pages()将页面释放。

至于__free_memcg_kmem_pages()的实现则是将memcg去注册页面,然后经由__free_pages()将页面归还到伙伴管理算法中。

【file:/mm/page_alloc.c】
/*
 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
 * pages allocated with __GFP_KMEMCG.
 *
 * Those pages are accounted to a particular memcg, embedded in the
 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
 * for that information only to find out that it is NULL for users who have no
 * interest in that whatsoever, we provide these functions.
 *
 * The caller knows better which flags it relies on.
 */
void __free_memcg_kmem_pages(struct page *page, unsigned int order)
{
    memcg_kmem_uncharge_pages(page, order);
    __free_pages(page, order);
}

 

至此对象释放分析完毕。