名词
Slab
用于表示存储的最大size数据,仅仅只是用于定义(通俗的讲就是表示可以存储数据大小的范围)。默认情况下,前后两个slab表示存储的size以1.25倍进行增长。例如slab1为96字节,slab2为120字节
Page
分配给Slab的内存空间,默认为1MB。分给Slab后将会根据slab的大小切割成chunk
Chunk
用于缓存记录的内存空间
Slab calss
特定大小的Chunk集合
增长因子
增长因子就是相邻两个chunk之间的增长倍数。 memcache默认是1.25。
2倍 chunk size增长。
struct item:
#define ITEM_LINKED 1 //该item插入到LRU队列了 #define ITEM_CAS 2 //该item使用CAS #define ITEM_SLABBED 4 //该item还在slab的空闲队列里面,没有分配出去 #define ITEM_FETCHED 8 //该item插入到LRU队列后,被worker线程访问过 typedef struct _stritem { struct _stritem *next;//next指针,用于LRU链表 struct _stritem *prev;//prev指针,用于LRU链表 struct _stritem *h_next;//h_next指针,用于哈希表的冲突链 rel_time_t time;//最后一次访问时间。绝对时间 rel_time_t exptime;//过期失效时间,绝对时间 int nbytes;//本item存放的数据的长度 unsigned short refcount;//本item的引用数 uint8_t nsuffix;//后缀长度 /* length of flags-and-length string */ uint8_t it_flags;//item的属性 /* ITEM_* above */ uint8_t slabs_clsid;/* which slab class we're in */ uint8_t nkey;//键值的长度 /* key length, w/terminating null and padding */ /* this odd type prevents type-punning issues when we do * the little shuffle to save space when not using CAS. */ union { uint64_t cas; char end; } data[]; /* if it_flags & ITEM_CAS we have 8 bytes CAS */ /* then null-terminated key */ /* then " flags length\r\n" (no terminating null) */ /* then data with terminating \r\n (no terminating null; it's binary!) */ } item;
struct slabclass:
typedef struct { unsigned int size; /* sizes of items */ unsigned int perslab; /* how many items per slab */ void *slots; /* list of item ptrs */ unsigned int sl_curr; /* total free items in list */ void *end_page_ptr; /* pointer to next free item at end of page, or 0 */ unsigned int end_page_free; /* number of items remaining at end of last alloced page */ unsigned int slabs; /* how many slabs were allocated for this class */ void **slab_list; /* array of slab pointers */ unsigned int list_size; /* size of prev array */ unsigned int killing; /* index+1 of dying slab, or zero if none */ size_t requested; /* The number of requested bytes */ } slabclass_t;
item_link_q:
将item插入到LRU队列的头部
//将item插入到LRU队列的头部 static void item_link_q(item *it) { /* item is the new head */ item **head, **tail; assert(it->slabs_clsid < LARGEST_ID); assert((it->it_flags & ITEM_SLABBED) == 0); head = &heads[it->slabs_clsid]; tail = &tails[it->slabs_clsid]; assert(it != *head); assert((*head && *tail) || (*head == 0 && *tail == 0)); //头插法插入该item it->prev = 0; it->next = *head; if (it->next) it->next->prev = it; *head = it;//该item作为对应链表的第一个节点 //如果该item是对应id上的第一个item,那么还会被认为是该id链上的最后一个item //因为在head那里使用头插法,所以第一个插入的item,到了后面确实成了最后一个item if (*tail == 0) *tail = it; sizes[it->slabs_clsid]++;//个数加一 return; }
item_unlink_q:
将item从对应的LRU队列中删除
static void item_unlink_q(item *it) { item **head, **tail; assert(it->slabs_clsid < LARGEST_ID); head = &heads[it->slabs_clsid]; tail = &tails[it->slabs_clsid]; if (*head == it) {//链表上的第一个节点 assert(it->prev == 0); *head = it->next; } if (*tail == it) {//链表上的最后一个节点 assert(it->next == 0); *tail = it->prev; } assert(it->next != it); assert(it->prev != it); //把item的前驱节点和后驱节点连接起来 if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; sizes[it->slabs_clsid]--;//个数减一 return; }
do_item_update:
更新item
#define ITEM_UPDATE_INTERVAL 60 //更新频率为60秒 void do_item_update(item *it) { //下面的代码可以看到update操作是耗时的。如果这个item频繁被访问, //那么会导致过多的update,过多的一系列费时操作。此时更新间隔就应运而生 //了。如果上一次的访问时间(也可以说是update时间)距离现在(current_time) //还在更新间隔内的,就不更新。超出了才更新。 if (it->time < current_time - ITEM_UPDATE_INTERVAL) { mutex_lock(&cache_lock); if ((it->it_flags & ITEM_LINKED) != 0) { item_unlink_q(it);//从LRU队列中删除 it->time = current_time;//更新访问时间 item_link_q(it);//插入到LRU队列的头部 } mutex_unlock(&cache_lock); } }
do_slabs_newslab:
分配一个新的 slab
static int do_slabs_newslab(const unsigned int id) { slabclass_t *p = &slabclass[id]; int len = settings.slab_reassign ? settings.item_size_max : p->size * p->perslab; char *ptr; if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) || (grow_slab_list(id) == 0) || ((ptr = memory_allocate((size_t)len)) == 0)) { MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id); return 0; } memset(ptr, 0, (size_t)len); p->end_page_ptr = ptr; p->end_page_free = p->perslab; p->slab_list[p->slabs++] = ptr; mem_malloced += len; MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id); return 1; }
do_slabs_alloc:
从指定的 slabclass, 即 slabclass[id], 分配大小为 size 的内存块供申请者使用.
分配的原则是, 优先从 slots 指向的空闲链表中分配, 空闲链表没有, 才从 slab 中分配一个空闲的 chunk.
static void *do_slabs_alloc(const size_t size, unsigned int id) { slabclass_t *p; void *ret = NULL; item *it = NULL; if (id < POWER_SMALLEST || id > power_largest) { MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0); return NULL; } p = &slabclass[id]; assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0); /* 如果不使用 slab 机制, 则直接从操作系统分配 */ #ifdef USE_SYSTEM_MALLOC if (mem_limit && mem_malloced + size > mem_limit) { MEMCACHED_SLABS_ALLOCATE_FAILED(size, id); return 0; } mem_malloced += size; ret = malloc(size); MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret); return ret; #endif /* fail unless we have space at the end of a recently allocated page, we have something on our freelist, or we could allocate a new page */ /* 确保最后一个slab 有空闲的chunk */ if (! (p->end_page_ptr != 0 || p->sl_curr != 0 || do_slabs_newslab(id) != 0)) { /* We don't have more memory available */ ret = NULL; } else if (p->sl_curr != 0) { /* 从空闲list分配一个item */ /* return off our freelist */ it = (item *)p->slots; p->slots = it->next; if (it->next) it->next->prev = 0; p->sl_curr--; ret = (void *)it; } else { /* 从最后一个slab中分配一个空闲chunk */ /* if we recently allocated a whole page, return from that */ assert(p->end_page_ptr != NULL); ret = p->end_page_ptr; if (--p->end_page_free != 0) { p->end_page_ptr = ((caddr_t)p->end_page_ptr) + p->size; } else { p->end_page_ptr = 0; } } if (ret) { p->requested += size; MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret); } else { MEMCACHED_SLABS_ALLOCATE_FAILED(size, id); } return ret; }
do_item_alloc:
从 slab 系统分配一个空闲 item
item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal); if (id == 0) return 0; mutex_lock(&cache_lock); /* do a quick check if we have any expired items in the tail.. */ item *search; rel_time_t oldest_live = settings.oldest_live; search = tails[id]; if (search != NULL && (refcount_incr(&search->refcount) == 2)) { /* 先检查 LRU 队列最后一个 item 是否超时, 超时的话就把这个 item 分配给用户 */ if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { // dead by flush STATS_LOCK(); stats.reclaimed++; STATS_UNLOCK(); itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) { STATS_LOCK(); stats.expired_unfetched++; STATS_UNLOCK(); itemstats[id].expired_unfetched++; } it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); /* 把这个 item 从 LRU 队列和哈希表中移除 */ do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); /* Initialize the item block: */ it->slabs_clsid = 0; /* 没有超时的 item, 那就尝试从 slabclass 分配, 运气不好的话, 分配失败, 那就把 LRU 队列最后一个 item 剔除, 然后分配给用户 */ } else if ((it = slabs_alloc(ntotal, id)) == NULL) { if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; pthread_mutex_unlock(&cache_lock); return NULL; } itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; if ((search->it_flags & ITEM_FETCHED) == 0) { STATS_LOCK(); stats.evicted_unfetched++; STATS_UNLOCK(); itemstats[id].evicted_unfetched++; } STATS_LOCK(); stats.evictions++; STATS_UNLOCK(); it = search; slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); /* 把这个 item 从 LRU 队列和哈希表中移除 */ do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0)); /* Initialize the item block: */ it->slabs_clsid = 0; } else { refcount_decr(&search->refcount); } /* LRU 队列是空的, 或者锁住了, 那就只能从 slabclass 分配内存 */ } else { /* If the LRU is empty or locked, attempt to allocate memory */ it = slabs_alloc(ntotal, id); if (search != NULL) refcount_decr(&search->refcount); } if (it == NULL) { itemstats[id].outofmemory++; /* Last ditch effort. There was a very rare bug which caused * refcount leaks. We leave this just in case they ever happen again. * We can reasonably assume no item can stay locked for more than * three hours, so if we find one in the tail which is that old, * free it anyway. */ if (search != NULL && search->refcount != 2 && search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1; do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0)); } pthread_mutex_unlock(&cache_lock); return NULL; } assert(it->slabs_clsid == 0); assert(it != heads[id]); /* 顺便对 item 做一些初始化 */ /* Item initialization can happen outside of the lock; the item's already * been removed from the slab LRU. */ it->refcount = 1; /* the caller will have a reference */ pthread_mutex_unlock(&cache_lock); it->next = it->prev = it->h_next = 0; it->slabs_clsid = id; DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; }
do_item_link
形成了一个完成的 item 后, 就要把它放入两个数据结构中, 一是 memcached 的哈希表,
memcached 运行过程中只有一个哈希表, 二是 item 所在的 slabclass 的 LRU 队列.
每个 slabclass 都有一个 LRU 队列
memcached 运行过程中只有一个哈希表, 二是 item 所在的 slabclass 的 LRU 队列.
每个 slabclass 都有一个 LRU 队列
int do_item_link(item *it, const uint32_t hv) { MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes); assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0); mutex_lock(&cache_lock); it->it_flags |= ITEM_LINKED; it->time = current_time; STATS_LOCK(); stats.curr_bytes += ITEM_ntotal(it); stats.curr_items += 1; stats.total_items += 1; STATS_UNLOCK(); /* Allocate a new CAS ID on link. */ ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0); /* 把 item 放入哈希表 */ assoc_insert(it, hv); /* 把 item 放入 LRU 队列*/ item_link_q(it); refcount_incr(&it->refcount); pthread_mutex_unlock(&cache_lock); return 1; }
do_item_unlink
do_item_unlink 与 do_item_unlink 做的工作相反, 把 item 从哈希表和 LRU 队列中删除,
而且还释放掉 item 所占的内存 (其实只是把 item 放到空闲链表中).
而且还释放掉 item 所占的内存 (其实只是把 item 放到空闲链表中).
void do_item_unlink(item *it, const uint32_t hv) { MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes); mutex_lock(&cache_lock); if ((it->it_flags & ITEM_LINKED) != 0) { it->it_flags &= ~ITEM_LINKED; STATS_LOCK(); stats.curr_bytes -= ITEM_ntotal(it); stats.curr_items -= 1; STATS_UNLOCK(); /* 从哈希表中删除 item */ assoc_delete(ITEM_key(it), it->nkey, hv); /* 从 LRU 队列中删除 item */ item_unlink_q(it); /* 释放 item 所占的内存 */ do_item_remove(it); } pthread_mutex_unlock(&cache_lock); }
item_get
根据 key 找对应的 item
/** wrapper around assoc_find which does the lazy expiration logic */ item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) { mutex_lock(&cache_lock); /* 从哈希表中找 item */ item *it = assoc_find(key, nkey, hv); if (it != NULL) { refcount_incr(&it->refcount); /* Optimization for slab reassignment. prevents popular items from * jamming in busy wait. Can only do this here to satisfy lock order * of item_lock, cache_lock, slabs_lock. */ if (slab_rebalance_signal && ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) { do_item_unlink_nolock(it, hv); do_item_remove(it); it = NULL; } } pthread_mutex_unlock(&cache_lock); int was_found = 0; if (settings.verbose > 2) { if (it == NULL) { fprintf(stderr, "> NOT FOUND %s", key); } else { fprintf(stderr, "> FOUND KEY %s", ITEM_key(it)); was_found++; } } /* 找到了, 然后检查是否超时 */ if (it != NULL) { if (settings.oldest_live != 0 && settings.oldest_live <= current_time && it->time <= settings.oldest_live) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by flush"); } } else if (it->exptime != 0 && it->exptime <= current_time) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by expire"); } } else { it->it_flags |= ITEM_FETCHED; DEBUG_REFCNT(it, '+'); } } if (settings.verbose > 2) fprintf(stderr, "\n"); return it; }
slabs_clsid:
根据大小判定slab class
unsigned int slabs_clsid(const size_t size) {//返回slabclass索引下标值 int res = POWER_SMALLEST;//res的初始值为1 //返回0表示查找失败,因为slabclass数组中,第一个元素是没有使用的 if (size == 0) return 0; //因为slabclass数组中各个元素能分配的item大小是升序的 //所以从小到大直接判断即可在数组找到最小但又能满足的元素 while (size > slabclass[res].size) if (res++ == power_largest) /* won't fit in the biggest slab */ return 0; return res; }
Links:
http://blog.csdn.net/luotuo44/article/details/42963793