ziplist
ziplist 作为redis常用的数据结构主要用于小数据量的压缩存储。在数据量较小的情况下,将数据紧凑排列可以有效利用CPU cache,有助于提高缓存命中,让数据读写更加迅速。
ziplist的数据定义很简单,如下:
1/* Each entry in the ziplist is either a string or an integer. */
2typedef struct {
3 /* When string is used, it is provided with the length (slen). */
4 unsigned char *sval;
5 unsigned int slen;
6 /* When integer is used, 'sval' is NULL, and lval holds the value. */
7 long long lval;
8} ziplistEntry;
查看ziplist的创建函数,可以发现其真正的数据结构都保存在ziplistEntry::sval所指的内存中。
1/* Create a new empty ziplist. */
2unsigned char *ziplistNew(void) {
3 /// 计算创建一个空ziplist所需的内存,由此可见一个ziplist又header和end两部分描述
4 unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
5 unsigned char *zl = zmalloc(bytes);
6 /// 设置ziplist分配的内存字节数
7 ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
8 /// 设置ziplist尾节点的偏移量
9 ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
10 /// 设置ziplist的节点数量为0
11 ZIPLIST_LENGTH(zl) = 0;
12 /// 设置ziplist的END部分
13 zl[bytes-1] = ZIP_END;
14 /// 返回ziplist有效内存地址,注意这里不是返回ziplistEntry
15 return zl;
16}
一个空的ziplist的内存布局大概是这样的:
其中ziplist Item 中的prevlen记录了到达上一个节点地址需要减少的字节数,datalen记录到达下一个节点地址需要增加的字节数。所以当我们对ziplist进行遍历时只需要取出这两个值,并对当前节点的地址进行加减即可。不过这两个值并非我们通常意义上的变量,而是被redis编码过的,用以节省内存。
对prevlen,第一个字节有特别的含义,当:
- 取值小于254,prevlen=p[0], prevlensize=1
- 取值等于254,prevlen=((ptr)[4] « 24)|((ptr)[3] « 16)|((ptr)[2] « 8)|((ptr)[1]); redis使用小端存储数据,所以此处还做了一次大端到小端的转换。
对datalen, 则有更复杂的编码方式,第一个字节表示datalen的编码方式,
- 当enc < 11000000b 时 enc = enc & 11000000b。当
- enc == 00000000b时,lensize = 1, len = enc & 00111111b
- enc == 01000000b时,lensize = 2, len = enc[0] « 8 | enc[1]
- enc == 10000000b时,lensize = 5, len = enc[1] « 24| enc[1] « 16| enc[1] « 8 | enc[1]
- enc == 11111110b时,lensize = 1,len = 1,用于存储8位整型
- enc == 11000000b时,lensize = 1,len = 2, 用于存储16位整型
- enc == 11010000b时,lensize = 1,len = 3, 用于存储24位整型
- enc == 11100000b时,lensize = 1,len = 4, 用于存储32位整型
- enc == 11110000b时,lensize = 1,len = 8, 用于存储64位整型
所以ziplist在存储数据的时候实际上对字符串和数值类型做了区分,并对数值类型进行了优化。怎么优化的等我们讲ziplist插入的时候会进行说明。在这之前让我们看看ziplist的遍历和查找是怎么做的。
1/* Return pointer to next entry in ziplist.
2 *
3 * zl is the pointer to the ziplist
4 * p is the pointer to the current element
5 *
6 * The element after 'p' is returned, otherwise NULL if we are at the end. */
7unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) {
8 ((void) zl);
9 size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
10
11 /* "p" could be equal to ZIP_END, caused by ziplistDelete,
12 * and we should return NULL. Otherwise, we should return NULL
13 * when the *next* element is ZIP_END (there is no next entry). */
14 if (p[0] == ZIP_END) {
15 return NULL;
16 }
17
18 p += zipRawEntryLength(p);
19 if (p[0] == ZIP_END) {
20 return NULL;
21 }
22
23 zipAssertValidEntry(zl, zlbytes, p);
24 return p;
25}
26
27/* Return pointer to previous entry in ziplist. */
28unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) {
29 unsigned int prevlensize, prevlen = 0;
30
31 /* Iterating backwards from ZIP_END should return the tail. When "p" is
32 * equal to the first element of the list, we're already at the head,
33 * and should return NULL. */
34 if (p[0] == ZIP_END) {
35 p = ZIPLIST_ENTRY_TAIL(zl);
36 return (p[0] == ZIP_END) ? NULL : p;
37 } else if (p == ZIPLIST_ENTRY_HEAD(zl)) {
38 return NULL;
39 } else {
40 ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
41 assert(prevlen > 0);
42 p-=prevlen;
43 size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
44 zipAssertValidEntry(zl, zlbytes, p);
45 return p;
46 }
47}
可以看到,后序遍历实际上只是对指针p做了一个偏移,偏移的量就是prevlensize + datalensize + datalen;而前序遍历则是直接减掉prevlen。
让我们再来看ziplist插入的代码:
1/* Insert item at "p". */
2unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
3 size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen, newlen;
4 unsigned int prevlensize, prevlen = 0;
5 size_t offset;
6 int nextdiff = 0;
7 unsigned char encoding = 0;
8 long long value = 123456789; /* initialized to avoid warning. Using a value
9 that is easy to see if for some reason
10 we use it uninitialized. */
11 zlentry tail;
12
13 /* Find out prevlen for the entry that is inserted. */
14 if (p[0] != ZIP_END) {
15 ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
16 } else {
17 unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
18 if (ptail[0] != ZIP_END) {
19 prevlen = zipRawEntryLengthSafe(zl, curlen, ptail);
20 }
21 }
22
23 /* See if the entry can be encoded */
24 if (zipTryEncoding(s,slen,&value,&encoding)) {
25 /* 'encoding' is set to the appropriate integer encoding */
26 reqlen = zipIntSize(encoding);
27 } else {
28 /* 'encoding' is untouched, however zipStoreEntryEncoding will use the
29 * string length to figure out how to encode it. */
30 reqlen = slen;
31 }
32 /* We need space for both the length of the previous entry and
33 * the length of the payload. */
34 reqlen += zipStorePrevEntryLength(NULL,prevlen);
35 reqlen += zipStoreEntryEncoding(NULL,encoding,slen);
36
37 /* When the insert position is not equal to the tail, we need to
38 * make sure that the next entry can hold this entry's length in
39 * its prevlen field. */
40 int forcelarge = 0;
41 nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
42 if (nextdiff == -4 && reqlen < 4) {
43 nextdiff = 0;
44 forcelarge = 1;
45 }
46
47 /* Store offset because a realloc may change the address of zl. */
48 offset = p-zl;
49 newlen = curlen+reqlen+nextdiff;
50 zl = ziplistResize(zl,newlen);
51 p = zl+offset;
52
53 /* Apply memory move when necessary and update tail offset. */
54 if (p[0] != ZIP_END) {
55 /* Subtract one because of the ZIP_END bytes */
56 memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);
57
58 /* Encode this entry's raw length in the next entry. */
59 if (forcelarge)
60 zipStorePrevEntryLengthLarge(p+reqlen,reqlen);
61 else
62 zipStorePrevEntryLength(p+reqlen,reqlen);
63
64 /* Update offset for tail */
65 ZIPLIST_TAIL_OFFSET(zl) =
66 intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);
67
68 /* When the tail contains more than one entry, we need to take
69 * "nextdiff" in account as well. Otherwise, a change in the
70 * size of prevlen doesn't have an effect on the *tail* offset. */
71 assert(zipEntrySafe(zl, newlen, p+reqlen, &tail, 1));
72 if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
73 ZIPLIST_TAIL_OFFSET(zl) =
74 intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
75 }
76 } else {
77 /* This element will be the new tail. */
78 ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
79 }
80
81 /* When nextdiff != 0, the raw length of the next entry has changed, so
82 * we need to cascade the update throughout the ziplist */
83 if (nextdiff != 0) {
84 offset = p-zl;
85 zl = __ziplistCascadeUpdate(zl,p+reqlen);
86 p = zl+offset;
87 }
88
89 /* Write the entry */
90 p += zipStorePrevEntryLength(p,prevlen);
91 p += zipStoreEntryEncoding(p,encoding,slen);
92 if (ZIP_IS_STR(encoding)) {
93 memcpy(p,s,slen);
94 } else {
95 zipSaveInteger(p,value,encoding);
96 }
97 ZIPLIST_INCR_LENGTH(zl,1);
98 return zl;
99}
插入时的流程如下:
flowchart TD; INIT --> A{{"是否在最后插入"}}; A --> |Y|B("计算上一个元素的长度"); A --> |N|C("如果END不为ZIP_END则计算上一个元素的长度"); B & C --> D{{"是否可以转为整型"}}; D --> |Y|E("根据值获取存储所需的字节数"); D --> |N|F("作为二进制数据存储,存储使用传入的长度"); E & F --> G("计算存储上一个节点的长度所需字节数"); G --> H("计算存储节点长度所需的字节数"); H --> I("检查下一个节点中的prevlen字段是否可以保存当前节点的长度信息"); I --> J("计算下一个节点需要调整的字节数nextdiff"); J --> K("重新计算节点在ziplist中的偏移量offset,以及新内存缓冲的大小newlen"); K --> L("重新分配ziplist内存,并计算新元素插入的起始地址p"); L --> M{{"是否在尾部插入"}}; M --> |Y| N("调整ziplist尾节点偏移量"); M --> |N| O("移动插入位置的节点(老节点)到p + reqlen。空出新节点的空间。"); O --> P("老节点保存新节点的节点长度(相对偏移量)"); P --> Q("调整ziplist尾节点偏移量"); Q --> R("调用zipEntrySafe,使用老节点数据填充zlentry数据。"); N & R --> S{{"nextdiff != 0"}}; S --> |Y|T("重新分配ziplist内存,调用__ziplistCascadeUpdate"); S --> |N|U("保存prevlen"); T --> U; U --> V("保存datalen"); V --> W{{"是否是字符串?"}}; W --> |Y|X("复制值内容到ziplist节点数据位置"); W --> |N|Y("调用zipSaveInteger作为数值存储"); X & Y --> END;
在ziplist插入的时候,因为插入节点的大小可能会影响插入节点的后续节点prevlensize发生改变,而prevlensize的改变又可能继续影响其后续节点。所以需要对所有后续节点进行遍历,以确定需要扩充的内存数量。这些内存大小变更的计算和新ziplist内存的分配则是在__ziplistCascadeUpdate这个函数中完成的。
1/* When an entry is inserted, we need to set the prevlen field of the next
2 * entry to equal the length of the inserted entry. It can occur that this
3 * length cannot be encoded in 1 byte and the next entry needs to be grow
4 * a bit larger to hold the 5-byte encoded prevlen. This can be done for free,
5 * because this only happens when an entry is already being inserted (which
6 * causes a realloc and memmove). However, encoding the prevlen may require
7 * that this entry is grown as well. This effect may cascade throughout
8 * the ziplist when there are consecutive entries with a size close to
9 * ZIP_BIG_PREVLEN, so we need to check that the prevlen can be encoded in
10 * every consecutive entry.
11 *
12 * Note that this effect can also happen in reverse, where the bytes required
13 * to encode the prevlen field can shrink. This effect is deliberately ignored,
14 * because it can cause a "flapping" effect where a chain prevlen fields is
15 * first grown and then shrunk again after consecutive inserts. Rather, the
16 * field is allowed to stay larger than necessary, because a large prevlen
17 * field implies the ziplist is holding large entries anyway.
18 *
19 * The pointer "p" points to the first entry that does NOT need to be
20 * updated, i.e. consecutive fields MAY need an update. */
21unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p) {
22 zlentry cur;
23 size_t prevlen, prevlensize, prevoffset; /* Informat of the last changed entry. */
24 size_t firstentrylen; /* Used to handle insert at head. */
25 size_t rawlen, curlen = intrev32ifbe(ZIPLIST_BYTES(zl));
26 size_t extra = 0, cnt = 0, offset;
27 size_t delta = 4; /* Extra bytes needed to update a entry's prevlen (5-1). */
28 unsigned char *tail = zl + intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl));
29
30 /* Empty ziplist */
31 if (p[0] == ZIP_END) return zl;
32
33 zipEntry(p, &cur); /* no need for "safe" variant since the input pointer was validated by the function that returned it. */
34 firstentrylen = prevlen = cur.headersize + cur.len;
35 prevlensize = zipStorePrevEntryLength(NULL, prevlen);
36 prevoffset = p - zl;
37 p += prevlen;
38
39 /* Iterate ziplist to find out how many extra bytes do we need to update it. */
40 while (p[0] != ZIP_END) {
41 assert(zipEntrySafe(zl, curlen, p, &cur, 0));
42
43 /* Abort when "prevlen" has not changed. */
44 if (cur.prevrawlen == prevlen) break;
45
46 /* Abort when entry's "prevlensize" is big enough. */
47 if (cur.prevrawlensize >= prevlensize) {
48 if (cur.prevrawlensize == prevlensize) {
49 zipStorePrevEntryLength(p, prevlen);
50 } else {
51 /* This would result in shrinking, which we want to avoid.
52 * So, set "prevlen" in the available bytes. */
53 zipStorePrevEntryLengthLarge(p, prevlen);
54 }
55 break;
56 }
57
58 /* cur.prevrawlen means cur is the former head entry. */
59 assert(cur.prevrawlen == 0 || cur.prevrawlen + delta == prevlen);
60
61 /* Update prev entry's info and advance the cursor. */
62 rawlen = cur.headersize + cur.len;
63 prevlen = rawlen + delta;
64 prevlensize = zipStorePrevEntryLength(NULL, prevlen);
65 prevoffset = p - zl;
66 p += rawlen;
67 extra += delta;
68 cnt++;
69 }
70
71 /* Extra bytes is zero all update has been done(or no need to update). */
72 if (extra == 0) return zl;
73
74 /* Update tail offset after loop. */
75 if (tail == zl + prevoffset) {
76 /* When the the last entry we need to update is also the tail, update tail offset
77 * unless this is the only entry that was updated (so the tail offset didn't change). */
78 if (extra - delta != 0) {
79 ZIPLIST_TAIL_OFFSET(zl) =
80 intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra-delta);
81 }
82 } else {
83 /* Update the tail offset in cases where the last entry we updated is not the tail. */
84 ZIPLIST_TAIL_OFFSET(zl) =
85 intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra);
86 }
87
88 /* Now "p" points at the first unchanged byte in original ziplist,
89 * move data after that to new ziplist. */
90 offset = p - zl;
91 zl = ziplistResize(zl, curlen + extra);
92 p = zl + offset;
93 memmove(p + extra, p, curlen - offset - 1);
94 p += extra;
95
96 /* Iterate all entries that need to be updated tail to head. */
97 while (cnt) {
98 zipEntry(zl + prevoffset, &cur); /* no need for "safe" variant since we already iterated on all these entries above. */
99 rawlen = cur.headersize + cur.len;
100 /* Move entry to tail and reset prevlen. */
101 memmove(p - (rawlen - cur.prevrawlensize),
102 zl + prevoffset + cur.prevrawlensize,
103 rawlen - cur.prevrawlensize);
104 p -= (rawlen + delta);
105 if (cur.prevrawlen == 0) {
106 /* "cur" is the previous head entry, update its prevlen with firstentrylen. */
107 zipStorePrevEntryLength(p, firstentrylen);
108 } else {
109 /* An entry's prevlen can only increment 4 bytes. */
110 zipStorePrevEntryLength(p, cur.prevrawlen+delta);
111 }
112 /* Foward to previous entry. */
113 prevoffset -= cur.prevrawlen;
114 cnt--;
115 }
116 return zl;
117}
ziplist本身很容易理解,比较复杂的是其节点存储时对prevlen和datalen的优化处理。因为变长所以解析需要花费更多的精力去理解,不过一旦理解以后ziplist就没有那么神秘了。ziplist在多个外部数据类型中都会被用到,例如hash和zset在小量数据的情况下。这么做的原因是为了减少内存消耗同时增加数据访问速度,这利用了CPU cache的相关特性,但是在数据量较大或者元素较多的情况下就不适用了,所以相应的数据类型会转为链表或者哈希表。
评论