Redis 源码分析(三)ziplist

ziplist

ziplist 作为redis常用的数据结构主要用于小数据量的压缩存储。在数据量较小的情况下,将数据紧凑排列可以有效利用CPU cache,有助于提高缓存命中,让数据读写更加迅速。

ziplist的数据定义很简单,如下:

1/* Each entry in the ziplist is either a string or an integer. */
2typedef struct {
3    /* When string is used, it is provided with the length (slen). */
4    unsigned char *sval;
5    unsigned int slen;
6    /* When integer is used, 'sval' is NULL, and lval holds the value. */
7    long long lval;
8} ziplistEntry;

查看ziplist的创建函数,可以发现其真正的数据结构都保存在ziplistEntry::sval所指的内存中。

 1/* Create a new empty ziplist. */
 2unsigned char *ziplistNew(void) {
 3    /// 计算创建一个空ziplist所需的内存,由此可见一个ziplist又header和end两部分描述
 4    unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
 5    unsigned char *zl = zmalloc(bytes);
 6    /// 设置ziplist分配的内存字节数
 7    ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
 8    /// 设置ziplist尾节点的偏移量
 9    ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
10    /// 设置ziplist的节点数量为0
11    ZIPLIST_LENGTH(zl) = 0;
12    /// 设置ziplist的END部分
13    zl[bytes-1] = ZIP_END;
14    /// 返回ziplist有效内存地址,注意这里不是返回ziplistEntry
15    return zl;
16}

一个空的ziplist的内存布局大概是这样的:
empty ziplist

其中ziplist Item 中的prevlen记录了到达上一个节点地址需要减少的字节数,datalen记录到达下一个节点地址需要增加的字节数。所以当我们对ziplist进行遍历时只需要取出这两个值,并对当前节点的地址进行加减即可。不过这两个值并非我们通常意义上的变量,而是被redis编码过的,用以节省内存。

对prevlen,第一个字节有特别的含义,当:

  • 取值小于254,prevlen=p[0], prevlensize=1
  • 取值等于254,prevlen=((ptr)[4] « 24)|((ptr)[3] « 16)|((ptr)[2] « 8)|((ptr)[1]); redis使用小端存储数据,所以此处还做了一次大端到小端的转换。

对datalen, 则有更复杂的编码方式,第一个字节表示datalen的编码方式,

  • 当enc < 11000000b 时 enc = enc & 11000000b。当
    • enc == 00000000b时,lensize = 1, len = enc & 00111111b
    • enc == 01000000b时,lensize = 2, len = enc[0] « 8 | enc[1]
    • enc == 10000000b时,lensize = 5, len = enc[1] « 24| enc[1] « 16| enc[1] « 8 | enc[1]
  • enc == 11111110b时,lensize = 1,len = 1,用于存储8位整型
  • enc == 11000000b时,lensize = 1,len = 2, 用于存储16位整型
  • enc == 11010000b时,lensize = 1,len = 3, 用于存储24位整型
  • enc == 11100000b时,lensize = 1,len = 4, 用于存储32位整型
  • enc == 11110000b时,lensize = 1,len = 8, 用于存储64位整型

所以ziplist在存储数据的时候实际上对字符串和数值类型做了区分,并对数值类型进行了优化。怎么优化的等我们讲ziplist插入的时候会进行说明。在这之前让我们看看ziplist的遍历和查找是怎么做的。

 1/* Return pointer to next entry in ziplist.
 2 *
 3 * zl is the pointer to the ziplist
 4 * p is the pointer to the current element
 5 *
 6 * The element after 'p' is returned, otherwise NULL if we are at the end. */
 7unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) {
 8    ((void) zl);
 9    size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
10
11    /* "p" could be equal to ZIP_END, caused by ziplistDelete,
12     * and we should return NULL. Otherwise, we should return NULL
13     * when the *next* element is ZIP_END (there is no next entry). */
14    if (p[0] == ZIP_END) {
15        return NULL;
16    }
17
18    p += zipRawEntryLength(p);
19    if (p[0] == ZIP_END) {
20        return NULL;
21    }
22
23    zipAssertValidEntry(zl, zlbytes, p);
24    return p;
25}
26
27/* Return pointer to previous entry in ziplist. */
28unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) {
29    unsigned int prevlensize, prevlen = 0;
30
31    /* Iterating backwards from ZIP_END should return the tail. When "p" is
32     * equal to the first element of the list, we're already at the head,
33     * and should return NULL. */
34    if (p[0] == ZIP_END) {
35        p = ZIPLIST_ENTRY_TAIL(zl);
36        return (p[0] == ZIP_END) ? NULL : p;
37    } else if (p == ZIPLIST_ENTRY_HEAD(zl)) {
38        return NULL;
39    } else {
40        ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
41        assert(prevlen > 0);
42        p-=prevlen;
43        size_t zlbytes = intrev32ifbe(ZIPLIST_BYTES(zl));
44        zipAssertValidEntry(zl, zlbytes, p);
45        return p;
46    }
47}

可以看到,后序遍历实际上只是对指针p做了一个偏移,偏移的量就是prevlensize + datalensize + datalen;而前序遍历则是直接减掉prevlen。

让我们再来看ziplist插入的代码:

 1/* Insert item at "p". */
 2unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
 3    size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen, newlen;
 4    unsigned int prevlensize, prevlen = 0;
 5    size_t offset;
 6    int nextdiff = 0;
 7    unsigned char encoding = 0;
 8    long long value = 123456789; /* initialized to avoid warning. Using a value
 9                                    that is easy to see if for some reason
10                                    we use it uninitialized. */
11    zlentry tail;
12
13    /* Find out prevlen for the entry that is inserted. */
14    if (p[0] != ZIP_END) {
15        ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
16    } else {
17        unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
18        if (ptail[0] != ZIP_END) {
19            prevlen = zipRawEntryLengthSafe(zl, curlen, ptail);
20        }
21    }
22
23    /* See if the entry can be encoded */
24    if (zipTryEncoding(s,slen,&value,&encoding)) {
25        /* 'encoding' is set to the appropriate integer encoding */
26        reqlen = zipIntSize(encoding);
27    } else {
28        /* 'encoding' is untouched, however zipStoreEntryEncoding will use the
29         * string length to figure out how to encode it. */
30        reqlen = slen;
31    }
32    /* We need space for both the length of the previous entry and
33     * the length of the payload. */
34    reqlen += zipStorePrevEntryLength(NULL,prevlen);
35    reqlen += zipStoreEntryEncoding(NULL,encoding,slen);
36
37    /* When the insert position is not equal to the tail, we need to
38     * make sure that the next entry can hold this entry's length in
39     * its prevlen field. */
40    int forcelarge = 0;
41    nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
42    if (nextdiff == -4 && reqlen < 4) {
43        nextdiff = 0;
44        forcelarge = 1;
45    }
46
47    /* Store offset because a realloc may change the address of zl. */
48    offset = p-zl;
49    newlen = curlen+reqlen+nextdiff;
50    zl = ziplistResize(zl,newlen);
51    p = zl+offset;
52
53    /* Apply memory move when necessary and update tail offset. */
54    if (p[0] != ZIP_END) {
55        /* Subtract one because of the ZIP_END bytes */
56        memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);
57
58        /* Encode this entry's raw length in the next entry. */
59        if (forcelarge)
60            zipStorePrevEntryLengthLarge(p+reqlen,reqlen);
61        else
62            zipStorePrevEntryLength(p+reqlen,reqlen);
63
64        /* Update offset for tail */
65        ZIPLIST_TAIL_OFFSET(zl) =
66            intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);
67
68        /* When the tail contains more than one entry, we need to take
69         * "nextdiff" in account as well. Otherwise, a change in the
70         * size of prevlen doesn't have an effect on the *tail* offset. */
71        assert(zipEntrySafe(zl, newlen, p+reqlen, &tail, 1));
72        if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
73            ZIPLIST_TAIL_OFFSET(zl) =
74                intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
75        }
76    } else {
77        /* This element will be the new tail. */
78        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
79    }
80
81    /* When nextdiff != 0, the raw length of the next entry has changed, so
82     * we need to cascade the update throughout the ziplist */
83    if (nextdiff != 0) {
84        offset = p-zl;
85        zl = __ziplistCascadeUpdate(zl,p+reqlen);
86        p = zl+offset;
87    }
88
89    /* Write the entry */
90    p += zipStorePrevEntryLength(p,prevlen);
91    p += zipStoreEntryEncoding(p,encoding,slen);
92    if (ZIP_IS_STR(encoding)) {
93        memcpy(p,s,slen);
94    } else {
95        zipSaveInteger(p,value,encoding);
96    }
97    ZIPLIST_INCR_LENGTH(zl,1);
98    return zl;
99}

插入时的流程如下:

flowchart TD;
  INIT --> A{{"是否在最后插入"}};
  A --> |Y|B("计算上一个元素的长度");
  A --> |N|C("如果END不为ZIP_END则计算上一个元素的长度");
  B & C --> D{{"是否可以转为整型"}};
  D --> |Y|E("根据值获取存储所需的字节数");
  D --> |N|F("作为二进制数据存储,存储使用传入的长度");
  E & F --> G("计算存储上一个节点的长度所需字节数");
  G --> H("计算存储节点长度所需的字节数");
  H --> I("检查下一个节点中的prevlen字段是否可以保存当前节点的长度信息");
  I --> J("计算下一个节点需要调整的字节数nextdiff");
  J --> K("重新计算节点在ziplist中的偏移量offset,以及新内存缓冲的大小newlen");
  K --> L("重新分配ziplist内存,并计算新元素插入的起始地址p");
  L --> M{{"是否在尾部插入"}};
  M --> |Y| N("调整ziplist尾节点偏移量");
  M --> |N| O("移动插入位置的节点(老节点)到p + reqlen。空出新节点的空间。");
  O --> P("老节点保存新节点的节点长度(相对偏移量)");
  P --> Q("调整ziplist尾节点偏移量");
  Q --> R("调用zipEntrySafe,使用老节点数据填充zlentry数据。");
  N & R --> S{{"nextdiff != 0"}};
  S --> |Y|T("重新分配ziplist内存,调用__ziplistCascadeUpdate");
  S --> |N|U("保存prevlen");
  T --> U;
  U --> V("保存datalen");
  V --> W{{"是否是字符串?"}};
  W --> |Y|X("复制值内容到ziplist节点数据位置");
  W --> |N|Y("调用zipSaveInteger作为数值存储");
  X & Y --> END;

在ziplist插入的时候,因为插入节点的大小可能会影响插入节点的后续节点prevlensize发生改变,而prevlensize的改变又可能继续影响其后续节点。所以需要对所有后续节点进行遍历,以确定需要扩充的内存数量。这些内存大小变更的计算和新ziplist内存的分配则是在__ziplistCascadeUpdate这个函数中完成的。

  1/* When an entry is inserted, we need to set the prevlen field of the next
  2 * entry to equal the length of the inserted entry. It can occur that this
  3 * length cannot be encoded in 1 byte and the next entry needs to be grow
  4 * a bit larger to hold the 5-byte encoded prevlen. This can be done for free,
  5 * because this only happens when an entry is already being inserted (which
  6 * causes a realloc and memmove). However, encoding the prevlen may require
  7 * that this entry is grown as well. This effect may cascade throughout
  8 * the ziplist when there are consecutive entries with a size close to
  9 * ZIP_BIG_PREVLEN, so we need to check that the prevlen can be encoded in
 10 * every consecutive entry.
 11 *
 12 * Note that this effect can also happen in reverse, where the bytes required
 13 * to encode the prevlen field can shrink. This effect is deliberately ignored,
 14 * because it can cause a "flapping" effect where a chain prevlen fields is
 15 * first grown and then shrunk again after consecutive inserts. Rather, the
 16 * field is allowed to stay larger than necessary, because a large prevlen
 17 * field implies the ziplist is holding large entries anyway.
 18 *
 19 * The pointer "p" points to the first entry that does NOT need to be
 20 * updated, i.e. consecutive fields MAY need an update. */
 21unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p) {
 22    zlentry cur;
 23    size_t prevlen, prevlensize, prevoffset; /* Informat of the last changed entry. */
 24    size_t firstentrylen; /* Used to handle insert at head. */
 25    size_t rawlen, curlen = intrev32ifbe(ZIPLIST_BYTES(zl));
 26    size_t extra = 0, cnt = 0, offset;
 27    size_t delta = 4; /* Extra bytes needed to update a entry's prevlen (5-1). */
 28    unsigned char *tail = zl + intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl));
 29
 30    /* Empty ziplist */
 31    if (p[0] == ZIP_END) return zl;
 32
 33    zipEntry(p, &cur); /* no need for "safe" variant since the input pointer was validated by the function that returned it. */
 34    firstentrylen = prevlen = cur.headersize + cur.len;
 35    prevlensize = zipStorePrevEntryLength(NULL, prevlen);
 36    prevoffset = p - zl;
 37    p += prevlen;
 38
 39    /* Iterate ziplist to find out how many extra bytes do we need to update it. */
 40    while (p[0] != ZIP_END) {
 41        assert(zipEntrySafe(zl, curlen, p, &cur, 0));
 42
 43        /* Abort when "prevlen" has not changed. */
 44        if (cur.prevrawlen == prevlen) break;
 45
 46        /* Abort when entry's "prevlensize" is big enough. */
 47        if (cur.prevrawlensize >= prevlensize) {
 48            if (cur.prevrawlensize == prevlensize) {
 49                zipStorePrevEntryLength(p, prevlen);
 50            } else {
 51                /* This would result in shrinking, which we want to avoid.
 52                 * So, set "prevlen" in the available bytes. */
 53                zipStorePrevEntryLengthLarge(p, prevlen);
 54            }
 55            break;
 56        }
 57
 58        /* cur.prevrawlen means cur is the former head entry. */
 59        assert(cur.prevrawlen == 0 || cur.prevrawlen + delta == prevlen);
 60
 61        /* Update prev entry's info and advance the cursor. */
 62        rawlen = cur.headersize + cur.len;
 63        prevlen = rawlen + delta; 
 64        prevlensize = zipStorePrevEntryLength(NULL, prevlen);
 65        prevoffset = p - zl;
 66        p += rawlen;
 67        extra += delta;
 68        cnt++;
 69    }
 70
 71    /* Extra bytes is zero all update has been done(or no need to update). */
 72    if (extra == 0) return zl;
 73
 74    /* Update tail offset after loop. */
 75    if (tail == zl + prevoffset) {
 76        /* When the the last entry we need to update is also the tail, update tail offset
 77         * unless this is the only entry that was updated (so the tail offset didn't change). */
 78        if (extra - delta != 0) {
 79            ZIPLIST_TAIL_OFFSET(zl) =
 80                intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra-delta);
 81        }
 82    } else {
 83        /* Update the tail offset in cases where the last entry we updated is not the tail. */
 84        ZIPLIST_TAIL_OFFSET(zl) =
 85            intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra);
 86    }
 87
 88    /* Now "p" points at the first unchanged byte in original ziplist,
 89     * move data after that to new ziplist. */
 90    offset = p - zl;
 91    zl = ziplistResize(zl, curlen + extra);
 92    p = zl + offset;
 93    memmove(p + extra, p, curlen - offset - 1);
 94    p += extra;
 95
 96    /* Iterate all entries that need to be updated tail to head. */
 97    while (cnt) {
 98        zipEntry(zl + prevoffset, &cur); /* no need for "safe" variant since we already iterated on all these entries above. */
 99        rawlen = cur.headersize + cur.len;
100        /* Move entry to tail and reset prevlen. */
101        memmove(p - (rawlen - cur.prevrawlensize), 
102                zl + prevoffset + cur.prevrawlensize, 
103                rawlen - cur.prevrawlensize);
104        p -= (rawlen + delta);
105        if (cur.prevrawlen == 0) {
106            /* "cur" is the previous head entry, update its prevlen with firstentrylen. */
107            zipStorePrevEntryLength(p, firstentrylen);
108        } else {
109            /* An entry's prevlen can only increment 4 bytes. */
110            zipStorePrevEntryLength(p, cur.prevrawlen+delta);
111        }
112        /* Foward to previous entry. */
113        prevoffset -= cur.prevrawlen;
114        cnt--;
115    }
116    return zl;
117}

ziplist本身很容易理解,比较复杂的是其节点存储时对prevlen和datalen的优化处理。因为变长所以解析需要花费更多的精力去理解,不过一旦理解以后ziplist就没有那么神秘了。ziplist在多个外部数据类型中都会被用到,例如hash和zset在小量数据的情况下。这么做的原因是为了减少内存消耗同时增加数据访问速度,这利用了CPU cache的相关特性,但是在数据量较大或者元素较多的情况下就不适用了,所以相应的数据类型会转为链表或者哈希表。

评论