PostgreSQL源码解读（248）-HTAB动态扩展图解#2

本节简单介绍了PostgreSQL中的HTAB如何动态扩展,这是第2部分,结合代码进行解析.

十载的平山网站建设经验，针对设计、前端、开发、售后、文案、推广等六对一服务，响应快，48小时及时工作处理。成都全网营销推广的优势是能够根据用户设备显示端的尺寸不同，自动调整平山建站的显示方式，使网站能够适用不同显示终端，在浏览器中调整网站的宽度，无论在任何一种浏览器上浏览网站，都能展现优雅布局与设计，从而大程度地提升浏览体验。创新互联从事“平山网站设计”,“平山网站推广”以来，每个客户项目都认真落实执行。

一、数据结构

/*
 * Top control structure for a hashtable --- in a shared table, each backend
 * has its own copy (OK since no fields change at runtime)
 * 哈希表的顶层控制结构.
 * 在这个共享哈希表中,每一个后台进程都有自己的拷贝
 * (之所以没有问题是因为fork出来后,在运行期没有字段会变化)
 */
struct HTAB
{
    //指向共享的控制信息
    HASHHDR    *hctl;           /* => shared control information */
    //段目录
    HASHSEGMENT *dir;           /* directory of segment starts */
    //哈希函数
    HashValueFunc hash;         /* hash function */
    //哈希键比较函数
    HashCompareFunc match;      /* key comparison function */
    //哈希键拷贝函数
    HashCopyFunc keycopy;       /* key copying function */
    //内存分配器
    HashAllocFunc alloc;        /* memory allocator */
    //内存上下文
    MemoryContext hcxt;         /* memory context if default allocator used */
    //表名(用于错误信息)
    char       *tabname;        /* table name (for error messages) */
    //如在共享内存中,则为T
    bool        isshared;       /* true if table is in shared memory */
    //如为T,则固定大小不能扩展
    bool        isfixed;        /* if true, don't enlarge */
    /* freezing a shared table isn't allowed, so we can keep state here */
    //不允许冻结共享表,因此这里会保存相关状态
    bool        frozen;         /* true = no more inserts allowed */
    /* We keep local copies of these fixed values to reduce contention */
    //保存这些固定值的本地拷贝,以减少冲突
    //哈希键长度(以字节为单位)
    Size        keysize;        /* hash key length in bytes */
    //段大小,必须为2的幂
    long        ssize;          /* segment size --- must be power of 2 */
    //段偏移,ssize的对数
    int         sshift;         /* segment shift = log2(ssize) */
};
/*
 * Header structure for a hash table --- contains all changeable info
 * 哈希表的头部结构 -- 存储所有可变信息
 *
 * In a shared-memory hash table, the HASHHDR is in shared memory, while
 * each backend has a local HTAB struct.  For a non-shared table, there isn't
 * any functional difference between HASHHDR and HTAB, but we separate them
 * anyway to share code between shared and non-shared tables.
 * 在共享内存哈希表中,HASHHDR位于共享内存中,每一个后台进程都有一个本地HTAB结构.
 * 对于非共享哈希表,HASHHDR和HTAB没有任何功能性的不同,
 * 但无论如何,我们还是把它们区分为共享和非共享表.
 */
struct HASHHDR
{
    /*
     * The freelist can become a point of contention in high-concurrency hash
     * tables, so we use an array of freelists, each with its own mutex and
     * nentries count, instead of just a single one.  Although the freelists
     * normally operate independently, we will scavenge entries from freelists
     * other than a hashcode's default freelist when necessary.
     * 在高并发的哈希表中,空闲链表会成为竞争热点,因此我们使用空闲链表数组,
     *   数组中的每一个元素都有自己的mutex和条目统计,而不是使用一个.
     *
     * If the hash table is not partitioned, only freeList[0] is used and its
     * spinlock is not used at all; callers' locking is assumed sufficient.
     * 如果哈希表没有分区,那么只有freelist[0]元素是有用的,自旋锁没有任何用处;
     * 调用者锁定被认为已足够OK.
     */
    /* Number of freelists to be used for a partitioned hash table. */
    //#define NUM_FREELISTS           32
    FreeListData freeList[NUM_FREELISTS];
    /* These fields can change, but not in a partitioned table */
    //这些域字段可以改变,但不适用于分区表
    /* Also, dsize can't change in a shared table, even if unpartitioned */
    //同时,就算是非分区表,共享表的dsize也不能改变
    //目录大小
    long        dsize;          /* directory size */
    //已分配的段大小(<= dsize)
    long        nsegs;          /* number of allocated segments (<= dsize) */
    //正在使用的最大桶ID
    uint32      max_bucket;     /* ID of maximum bucket in use */
    //进入整个哈希表的模掩码
    uint32      high_mask;      /* mask to modulo into entire table */
    //进入低位哈希表的模掩码
    uint32      low_mask;       /* mask to modulo into lower half of table */
    /* These fields are fixed at hashtable creation */
    //下面这些字段在哈希表创建时已固定
    //哈希键大小(以字节为单位)
    Size        keysize;        /* hash key length in bytes */
    //所有用户元素大小(以字节为单位)
    Size        entrysize;      /* total user element size in bytes */
    //分区个数(2的幂),或者为0
    long        num_partitions; /* # partitions (must be power of 2), or 0 */
    //目标的填充因子
    long        ffactor;        /* target fill factor */
    //如目录是固定大小,则该值为dsize的上限值
    long        max_dsize;      /* 'dsize' limit if directory is fixed size */
    //段大小,必须是2的幂
    long        ssize;          /* segment size --- must be power of 2 */
    //段偏移,ssize的对数
    int         sshift;         /* segment shift = log2(ssize) */
    //一次性分配的条目个数
    int         nelem_alloc;    /* number of entries to allocate at once */
#ifdef HASH_STATISTICS
    /*
     * Count statistics here.  NB: stats code doesn't bother with mutex, so
     * counts could be corrupted a bit in a partitioned table.
     * 统计信息.
     * 注意:统计相关的代码不会影响mutex,因此对于分区表,统计可能有一点点问题
     */
    long        accesses;
    long        collisions;
#endif
};
/*
 * Per-freelist data.
 * 空闲链表数据.
 *
 * In a partitioned hash table, each freelist is associated with a specific
 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
 * nentries tracks the number of live hashtable entries having those hashcodes
 * (NOT the number of entries in the freelist, as you might expect).
 * 在一个分区哈希表中,每一个空闲链表与特定的hashcodes集合相关,通过下面的FREELIST_IDX()宏进行定义.
 * nentries跟踪有这些hashcodes的仍存活的hashtable条目个数.
 * (注意不要搞错,不是空闲的条目个数)
 *
 * The coverage of a freelist might be more or less than one partition, so it
 * needs its own lock rather than relying on caller locking.  Relying on that
 * wouldn't work even if the coverage was the same, because of the occasional
 * need to "borrow" entries from another freelist; see get_hash_entry().
 * 空闲链表的覆盖范围可能比一个分区多或少,因此需要自己的锁而不能仅仅依赖调用者的锁.
 * 依赖调用者锁在覆盖面一样的情况下也不会起效,因为偶尔需要从另一个自由列表“借用”条目,详细参见get_hash_entry()
 *
 * Using an array of FreeListData instead of separate arrays of mutexes,
 * nentries and freeLists helps to reduce sharing of cache lines between
 * different mutexes.
 * 使用FreeListData数组而不是一个独立的mutexes,nentries和freelists数组有助于减少不同mutexes之间的缓存线共享.
 */
typedef struct
{
    //该空闲链表的自旋锁
    slock_t     mutex;          /* spinlock for this freelist */
    //相关桶中的条目个数
    long        nentries;       /* number of entries in associated buckets */
    //空闲元素链
    HASHELEMENT *freeList;      /* chain of free elements */
} FreeListData;
/*
 * HASHELEMENT is the private part of a hashtable entry.  The caller's data
 * follows the HASHELEMENT structure (on a MAXALIGN'd boundary).  The hash key
 * is expected to be at the start of the caller's hash entry data structure.
 * HASHELEMENT是哈希表条目的私有部分.
 * 调用者的数据按照HASHELEMENT结构组织(位于MAXALIGN的边界).
 * 哈希键应位于调用者hash条目数据结构的开始位置.
 */
typedef struct HASHELEMENT
{
    //链接到相同桶中的下一个条目
    struct HASHELEMENT *link;   /* link to next entry in same bucket */
    //该条目的哈希函数结果
    uint32      hashvalue;      /* hash function result for this entry */
} HASHELEMENT;
/* Hash table header struct is an opaque type known only within dynahash.c */
//哈希表头部结构,非透明类型,用于dynahash.c
typedef struct HASHHDR HASHHDR;
/* Hash table control struct is an opaque type known only within dynahash.c */
//哈希表控制结构,非透明类型,用于dynahash.c
typedef struct HTAB HTAB;
/* Parameter data structure for hash_create */
//hash_create使用的参数数据结构
/* Only those fields indicated by hash_flags need be set */
//根据hash_flags标记设置相应的字段
typedef struct HASHCTL
{
    //分区个数(必须是2的幂)
    long        num_partitions; /* # partitions (must be power of 2) */
    //段大小
    long        ssize;          /* segment size */
    //初始化目录大小
    long        dsize;          /* (initial) directory size */
    //dsize上限
    long        max_dsize;      /* limit to dsize if dir size is limited */
    //填充因子
    long        ffactor;        /* fill factor */
    //哈希键大小(字节为单位)
    Size        keysize;        /* hash key length in bytes */
    //参见上述数据结构注释
    Size        entrysize;      /* total user element size in bytes */
    //
    HashValueFunc hash;         /* hash function */
    HashCompareFunc match;      /* key comparison function */
    HashCopyFunc keycopy;       /* key copying function */
    HashAllocFunc alloc;        /* memory allocator */
    MemoryContext hcxt;         /* memory context to use for allocations */
    //共享内存中的哈希头部结构地址
    HASHHDR    *hctl;           /* location of header in shared mem */
} HASHCTL;
/* A hash bucket is a linked list of HASHELEMENTs */
//哈希桶是HASHELEMENTs链表
typedef HASHELEMENT *HASHBUCKET;
/* A hash segment is an array of bucket headers */
//hash segment是桶数组
typedef HASHBUCKET *HASHSEGMENT;
/*
 * Hash functions must have this signature.
 * Hash函数必须有它自己的标识
 */
typedef uint32 (*HashValueFunc) (const void *key, Size keysize);
 /*
 * Key comparison functions must have this signature.  Comparison functions
 * return zero for match, nonzero for no match.  (The comparison function
 * definition is designed to allow memcmp() and strncmp() to be used directly
 * as key comparison functions.)
 * 哈希键对比函数必须有自己的标识.
 * 如匹配则对比函数返回0,不匹配返回非0.
 * (对比函数定义被设计为允许在对比键值时可直接使用memcmp()和strncmp())
 */
typedef int (*HashCompareFunc) (const void *key1, const void *key2,
 Size keysize);
 /*
 * Key copying functions must have this signature.  The return value is not
 * used.  (The definition is set up to allow memcpy() and strlcpy() to be
 * used directly.)
 * 键拷贝函数必须有自己的标识.
 * 返回值无用.
 */
typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize);
/*
 * Space allocation function for a hashtable --- designed to match malloc().
 * Note: there is no free function API; can't destroy a hashtable unless you
 * use the default allocator.
 * 哈希表的恐惧分配函数 -- 被设计为与malloc()函数匹配.
 * 注意:这里没有释放函数API;不能销毁哈希表,除非使用默认的分配器.
 */
typedef void *(*HashAllocFunc) (Size request);

其结构如下图所示：
PostgreSQL 源码解读（248）- HTAB动态扩展图解#2

扩展后的结构如下图所示:
PostgreSQL 源码解读（248）- HTAB动态扩展图解#2

二、源码解读

主要的函数是expand_table
1.分配新桶,HTAB的最大桶数max_bucket+1
2.根据新桶号计算段号和段内编号
3.如需扩展段,则扩展(*2)
4.获取新桶号对应的原桶号,目的是为了把原桶号中的数据迁移到新桶中.新桶号和原桶号相差low_mask
5.扫描旧桶,重建旧桶元素链表,构造新桶元素链表


/*
 * Expand the table by adding one more hash bucket.
 * 通过增加一个或者多个hash bucket扩展hash表
 */
static bool
expand_table(HTAB *hashp)
{
    HASHHDR    *hctl = hashp->hctl;//hash控制结构
    HASHSEGMENT old_seg,//原seg
                new_seg;//新seg
    long        old_bucket,//原bucket
                new_bucket;//新bucket
    long        new_segnum,//新seg号
                new_segndx;//新seg索引(segment中的编号)
    long        old_segnum,//新seg号
                old_segndx;//原seg索引
    HASHBUCKET *oldlink,//原桶
               *newlink;//新桶
    HASHBUCKET  currElement,//当前元素
                nextElement;//下一元素
    //#define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
    Assert(!IS_PARTITIONED(hctl));
#ifdef HASH_STATISTICS
    hash_expansions++;
#endif
    new_bucket = hctl->max_bucket + 1;//新增加一个bucket
    new_segnum = new_bucket >> hashp->sshift;//取商数
    new_segndx = MOD(new_bucket, hashp->ssize);//取余数
    if (new_segnum >= hctl->nsegs)
    {
        //扩展segment,每次扩展一倍
        /* Allocate new segment if necessary -- could fail if dir full */
        if (new_segnum >= hctl->dsize)
            if (!dir_realloc(hashp))
                return false;
        if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))//为新的seg对应的bucket分配空间
            return false;
        hctl->nsegs++;
    }
    /* OK, we created a new bucket */
    //已完成创建
    hctl->max_bucket++;
    /*
     * *Before* changing masks, find old bucket corresponding to same hash
     * values; values in that bucket may need to be relocated to new bucket.
     * Note that new_bucket is certainly larger than low_mask at this point,
     * so we can skip the first step of the regular hash mask calc.
     * 在修改掩码前,为新的bucket找到对应的原bucket,原bucket中的元素keneng需要迁移到新的bucket上.
     * 注意new_bucket肯定会比low_mask要大,可以跳过常规的hash掩码计算的第一个步骤.
     */
    old_bucket = (new_bucket & hctl->low_mask);
    /*
     * If we crossed a power of 2, readjust masks.
     * 如果new_bucket是2的n次方,调整掩码
     */
    if ((uint32) new_bucket > hctl->high_mask)
    {
        hctl->low_mask = hctl->high_mask;//如15->31
        hctl->high_mask = (uint32) new_bucket | hctl->low_mask;//如31->63
    }
    /*
     * Relocate records to the new bucket.  NOTE: because of the way the hash
     * masking is done in calc_bucket, only one old bucket can need to be
     * split at this point.  With a different way of reducing the hash value,
     * that might not be true!
     * 重定位记录到新的bucket上.
     * 注意:由于通过方法calc_bucket计算hash掩码,这时只需要拆分一个bucket.
     * 
     */
    old_segnum = old_bucket >> hashp->sshift;//计算原seg号
    old_segndx = MOD(old_bucket, hashp->ssize);//计算原seg中的索引号
    old_seg = hashp->dir[old_segnum];//旧seg
    new_seg = hashp->dir[new_segnum];//新seg
    oldlink = &old_seg[old_segndx];//原bucket指针
    newlink = &new_seg[new_segndx];//新bucket指针
    for (currElement = *oldlink;
         currElement != NULL;
         currElement = nextElement)//循环遍历
    {
        nextElement = currElement->link;
        if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
        {
            *oldlink = currElement;
            oldlink = &currElement->link;//重新构造原bucket
        }
        else
        {
            *newlink = currElement;//构造新bucket
            newlink = &currElement->link;
        }
    }
    /* don't forget to terminate the rebuilt hash chains... */
    //不要忘了终止重建后的hash链
    *oldlink = NULL;
    *newlink = NULL;
    return true;
}
static bool
dir_realloc(HTAB *hashp)
{
    HASHSEGMENT *p;
    HASHSEGMENT *old_p;
    long        new_dsize;
    long        old_dirsize;
    long        new_dirsize;
    if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
        return false;
    /* Reallocate directory */
    new_dsize = hashp->hctl->dsize << 1;
    old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
    new_dirsize = new_dsize * sizeof(HASHSEGMENT);
    old_p = hashp->dir;
    CurrentDynaHashCxt = hashp->hcxt;
    p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
    if (p != NULL)
    {
        memcpy(p, old_p, old_dirsize);
        MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
        hashp->dir = p;
        hashp->hctl->dsize = new_dsize;
        /* XXX assume the allocator is palloc, so we know how to free */
        Assert(hashp->alloc == DynaHashAlloc);
        pfree(old_p);
        return true;
    }
    return false;
}
static HASHSEGMENT
seg_alloc(HTAB *hashp)
{
    HASHSEGMENT segp;
    CurrentDynaHashCxt = hashp->hcxt;
    segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
    if (!segp)
        return NULL;
    MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
    return segp;
}

三、跟踪分析

测试脚本

[local:/data/run/pg12]:5120 pg12@testdb=# \d t_expand;
              Table "public.t_expand"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 id     | integer |           |          | 
[local:/data/run/pg12]:5120 pg12@testdb=# select count(*) from t_expand;
  count  
---------
 2000000
(1 row)
[local:/data/run/pg12]:5120 pg12@testdb=# select * from t_expand;
...

启动gdb跟踪

(gdb) b hash_search_with_hash_value
Breakpoint 2 at 0xa790f2: file dynahash.c, line 925.
(gdb) c
Continuing.
Breakpoint 2, hash_search_with_hash_value (hashp=0x224eac8, keyPtr=0x7fffed717700, hashvalue=2252448879, action=HASH_ENTER, foundPtr=0x7fffed7176ff) at dynahash.c:925
925     HASHHDR    *hctl = hashp->hctl; --> hash控制结构体
(gdb) n
926     int         freelist_idx = FREELIST_IDX(hctl, hashvalue);--> 空闲链表
(gdb) p *hctl
$1 = {freeList = {{mutex = 0 '\000', nentries = 0, freeList = 0x22504d0}, {mutex = 0 '\000', nentries = 0, freeList = 0x0} }, dsize = 256, nsegs = 1, max_bucket = 15, 
  high_mask = 31, low_mask = 15, keysize = 20, entrysize = 72, num_partitions = 0, ffactor = 1, max_dsize = -1, ssize = 256, sshift = 8, nelem_alloc = 46}
(gdb) n
949     if (action == HASH_ENTER || action == HASH_ENTER_NULL) 
(gdb) 
956         if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
(gdb) 
957             hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && --> 判断是否需要扩展
(gdb) 
956         if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
(gdb) 
965     bucket = calc_bucket(hctl, hashvalue);-->计算hash桶
(gdb) step
calc_bucket (hctl=0x224eb60, hash_val=2252448879) at dynahash.c:871
871     bucket = hash_val & hctl->high_mask;-->先行与high_mask(31)进行掩码运算
(gdb) n
872     if (bucket > hctl->max_bucket)-->得到的结果如何比max_bucket还大,那要跟low_mask(15)进行掩码运算
(gdb) p bucket
$2 = 15
(gdb) n
875     return bucket;
(gdb) l
870 
871     bucket = hash_val & hctl->high_mask;
872     if (bucket > hctl->max_bucket)
873         bucket = bucket & hctl->low_mask;
874 
875     return bucket;
876 }
877 
878 /*
879  * hash_search -- look up key in table and perform action
(gdb) n
876 }
(gdb) 
hash_search_with_hash_value (hashp=0x224eac8, keyPtr=0x7fffed717700, hashvalue=2252448879, action=HASH_ENTER, foundPtr=0x7fffed7176ff) at dynahash.c:967
967     segment_num = bucket >> hashp->sshift;-->seg号,相当于15/256,结果为0
(gdb) 
968     segment_ndx = MOD(bucket, hashp->ssize);-->seg内编号,相当于15/256取模,结果为15
(gdb) 
970     segp = hashp->dir[segment_num];
(gdb) 
972     if (segp == NULL)
(gdb) p segment_num
$3 = 0
(gdb) p segment_ndx
$4 = 15
(gdb) n
975     prevBucketPtr = &segp[segment_ndx];
(gdb) 
976     currBucket = *prevBucketPtr;
(gdb) 
981     match = hashp->match;       /* save one fetch in inner loop */
(gdb) 
982     keysize = hashp->keysize;   /* ditto */
(gdb) 
984     while (currBucket != NULL)
(gdb) 
997     if (foundPtr)
(gdb) 
998         *foundPtr = (bool) (currBucket != NULL);
(gdb) 
1003        switch (action)
(gdb) 
1047                if (currBucket != NULL)
(gdb) 
1051                if (hashp->frozen)
(gdb) 
1055                currBucket = get_hash_entry(hashp, freelist_idx);
(gdb) 
1056                if (currBucket == NULL)
(gdb) 
1073                *prevBucketPtr = currBucket;
(gdb) 
1074                currBucket->link = NULL;
(gdb) 
1077                currBucket->hashvalue = hashvalue;
(gdb) 
1078                hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
(gdb) 
1087                return (void *) ELEMENTKEY(currBucket);
(gdb) 
1093    }
(gdb) 
hash_search (hashp=0x224eac8, keyPtr=0x7fffed717700, action=HASH_ENTER, foundPtr=0x7fffed7176ff) at dynahash.c:916
916 }
(gdb)

四、参考资料

N/A

网页名称：PostgreSQL源码解读（248）-HTAB动态扩展图解#2
当前地址：http://pwwzsj.com/article/isjcoj.html

平武建站

PostgreSQL源码解读（248）-HTAB动态扩展图解#2

一、数据结构

二、源码解读

三、跟踪分析

四、参考资料

其他资讯