记一次扒 nginx ingress 的过程

背景

最近因为 nginx 网关的问题，导致服务有诸多不稳定的风险，因此需要更近一步去做网关的工作。

之前预判过将来会因无法自定义负载均衡而达不到目标，于是探索过 nginx-ingress 增加自定义负载均衡的方案，详情参见记一次有状态服务的负载均衡方案探索，但由于当时业务还没有真正遇到问题，所以也没继续去推这个事儿。

不过当时的探索比较粗浅，是抱着 能简单解决问题 的目的做的，现在，需要有更多的梳理，以降低大家对这件事的认知复杂度。

nginx ingress

在 k8s 中，我们有多种提供对外访问的方式，其中业务中用的最多的，就是 ingress controller。市面上提供的各类 ingress controller 非常多，例如 envoy、treafik、apisix、openresty 等等，我们选用的，是运维同学都比较熟悉的 nginx-ingress-controller。

nginx 的官网参考 nginx.org
nginx ingress controller 的官网参考 ingress-nginx , github ingress-nginx

先看一下 nginx ingress controller 的工作原理

在这个过程中，ingress-nginx 的职责是 接收 api-server 中的资源变化，并转化成 nginx 需要的格式，通过 http 发送给 nginx

nginx

nginx 依然是我们熟悉的那个 nginx，但是和传统我们使用的方式不同，所有资源变化的处理，全都是交给 xxx_by_lua ，包括我们本次最需要被扒的 balancer 。

balancer 的入口文件是 lua/balancer.lua ，可以看到有这样的内容：

local ngx_balancer = require("ngx.balancer")
local round_robin = require("balancer.round_robin")
local chash = require("balancer.chash")
local chashsubset = require("balancer.chashsubset")
local sticky_balanced = require("balancer.sticky_balanced")
local sticky_persistent = require("balancer.sticky_persistent")
local ewma = require("balancer.ewma")

local DEFAULT_LB_ALG = "round_robin"
local IMPLEMENTATIONS = {
  round_robin = round_robin,
  chash = chash,
  chashsubset = chashsubset,
  sticky_balanced = sticky_balanced,
  sticky_persistent = sticky_persistent,
  ewma = ewma,
}

local _M = {}
local balancers = {}


local function get_implementation(backend)
  local name = backend["load-balance"] or DEFAULT_LB_ALG

  if backend["sessionAffinityConfig"] and
     backend["sessionAffinityConfig"]["name"] == "cookie" then
    if backend["sessionAffinityConfig"]["mode"] == "persistent" then
      name = "sticky_persistent"
    else
      name = "sticky_balanced"
    end

  elseif backend["upstreamHashByConfig"] and
         backend["upstreamHashByConfig"]["upstream-hash-by"] then
    if backend["upstreamHashByConfig"]["upstream-hash-by-subset"] then
      name = "chashsubset"
    else
      name = "chash"
    end
  end

  local implementation = IMPLEMENTATIONS[name]
  if not implementation then
    ngx.log(ngx.WARN, backend["load-balance"], " is not supported, ",
            "falling back to ", DEFAULT_LB_ALG)
    implementation = IMPLEMENTATIONS[DEFAULT_LB_ALG]
  end

  return implementation
end


setmetatable(_M, {__index = {
  get_implementation = get_implementation,
  sync_backend = sync_backend,
  route_to_alternative_balancer = route_to_alternative_balancer,
  get_balancer = get_balancer,
  get_balancer_by_upstream_name = get_balancer_by_upstream_name,
}})

return _M

其核心作用，就是做 balancer 的注册及获取。

我们用的是 chash 的方式，对应的代码在 lua/balancer/chash.lua ，内容如下：

local balancer_resty = require("balancer.resty")
local resty_chash = require("resty.chash")
local util = require("util")
local ngx_log = ngx.log
local ngx_ERR = ngx.ERR
local setmetatable = setmetatable

local _M = balancer_resty:new({ factory = resty_chash, name = "chash" })

function _M.new(self, backend)
  local nodes = util.get_nodes(backend.endpoints)
  local complex_val, err =
    util.parse_complex_value(backend["upstreamHashByConfig"]["upstream-hash-by"])
  if err ~= nil then
    ngx_log(ngx_ERR, "could not parse the value of the upstream-hash-by: ", err)
  end

  local o = {
    instance = self.factory:new(nodes),
    hash_by = complex_val,
    traffic_shaping_policy = backend.trafficShapingPolicy,
    alternative_backends = backend.alternativeBackends,
  }
  setmetatable(o, self)
  self.__index = self
  return o
end

function _M.balance(self)
  local key = util.generate_var_value(self.hash_by)
  return self.instance:find(key)
end

return _M

其实这里面什么都没做，就是封装了一下 resty.chash ，继续扒一下这个库的源码 openresty/lua-resty-balancer/lib/resty/chash.lua，内容如下：

ffi.cdef[[
typedef unsigned int uint32_t;

typedef struct {
    uint32_t hash;
    uint32_t id;
} chash_point_t;

void chash_point_init(chash_point_t *points, uint32_t base_hash, uint32_t start,
    uint32_t num, uint32_t id);
int chash_point_sort(chash_point_t *points, uint32_t size);

int chash_point_add(chash_point_t *old_points, uint32_t old_length,
    uint32_t base_hash, uint32_t from, uint32_t num, uint32_t id,
    chash_point_t *new_points);
int chash_point_reduce(chash_point_t *old_points, uint32_t old_length,
    uint32_t base_hash, uint32_t from, uint32_t num, uint32_t id);
void chash_point_delete(chash_point_t *old_points, uint32_t old_length,
    uint32_t id);
]]

local _M = {}

local clib = load_shared_lib("librestychash")
if not clib then
    error("can not load librestychash")
end

local CONSISTENT_POINTS = 160   -- points per server
local pow32 = math.pow(2, 32)

local chash_point_t = ffi.typeof("chash_point_t[?]")


local function _precompute(nodes)
	// 省略
    for id, weight in pairs(nodes) do
		// 省略
        clib.chash_point_init(points, base_hash, start, num, index)
    end

    if clib.chash_point_sort(points, npoints) ~= CHASH_OK then
        error("no memory")
    end

    return ids, points, npoints, newnodes
end

function _M.new(_, nodes)
	// 省略
end

function _M.reinit(self, nodes)
	// 省略
end

local function _delete(self, id)
	// 省略
    clib.chash_point_delete(self.points, self.npoints, index)
end

local function _incr(self, id, weight)
	// 省略
    local base_hash = bxor(crc32(tostring(id)), 0xffffffff)
    local rc = clib.chash_point_add(self.points, self.npoints, base_hash,old_weight * CONSISTENT_POINTS, weight * CONSISTENT_POINTS,index, new_points)
	// 省略
end

local function _decr(self, id, weight)
	// 省略
    local rc = clib.chash_point_reduce(self.points, self.npoints, base_hash,from, num, index)
end



local function _find_id(points, npoints, hash)
    local step = pow32 / npoints
    local index = floor(hash / step)

    local max_index = npoints - 1

    -- it seems safer to do this
    if index > max_index then
        index = max_index
    end

    -- find the first points >= hash
    if points[index].hash >= hash then
        for i = index, 1, -1 do
            if points[i - 1].hash < hash then
                return points[i].id, i
            end
        end

        return points[0].id, 0
    end

    for i = index + 1, max_index do
        if hash <= points[i].hash then
            return points[i].id, i
        end
    end

    return points[0].id, 0
end


function _M.find(self, key)
    local hash = crc32(tostring(key))

    local id, index = _find_id(self.points, self.npoints, hash)

    return self.ids[id], index
end

return _M

可以看到，这个文件实现了 find 的方法，而底层 hash 环维护的事交给了 librestychash 这个 clib，继续看一下 openresty/lua-resty-balancer/chash.c，代码如下：

static uint32_t crc32_table256[] = {
    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba
    // 省略……
};


static inline void
crc32_update(uint32_t *crc, u_char *p, size_t len)
{
    uint32_t  c;

    c = *crc;

    while (len--) {
        c = crc32_table256[(c ^ *p++) & 0xff] ^ (c >> 8);
    }

    *crc = c;
}


static inline void
chash_point_init_crc(chash_point_t *arr, uint32_t start, uint32_t base_hash,
    uint32_t from, uint32_t num, uint32_t id)
{
    chash_point_t *node;
    uint32_t i, hash;
    union {
        uint32_t                        value;
        u_char                          byte[4];
    } prev_hash;

    prev_hash.value = 0;
    node = &arr[start];

    for (i = 0; i < from + num; i++) {
        hash = base_hash;
        crc32_update(&hash, prev_hash.byte, 4);
        crc32_final(hash);

        if (i >= from) {
            node->hash = hash;
            node->id = id;
            node = node + 1;
        }

        /* no big performace different in my test */

        /* this only works when have little endian */
        // prev_hash.value = hash;

        prev_hash.byte[0] = (u_char) (hash & 0xff);
        prev_hash.byte[1] = (u_char) ((hash >> 8) & 0xff);
        prev_hash.byte[2] = (u_char) ((hash >> 16) & 0xff);
        prev_hash.byte[3] = (u_char) ((hash >> 24) & 0xff);
    }
}


void
chash_point_init(chash_point_t *arr, uint32_t base_hash, uint32_t start,
    uint32_t num, uint32_t id)
{
    chash_point_init_crc(arr, start, base_hash, 0, num, id);
}


int
chash_point_sort(chash_point_t arr[], uint32_t n)
{
    chash_point_t *points;
    chash_point_t *node;
    int i, j, index, start, end;
    uint32_t min_sz, m, step;

    /* not sure 1.6 is the best */
    min_sz = n * 1.6;
    m = 2;

    while (m <= min_sz) {
        m *= 2;
    }

    step = pow(2, 32) / m;

    points = (chash_point_t *) calloc(m, sizeof(chash_point_t));
    if (points == NULL) {
        return CHASH_ERR;
    }

    for (i = 0; i < n; i++) {
        node = &arr[i];
        index = node->hash / step;

        assert(index < m); // index must less than m

        for (end = index; end >= 0; end--) {
            if (points[end].id == 0) {
                goto insert;
            }

            if (node->hash >= points[end].hash) {
                break;
            }
        }

        for (start = end - 1; start >= 0; start--) {
            if (points[start].id == 0) {
                /* left shift before end */
                for (j = start; j < end; j++) {
                    points[j].hash = points[j + 1].hash;
                    points[j].id = points[j + 1].id;
                }

                /* points[end] is empty now */

                /* left shift after end when node->hash is bigger than them */
                /* only end == index can match this */
                while (end + 1 < m
                       && points[end + 1].id != 0
                       && points[end + 1].hash < node->hash)
                {
                    points[end].hash = points[end + 1].hash;
                    points[end].id = points[end + 1].id;
                    end += 1;
                }

                goto insert;
            }
        }

        /* full before index, try to append */

        for (end = end + 1; end < m; end++) {
            if (points[end].id == 0) {
                goto insert;
            }

            if (node->hash < points[end].hash) {
                break;
            }
        }

        for (start = end + 1; start < m; start++) {
            if (points[start].id == 0) {
                break;
            }
        }

        /* right shift */
        for (j = start; j > end; j--) {
            points[j].hash = points[j - 1].hash;
            points[j].id = points[j - 1].id;
        }

insert:
        assert(end < m && end >= 0);

        points[end].id = node->id;
        points[end].hash = node->hash;
    }

    j = 0;
    for (i = 0; i < m; i++) {
        if (points[i].id != 0) {
            arr[j].id = points[i].id;
            arr[j].hash = points[i].hash;
            j++;
        }
    }

    free(points);

    return CHASH_OK;
}


int
chash_point_add(chash_point_t *old_points, uint32_t old_length,
    uint32_t base_hash, uint32_t from, uint32_t num, uint32_t id,
    chash_point_t *new_points)
{
    int i, j, k;
    chash_point_t *tmp_points;

    tmp_points = (chash_point_t *) calloc(num, sizeof(chash_point_t));
    if (tmp_points == NULL) {
        return CHASH_ERR;
    }

    chash_point_init_crc(tmp_points, 0, base_hash, from, num, id);

    if (chash_point_sort(tmp_points, num) != CHASH_OK) {
        free(tmp_points);
        return CHASH_ERR;
    }

    j = num - 1;
    k = old_length + num - 1;
    for (i = old_length - 1; i >= 0; i--, k--) {
        while (j >= 0 && tmp_points[j].hash > old_points[i].hash) {
            new_points[k].hash = tmp_points[j].hash;
            new_points[k].id = tmp_points[j].id;

            j--;
            k--;
        }

        new_points[k].hash = old_points[i].hash;
        new_points[k].id = old_points[i].id;
    }

    for (; j >= 0; j--, k--) {
        new_points[k].hash = tmp_points[j].hash;
        new_points[k].id = tmp_points[j].id;
    }

    free(tmp_points);

    return CHASH_OK;
}


int
chash_point_reduce(chash_point_t *old_points, uint32_t old_length,
    uint32_t base_hash, uint32_t from, uint32_t num, uint32_t id)
{
    int i, j, k;
    chash_point_t *tmp_points;

    tmp_points = (chash_point_t *) calloc(num, sizeof(chash_point_t));

    chash_point_init_crc(tmp_points, 0, base_hash, from, num, id);

    if (chash_point_sort(tmp_points, num) != CHASH_OK) {
        free(tmp_points);
        return CHASH_ERR;
    }

    for (i = 0, j = 0, k = 0; i < old_length; i++) {
        if (j < num
            && old_points[i].hash == tmp_points[j].hash
            && old_points[i].id == id)
        {
            j++;
            continue;
        }

        if (i != k) {
            old_points[k].hash = old_points[i].hash;
            old_points[k].id = old_points[i].id;
        }
        k++;
    }

    free(tmp_points);

    return CHASH_OK;
}


void
chash_point_delete(chash_point_t *old_points, uint32_t old_length, uint32_t id)
{
    int i, j;

    for (i = 0, j = 0; i < old_length; i++) {
        if (old_points[i].id == id) {
            continue;
        }

        if (i != j) {
            old_points[j].hash = old_points[i].hash;
            old_points[j].id = old_points[i].id;
        }
        j++;
    }
}

这就是 chash 本身的实现了，比较简单，主要的结构如下

typedef struct {
    uint32_t hash;
    uint32_t id;
} chash_point_t;

这是一个 point，理解成是 hash 环上的每一个桩就行了，整个 hash 环使用数组来实现。

我们遇到的问题

背景场景

使用的 ingress-nginx 做应用网关，应用中有一个 websocket 的服务，会有大量的连接保持着。同时，为了让缓存发挥作用，我们使用了 nginx consistent hash 的方式让同一个 room 下的连接在同一个 pod 上。

存在隐患的场景有如下 2 个:

当 ingress 发生变化，ingress-nginx会在一段时间后强制停止 worker 进程。这会导致短时间内大量 websocket 重新连接，无异于一次攻击 🐶 ……
当服务进行更新部署时，若采用大批量更新，则会导致短时间大量重连，和场景 1 有异曲同工之效；若采用小批量滚动更新，则会导致部分用户会进行多次重连。

可能的思路

能否让 hash 环不发生变化？

从 nginx-ingress/lua/chash.lua 中可以看到，hash 环生成的方式是以 endpoints 为基准的，如果 endpoints 不发生改变，则 hash 环不会发生改变。

function _M.new(self, backend)
  local nodes = util.get_nodes(backend.endpoints)
  // 省略……
  return o
end

ep 是每一个 pod 的 ip，会随着 pod 的更新而更新，所以答案几乎是否定的。(当然还是有一些奇怪的操作可以达到目标，但不常规操作还是少搞的好，不然就是大坑一个)

能否让负载均衡可控？

嗯，这个自然是可以的，基本 demo 可以查看记一次有状态服务的负载均衡方案探索

当然，为了让整套体系能够起作用，还需要做大量的体系性工作，之前有一个简单的 demo，可以查看用 operator 做点有趣的事

这里可以更细致地做一些原型出来

能不能让网关重启不会导致连接断掉？

按照调研，目前市面上能看到 mosn 是做了连接平滑迁移的，这里去看一下，它具体是怎么做到的，有什么优劣？

mosn 的代码地址： https://github.com/mosn/mosn
一篇关于 mosn 是怎么做的迁移： nginx vs envoy vs mosn 平滑升级原理

基本结论是： mosn 和 envoy 都使用了 UDS (unix domain sockets) 的技术，这是在 linux 内核 3.5+ 支持的一种文件描述符传递的方案。

追了一下 mosn 的代码，核心逻辑如下：


// transferHandler is called on recv transfer request
func transferHandler(c net.Conn, handler types.ConnectionHandler, transferMap *sync.Map) {
	defer func() {
		if r := recover(); r != nil {
			log.DefaultLogger.Errorf("[network] [transfer] [handler] transferHandler panic %v", r)
		}
	}()

	defer c.Close()

	uc, ok := c.(*net.UnixConn)
	if !ok {
		log.DefaultLogger.Errorf("[network] [transfer] [handler] unexpected FileConn type; expected UnixConn, got %T", c)
		return
	}
	// recv type
	conn, err := transferRecvType(uc)
	if err != nil {
		log.DefaultLogger.Errorf("[network] [transfer] [handler] transferRecvType error :%v", err)
		return
	}

	if conn != nil {
		// transfer read
		// recv header + buffer
		dataBuf, tlsBuf, err := transferReadRecvData(uc)
		if err != nil {
			log.DefaultLogger.Errorf("[network] [transfer] [handler] transferRecvData error :%v", err)
			return
		}
		connection := transferNewConn(conn, dataBuf, tlsBuf, handler, transferMap)
		if connection != nil {
			transferSendID(uc, connection.id)
		} else {
			transferSendID(uc, transferErr)
		}
	} else {
		// transfer write
		// recv header + buffer
		id, buf, err := transferWriteRecvData(uc)
		if err != nil {
			log.DefaultLogger.Errorf("[network] [transfer] [handler] transferRecvData error :%v", err)
		}
		connection := transferFindConnection(transferMap, uint64(id))
		if connection == nil {
			log.DefaultLogger.Errorf("[network] [transfer] [handler] transferFindConnection failed, id = %d", id)
			return
		}
		err = transferWriteBuffer(connection, buf)
		if err != nil {
			log.DefaultLogger.Errorf("[network] [transfer] [handler] transferWriteBuffer error :%v", err)
			return
		}
	}
}

关于具体细节的问题，可以再细致看一下代码，或者调试一下。

关于转移连接描述符的操作，代码如下：

// 老的，发送 fd
func transferSendFD(uc *net.UnixConn, file *os.File) error {
	buf := make([]byte, 1)
	// transfer read
	buf[0] = 0
	if file == nil {
		return errors.New("transferSendFD conn is nil")
	}
	defer file.Close()
	rights := syscall.UnixRights(int(file.Fd()))
	n, oobn, err := uc.WriteMsgUnix(buf, rights, nil)
	if err != nil {
		return fmt.Errorf("WriteMsgUnix: %v", err)
	}
	if n != len(buf) || oobn != len(rights) {
		return fmt.Errorf("WriteMsgUnix = %d, %d; want 1, %d", n, oobn, len(rights))
	}
	return nil
}

// 新的，接收 fd
func transferRecvFD(oob []byte) (net.Conn, error) {
	scms, err := unix.ParseSocketControlMessage(oob)
	if err != nil {
		return nil, fmt.Errorf("ParseSocketControlMessage: %v", err)
	}
	if len(scms) != 1 {
		return nil, fmt.Errorf("expected 1 SocketControlMessage; got scms = %#v", scms)
	}
	scm := scms[0]
	gotFds, err := unix.ParseUnixRights(&scm)
	if err != nil {
		return nil, fmt.Errorf("unix.ParseUnixRights: %v", err)
	}
	if len(gotFds) != 1 {
		return nil, fmt.Errorf("wanted 1 fd; got %#v", gotFds)
	}
	f := os.NewFile(uintptr(gotFds[0]), "fd-from-old")
	defer f.Close()
	conn, err := net.FileConn(f)
	if err != nil {
		return nil, fmt.Errorf("FileConn error :%v", gotFds)
	}
	return conn, nil
}

这里可以画一下流程图，可能更好理解一些。

能否使用 UDS 解决业务升级问题？

实现上当然是 ok 的，但不建议这么搞，主要有以下原因：

业务不是网关，代码的稳定性是很不好的，需要经常变更，为了满足隐藏的一些需求，还是在业务层做特定机制的建设更好。
我们的服务都运行在容器中，对 k8s 而言，镜像是不可变的，平滑升级导致我们在不可变的镜像中使用了可变的进程，容易出问题。

我们究竟要解决什么问题？

让大规模重连尽量不要发生
- 网关独立，不被其他对网关的改动而影响
- 使用能平滑迁移的网关，如 mosn
- 业务应用平滑启动
  - 少量 pod 滚动更新
    - 坏处: 部分连接会多次重连(这里涉及 ws 服务目前均衡的具体实现，需要再细看代码) ；
    - 好处: 对数据库的压力较小；
    - 要解决的问题: ws 服务本身会不会被突然重连的压力搞出问题 => 加多一些 vnode
  - 蓝绿发布、连接迁移
让大规模重连也不会引发重大问题
- session 机制 (这个我觉得还是很有效的)
- 考虑用 redis、甚至内存的方式做连接信息管理
- 限流降级熔断机制
  - 网关层限流等
  - 业务层限流等
  - 前端自身限流等 (至少不要疯狂重连吧 😂 )
其他可能的问题
- 再细致地查一查，经常断掉是怎么发生的，会不会是心跳机制的问题
- 内存泄漏和协程泄漏往往是同时发生的，是不是可以把协程池弄上，然后监控起来
- 前端的策略还是有很多可讨论的，比如长期在后台的页面，是不是把连接断了不要连了
- 多个连接的价值需不需要再探讨一下？ (不一定，紧要性不高)
- 每天晚上的几千个连接还是有点问题的，查一查心里稳妥些

实际上，我们可能主要的问题还是 mongodb 索引少建了一个，在此基础上或许一切的问题都是可以堆一点资源就解决的 🐶 ……
业务有时候就是这样，解决问题或许比牛逼的技术有更大业务价值……

其他

openresty 库中还有很多实用的工具，之后有机会可以详细扒一下，例如:

├── aes.lua
├── chash.lua
├── cookie.lua
├── core
│   ├── base.lua
│   ├── base64.lua
│   ├── ctx.lua
│   ├── exit.lua
│   ├── hash.lua
│   ├── misc.lua
│   ├── ndk.lua
│   ├── phase.lua
│   ├── regex.lua
│   ├── request.lua
│   ├── response.lua
│   ├── shdict.lua
│   ├── time.lua
│   ├── uri.lua
│   ├── utils.lua
│   ├── var.lua
│   └── worker.lua
├── core.lua
├── limit
│   ├── conn.lua
│   ├── count.lua
│   ├── req.lua
│   └── traffic.lua
├── md5.lua
├── random.lua
├── roundrobin.lua
├── sha.lua
├── sha1.lua
├── sha224.lua
├── sha256.lua
├── sha384.lua
├── sha512.lua
├── string.lua
└── upload.lua

另一个常用的负载方式是 chashsubset ，实现在 chashsubset.lua ，后续可以继续扒一下
mosn 的实现中还有很多可以参考的地方，比如 wasm 的扩展方式、xprotocal 的扩展方式，看看还是有价值的

Anticipate the difficult by managing the easy.
— Laozi

code reading nginx nginx ingress ingress

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

关于文件存储的一些调研上一篇

记录一次在团队内的k8s分享下一篇