本文走读内核网络.NETfilter子系统相关的源码。源码基于kernel 4.14版本。
Netfilter子系统包含数据包选择、过滤、修改,连接跟踪,网络地址转换(NAT)等内容。
Netfilter挂载点
在上篇《linux内核源码走读之IPv4及IPv6》文章中,我们在IPv4和IPv6的接收和发送路径中,看到过这些挂载点。
- NF_INET_PRE_ROUTING: 在IPv4中,这个挂载点位于方法ip_rcv()中。这是所有入站数据包遇到的第一个挂载点,它处在路由选择之前。
- NF_INET_LOCAL_IN: 在IPv4中,这个挂载点位于方法ip_local_deliver中。对于所有发给当前主机的入站数据包,经过挂载点NF_INET_PRE_ROUTING和路由选择子系统之后,都将到达这个挂载点。
- NF_INET_FORWARD: 在IPv4中,这个挂载点位于方法ip_forward()中。对于所有要转发的数据包,经过挂载点NF_INET_PRE_ROUTING和路由选择子系统之后,都将到达这个挂载点。
- NF_INET_POST_ROUTING: 在IPv4中,这个挂载点位于方法ip_output()中。所有要转发的数据包,都在经过挂载点NF_INET_FORWARD后到达这个挂载点。另外,当前主机生成的数据包经过挂载点NF_INET_LOCAL_OUT后将到达这个挂载点。
- NF_INET_LOCAL_OUT: 在IPv4中,这个挂载点位于方法__ip_local_out中。当前主机生成的所有出站数据包都在经过路由查找和此挂载点之后,到达挂载点NF_INET_POST_ROUTING。
内核网络代码中,一般通过宏NF_HOOK来调用在挂载点中注册的钩子函数。
static inline intNF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)){ int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret;}//nf_hook并不调用okfn回调函数,NF_HOOK宏判断nf_hook返回值=1(表示允许包通过)调用okfnstatic inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) switch (pf) case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); return retint nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int s) for (; s < e->num_hook_entries; s++) //依次执行注册的hook函数,如果返回值是NF_ACCEPT,则表示调用者可进一步执行okfn verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); return entry->hook(entry->priv, skb, state);
Netfilter钩子回调函数返回值必须是下述五个值之一,这些值被称为netfilter verdicts(netfilter判决)
- NF_DROP: 默默丢弃数据包
- NF_ACCEPT: 数据包继续在内核协议栈中传输
- NF_STOLEN: 数据包不继续传输,由钩子方法进行处理
- NF_QUEUE: 将数据包排序,供用户空间使用
- NF_REPEAT: 再次调用钩子函数
注册Netfilter钩子回调函数
注册Netfilter钩子回调函数的方法有两个nf_register_net_hook和nf_register_net_hooks。 4.13之前的内核版本还有两个注册接口nf_register_hook和nf_register_hooks, 从4.13版本开始内核删除了这两个接口,这两个接口最终也是调用nf_register_net_hook,下面看下nf_register_net_hook:
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) __nf_register_net_hook(net, reg->pf, reg) struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev) return net->nf.hooks_ipv4 + hooknum; //以pf==NFPROTO_IPV4为例。钩子挂载点保存在struct net对象中 p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); //将新的nf_hook_ops按照优先级插入到hook entries中
我们看到nf_register_net_hook一个入参是结构体struct nf_hook_ops,看下这个结构体:
typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state);struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; //要注册的钩子回调函数 struct net_device *dev; void *priv; u_int8_t pf; //协议簇,对于IPv4来说,它为NFPROTO_IPV4; IPV6, NFPROTO_IPV6 bool nat_hook; unsigned int hooknum; //netfilter的5个挂载点之一 /* Hooks are ordered in ascending priority. */ int priority; //按优先级升序排列回调函数,priority值越小回调函数越先被调用};
连接跟踪
现代网络中,仅根据L4和L3报头来过滤流量还不够,还应考虑基于会话对包进行处理。 连接跟踪能够让内核跟踪会话,连接跟踪的主要目标是为NAT打下基础。
连接跟踪初始化
先看下连接跟踪模块定义的netfilter挂载点对象数组,即结构体struct nf_hook_ops数组,定义在netfilter各挂载点的处理函数。
static const struct nf_hook_ops ipv4_conntrack_ops[] = { { .hook = ipv4_conntrack_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_helper, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_helper, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, },};
注册的最重要的连接跟踪回调函数是,NF_INET_PRE_ROUTING钩子回调函数ipv4_conntrack_in和NF_INET_LOCAL_OUT钩子回调函数ipv4_conntrack_local。 这两个钩子函数的优先级为NF_IP_PRI_CONNTRACK(-200),优先级较高。 ipv4_conntrack_in和ipv4_conntrack_local都会调用到nf_conntrack_in,下一小结走读nf_conntrack_in。
继续看下注册这个ipv4_conntrack_ops的地方。在内核版本4.9及以前,直接在函数
nf_conntrack_l3proto_ipv4_init中调用nf_register_hooks来注册。 4.10及以后内核,不在nf_conntrack_l3proto_ipv4_init中直接注册ipv4_conntrack_ops,看下相关代码:
//nf_conntrack_l3proto_ipv4.c//nf_conntrack_l3proto_ipv4_init为nf_conntrack_ipv4.ko的初始化函数module_init(nf_conntrack_l3proto_ipv4_init);static int __init nf_conntrack_l3proto_ipv4_init(void) ... ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); //注册到全局变量nf_ct_l3protos中struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto,#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv4_tuple_to_nlattr, .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */#endif .net_ns_get = ipv4_hooks_register, //这里注册的函数用于注册连接跟踪的netfliter钩子 .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE,};//先看下ipv4_hooks_registerstatic int ipv4_hooks_register(struct net *net) struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); cnet->users++; if (cnet->users > 1) goto out_unlock; //只在第一次调用的时候往下走,之后的调用只是users技术+1 //注册连接跟踪的netfilter钩子 nf_register_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));//再看下调用nf_conntrack_l3proto_ipv4.net_ns_get的地方int nf_ct_netns_get(struct net *net, u8 nfproto) if (nfproto == NFPROTO_INET) nf_ct_netns_do_get(net, NFPROTO_IPV4) nf_ct_netns_do_get(net, NFPROTO_IPV6)static int nf_ct_netns_do_get(struct net *net, u8 nfproto) const struct nf_conntrack_l3proto *l3proto; l3proto = __nf_ct_l3proto_find(nfproto); //对于NFPROTO_IPV4,这里返回的是nf_conntrack_l3proto_ipv4 l3proto->net_ns_get(net); //调用net_ns_get//调用nf_ct_netns_get地方有很多,主要应该是通过NFT_ct_get_init和nft_nat_init
下图展示了IPv4连接跟踪钩子函数在IPv4收发流程中的位置,其中绿色方块是netfilter的5个钩子挂载点,蓝色方块是连接跟踪模块注册的钩子函数:
连接跟踪netfilter挂载点
用来区分特定方向上的流的结构体是struct nf_conntrack_tuple:
struct nf_conntrack_tuple { struct nf_conntrack_man src; //tuple的可操作部分 /* 以下是tuple的固定部分 */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; u_int8_t protonum; //protocol u_int8_t dir; } dst;};
连接跟踪条目
struct nf_conn表示连接跟踪条目,即保存到连接跟踪hash表里的节点。
struct nf_conn { struct nf_conntrack ct_general; spinlock_t lock; u16 cpu; struct nf_conntrack_zone zone; struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; //hashlist节点 unsigned long status; u32 timeout; possible_net_t ct_net; struct hlist_node nat_bysource; /* all members below initialized via memset */ struct { } __nfct_init_offset; struct nf_conn *master; u_int32_t mark; u_int32_t secmark; struct nf_ct_ext *ext; union nf_conntrack_proto proto;};
接下来看一下方法nf_conntrack_in():
unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) l3proto = __nf_ct_l3proto_find(pf); //对于pf=PF_INET,PF_INET,返回的是全局变量nf_conntrack_l3proto_ipv4 l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum); //.get_l4proto=ipv4_get_l4proto //对于IPv4 ->get_l4proto=ipv4_get_l4proto *dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; //protonum即四层协议 l4proto = __nf_ct_l4proto_find(pf, protonum); //以IPPROTO_TCP为例,返回的是全局变量nf_conntrack_l4proto_tcp4 resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l3proto, l4proto); struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; nf_ct_get_tuple() //填充tuple hash = hash_conntrack_raw(&tuple, net); //对tuple进行hash散列运算,调用的内核提供的jhash2() h = __nf_conntrack_find_get(net, zone, &tuple, hash); //在全局变量nf_conntrack_hash hash表下查找连接是否存在 if (!h) //如果连接不存在,则新建一个连接,保存到unconfirmed list h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,skb, dataoff, hash); ct = nf_ct_tuplehash_to_ctrack(h); //利用contAIner_of得到真正的连接对象 ...//一系列ctinfo赋值逻辑,对于新建的连接ctinfo = IP_CT_NEW nf_ct_set(skb, ct, ctinfo); //将连接对象和连接状态值,保存到skb中 skb->_nfct = (unsigned long)ct | info; //借助指针低4位一定为0的逻辑,低4位存整数值 timeouts = nf_ct_timeout_lookup(net, ct, l4proto); l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts); //以TCP为例,->packet==tcp_packet()
再看下ipv4_confirm()的代码:
ipv4_confirm nf_conntrack_confirmstatic inline int nf_conntrack_confirm(struct sk_buff *skb) ct = nf_ct_get(skb, &ctinfo); ... nf_ct_del_from_dying_or_unconfirmed_list(ct); //从unconfirmed或dying表中删除连接 ... __nf_conntrack_hash_insert(ct, hash, reply_hash); //插入到nf_conntrack_hash ...
iptables
iptables由内核部分和用户空间部分组成,核心是内核部分。
iptables的字面意思就是ip表项,每个表由struct xt_table表示。IPv4中,注册和注销表的接口是ipt_register_table()和ipt_unregister_table()。
struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; // struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* called when table is needed in the given netns */ int (*table_init)(struct net *net); const char name[XT_TABLE_MAXNAMELEN];};int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res) xt_register_table(net, table, &bootstrap, newinfo); list_add(&table->list, &net->xt.tables[table->af]); //注册到net->xt.tables上 nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)) //注册netfilter钩子
struct net对象包含IPv4和IPv6专用对象netns_ipv4和netns_ipv6,netns_ipv4和netns_ipv6又包含指向xt_table对象的指针。 例如netns_ipv4包含iptable_filter、iptable_mangle、iptable_raw、arptable_filter、nat_table。
我们以iptable_filter过滤表为例,来进一步看下iptables的工作原理。
//filter表的定义#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT))static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, //按照FILTER_VALID_HOOKS定义,在netfilter的3个挂载点挂载钩子 .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, .table_init = iptable_filter_table_init,};//初始化static int __init iptable_filter_init(void) //这一步主要是初始化netfilter钩子挂载对象,3个挂载点的回调函数都是iptable_filter_hook filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); register_pernet_subsys(&iptable_filter_net_ops) iptable_filter_net_init iptable_filter_table_init(net) //注册filter表 ipt_register_table(net, &packet_filter, repl, filter_ops, &net->ipv4.iptable_filter);
总结下,内核提供了一些表,表里的条目由用户空间程序设置。
看一个用户空间iptables命令例子:
iptables -A INPUT -p udp --dport=5001 -j LOG --log-level 1
这条规则的意思是,向filter表中添加一条规则,将目标端口为5001的UDP入站数据包转储到系统日志中。 使用iptables命令时,应使用修饰符-t来指定要使用的表,如果没指定,默认使用过滤表。
再看一个规则:
iptables -A INPUT -p tcp -m conntrack --ctstate ESTABLISHED -j LOG --log-level 1
这个规则是根据连接跟踪状态来过滤数据包,将连接状态为ESTABLISHED的数据包转储到系统日志中。
本文主要聚焦内核源码,关于用户空间的iptables命令,后面另起文章学习
NAT
NAT(Network Address Translation)网络地址转换,主要用于IP地址转换或端口转换。 NAT最常见的用途之一是,让局域网中一组使用私有IP地址的主机能够通过网关的公网IP访问Internet。
NAT初始化
与上节介绍的过滤表一样,NAT表也是一个xt_table对象。
static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, .table_init = iptable_nat_table_init,};
nat表的netfilter钩子函数:
static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_in, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_out, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_fn, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, },};
nat表的初始化:
static int __init iptable_nat_init(void) iptable_nat_table_init(&init_net) struct ipt_replace *repl; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); //调用ipt_register_table注册nat表 ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, nf_nat_ipv4_ops, &net->ipv4.nat_table);
NAT钩子回调函数
NAT的核心实现位于
net/netfilter/nf_nat_core.c。NAT实现的基本元素为结构nf_nat_l4proto和nf_nat_l3proto。 (在3.7之前的内核中,使用的是结构nf_nat_protocol)。这两个结构都包含函数指针manip_pkt(),它会修改数据报头。 下面看下这两个结构。
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, .in_range = nf_nat_ipv4_in_range, .secure_port = nf_nat_ipv4_secure_port, .manip_pkt = nf_nat_ipv4_manip_pkt, //修改ip包 .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc,#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,#endif#ifdef CONFIG_XFRM .decode_session = nf_nat_ipv4_decode_session,#endif};//专门看下这个修改ip包的函数nf_nat_ipv4_manip_pktstatic bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) ... if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; //修改源IP } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; //修改目标IP }//TCPconst struct nf_nat_l4proto nf_nat_l4proto_tcp = { .l4proto = IPPROTO_TCP, .manip_pkt = tcp_manip_pkt, //修改IP包 .in_range = nf_nat_l4proto_in_range, .unique_tuple = tcp_unique_tuple,#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,#endif};//看下tcp_manip_pkt, udp的类似static bool tcp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l3proto *l3proto, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) ... if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; //修改端口号
继续看下NAT模块注册的netfilter钩子函数。IPv4 NAT模块在4个挂载点注册了钩子函数, 这4个函数最终都调用到nf_nat_ipv4_fn()。
unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; //没有连接跟踪就直接返回 switch (ctinfo) case IP_CT_NEW: if (!nf_nat_initialized(ct, maniptype)) //do_chain最终调用ipt_do_table,在nat标准查找指定条目,找到则调用target的回调函数 do_chain(priv, skb, state, ct); //执行报文修改操作 nf_nat_packet(ct, ctinfo, state->hook, skb); //这里的l3proto对应前面讲的nf_nat_l3proto_ipv4 l3proto = __nf_nat_l3proto_find(target.src.l3num); //如果是TCP的话,l4proto是nf_nat_l4proto_tcp l4proto = __nf_nat_l4proto_find(target.src.l3num,target.dst.protonum) l3proto->manip_pkt(skb, 0, l4proto, &target, mtype) //调用manip_pkt函数