Skip to content

docker与iptable和网桥

Posted on:November 26, 2019 at 11:41 PM

如何创建网桥

创建网桥,可以通过bridge-utils包的brctl来创建一个网桥

$sudo brctl addbr br0

然后通过brctl show可以看到列出的网桥

$brctl  show
bridge name	bridge id		STP enabled	interfaces
br0		8000.000000000000	no		

通过strace查看系统调用

$sudo strace  brctl addbr br1

输出

ubuntu@VM-0-3-ubuntu:~/libnlbuild/bin$ sudo strace  brctl addbr br1
...
socket(AF_UNIX, SOCK_STREAM, 0)         = 3
ioctl(3, SIOCBRADDBR, "br1")            = 0
+++ exited with 0 +++

看到调用

ioctl(3, SIOCBRADDBR, "br1") 

3 指的是打开的文件描述符.0,1,2都是特殊的标准输入输出错误等的文件描述符,所以下一个打开的文件就是3

我写的一个创建网桥的小例子

//  bradd.c
#include <linux/sockios.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <stdio.h>
int main(){
        int br_socket_fd,ret;
        if(br_socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0) < 0){
            perror("Error: ");
        }
        if(ret = ioctl(br_socket_fd, SIOCBRADDBR,"hello") < 0) // SIOCBRADDBR 由sockios.h 引入
        {
            perror("ioctl error");
        }
        return 0;
}
$gcc bradd.c -o 
## 需要使用sudo添加网桥
$sudo ./bradd     

然后用brctl show 输出,创建了一个叫hello的网桥:

$ brctl show
bridge name	bridge id		STP enabled	interfaces		
docker0		8000.024273119fd1	no		vethe6cf6a0
hello		8000.000000000000	no		

然后我们发现了docker0hello两个网桥相差一个interfaces,我们如何添加veth呢?

int br_add_interface(const char *bridge, const char *dev)
{
	struct ifreq ifr;
	...
	int ifindex = if_nametoindex(dev);
	...
	strncpy(ifr.ifr_name, bridge, IFNAMSIZ);
	ifr.ifr_ifindex = ifindex;
	err = ioctl(br_socket_fd, SIOCBRADDIF, &ifr);
	...
}

最后调用linux 的net/bridge/br_if.c:

// dev 是我们要添加的设备 // br 是我们的网桥

/* called with RTNL */
int br_add_if(struct net_bridge *br, struct net_device *dev,
	      struct netlink_ext_ack *extack)
{
	struct net_bridge_port *p;
	int err = 0;
	unsigned br_hr, dev_hr;
	bool changed_addr;

	...
	p = new_nbp(br, dev);
	if (IS_ERR(p))
		return PTR_ERR(p);

	call_netdevice_notifiers(NETDEV_JOIN, dev);

	err = dev_set_allmulti(dev, 1);
	if (err) {
		kfree(p);	/* kobject not yet init'd, manually free */
		goto err1;
	}

	err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
				   SYSFS_BRIDGE_PORT_ATTR);
	if (err)
		goto err2;

	err = br_sysfs_addif(p);
	if (err)
		goto err2;

	err = br_netpoll_enable(p);
	if (err)
		goto err3;

	err = netdev_rx_handler_register(dev, br_handle_frame, p);
	if (err)
		goto err4;

	dev->priv_flags |= IFF_BRIDGE_PORT;

	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack);
	if (err)
		goto err5;

	err = nbp_switchdev_mark_set(p);
	if (err)
		goto err6;

	dev_disable_lro(dev);

	list_add_rcu(&p->list, &br->port_list);

	nbp_update_port_count(br);

	netdev_update_features(br->dev);

	br_hr = br->dev->needed_headroom;
	dev_hr = netdev_get_fwd_headroom(dev);
	if (br_hr < dev_hr)
		update_headroom(br, dev_hr);
	else
		netdev_set_rx_headroom(dev, br_hr);

	if (br_fdb_insert(br, p, dev->dev_addr, 0))
		netdev_err(dev, "failed insert local address bridge forwarding table\n");

	if (br->dev->addr_assign_type != NET_ADDR_SET) {
		/* Ask for permission to use this MAC address now, even if we
		 * don't end up choosing it below.
		 */
		err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
		if (err)
			goto err7;
	}

	err = nbp_vlan_init(p, extack);
	if (err) {
		netdev_err(dev, "failed to initialize vlan filtering on this port\n");
		goto err7;
	}

	spin_lock_bh(&br->lock);
	changed_addr = br_stp_recalculate_bridge_id(br);

	if (netif_running(dev) && netif_oper_up(dev) &&
	    (br->dev->flags & IFF_UP))
		br_stp_enable_port(p);
	spin_unlock_bh(&br->lock);

	br_ifinfo_notify(RTM_NEWLINK, NULL, p);

	if (changed_addr)
		call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);

	br_mtu_auto_adjust(br);
	br_set_gso_limits(br);

	kobject_uevent(&p->kobj, KOBJ_ADD);

	return 0;
	...
}

添加虚拟设备:

# strace  ip link add vethaaa type veth peer name vethbbb
execve("/sbin/ip", ["ip", "link", "add", "vethaaa", "type", "veth", "peer", "name", "vethbbb"], 0x7ffed8af30f0 /* 23 vars */) 
...
socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_ROUTE) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1048576], 4) = 0
setsockopt(3, SOL_NETLINK, NETLINK_EXT_ACK, [1], 4) = 0
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=26226, nl_groups=00000000}, [12]) = 0
sendto(3, {{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=52, type=NLMSG_ERROR, flags=0, seq=0, pid=26226}, {error=-ENODEV, msg={{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}}}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 52
access("/proc/net", R_OK)               = 0
access("/proc/net/unix", R_OK)          = 0
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="vethaaa"}) = -1 ENODEV (No such device)
close(4)                                = 0
brk(NULL)                               = 0x560e12455000
brk(0x560e12476000)                     = 0x560e12476000
openat(AT_FDCWD, "/usr/lib/ip/link_veth.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=92, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576836139, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, [{{nla_len=12, nla_type=IFLA_IFNAME}, "vethaaa"}, {{nla_len=48, nla_type=IFLA_LINKINFO}, [{{nla_len=8, nla_type=IFLA_INFO_KIND}, "veth"...}, {{nla_len=36, nla_type=IFLA_INFO_DATA}, "\x20\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x03\x00\x76\x65\x74\x68\x62\x62\x62\x00"}]}]}, iov_len=92}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 92
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 36
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=36, type=NLMSG_ERROR, flags=NLM_F_CAPPED, seq=1576836139, pid=26226}, {error=0, msg={len=92, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576836139, pid=0}}}, iov_len=36}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36
socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_ROUTE) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1048576], 4) = 0
setsockopt(3, SOL_NETLINK, NETLINK_EXT_ACK, [1], 4) = 0
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=18263, nl_groups=00000000}, [12]) = 0
sendto(3, {{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=52, type=NLMSG_ERROR, flags=0, seq=0, pid=18263}, {error=-EPERM, msg={{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}}}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 52
access("/proc/net", R_OK)               = 0
access("/proc/net/unix", R_OK)          = 0
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="p1"}) = -1 ENODEV (No such device)
close(4)                                = 0
brk(NULL)                               = 0x5595d01bb000
brk(0x5595d01dc000)                     = 0x5595d01dc000
openat(AT_FDCWD, "/usr/lib/ip/link_veth.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=84, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576748752, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, [{{nla_len=7, nla_type=IFLA_IFNAME}, "p1"}, {{nla_len=44, nla_type=IFLA_LINKINFO}, [{{nla_len=8, nla_type=IFLA_INFO_KIND}, "veth"...}, {{nla_len=32, nla_type=IFLA_INFO_DATA}, "\x1c\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x00\x03\x00\x70\x32\x00\x00"}]}]}, iov_len=84}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 84
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 104
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=104, type=NLMSG_ERROR, flags=0, seq=1576748752, pid=18263}, {error=-EPERM, msg={{len=84, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576748752, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, [{{nla_len=7, nla_type=IFLA_IFNAME}, "p1"}, {{nla_len=44, nla_type=IFLA_LINKINFO}, [{{nla_len=8, nla_type=IFLA_INFO_KIND}, "veth"...}, {{nla_len=32, nla_type=IFLA_INFO_DATA}, "\x1c\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x00\x03\x00\x70\x32\x00\x00"}]}]}}}, iov_len=104}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 104
write(2, "RTNETLINK answers: Operation not"..., 43RTNETLINK answers: Operation not permitted
) = 43
exit_group(2)                           = ?
+++ exited with 2 +++

linux 相关的netlink veth内容:

// drivers\net\veth.c
static struct rtnl_link_ops veth_link_ops = {
	.kind		= DRV_NAME,
	.priv_size	= sizeof(struct veth_priv),
	.setup		= veth_setup,
	.validate	= veth_validate,
	.newlink	= veth_newlink,
	.dellink	= veth_dellink,
	.policy		= veth_policy,
	.maxtype	= VETH_INFO_MAX,
	.get_link_net	= veth_get_link_net,
};
E:\linux-master\net\netlink\af_netlink.c
static const struct proto_ops netlink_ops = {
	.family =	PF_NETLINK,
	.owner =	THIS_MODULE,
	.release =	netlink_release,
	.bind =		netlink_bind,
	.connect =	netlink_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	sock_no_accept,
	.getname =	netlink_getname,
	.poll =		datagram_poll,
	.ioctl =	netlink_ioctl,
	.listen =	sock_no_listen,
	.shutdown =	sock_no_shutdown,
	.setsockopt =	netlink_setsockopt,
	.getsockopt =	netlink_getsockopt,
	.sendmsg =	netlink_sendmsg,
	.recvmsg =	netlink_recvmsg,
	.mmap =		sock_no_mmap,
	.sendpage =	sock_no_sendpage,
};

添加veth 设备

首先是添加socket

(gdb) bt
#0  socket () at ../sysdeps/unix/syscall-template.S:78
#1  0x00005555555b60c7 in rtnl_open_byproto (rth=0x5555557d8020 <rth>, subscriptions=0, protocol=<optimized out>) at libnetlink.c:194
#2  0x000055555555f956 in main (argc=9, argv=0x7fffffffe548) at ip.c:308
Breakpoint 6, __libc_sendmsg (fd=3, msg=msg@entry=0x7fffffffdd70, flags=flags@entry=0) at ../sysdeps/unix/sysv/linux/sendmsg.c:28
28	../sysdeps/unix/sysv/linux/sendmsg.c: No such file or directory.
(gdb) bt
#0  __libc_sendmsg (fd=3, msg=msg@entry=0x7fffffffdd70, flags=flags@entry=0) at ../sysdeps/unix/sysv/linux/sendmsg.c:28
#1  0x00005555555b5c8f in __rtnl_talk_iov (rtnl=0x5555557d8020 <rth>, iov=iov@entry=0x7fffffffddf0, iovlen=iovlen@entry=1, answer=answer@entry=0x0, show_rtnl_err=show_rtnl_err@entry=true, 
    errfn=0x0) at libnetlink.c:887
#2  0x00005555555b7225 in __rtnl_talk (errfn=0x0, show_rtnl_err=true, answer=<optimized out>, n=0x7fffffffde40, rtnl=<optimized out>) at libnetlink.c:1000
#3  rtnl_talk (rtnl=<optimized out>, n=n@entry=0x7fffffffde40, answer=answer@entry=0x0) at libnetlink.c:1006
#4  0x000055555557bc6e in iplink_modify (cmd=cmd@entry=16, flags=flags@entry=1536, argc=3, argc@entry=6, argv=<optimized out>, argv@entry=0x7fffffffe560) at iplink.c:1084
#5  0x000055555557c0c6 in do_iplink (argc=7, argv=0x7fffffffe558) at iplink.c:1641
#6  0x000055555555ff0c in do_cmd (argv0=0x7fffffffe7d8 "link", argc=8, argv=0x7fffffffe550) at ip.c:113
#7  0x000055555555f9a0 in main (argc=9, argv=0x7fffffffe548) at ip.c:317

比如命令ip link add veth_0 type veth peer name veth_0_peer 初始化的时候req.n 的长度是32

 p req.n.nlmsg_len 
$1 = 32

经过ret = iplink_parse(argc, argv, &req, &type); 后变成44,

(gdb) p ((char *)n)[32]@64
$50 = "\v\000\003\000veth_0\000\000\064\000\022\000\b\000\001\000veth(\000\002\000$\000\001", '\000' <repeats 17 times>, "\020\000\003\000veth_0_peer"

iptables是什么?

# type iptables
iptables is hashed (/sbin/iptables)

iptables命令为什么可以处理那些问题呢?

iptable原理

iptable就是通过socket netlink做特别的通信,改变netfilter子系统的相关hook

源码 相关阅读

相关阅读