网络相关

网络相关

Posted by nomadli on July 5, 2018

减少用户态与内核态的内存拷贝

  • read换mmap,读文件共享内核内存
  • pip unix_sock socketpair
风险:当mmap多进程/线程使用时,后续操作可能因其他进程/线程修改mmap导致crash
处理:1.捕捉SIGBUS信号并直接返回,后续操作也被终止
    2.fcntl(fd, F_SETSIG, RT_SIGNAL_LEASE)
      fcntl(fd, F_SETLEASE, l_type) 设置文件中断信号,捕捉信号后终止后续操作
  • sendfile 替换read/mmap+write的组合,直接内核操作
风险:1.mmap一样,最好设置RT_SIGNAL_LEASE信号,不要抛出SIGBUS信号
    2.非POXI标准,移植问题
  • splice 直接DMA用户态传输 ```C #include ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags); // fd_in read文件句柄, 如果是不可以随机读文件off_in=NULL从当前偏移位置读入 // fd_out write文件句柄,如果是不可以随机写文件off_out=NULL从当前偏移写入 // len 数据长度 // flags // SPLICE_F_NONBLOCK splice调用不阻塞,但read或write没有设置noblock不起作用 // SPLICE_F_MORE 还有后期splice系统调用更多数据 // SPLICE_F_MOVE 如果可以整页内存移动 // SPLICE_F_GIFT // fd_in和fd_out中必须至少有一个是管道文件描述符。

//使用splice实现的回显服务器 #include #include #include #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include #include #include #include int main(int argc, char **argv) { if (argc <= 2) { printf("usage: %s ip port\n", basename(argv[0])); return 1; }

struct sockaddr_in address;
bzero(&address, sizeof(address));
address.sin_family = AF_INET;
address.sin_port = htons(atoi(argv[2]));
inet_pton(AF_INET, argv[1], &address.sin_addr);
int ssock = socket(PF_INET, SOCK_STREAM, 0);
int reuse = 1;
setsockopt(ssock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
int ret = bind(ssock, (struct sockaddr*)&address, sizeof(address));
ret = listen(ssock, 5);
struct sockaddr_in client;
socklen_t client_addrlength = sizeof(client);
int csock = accept(ssock, (struct sockaddr*)&client, &client_addrlength);
if (csock < 0) {
    printf("errno is: %s\n", strerror(errno));
} else {
    int p2[2];  
    ret = pipe(p2);
    ret = splice(csock, NULL, p2[1], NULL, 32768, SPLICE_F_MORE|SPLICE_F_MOVE); 
    ret = splice(p2[0], NULL, csock, NULL, 32768, SPLICE_F_MORE| SPLICE_F_MOVE);
    close(p2);
}
close(csock);
return 0; } ``` - tee() 在两个管道间0拷贝传输数据,并且不消费数据,即调用后两个管道都能读到相同数据

nginx

  • ip 别名解决反向代理端口耗尽问题
    ifconfig eth0:0 192.168.1.223 broadcast 192.168.1.255 netmask 255.255.255.0 up
    route add -host 192.168.1.223 dev eth0:0
    nginx中使用 split_clients proxy_bind
    

ECMP 等价多路径路由

Linux Virtual Server(director服务)

glb-director

window

  • select
  • WSAAsyncSelect 64个
  • WSAEventSelect
  • Overlapped I/O 通知
  • Overlapped I/O 完成
  • IOCP ```C #include #include

#pragma comment(lib,”Ws2_32.lib”)

typedef struct { SOCKET fd; } NMLISockeCTX;

WORD version = MAKEWORD(2, 2); WSADATA data; int err = WSAStartup(version, &data); if (err != 0) { printf(“WSAStartup failed with error: %d\n”, err); return; }

HANDLE hIOCP = CreateIoCompletionPort( INVALID_HANDLE_VALUE, //HANDLE FileHandle 要设置的文件句柄 0, //HANDLE ExistingCompletionPort 当前的完成端口句柄 0, //ULONG_PTR CompletionKey 用户上下文数据 0 //DWORD NumberOfConcurrentThreads 同时执行的线程数,0==cpu数量 ); if (hIOCP == NULL) { printf(“%d ==\n”, WSAGetLastError()); }

SOCKET srvfd = WSASocket(AF_INET, SOCK_STREAM, 0, NULL, 0, WSA_FLAG_OVERLAPPED); CreateIoCompletionPort((HANDLE)srvfd, iocp, (ULONG_PTR)(&srvctx), 0); sockaddr_in addr{0}; addr.sin_family = AF_INET; addr.sin_port = htons(1234); addr.sin_addr.S_un.S_addr = INADDR_ANY; ::bind(srvfd, (sockaddr*)&addr, sizeof(addr)); ::listen(srvfd, SOMAXCONN);

//WSAAcceptEx 函数没有头文件,需要获取函数指针 LPFN_ACCEPTEX accept; GUID guid = WSAID_ACCEPTEX; DWORD dwBytes = 0; WSAIoctl(srvctx.fd, SIO_GET_EXTENSION_FUNCTION_POINTER, &guid, sizeof(guid), &accept, sizeof(accept), &dwBytes, NULL, NULL); if (accept == NULL) { printf(“%d ==\n”, WSAGetLastError()); }

// 新建一个socket句柄, 用来接收accept的新socket NMLISockeCTX newfd = { .fd = WSASocket(AF_INET, SOCK_STREAM, IPPROTO_TCP, NULL, 0, WSA_FLAG_OVERLAPPED) }; iocp->sokAccept = newfd.fd;

DWORD len = 1400; //设置数据缓存,accept后不会触发回调,收到消息并读取到缓存后触发回调 perIOCtx->len = len; perIOCtx->len += sizeof(sockaddr_in) + 16; //远程地址缓存,16是系统要求的附加长度 perIOCtx->len += sizeof(sockaddr_in) + 16; //本地地址缓存,16是系统要求的附加长度 iocp->buf = malloc(perIOCtx->len); //buf = NULL, len = 0, accept后立刻触发回调 ZeroMemory(&(iocp->ol), sizeof(OVERLAPPED)); accept(srvfd, newfd.fd, iocp->buf, len, sizeof(sockaddr_in) + 16, sizeof(sockaddr_in) + 16, &len, &(ocp->ol));

DWORD ret = 0; struct ctx pc = NULL; OVERLAPPED *ov = NULL; for (; ;) { if (GetQueuedCompletionStatus( iocp, //HANDLE CompletionPort 监听的完成端口 &ret, //LPDWORD lpNumberOfBytes 返回数据长度 (PULONG_PTR)&pc, //PULONG_PTR lpCompletionKey 上下文数据 &ov, //LPOVERLAPPED lpOverlapped == iocp->ol 完成的是什么操作 INFINITE //DWORD dwMilliseconds INFINITE永远不超时 )){

} }

SOCKET cfd = WSASocket(AF_INET, SOCK_STREAM, IPPROTO_TCP, NULL, 0, WSA_FLAG_OVERLAPPED);

## bond 模式 多网卡绑定为一张网卡
- mode=0 (balance-rr) 轮循策略(支持容错) 需要交换机配置聚合口, 防止包乱序到达
- mode=1 (active-backup) 主备策略,只有一张网卡是活动的, 交换机不能配置捆绑, 否则会有包发到备份网卡上
- mode=2 (balance-xor) 根据配置的hash策略传输, 默认策略src_mac ^ des_mac, 交换机需配置port channel, xmit_hash_policy
- mode=3 (broadcast) 广播策略, 所有网卡都发送相同的包, 交换机需要聚合强制不协商
- mode=4 (802.3ad) IEEE 动态链接聚合 使用802.3ad协议与交换机聚合LACP方式配合(xmit_hash_policy),所有设备在相同的速率和双工模式
- mode=5 (balance-tlb) 传输负载均衡, 根据网卡出口负载选择网卡,不需要交换机支持,每个网卡使用不同的mac
- mode=6 (balance-alb) 适应性负载均衡, 根据网卡进出口负载选择网卡,不需要交换机支持,使用ARP协商入口流量,每个网卡使用不同的mac
```bash
#bond
modinfo bonding #查看内核是否支持bond模式

#
systemctl stop NetworkManager.service
systemctl disable NetworkManager.service

cat /etc/sysconfig/network-scripts/ifcfg-xxx #多张网卡信息
DEVICE=xxx
HWADDR=EC:F4:BB:DC:4C:0C
TYPE=Ethernet
UUID=669f0694-9c52-4792-bd67-22c9d2c17acb
ONBOOT=yes
NM_CONTROLLED=no
BOOTPROTO=none
MASTER=bond0
SLAVE=yes

cat /etc/sysconfig/network-scripts/ifcfg-bond0 #bond信息
DEVICE=bond0
NAME='System bond0'
TYPE=Ethernet
BOOTPROTO=static
BONDING_OPTS='mode=802.3ad miimon=100 lacp_rate=fast xmit_hash_policy=layer2+3'
#上面是配置vlan时 bond0的,下面是配置vlan的+上面 如ifcfg-bond0.950 VLAN=yes VLAN_ID=950 PHYSDEV=bond0
#交换机做trunk防止每个vlan创建一个链路
NM_CONTROLLED=no
USERCTL=no
ONBOOT=yes
IPADDR=192.168.1.100
NETMASK=255.255.255.0
GATEWAY=192.168.1.1
DNS1=8.8.8.8

IPV6INIT=no

systemctl restart network

#开机自动加载
#/etc/modprobe.d/bonding.conf
alias bond0 bonding
options bond0 miimon=100 mode=4
install bond1 /sbin/modprobe bonding -o bond1 miimon=100 mode=0
install bond2 /sbin/modprobe bonding -o bond2 miimon=100 mode=1
install bond3 /sbin/modprobe bonding -o bond3 miimon=100 mode=0

#/etc/rc.local
nmcli con up eth1
nmcli con up bond0
ifenslave bond0 eth0 eth1
ifenslave bond1 eth2 eth3
ifenslave bond2 eth4 eth5
ifenslave bond3 eth6 eth7

cat /proc/net/bonding/bound0 查看配置信息

team

  • broadcast 广播模式
  • activebackup 主备模式
  • roundrobin 轮循模式
  • loadbalance 负载均衡模式
  • lacp 802.3ad 链路聚合协议 ```bash nmcli connection show #查看所有网络 nmcli connection add type team con-name team0 ifname team0 config ‘{“runner”:{“name”:”loadbalance”}}’ #添加team负载均衡模式 nmcli connection modify team0 ipv4.addresses “192.168.16.2/24” nmcli connection modify team0 ipv4.gateway “192.168.16.1” nmcli connection modify team0 ipv4.dns “192.168.16.1” nmcli connection add type team-slave con-name team0-port1 ifname eth0 master team0 nmcli connection add type team-slave con-name team0-port2 ifname eth1 master team0 nmcli connection up team0 nmcli connection up team0-port1 nmcli connection up team0-port2 teamdctl team0 state #vi /etc/sysconfig/network-scripts/ifcfg-team0 修改配置 #nmcli connection reload 加载配置 #systemctl restart network

nmcli con show nmcli con delete team0 nmcli con delete team0-port1 nmcli con delete team0-port2 systemctl restart network


## centos
```bash
ip route  | grep default    #获取网关
ip n show | grep 10.0.2.2   #获取网关mac
ip link #查看网络设备
ethtool -i eth0 #查看eth0设备信息、驱动 bus-info(PCI序号)
ethtool -l eth0 #查看eth0的队列
ls /sys/devices/pci0000:00/0000:00:03.0/msi_irqs #根据PCI序号查看中断号(IRQ)列表
cat /proc/interrupts #查看中断统计
systemctl stop irqbalance   #停用中断系统自动分配
systemctl disable irqbalance
echo kernel.numa_balancing=0 >> /etc/sysctl.conf #禁用内核NUMA平衡
sysctl -p
vi /etc/default/grub GRUB_CMDLINE_LINUX_DEFAULT="xxx isolcpus=2-7" #将2-7cpu从内核调度中去除, 其它用户进程将不会调度到2-7cpu上, 专职于某个特定应用并且2-7上的线程不会在2-7切换。当内核线程不受参数限制.
grub2-mkconfig -o /etc/grub2.cfg

#每次重启系统需要重新查看中断号, 重新设置亲和的cpu
cd /proc/irq/19  #19是示例IRQ号
echo 64 > smp_affinity  #将bitmap设置的到IRQ19的smp_affinity, 中断只是要cpu6 1 << 6 = 64
cat /sys/devices/system/cpu/cpu6/topology/physical_package_id #查看cpu0的NUMA节点

numactl -a -N netdev:eth0 grep allowed /proc/self/status #查看eth0与那个NUMA+cpu亲和
numactl -a -N netdev:eth0 -C 6 grep allowed /proc/self/status #测试将eth0绑定到cpu6上
#systemd启动时可以用CPUAffinity=6 也可以用 ExecStart=/bin/numactl -aN netdev:em1 -C 6 /x/exe