减少用户态与内核态的内存拷贝
- read换mmap,读文件共享内核内存
- pip unix_sock socketpair
风险:当mmap多进程/线程使用时,后续操作可能因其他进程/线程修改mmap导致crash
处理:1.捕捉SIGBUS信号并直接返回,后续操作也被终止
2.fcntl(fd, F_SETSIG, RT_SIGNAL_LEASE)
fcntl(fd, F_SETLEASE, l_type) 设置文件中断信号,捕捉信号后终止后续操作
- sendfile 替换read/mmap+write的组合,直接内核操作
风险:1.mmap一样,最好设置RT_SIGNAL_LEASE信号,不要抛出SIGBUS信号
2.非POXI标准,移植问题
- splice 直接DMA用户态传输
```C
#include
ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags); // fd_in read文件句柄, 如果是不可以随机读文件off_in=NULL从当前偏移位置读入 // fd_out write文件句柄,如果是不可以随机写文件off_out=NULL从当前偏移写入 // len 数据长度 // flags // SPLICE_F_NONBLOCK splice调用不阻塞,但read或write没有设置noblock不起作用 // SPLICE_F_MORE 还有后期splice系统调用更多数据 // SPLICE_F_MOVE 如果可以整页内存移动 // SPLICE_F_GIFT // fd_in和fd_out中必须至少有一个是管道文件描述符。
//使用splice实现的回显服务器
#include
struct sockaddr_in address;
bzero(&address, sizeof(address));
address.sin_family = AF_INET;
address.sin_port = htons(atoi(argv[2]));
inet_pton(AF_INET, argv[1], &address.sin_addr);
int ssock = socket(PF_INET, SOCK_STREAM, 0);
int reuse = 1;
setsockopt(ssock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
int ret = bind(ssock, (struct sockaddr*)&address, sizeof(address));
ret = listen(ssock, 5);
struct sockaddr_in client;
socklen_t client_addrlength = sizeof(client);
int csock = accept(ssock, (struct sockaddr*)&client, &client_addrlength);
if (csock < 0) {
printf("errno is: %s\n", strerror(errno));
} else {
int p2[2];
ret = pipe(p2);
ret = splice(csock, NULL, p2[1], NULL, 32768, SPLICE_F_MORE|SPLICE_F_MOVE);
ret = splice(p2[0], NULL, csock, NULL, 32768, SPLICE_F_MORE| SPLICE_F_MOVE);
close(p2);
}
close(csock);
return 0; } ``` - tee() 在两个管道间0拷贝传输数据,并且不消费数据,即调用后两个管道都能读到相同数据
nginx
- ip 别名解决反向代理端口耗尽问题
ifconfig eth0:0 192.168.1.223 broadcast 192.168.1.255 netmask 255.255.255.0 up route add -host 192.168.1.223 dev eth0:0 nginx中使用 split_clients proxy_bind
ECMP 等价多路径路由
Linux Virtual Server(director服务)
glb-director
window
- select
- WSAAsyncSelect 64个
- WSAEventSelect
- Overlapped I/O 通知
- Overlapped I/O 完成
- IOCP
```C
#include
#include
#pragma comment(lib,”Ws2_32.lib”)
typedef struct { SOCKET fd; } NMLISockeCTX;
WORD version = MAKEWORD(2, 2); WSADATA data; int err = WSAStartup(version, &data); if (err != 0) { printf(“WSAStartup failed with error: %d\n”, err); return; }
HANDLE hIOCP = CreateIoCompletionPort( INVALID_HANDLE_VALUE, //HANDLE FileHandle 要设置的文件句柄 0, //HANDLE ExistingCompletionPort 当前的完成端口句柄 0, //ULONG_PTR CompletionKey 用户上下文数据 0 //DWORD NumberOfConcurrentThreads 同时执行的线程数,0==cpu数量 ); if (hIOCP == NULL) { printf(“%d ==\n”, WSAGetLastError()); }
SOCKET srvfd = WSASocket(AF_INET, SOCK_STREAM, 0, NULL, 0, WSA_FLAG_OVERLAPPED); CreateIoCompletionPort((HANDLE)srvfd, iocp, (ULONG_PTR)(&srvctx), 0); sockaddr_in addr{0}; addr.sin_family = AF_INET; addr.sin_port = htons(1234); addr.sin_addr.S_un.S_addr = INADDR_ANY; ::bind(srvfd, (sockaddr*)&addr, sizeof(addr)); ::listen(srvfd, SOMAXCONN);
//WSAAcceptEx 函数没有头文件,需要获取函数指针 LPFN_ACCEPTEX accept; GUID guid = WSAID_ACCEPTEX; DWORD dwBytes = 0; WSAIoctl(srvctx.fd, SIO_GET_EXTENSION_FUNCTION_POINTER, &guid, sizeof(guid), &accept, sizeof(accept), &dwBytes, NULL, NULL); if (accept == NULL) { printf(“%d ==\n”, WSAGetLastError()); }
// 新建一个socket句柄, 用来接收accept的新socket NMLISockeCTX newfd = { .fd = WSASocket(AF_INET, SOCK_STREAM, IPPROTO_TCP, NULL, 0, WSA_FLAG_OVERLAPPED) }; iocp->sokAccept = newfd.fd;
DWORD len = 1400; //设置数据缓存,accept后不会触发回调,收到消息并读取到缓存后触发回调 perIOCtx->len = len; perIOCtx->len += sizeof(sockaddr_in) + 16; //远程地址缓存,16是系统要求的附加长度 perIOCtx->len += sizeof(sockaddr_in) + 16; //本地地址缓存,16是系统要求的附加长度 iocp->buf = malloc(perIOCtx->len); //buf = NULL, len = 0, accept后立刻触发回调 ZeroMemory(&(iocp->ol), sizeof(OVERLAPPED)); accept(srvfd, newfd.fd, iocp->buf, len, sizeof(sockaddr_in) + 16, sizeof(sockaddr_in) + 16, &len, &(ocp->ol));
DWORD ret = 0; struct ctx pc = NULL; OVERLAPPED *ov = NULL; for (; ;) { if (GetQueuedCompletionStatus( iocp, //HANDLE CompletionPort 监听的完成端口 &ret, //LPDWORD lpNumberOfBytes 返回数据长度 (PULONG_PTR)&pc, //PULONG_PTR lpCompletionKey 上下文数据 &ov, //LPOVERLAPPED lpOverlapped == iocp->ol 完成的是什么操作 INFINITE //DWORD dwMilliseconds INFINITE永远不超时 )){
} }
SOCKET cfd = WSASocket(AF_INET, SOCK_STREAM, IPPROTO_TCP, NULL, 0, WSA_FLAG_OVERLAPPED);
## bond 模式 多网卡绑定为一张网卡
- mode=0 (balance-rr) 轮循策略(支持容错) 需要交换机配置聚合口, 防止包乱序到达
- mode=1 (active-backup) 主备策略,只有一张网卡是活动的, 交换机不能配置捆绑, 否则会有包发到备份网卡上
- mode=2 (balance-xor) 根据配置的hash策略传输, 默认策略src_mac ^ des_mac, 交换机需配置port channel, xmit_hash_policy
- mode=3 (broadcast) 广播策略, 所有网卡都发送相同的包, 交换机需要聚合强制不协商
- mode=4 (802.3ad) IEEE 动态链接聚合 使用802.3ad协议与交换机聚合LACP方式配合(xmit_hash_policy),所有设备在相同的速率和双工模式
- mode=5 (balance-tlb) 传输负载均衡, 根据网卡出口负载选择网卡,不需要交换机支持,每个网卡使用不同的mac
- mode=6 (balance-alb) 适应性负载均衡, 根据网卡进出口负载选择网卡,不需要交换机支持,使用ARP协商入口流量,每个网卡使用不同的mac
```bash
#bond
modinfo bonding #查看内核是否支持bond模式
#
systemctl stop NetworkManager.service
systemctl disable NetworkManager.service
cat /etc/sysconfig/network-scripts/ifcfg-xxx #多张网卡信息
DEVICE=xxx
HWADDR=EC:F4:BB:DC:4C:0C
TYPE=Ethernet
UUID=669f0694-9c52-4792-bd67-22c9d2c17acb
ONBOOT=yes
NM_CONTROLLED=no
BOOTPROTO=none
MASTER=bond0
SLAVE=yes
cat /etc/sysconfig/network-scripts/ifcfg-bond0 #bond信息
DEVICE=bond0
NAME='System bond0'
TYPE=Ethernet
BOOTPROTO=static
BONDING_OPTS='mode=802.3ad miimon=100 lacp_rate=fast xmit_hash_policy=layer2+3'
#上面是配置vlan时 bond0的,下面是配置vlan的+上面 如ifcfg-bond0.950 VLAN=yes VLAN_ID=950 PHYSDEV=bond0
#交换机做trunk防止每个vlan创建一个链路
NM_CONTROLLED=no
USERCTL=no
ONBOOT=yes
IPADDR=192.168.1.100
NETMASK=255.255.255.0
GATEWAY=192.168.1.1
DNS1=8.8.8.8
IPV6INIT=no
systemctl restart network
#开机自动加载
#/etc/modprobe.d/bonding.conf
alias bond0 bonding
options bond0 miimon=100 mode=4
install bond1 /sbin/modprobe bonding -o bond1 miimon=100 mode=0
install bond2 /sbin/modprobe bonding -o bond2 miimon=100 mode=1
install bond3 /sbin/modprobe bonding -o bond3 miimon=100 mode=0
#/etc/rc.local
nmcli con up eth1
nmcli con up bond0
ifenslave bond0 eth0 eth1
ifenslave bond1 eth2 eth3
ifenslave bond2 eth4 eth5
ifenslave bond3 eth6 eth7
cat /proc/net/bonding/bound0 查看配置信息
team
- broadcast 广播模式
- activebackup 主备模式
- roundrobin 轮循模式
- loadbalance 负载均衡模式
- lacp 802.3ad 链路聚合协议 ```bash nmcli connection show #查看所有网络 nmcli connection add type team con-name team0 ifname team0 config ‘{“runner”:{“name”:”loadbalance”}}’ #添加team负载均衡模式 nmcli connection modify team0 ipv4.addresses “192.168.16.2/24” nmcli connection modify team0 ipv4.gateway “192.168.16.1” nmcli connection modify team0 ipv4.dns “192.168.16.1” nmcli connection add type team-slave con-name team0-port1 ifname eth0 master team0 nmcli connection add type team-slave con-name team0-port2 ifname eth1 master team0 nmcli connection up team0 nmcli connection up team0-port1 nmcli connection up team0-port2 teamdctl team0 state #vi /etc/sysconfig/network-scripts/ifcfg-team0 修改配置 #nmcli connection reload 加载配置 #systemctl restart network
nmcli con show nmcli con delete team0 nmcli con delete team0-port1 nmcli con delete team0-port2 systemctl restart network
## centos
```bash
ip route | grep default #获取网关
ip n show | grep 10.0.2.2 #获取网关mac
ip link #查看网络设备
ethtool -i eth0 #查看eth0设备信息、驱动 bus-info(PCI序号)
ethtool -l eth0 #查看eth0的队列
ls /sys/devices/pci0000:00/0000:00:03.0/msi_irqs #根据PCI序号查看中断号(IRQ)列表
cat /proc/interrupts #查看中断统计
systemctl stop irqbalance #停用中断系统自动分配
systemctl disable irqbalance
echo kernel.numa_balancing=0 >> /etc/sysctl.conf #禁用内核NUMA平衡
sysctl -p
vi /etc/default/grub GRUB_CMDLINE_LINUX_DEFAULT="xxx isolcpus=2-7" #将2-7cpu从内核调度中去除, 其它用户进程将不会调度到2-7cpu上, 专职于某个特定应用并且2-7上的线程不会在2-7切换。当内核线程不受参数限制.
grub2-mkconfig -o /etc/grub2.cfg
#每次重启系统需要重新查看中断号, 重新设置亲和的cpu
cd /proc/irq/19 #19是示例IRQ号
echo 64 > smp_affinity #将bitmap设置的到IRQ19的smp_affinity, 中断只是要cpu6 1 << 6 = 64
cat /sys/devices/system/cpu/cpu6/topology/physical_package_id #查看cpu0的NUMA节点
numactl -a -N netdev:eth0 grep allowed /proc/self/status #查看eth0与那个NUMA+cpu亲和
numactl -a -N netdev:eth0 -C 6 grep allowed /proc/self/status #测试将eth0绑定到cpu6上
#systemd启动时可以用CPUAffinity=6 也可以用 ExecStart=/bin/numactl -aN netdev:em1 -C 6 /x/exe