Now it is possible to create link in L2_MACNAT mode. This mode is intended for the desktop virtual machines, for bridging to Wireless interfaces. The mode should be specified while creating first child interface. It is not possible to change it after this. In contrast to L2 mode, MACNAT mode learns MAC and IP addresses from outgoing traffic of child interfaces. MAC address is translated for TX and RX traffic. The maximum number of addresses on child interface is limited. There can be IPVLAN_MAX_MACNAT_ADDRS of each (ipv4/ipv6) types. So far patching is implemented for Ethernet Header and ARPs. Also, dev_add_pack() protocol is attached to the main port to support communication from main to child interfaces. ToDo: support IPv6 Neighbours Discovery. Signed-off-by: Dmitry Skorodumov --- Documentation/networking/ipvlan.rst | 20 ++ drivers/net/ipvlan/ipvlan.h | 28 +++ drivers/net/ipvlan/ipvlan_core.c | 300 ++++++++++++++++++++++++++-- drivers/net/ipvlan/ipvlan_main.c | 148 ++++++++++++-- include/uapi/linux/if_link.h | 1 + 5 files changed, 457 insertions(+), 40 deletions(-) diff --git a/Documentation/networking/ipvlan.rst b/Documentation/networking/ipvlan.rst index 895d0ccfd596..c6fb2e6068b0 100644 --- a/Documentation/networking/ipvlan.rst +++ b/Documentation/networking/ipvlan.rst @@ -90,6 +90,26 @@ works in this mode and hence it is L3-symmetric (L3s). This will have slightly l performance but that shouldn't matter since you are choosing this mode over plain-L3 mode to make conn-tracking work. +4.4 L2_MACNAT mode: +------------------- + +This mode extends the L2 mode and is primarily designed for desktop virtual +machines that need to bridge to wireless interfaces. In standard L2 mode, +you must configure IP addresses on slave interfaces to enable frame +multiplexing between slaves and the master. + +In L2_MACNAT mode, IPVLAN automatically learns IPv4/IPv6 and MAC addresses +from outgoing packets. For transmitted packets, the source MAC address +is replaced with the MAC address of the main interface. Received packets +are routed to the interface that previously used the destination address, +and the destination MAC is replaced with the learned MAC address. + +This enables slave interfaces to automatically obtain IP addresses +via DHCP and IPv6 autoconfiguration. + +Additionally, dev_add_pack() is configured on the master interface to capture +outgoing frames and multiplex them to slave interfaces when necessary. + 5. Mode flags: ============== diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 50de3ee204db..c690e313ef6b 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -39,6 +39,8 @@ #define IPVLAN_QBACKLOG_LIMIT 1000 +#define IPVLAN_MAX_MACNAT_ADDRS 4 + typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, @@ -78,11 +80,13 @@ struct ipvl_addr { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; + u8 hwaddr[ETH_ALEN]; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ struct list_head anode; /* logical-interface linkage */ ipvl_hdr_type atype; + u64 tstamp; struct rcu_head rcu; }; @@ -91,6 +95,7 @@ struct ipvl_port { possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_head ipvlans; + struct packet_type ipvl_ptype; u16 mode; u16 flags; u16 dev_id_start; @@ -103,6 +108,7 @@ struct ipvl_port { struct ipvl_skb_cb { bool tx_pkt; + void *mark; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) @@ -151,12 +157,34 @@ static inline void ipvlan_clear_vepa(struct ipvl_port *port) port->flags &= ~IPVLAN_F_VEPA; } +static inline bool ipvlan_is_macnat(struct ipvl_port *port) +{ + return port->mode == IPVLAN_MODE_L2_MACNAT; +} + +static inline void ipvlan_mark_skb(struct sk_buff *skb, struct net_device *dev) +{ + IPVL_SKB_CB(skb)->mark = dev; +} + +static inline bool ipvlan_is_skb_marked(struct sk_buff *skb, + struct net_device *dev) +{ + return (IPVL_SKB_CB(skb)->mark == dev); +} + void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); +void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev); void ipvlan_process_multicast(struct work_struct *work); +void ipvlan_multicast_enqueue(struct ipvl_port *port, + struct sk_buff *skb, bool tx_pkt); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); +int ipvlan_add_addr(struct ipvl_dev *ipvlan, + void *iaddr, bool is_v6, const u8 *hwaddr); +void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index d7e3ddbcab6f..ba67cd8663e2 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -225,6 +225,42 @@ unsigned int ipvlan_mac_hash(const unsigned char *addr) return hash & IPVLAN_MAC_FILTER_MASK; } +static int ipvlan_macnat_xmit_phydev(struct ipvl_port *port, + struct sk_buff *skb, + bool lyr3h_valid, + void *lyr3h, int addr_type) +{ + struct sk_buff *orig_skb = skb; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + /* Use eth-addr of main as source. */ + skb_reset_mac_header(skb); + ether_addr_copy(skb_eth_hdr(skb)->h_source, port->dev->dev_addr); + + if (!lyr3h_valid) { + lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); + orig_skb = skb; /* no need to reparse */ + } + + /* ToDo: Handle ICMPv6 for neighbours discovery.*/ + if (lyr3h && addr_type == IPVL_ARP) { + if (skb != orig_skb) + lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); + + if (lyr3h) { + struct arphdr *arph = (struct arphdr *)lyr3h; + + ether_addr_copy((u8 *)(arph + 1), port->dev->dev_addr); + } + } + + skb->dev = port->dev; + return dev_queue_xmit(skb); +} + void ipvlan_process_multicast(struct work_struct *work) { struct ipvl_port *port = container_of(work, struct ipvl_port, wq); @@ -285,9 +321,25 @@ void ipvlan_process_multicast(struct work_struct *work) if (tx_pkt) { /* If the packet originated here, send it out. */ - skb->dev = port->dev; - skb->pkt_type = pkt_type; - dev_queue_xmit(skb); + if (ipvlan_is_macnat(port)) { + /* Inject as rx-packet to main dev. */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + consumed = true; + local_bh_disable(); + nskb->pkt_type = pkt_type; + nskb->dev = port->dev; + dev_forward_skb(port->dev, nskb); + local_bh_enable(); + } + /* Send out */ + ipvlan_macnat_xmit_phydev(port, skb, false, + NULL, -1); + } else { + skb->dev = port->dev; + skb->pkt_type = pkt_type; + dev_queue_xmit(skb); + } } else { if (consumed) consume_skb(skb); @@ -299,7 +351,7 @@ void ipvlan_process_multicast(struct work_struct *work) } } -static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) +void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) { bool xnet = true; @@ -311,8 +363,36 @@ static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; } -static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, - bool local) +static int ipvlan_macnat_rx_skb(struct ipvl_addr *addr, int addr_type, + struct sk_buff *skb) +{ + /* Here we have non-shared skb and free to modify it. */ + struct ethhdr *eth = eth_hdr(skb); + + if (addr_type == IPVL_ARP) { + struct arphdr *arph = arp_hdr(skb); + u8 *arp_ptr = (u8 *)(arph + 1); + u8 *dsthw = arp_ptr + addr->master->dev->addr_len + sizeof(u32); + const u8 *phy_addr = addr->master->phy_dev->dev_addr; + + /* Some access points may do ARP-proxy and answers us back. + * Client may treat this as address-conflict. + */ + if (ether_addr_equal(eth->h_source, phy_addr) && + ether_addr_equal(eth->h_dest, phy_addr) && + is_zero_ether_addr(dsthw)) { + return NET_RX_DROP; + } + if (ether_addr_equal(dsthw, phy_addr)) + ether_addr_copy(dsthw, addr->hwaddr); + } + + ether_addr_copy(eth->h_dest, addr->hwaddr); + return NET_RX_SUCCESS; +} + +static int ipvlan_rcv_frame(struct ipvl_addr *addr, int addr_type, + struct sk_buff **pskb, bool local) { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; @@ -322,10 +402,8 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, struct sk_buff *skb = *pskb; len = skb->len + ETH_HLEN; - /* Only packets exchanged between two local slaves need to have - * device-up check as well as skb-share check. - */ - if (local) { + + if (local || ipvlan_is_macnat(ipvlan->port)) { if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto out; @@ -336,6 +414,13 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, goto out; *pskb = skb; + if (ipvlan_is_macnat(ipvlan->port) && !local) { + if (ipvlan_macnat_rx_skb(addr, addr_type, skb) != + NET_RX_SUCCESS) { + kfree_skb(skb); + goto out; + } + } } if (local) { @@ -414,6 +499,120 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, return addr; } +static bool is_ipv4_usable(__be32 addr) +{ + return !ipv4_is_lbcast(addr) && !ipv4_is_multicast(addr) && + !ipv4_is_zeronet(addr); +} + +#if IS_ENABLED(CONFIG_IPV6) +static bool is_ipv6_usable(const struct in6_addr *addr) +{ + return !ipv6_addr_is_multicast(addr) && !ipv6_addr_loopback(addr) && + !ipv6_addr_any(addr); +} +#endif + +static void __ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, + void *addr, bool is_v6, + const u8 *hwaddr) +{ + const ipvl_hdr_type atype = is_v6 ? IPVL_IPV6 : IPVL_IPV4; + struct ipvl_addr *ipvladdr, *oldest = NULL; + unsigned int naddrs = 0; + + spin_lock_bh(&ipvlan->addrs_lock); + + if (ipvlan_addr_busy(ipvlan->port, addr, is_v6)) + goto out_unlock; + + list_for_each_entry_rcu(ipvladdr, &ipvlan->addrs, anode) { + if (ipvladdr->atype != atype) + continue; + naddrs++; + if (!oldest || time_before64(ipvladdr->tstamp, oldest->tstamp)) + oldest = ipvladdr; + } + + if (naddrs < IPVLAN_MAX_MACNAT_ADDRS) { + oldest = NULL; + } else { + ipvlan_ht_addr_del(oldest); + list_del_rcu(&oldest->anode); + } + + ipvlan_add_addr(ipvlan, addr, is_v6, hwaddr); + +out_unlock: + spin_unlock_bh(&ipvlan->addrs_lock); + if (oldest) + kfree_rcu(oldest, rcu); +} + +static void ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, + int addr_type, const u8 *hwaddr) +{ + struct ipvl_addr *ipvladdr; + void *addr = NULL; + bool is_v6; + + switch (addr_type) { +#if IS_ENABLED(CONFIG_IPV6) + /* No need to handle IPVL_ICMPV6, it never has valid src-address. */ + case IPVL_IPV6: { + struct ipv6hdr *ip6h; + + ip6h = (struct ipv6hdr *)lyr3h; + if (!is_ipv6_usable(&ip6h->saddr)) + return; + is_v6 = true; + addr = &ip6h->saddr; + break; + } +#endif + case IPVL_IPV4: { + struct iphdr *ip4h; + __be32 *i4addr; + + ip4h = (struct iphdr *)lyr3h; + i4addr = &ip4h->saddr; + if (!is_ipv4_usable(*i4addr)) + return; + is_v6 = false; + addr = i4addr; + break; + } + case IPVL_ARP: { + struct arphdr *arph; + unsigned char *arp_ptr; + __be32 *i4addr; + + arph = (struct arphdr *)lyr3h; + arp_ptr = (unsigned char *)(arph + 1); + arp_ptr += ipvlan->port->dev->addr_len; + i4addr = (__be32 *)arp_ptr; + if (!is_ipv4_usable(*i4addr)) + return; + is_v6 = false; + addr = i4addr; + break; + } + default: + return; + } + + /* handle situation when MAC changed, but IP is the same. */ + ipvladdr = ipvlan_ht_addr_lookup(ipvlan->port, addr, is_v6); + if (ipvladdr && !ether_addr_equal(ipvladdr->hwaddr, hwaddr)) { + /* del_addr is safe to call, because we are inside xmit. */ + ipvlan_del_addr(ipvladdr->master, addr, is_v6); + ipvladdr = NULL; + } + + if (!ipvladdr) + __ipvlan_macnat_addr_learn(ipvlan, addr, is_v6, hwaddr); +} + static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -561,8 +760,8 @@ static int ipvlan_process_outbound(struct sk_buff *skb) return ret; } -static void ipvlan_multicast_enqueue(struct ipvl_port *port, - struct sk_buff *skb, bool tx_pkt) +void ipvlan_multicast_enqueue(struct ipvl_port *port, + struct sk_buff *skb, bool tx_pkt) { if (skb->protocol == htons(ETH_P_PAUSE)) { kfree_skb(skb); @@ -607,7 +806,7 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); return NET_XMIT_DROP; } - ipvlan_rcv_frame(addr, &skb, true); + ipvlan_rcv_frame(addr, addr_type, &skb, true); return NET_XMIT_SUCCESS; } } @@ -634,7 +833,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); return NET_XMIT_DROP; } - ipvlan_rcv_frame(addr, &skb, true); + ipvlan_rcv_frame(addr, -1, &skb, true); return NET_XMIT_SUCCESS; } } @@ -661,6 +860,61 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) return dev_queue_xmit(skb); } +static int ipvlan_xmit_mode_macnat(struct sk_buff *skb, struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ethhdr *eth = skb_eth_hdr(skb); + struct ipvl_addr *addr; + int addr_type; + void *lyr3h; + + /* Ignore tx-packets from host and don't allow to use main addr. */ + if (ether_addr_equal(eth->h_source, dev->dev_addr) || + ether_addr_equal(eth->h_source, ipvlan->phy_dev->dev_addr)) + goto out_drop; + + /* Mark SKB in advance */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + ipvlan_mark_skb(skb, ipvlan->phy_dev); + + lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); + if (lyr3h) + ipvlan_macnat_addr_learn(ipvlan, lyr3h, addr_type, + eth->h_source); + + if (is_multicast_ether_addr(eth->h_dest)) { + skb_reset_mac_header(skb); + ipvlan_skb_crossing_ns(skb, NULL); + ipvlan_multicast_enqueue(ipvlan->port, skb, true); + return NET_XMIT_SUCCESS; + } else if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->dev_addr)) { + /* It is a packet from child with destination to main port. + * Pass it to main. + */ + skb->pkt_type = PACKET_HOST; + skb->dev = ipvlan->phy_dev; + dev_forward_skb(ipvlan->phy_dev, skb); + return NET_XMIT_SUCCESS; + } else if (lyr3h) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) { + if (ipvlan_is_private(ipvlan->port)) + goto out_drop; + + ipvlan_rcv_frame(addr, addr_type, &skb, true); + return NET_XMIT_SUCCESS; + } + } + + return ipvlan_macnat_xmit_phydev(ipvlan->port, skb, true, lyr3h, + addr_type); +out_drop: + consume_skb(skb); + return NET_XMIT_DROP; +} + int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); @@ -675,6 +929,8 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) switch(port->mode) { case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); + case IPVLAN_MODE_L2_MACNAT: + return ipvlan_xmit_mode_macnat(skb, dev); case IPVLAN_MODE_L3: #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: @@ -724,8 +980,7 @@ static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); if (addr) - ret = ipvlan_rcv_frame(addr, pskb, false); - + ret = ipvlan_rcv_frame(addr, addr_type, pskb, false); out: return ret; } @@ -737,17 +992,23 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; - if (is_multicast_ether_addr(eth->h_dest)) { - if (ipvlan_external_frame(skb, port)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + /* Ignore already seen packets. */ + if (ipvlan_is_skb_marked(skb, port->dev)) + return RX_HANDLER_PASS; + if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_external_frame(skb, port) || + ipvlan_is_macnat(port)) { /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later * when work-queue processes this frame. This is * achieved by returning RX_HANDLER_PASS. */ + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + ipvlan_mark_skb(skb, port->dev); ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); } @@ -770,6 +1031,7 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) switch (port->mode) { case IPVLAN_MODE_L2: + case IPVLAN_MODE_L2_MACNAT: return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 660f3db11766..f27af7709a5b 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -16,6 +16,15 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, ASSERT_RTNL(); if (port->mode != nval) { + /* Don't allow switch off the learnable bridge mode. + * Flags also must be set from the first port-link setup. + */ + if (port->mode == IPVLAN_MODE_L2_MACNAT || + (nval == IPVLAN_MODE_L2_MACNAT && port->count > 1)) { + netdev_err(port->dev, "MACNAT mode cannot be changed.\n"); + return -EINVAL; + } + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { @@ -40,7 +49,10 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, ipvlan_l3s_unregister(port); } port->mode = nval; + if (port->mode == IPVLAN_MODE_L2_MACNAT) + dev_add_pack(&port->ipvl_ptype); } + return 0; fail: @@ -59,6 +71,67 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, return err; } +static int ipvlan_macnat_port_rcv(struct sk_buff *skb, struct net_device *wdev, + struct packet_type *pt, + struct net_device *orig_wdev) +{ + struct ipvl_port *port; + struct ipvl_addr *addr; + struct ethhdr *eth; + int addr_type; + void *lyr3h; + + port = container_of(pt, struct ipvl_port, ipvl_ptype); + /* We are interested only in outgoing packets. + * rx-path is handled in rx_handler(). + */ + if (skb->pkt_type != PACKET_OUTGOING || + ipvlan_is_skb_marked(skb, port->dev)) + goto out; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto no_mem; + + /* data should point to eth-header */ + skb_push(skb, skb->data - skb_mac_header(skb)); + skb->dev = port->dev; + eth = eth_hdr(skb); + + if (is_multicast_ether_addr(eth->h_dest)) { + ipvlan_skb_crossing_ns(skb, NULL); + skb->protocol = eth_type_trans(skb, skb->dev); + skb->pkt_type = PACKET_HOST; + ipvlan_mark_skb(skb, port->dev); + ipvlan_multicast_enqueue(port, skb, false); + return NET_RX_SUCCESS; + } + + lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); + if (addr) { + struct ipvl_dev *ipvlan = addr->master; + int ret, len; + + ipvlan_skb_crossing_ns(skb, ipvlan->dev); + skb->protocol = eth_type_trans(skb, skb->dev); + skb->pkt_type = PACKET_HOST; + ipvlan_mark_skb(skb, port->dev); + len = skb->len + ETH_HLEN; + ret = netif_rx(skb); + ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, false); + return NET_RX_SUCCESS; + } + +out: + dev_kfree_skb(skb); +no_mem: + return NET_RX_DROP; +} + static int ipvlan_port_create(struct net_device *dev) { struct ipvl_port *port; @@ -84,6 +157,11 @@ static int ipvlan_port_create(struct net_device *dev) if (err) goto err; + port->ipvl_ptype.func = ipvlan_macnat_port_rcv; + port->ipvl_ptype.type = htons(ETH_P_ALL); + port->ipvl_ptype.dev = dev; + port->ipvl_ptype.list.prev = LIST_POISON2; + netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); return 0; @@ -100,6 +178,8 @@ static void ipvlan_port_destroy(struct net_device *dev) netdev_put(dev, &port->dev_tracker); if (port->mode == IPVLAN_MODE_L3S) ipvlan_l3s_unregister(port); + if (port->ipvl_ptype.list.prev != LIST_POISON2) + dev_remove_pack(&port->ipvl_ptype); netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); while ((skb = __skb_dequeue(&port->backlog)) != NULL) { @@ -189,10 +269,13 @@ static int ipvlan_open(struct net_device *dev) else dev->flags &= ~IFF_NOARP; - rcu_read_lock(); - list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) - ipvlan_ht_addr_add(ipvlan, addr); - rcu_read_unlock(); + /* for learnable, addresses will be obtained from tx-packets. */ + if (!ipvlan_is_macnat(ipvlan->port)) { + rcu_read_lock(); + list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) + ipvlan_ht_addr_add(ipvlan, addr); + rcu_read_unlock(); + } return 0; } @@ -581,11 +664,21 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, INIT_LIST_HEAD(&ipvlan->addrs); spin_lock_init(&ipvlan->addrs_lock); - /* TODO Probably put random address here to be presented to the - * world but keep using the physical-dev address for the outgoing - * packets. + /* Flags are per port and latest update overrides. User has + * to be consistent in setting it just like the mode attribute. */ - eth_hw_addr_set(dev, phy_dev->dev_addr); + if (data && data[IFLA_IPVLAN_MODE]) + mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + if (mode != IPVLAN_MODE_L2_MACNAT) { + /* TODO Probably put random address here to be presented to the + * world but keep using the physical-dev addr for the outgoing + * packets. + */ + eth_hw_addr_set(dev, phy_dev->dev_addr); + } else { + eth_hw_addr_random(dev); + } dev->priv_flags |= IFF_NO_RX_HANDLER; @@ -597,6 +690,9 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, port = ipvlan_port_get_rtnl(phy_dev); ipvlan->port = port; + if (data && data[IFLA_IPVLAN_FLAGS]) + port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); + /* If the port-id base is at the MAX value, then wrap it around and * begin from 0x1 again. This may be due to a busy system where lots * of slaves are getting created and deleted. @@ -625,19 +721,13 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, if (err) goto remove_ida; - /* Flags are per port and latest update overrides. User has - * to be consistent in setting it just like the mode attribute. - */ - if (data && data[IFLA_IPVLAN_FLAGS]) - port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); - - if (data && data[IFLA_IPVLAN_MODE]) - mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); - err = ipvlan_set_port_mode(port, mode, extack); if (err) goto unlink_netdev; + if (ipvlan_is_macnat(port)) + dev_set_allmulti(dev, 1); + list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); netif_stacked_transfer_operstate(phy_dev, dev); return 0; @@ -657,6 +747,9 @@ void ipvlan_link_delete(struct net_device *dev, struct list_head *head) struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; + if (ipvlan_is_macnat(ipvlan->port)) + dev_set_allmulti(dev, -1); + spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); @@ -793,6 +886,9 @@ static int ipvlan_device_event(struct notifier_block *unused, break; case NETDEV_CHANGEADDR: + if (ipvlan_is_macnat(port)) + break; + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { eth_hw_addr_set(ipvlan->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); @@ -813,7 +909,8 @@ static int ipvlan_device_event(struct notifier_block *unused, } /* the caller must held the addrs lock */ -static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6, + const u8 *hwaddr) { struct ipvl_addr *addr; @@ -822,6 +919,7 @@ static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) return -ENOMEM; addr->master = ipvlan; + addr->tstamp = get_jiffies_64(); if (!is_v6) { memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); addr->atype = IPVL_IPV4; @@ -831,6 +929,8 @@ static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) addr->atype = IPVL_IPV6; #endif } + if (hwaddr) + ether_addr_copy(addr->hwaddr, hwaddr); list_add_tail_rcu(&addr->anode, &ipvlan->addrs); @@ -843,7 +943,7 @@ static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) return 0; } -static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; @@ -884,7 +984,7 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else - ret = ipvlan_add_addr(ipvlan, ip6_addr, true); + ret = ipvlan_add_addr(ipvlan, ip6_addr, true, NULL); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } @@ -928,6 +1028,9 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused, if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; + if (ipvlan_is_macnat(ipvlan->port)) + return notifier_from_errno(-EADDRNOTAVAIL); + switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { @@ -952,7 +1055,7 @@ static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else - ret = ipvlan_add_addr(ipvlan, ip4_addr, false); + ret = ipvlan_add_addr(ipvlan, ip4_addr, false, NULL); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } @@ -999,6 +1102,9 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused, if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; + if (ipvlan_is_macnat(ipvlan->port)) + return notifier_from_errno(-EADDRNOTAVAIL); + switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3b491d96e52e..64ecb1d739d0 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1269,6 +1269,7 @@ enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, IPVLAN_MODE_L3S, + IPVLAN_MODE_L2_MACNAT, IPVLAN_MODE_MAX }; -- 2.25.1 Some WiFi enfironments sometimes send mcast packets with unicast eth_dst. Forcibly replace eth_dst to be bcast in this case if bridge is in L2E mode. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 62 ++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index ba67cd8663e2..aa79368b4559 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -985,18 +985,69 @@ static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, return ret; } +static bool ipvlan_is_mcast(struct ipvl_port *port, void *lyr3h, int addr_type) +{ + switch (addr_type) { +#if IS_ENABLED(CONFIG_IPV6) + /* No need to handle ICMPv6. This type is used for DAD only. */ + case IPVL_IPV6: + return !is_ipv6_usable(&((struct ipv6hdr *)lyr3h)->daddr); +#endif + case IPVL_IPV4: { + /* Treat mcast, bcast and zero as multicast. */ + __be32 i4addr = ((struct iphdr *)lyr3h)->daddr; + + return !is_ipv4_usable(i4addr); + } + case IPVL_ARP: { + struct arphdr *arph; + unsigned char *arp_ptr; + __be32 i4addr; + + arph = (struct arphdr *)lyr3h; + arp_ptr = (unsigned char *)(arph + 1); + arp_ptr += (2 * port->dev->addr_len) + 4; + i4addr = *(__be32 *)arp_ptr; + return !is_ipv4_usable(i4addr); + } + } + return false; +} + +static bool ipvlan_is_l2_mcast(struct ipvl_port *port, struct sk_buff *skb, + bool *need_eth_fix) +{ + int addr_type; + void *lyr3h; + + /* In some wifi environments unicast dest address means nothing. + * IP still can be a mcast and frame should be treated as mcast. + */ + *need_eth_fix = false; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) + return true; + + if (!ipvlan_is_macnat(port)) + return false; + + lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); + *need_eth_fix = lyr3h && ipvlan_is_mcast(port, lyr3h, addr_type); + + return *need_eth_fix; +} + static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ipvl_port *port) { - struct sk_buff *skb = *pskb; - struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; + struct sk_buff *skb = *pskb; + bool need_eth_fix; /* Ignore already seen packets. */ if (ipvlan_is_skb_marked(skb, port->dev)) return RX_HANDLER_PASS; - if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_is_l2_mcast(port, skb, &need_eth_fix)) { if (ipvlan_external_frame(skb, port) || ipvlan_is_macnat(port)) { /* External frames are queued for device local @@ -1008,6 +1059,11 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { + if (need_eth_fix) { + struct ethhdr *eth = eth_hdr(nskb); + + eth_broadcast_addr(eth->h_dest); + } ipvlan_mark_skb(skb, port->dev); ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); -- 2.25.1 When ipvlan interface goes down, forget all learned addresses. This is a way to cleanup addresses when master dev switches to another network. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_main.c | 49 ++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f27af7709a5b..288b4d103b97 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -742,14 +742,10 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, } EXPORT_SYMBOL_GPL(ipvlan_link_new); -void ipvlan_link_delete(struct net_device *dev, struct list_head *head) +static void ipvlan_addrs_forget_all(struct ipvl_dev *ipvlan) { - struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; - if (ipvlan_is_macnat(ipvlan->port)) - dev_set_allmulti(dev, -1); - spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); @@ -757,6 +753,16 @@ void ipvlan_link_delete(struct net_device *dev, struct list_head *head) kfree_rcu(addr, rcu); } spin_unlock_bh(&ipvlan->addrs_lock); +} + +void ipvlan_link_delete(struct net_device *dev, struct list_head *head) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (ipvlan_is_macnat(ipvlan->port)) + dev_set_allmulti(dev, -1); + + ipvlan_addrs_forget_all(ipvlan); ida_free(&ipvlan->port->ida, dev->dev_id); list_del_rcu(&ipvlan->pnode); @@ -814,6 +820,19 @@ int ipvlan_link_register(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(ipvlan_link_register); +static bool ipvlan_is_valid_dev(const struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (!netif_is_ipvlan(dev)) + return false; + + if (!ipvlan || !ipvlan->port) + return false; + + return true; +} + static int ipvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -825,6 +844,13 @@ static int ipvlan_device_event(struct notifier_block *unused, LIST_HEAD(lst_kill); int err; + if (event == NETDEV_DOWN && ipvlan_is_valid_dev(dev)) { + struct ipvl_dev *ipvlan = netdev_priv(dev); + + ipvlan_addrs_forget_all(ipvlan); + return NOTIFY_DONE; + } + if (!netif_is_ipvlan_port(dev)) return NOTIFY_DONE; @@ -960,19 +986,6 @@ void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) kfree_rcu(addr, rcu); } -static bool ipvlan_is_valid_dev(const struct net_device *dev) -{ - struct ipvl_dev *ipvlan = netdev_priv(dev); - - if (!netif_is_ipvlan(dev)) - return false; - - if (!ipvlan || !ipvlan->port) - return false; - - return true; -} - #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { -- 2.25.1 To make IPv6 work with macnat mode, need to process the TX-path: * Replace Source-ll-addr in Solicitation ndisc, * Replace Target-ll-addr in Advertisement ndisc No need to do anything in RX-path Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 128 ++++++++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 12 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index aa79368b4559..97107d9ce20c 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -4,6 +4,7 @@ #include #include +#include #include "ipvlan.h" @@ -225,6 +226,115 @@ unsigned int ipvlan_mac_hash(const unsigned char *addr) return hash & IPVLAN_MAC_FILTER_MASK; } +static void ipvlan_macnat_patch_tx_arp(struct ipvl_port *port, + struct sk_buff *skb) +{ + struct arphdr *arph; + int addr_type; + + arph = (struct arphdr *)ipvlan_get_L3_hdr(port, skb, + &addr_type); + ether_addr_copy((u8 *)(arph + 1), port->dev->dev_addr); +} + +#if IS_ENABLED(CONFIG_IPV6) + +static u8 *ipvlan_search_icmp6_ll_addr(struct sk_buff *skb, u8 icmp_option) +{ + /* skb is ensured to pullable for all ipv6 payload_len by caller */ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct icmp6hdr *icmph; + int ndsize, curr_off; + + icmph = (struct icmp6hdr *)(ip6h + 1); + ndsize = (int)ntohs(ip6h->payload_len); + curr_off = sizeof(*icmph); + + if (icmph->icmp6_type != NDISC_ROUTER_SOLICITATION) + curr_off += sizeof(struct in6_addr); + + while ((curr_off + 2) < ndsize) { + u8 *data = (u8 *)icmph + curr_off; + u32 opt_len = data[1] << 3; + + if (unlikely(opt_len == 0)) + return NULL; + + if (data[0] != icmp_option) { + curr_off += opt_len; + continue; + } + + if (unlikely(opt_len < ETH_ALEN + 2)) + return NULL; + + if (unlikely(curr_off + opt_len > ndsize)) + return NULL; + + return data + 2; + } + + return NULL; +} + +static void ipvlan_macnat_patch_tx_ipv6(struct ipvl_port *port, + struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + struct icmp6hdr *icmph; + u8 icmp_option; + u8 *lladdr; + u16 ndsize; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) + return; + + if (ipv6_hdr(skb)->nexthdr != NEXTHDR_ICMP) + return; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)))) + return; + + ip6h = ipv6_hdr(skb); + icmph = (struct icmp6hdr *)(ip6h + 1); + + /* Patch Source-LL for solicitation, Target-LL for advertisement */ + if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmph->icmp6_type == NDISC_ROUTER_SOLICITATION) + icmp_option = ND_OPT_SOURCE_LL_ADDR; + else if (icmph->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) + icmp_option = ND_OPT_TARGET_LL_ADDR; + else + return; + + ndsize = (int)ntohs(ip6h->payload_len); + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + ndsize))) + return; + + lladdr = ipvlan_search_icmp6_ll_addr(skb, icmp_option); + if (!lladdr) + return; + + ether_addr_copy(lladdr, port->dev->dev_addr); + + ip6h = ipv6_hdr(skb); + icmph = (struct icmp6hdr *)(ip6h + 1); + icmph->icmp6_cksum = 0; + icmph->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ndsize, + IPPROTO_ICMPV6, + csum_partial(icmph, + ndsize, + 0)); + skb->ip_summed = CHECKSUM_COMPLETE; +} +#else +static void ipvlan_macnat_patch_tx_ipv6(struct ipvl_port *port, + struct sk_buff *skb) +{ +} +#endif + static int ipvlan_macnat_xmit_phydev(struct ipvl_port *port, struct sk_buff *skb, bool lyr3h_valid, @@ -244,18 +354,12 @@ static int ipvlan_macnat_xmit_phydev(struct ipvl_port *port, lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); orig_skb = skb; /* no need to reparse */ } - - /* ToDo: Handle ICMPv6 for neighbours discovery.*/ - if (lyr3h && addr_type == IPVL_ARP) { - if (skb != orig_skb) - lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); - - if (lyr3h) { - struct arphdr *arph = (struct arphdr *)lyr3h; - - ether_addr_copy((u8 *)(arph + 1), port->dev->dev_addr); - } - } + if (!lyr3h) + addr_type = -1; + else if (addr_type == IPVL_ARP) + ipvlan_macnat_patch_tx_arp(port, skb); + else if (addr_type == IPVL_ICMPV6 || addr_type == IPVL_IPV6) + ipvlan_macnat_patch_tx_ipv6(port, skb); skb->dev = port->dev; return dev_queue_xmit(skb); -- 2.25.1 Fixed a compilation warning: ipvlan_core.c:56: warning: incorrect type in argument 1 (different base types) expected unsigned int [usertype] a got restricted __be32 const [usertype] s_addr Force cast the s_addr to u32 Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 97107d9ce20c..f3f34581339c 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -53,8 +53,8 @@ static u8 ipvlan_get_v4_hash(const void *iaddr) { const struct in_addr *ip4_addr = iaddr; - return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & - IPVLAN_HASH_MASK; + return jhash_1word((__force u32)ip4_addr->s_addr, ipvlan_jhash_secret) & + IPVLAN_HASH_MASK; } static bool addr_equal(bool is_v6, struct ipvl_addr *addr, const void *iaddr) -- 2.25.1 Make the addrs_lock be per port, not per ipvlan dev. This appears to be a very minor problem though. Since it's highly unlikely that ipvlan_add_addr() will be called on 2 CPU simultaneously. But nevertheless, this may cause: 1. False-negative of ipvlan_addr_busy(): one interface iterated through all port->ipvlans + ipvlan->addrs under some ipvlan spinlock, and another added IP under its own lock. Though this is only possible for IPv6, since looks like only ipvlan_addr6_event() can be called without rtnl_lock. 2. Race since ipvlan_ht_addr_add(port) is called under different ipvlan->addrs_lock locks This should not affect performance, since add/remove IP is a rare situation and spinlock is not locked on fast paths. Also, it's quite convenient to have addrs_lock on ipvl_port, to dynamically prevent conflict of IPs with addresses on main port. CC: Paolo Abeni Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan.h | 2 +- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- drivers/net/ipvlan/ipvlan_main.c | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index c690e313ef6b..0ab1797c6128 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -71,7 +71,6 @@ struct ipvl_dev { DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); netdev_features_t sfeatures; u32 msg_enable; - spinlock_t addrs_lock; }; struct ipvl_addr { @@ -94,6 +93,7 @@ struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; + spinlock_t addrs_lock; /* guards hash-table and addrs */ struct list_head ipvlans; struct packet_type ipvl_ptype; u16 mode; diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index f3f34581339c..8a39b1c170b3 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -625,7 +625,7 @@ static void __ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, struct ipvl_addr *ipvladdr, *oldest = NULL; unsigned int naddrs = 0; - spin_lock_bh(&ipvlan->addrs_lock); + spin_lock_bh(&ipvlan->port->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, addr, is_v6)) goto out_unlock; @@ -648,7 +648,7 @@ static void __ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, ipvlan_add_addr(ipvlan, addr, is_v6, hwaddr); out_unlock: - spin_unlock_bh(&ipvlan->addrs_lock); + spin_unlock_bh(&ipvlan->port->addrs_lock); if (oldest) kfree_rcu(oldest, rcu); } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 288b4d103b97..c1df97a88a40 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -148,6 +148,7 @@ static int ipvlan_port_create(struct net_device *dev) for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) INIT_HLIST_HEAD(&port->hlhead[idx]); + spin_lock_init(&port->addrs_lock); skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); ida_init(&port->ida); @@ -662,7 +663,6 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, if (!tb[IFLA_MTU]) ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(&ipvlan->addrs); - spin_lock_init(&ipvlan->addrs_lock); /* Flags are per port and latest update overrides. User has * to be consistent in setting it just like the mode attribute. @@ -746,13 +746,13 @@ static void ipvlan_addrs_forget_all(struct ipvl_dev *ipvlan) { struct ipvl_addr *addr, *next; - spin_lock_bh(&ipvlan->addrs_lock); + spin_lock_bh(&ipvlan->port->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); kfree_rcu(addr, rcu); } - spin_unlock_bh(&ipvlan->addrs_lock); + spin_unlock_bh(&ipvlan->port->addrs_lock); } void ipvlan_link_delete(struct net_device *dev, struct list_head *head) @@ -973,16 +973,16 @@ void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; - spin_lock_bh(&ipvlan->addrs_lock); + spin_lock_bh(&ipvlan->port->addrs_lock); addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); if (!addr) { - spin_unlock_bh(&ipvlan->addrs_lock); + spin_unlock_bh(&ipvlan->port->addrs_lock); return; } ipvlan_ht_addr_del(addr); list_del_rcu(&addr->anode); - spin_unlock_bh(&ipvlan->addrs_lock); + spin_unlock_bh(&ipvlan->port->addrs_lock); kfree_rcu(addr, rcu); } @@ -991,14 +991,14 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { int ret = -EINVAL; - spin_lock_bh(&ipvlan->addrs_lock); + spin_lock_bh(&ipvlan->port->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip6_addr, true, NULL); - spin_unlock_bh(&ipvlan->addrs_lock); + spin_unlock_bh(&ipvlan->port->addrs_lock); return ret; } @@ -1062,14 +1062,14 @@ static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) { int ret = -EINVAL; - spin_lock_bh(&ipvlan->addrs_lock); + spin_lock_bh(&ipvlan->port->addrs_lock); if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) netif_err(ipvlan, ifup, ipvlan->dev, "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else ret = ipvlan_add_addr(ipvlan, ip4_addr, false, NULL); - spin_unlock_bh(&ipvlan->addrs_lock); + spin_unlock_bh(&ipvlan->port->addrs_lock); return ret; } -- 2.25.1 It was forgotten to lock addrs in ipvlan_open(). Seems that code was initially written in assumption that any address change occurs under rtnl_lock(). But it's not true for the ipv6 case. So, we have to take addr_lock in ipvlan_open(). Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index c1df97a88a40..27d289aadef1 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -262,20 +262,20 @@ static void ipvlan_uninit(struct net_device *dev) static int ipvlan_open(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan->port; struct ipvl_addr *addr; - if (ipvlan->port->mode == IPVLAN_MODE_L3 || - ipvlan->port->mode == IPVLAN_MODE_L3S) + if (port->mode == IPVLAN_MODE_L3 || port->mode == IPVLAN_MODE_L3S) dev->flags |= IFF_NOARP; else dev->flags &= ~IFF_NOARP; /* for learnable, addresses will be obtained from tx-packets. */ - if (!ipvlan_is_macnat(ipvlan->port)) { - rcu_read_lock(); + if (!ipvlan_is_macnat(port)) { + spin_lock_bh(&port->addrs_lock); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_add(ipvlan, addr); - rcu_read_unlock(); + spin_unlock_bh(&port->addrs_lock); } return 0; -- 2.25.1 Remember all ip-addresses on main iface and check in ipvlan_addr_busy() that addr is not used on main. Store IPs in separate list. Remember IP address at port create and listen for addr-change events. Don't allow to configure addresses on children with addresses of main. In learning mode, child may not learn the address if it is used on main. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan.h | 13 ++ drivers/net/ipvlan/ipvlan_core.c | 41 ++++--- drivers/net/ipvlan/ipvlan_main.c | 205 +++++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+), 14 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 0ab1797c6128..faba1308c135 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -89,10 +89,21 @@ struct ipvl_addr { struct rcu_head rcu; }; +struct ipvl_port_addr { + union { + struct in6_addr ip6; + struct in_addr ip4; + } ipu; + ipvl_hdr_type atype; + struct list_head anode; + struct rcu_head rcu; +}; + struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; + struct list_head port_addrs; /* addresses of main iface.*/ spinlock_t addrs_lock; /* guards hash-table and addrs */ struct list_head ipvlans; struct packet_type ipvl_ptype; @@ -199,6 +210,8 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); int ipvlan_link_register(struct rtnl_link_ops *ops); +struct ipvl_port_addr *ipvlan_port_find_addr(struct ipvl_port *port, + const void *iaddr, bool is_v6); #ifdef CONFIG_IPVLAN_L3S int ipvlan_l3s_register(struct ipvl_port *port); void ipvlan_l3s_unregister(struct ipvl_port *port); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 8a39b1c170b3..f08edaaae61d 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -133,6 +133,8 @@ bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) break; } } + if (!ret) + ret = !!ipvlan_port_find_addr(port, iaddr, is_v6); rcu_read_unlock(); return ret; } @@ -617,18 +619,22 @@ static bool is_ipv6_usable(const struct in6_addr *addr) } #endif -static void __ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, - void *addr, bool is_v6, - const u8 *hwaddr) +static int __ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, + void *addr, bool is_v6, + const u8 *hwaddr) { const ipvl_hdr_type atype = is_v6 ? IPVL_IPV6 : IPVL_IPV4; struct ipvl_addr *ipvladdr, *oldest = NULL; unsigned int naddrs = 0; + int ret = -1; spin_lock_bh(&ipvlan->port->addrs_lock); + if (ipvlan_port_find_addr(ipvlan->port, addr, is_v6)) + goto out_unlock; /* used by main. */ + if (ipvlan_addr_busy(ipvlan->port, addr, is_v6)) - goto out_unlock; + goto out_unlock; /* used by other ipvlan. */ list_for_each_entry_rcu(ipvladdr, &ipvlan->addrs, anode) { if (ipvladdr->atype != atype) @@ -646,15 +652,19 @@ static void __ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, } ipvlan_add_addr(ipvlan, addr, is_v6, hwaddr); + ret = 0; out_unlock: spin_unlock_bh(&ipvlan->port->addrs_lock); if (oldest) kfree_rcu(oldest, rcu); + + return ret; } -static void ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, - int addr_type, const u8 *hwaddr) +/* return -1 if frame should be dropped. */ +static int ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, + int addr_type, const u8 *hwaddr) { struct ipvl_addr *ipvladdr; void *addr = NULL; @@ -668,7 +678,7 @@ static void ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, ip6h = (struct ipv6hdr *)lyr3h; if (!is_ipv6_usable(&ip6h->saddr)) - return; + return 0; is_v6 = true; addr = &ip6h->saddr; break; @@ -681,7 +691,7 @@ static void ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, ip4h = (struct iphdr *)lyr3h; i4addr = &ip4h->saddr; if (!is_ipv4_usable(*i4addr)) - return; + return 0; is_v6 = false; addr = i4addr; break; @@ -696,17 +706,18 @@ static void ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, arp_ptr += ipvlan->port->dev->addr_len; i4addr = (__be32 *)arp_ptr; if (!is_ipv4_usable(*i4addr)) - return; + return 0; is_v6 = false; addr = i4addr; break; } default: - return; + return 0; } /* handle situation when MAC changed, but IP is the same. */ ipvladdr = ipvlan_ht_addr_lookup(ipvlan->port, addr, is_v6); + if (ipvladdr && !ether_addr_equal(ipvladdr->hwaddr, hwaddr)) { /* del_addr is safe to call, because we are inside xmit. */ ipvlan_del_addr(ipvladdr->master, addr, is_v6); @@ -714,7 +725,9 @@ static void ipvlan_macnat_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, } if (!ipvladdr) - __ipvlan_macnat_addr_learn(ipvlan, addr, is_v6, hwaddr); + return __ipvlan_macnat_addr_learn(ipvlan, addr, is_v6, hwaddr); + + return 0; } static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) @@ -984,9 +997,9 @@ static int ipvlan_xmit_mode_macnat(struct sk_buff *skb, struct net_device *dev) ipvlan_mark_skb(skb, ipvlan->phy_dev); lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); - if (lyr3h) - ipvlan_macnat_addr_learn(ipvlan, lyr3h, addr_type, - eth->h_source); + if (lyr3h && ipvlan_macnat_addr_learn(ipvlan, lyr3h, addr_type, + eth->h_source) < 0) + goto out_drop; if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 27d289aadef1..c8cf3b85fce1 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -132,6 +132,124 @@ static int ipvlan_macnat_port_rcv(struct sk_buff *skb, struct net_device *wdev, return NET_RX_DROP; } +static int ipvlan_port_add_addr(struct ipvl_port *port, const void *iaddr, + bool is_v6, bool can_block) +{ + gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; + struct ipvl_port_addr *addr; + + addr = kzalloc(sizeof(*addr), gfp_flags); + if (!addr) + return -ENOMEM; + if (!is_v6) { + memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); + addr->atype = IPVL_IPV4; + } else { + memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr)); + addr->atype = IPVL_IPV6; + } + + spin_lock_bh(&port->addrs_lock); + list_add_tail_rcu(&addr->anode, &port->port_addrs); + spin_unlock_bh(&port->addrs_lock); + + return 0; +} + +static bool portaddr_equal(bool is_v6, const struct ipvl_port_addr *addr, + const void *iaddr) +{ + if (!is_v6 && addr->atype == IPVL_IPV4) { + struct in_addr *i4addr = (struct in_addr *)iaddr; + + return addr->ip4addr.s_addr == i4addr->s_addr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (is_v6 && addr->atype == IPVL_IPV6) { + struct in6_addr *i6addr = (struct in6_addr *)iaddr; + + return ipv6_addr_equal(&addr->ip6addr, i6addr); +#endif + } + + return false; +} + +struct ipvl_port_addr *ipvlan_port_find_addr(struct ipvl_port *port, + const void *iaddr, bool is_v6) +{ + struct ipvl_port_addr *addr; + + list_for_each_entry_rcu(addr, &port->port_addrs, anode) + if (portaddr_equal(is_v6, addr, iaddr)) + return addr; + return NULL; +} + +static void ipvlan_port_del_addr(struct ipvl_port *port, const void *iaddr, + bool is_v6) +{ + struct ipvl_port_addr *addr; + + spin_lock_bh(&port->addrs_lock); + addr = ipvlan_port_find_addr(port, iaddr, is_v6); + if (addr) + list_del_rcu(&addr->anode); + spin_unlock_bh(&port->addrs_lock); + + if (addr) + kfree_rcu(addr, rcu); +} + +static int ipvlan_port_enum_addrs(struct ipvl_port *port) +{ + struct inet6_dev *in6_dev __maybe_unused; + const struct in_device *in_dev; + int r = 0; + + ASSERT_RTNL(); + + in_dev = __in_dev_get_rtnl(port->dev); + if (in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + r = ipvlan_port_add_addr(port, &ifa->ifa_local, false, + true); + if (r < 0) + return r; + } + } + +#if IS_ENABLED(CONFIG_IPV6) + in6_dev = __in6_dev_get(port->dev); + if (in6_dev) { + const struct inet6_ifaddr *ifa6; + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifa6, &in6_dev->addr_list, if_list) { + r = ipvlan_port_add_addr(port, &ifa6->addr, true, + false); + if (r < 0) + break; + } + read_unlock_bh(&in6_dev->lock); + } +#endif + return r; +} + +static void ipvlan_port_free_port_addrs(struct ipvl_port *port) +{ + struct ipvl_port_addr *addr, *next; + + ASSERT_RTNL(); + + list_for_each_entry_safe(addr, next, &port->port_addrs, anode) { + list_del_rcu(&addr->anode); + kfree_rcu(addr, rcu); + } +} + static int ipvlan_port_create(struct net_device *dev) { struct ipvl_port *port; @@ -148,12 +266,15 @@ static int ipvlan_port_create(struct net_device *dev) for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) INIT_HLIST_HEAD(&port->hlhead[idx]); + INIT_LIST_HEAD(&port->port_addrs); spin_lock_init(&port->addrs_lock); skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); ida_init(&port->ida); port->dev_id_start = 1; + ipvlan_port_enum_addrs(port); + err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) goto err; @@ -167,6 +288,7 @@ static int ipvlan_port_create(struct net_device *dev) return 0; err: + ipvlan_port_free_port_addrs(port); kfree(port); return err; } @@ -188,6 +310,7 @@ static void ipvlan_port_destroy(struct net_device *dev) kfree_skb(skb); } ida_destroy(&port->ida); + ipvlan_port_free_port_addrs(port); kfree(port); } @@ -986,6 +1109,50 @@ void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) kfree_rcu(addr, rcu); } +static void ipvlan_port_del_addr_ipvlans(struct ipvl_port *port, + const void *iaddr, bool is_v6) +{ + struct ipvl_addr *addr = NULL; + struct ipvl_dev *ipvlan; + + list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { + spin_lock_bh(&port->addrs_lock); + addr = ipvlan_find_addr(ipvlan, iaddr, is_v6); + if (addr) { + ipvlan_ht_addr_del(addr); + list_del_rcu(&addr->anode); + spin_unlock_bh(&port->addrs_lock); + break; + } + spin_unlock_bh(&port->addrs_lock); + } + + if (addr) + kfree_rcu(addr, rcu); +} + +static int ipvlan_port_add_addr_event(struct ipvl_port *port, + const void *iaddr, bool is_v6) +{ + int r; + + r = ipvlan_port_add_addr(port, iaddr, is_v6, true); + if (r < 0) + return r; + + ipvlan_port_del_addr_ipvlans(port, iaddr, is_v6); + + return NOTIFY_OK; +} + +static int ipvlan_port_del_addr_event(struct ipvl_port *port, + const void *iaddr, bool is_v6) +{ + ipvlan_port_del_addr(port, iaddr, is_v6); + + return NOTIFY_OK; +} + #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { @@ -1014,6 +1181,24 @@ static int ipvlan_addr6_event(struct notifier_block *unused, struct net_device *dev = (struct net_device *)if6->idev->dev; struct ipvl_dev *ipvlan = netdev_priv(dev); + if (netif_is_ipvlan_port(dev)) { + struct ipvl_port *port = ipvlan_port_get_rcu(dev); + + if (!ipvlan_is_macnat(port)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + return ipvlan_port_add_addr_event(port, &if6->addr, + true); + case NETDEV_DOWN: + return ipvlan_port_del_addr_event(port, &if6->addr, + true); + default: + return NOTIFY_OK; + } + } + if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; @@ -1086,6 +1271,26 @@ static int ipvlan_addr4_event(struct notifier_block *unused, struct ipvl_dev *ipvlan = netdev_priv(dev); struct in_addr ip4_addr; + if (netif_is_ipvlan_port(dev)) { + struct ipvl_port *port = ipvlan_port_get_rcu(dev); + + if (!ipvlan_is_macnat(port)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + return ipvlan_port_add_addr_event(port, + &if4->ifa_address, + false); + case NETDEV_DOWN: + return ipvlan_port_del_addr_event(port, + &if4->ifa_address, + false); + default: + return NOTIFY_OK; + } + } + if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; -- 2.25.1 Fix functions that accept "void *iaddr" as param to have const-specifier. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan.h | 6 +++--- drivers/net/ipvlan/ipvlan_core.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index faba1308c135..be2bc2d33ddb 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -194,11 +194,11 @@ void ipvlan_multicast_enqueue(struct ipvl_port *port, int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); int ipvlan_add_addr(struct ipvl_dev *ipvlan, - void *iaddr, bool is_v6, const u8 *hwaddr); -void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); + const void *iaddr, bool is_v6, const u8 *hwaddr); +void ipvlan_del_addr(struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); -bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); +bool ipvlan_addr_busy(struct ipvl_port *port, const void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index f08edaaae61d..425d2a12a4ee 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -121,7 +121,7 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, return ret; } -bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) +bool ipvlan_addr_busy(struct ipvl_port *port, const void *iaddr, bool is_v6) { struct ipvl_dev *ipvlan; bool ret = false; diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index c8cf3b85fce1..e99285bca1cd 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -1058,7 +1058,7 @@ static int ipvlan_device_event(struct notifier_block *unused, } /* the caller must held the addrs lock */ -int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6, +int ipvlan_add_addr(struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6, const u8 *hwaddr) { struct ipvl_addr *addr; @@ -1092,7 +1092,7 @@ int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6, return 0; } -void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +void ipvlan_del_addr(struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6) { struct ipvl_addr *addr; -- 2.25.1 Extract commond code for ipvlan_addr4_validator_event()/ ipvlan_addr6_validator_event() to own function Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_main.c | 67 +++++++++++++++----------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index e99285bca1cd..e50dd9022557 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -1153,6 +1153,33 @@ static int ipvlan_port_del_addr_event(struct ipvl_port *port, return NOTIFY_OK; } +static int ipvlan_addr_validator_event(struct net_device *dev, + unsigned long event, + struct netlink_ext_ack *extack, + const void *iaddr, + bool is_v6) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (!ipvlan_is_valid_dev(dev)) + return NOTIFY_DONE; + + if (ipvlan_is_macnat(ipvlan->port)) + return notifier_from_errno(-EADDRNOTAVAIL); + + switch (event) { + case NETDEV_UP: + if (ipvlan_addr_busy(ipvlan->port, iaddr, is_v6)) { + NL_SET_ERR_MSG(extack, + "Address already assigned to an ipvlan device"); + return notifier_from_errno(-EADDRINUSE); + } + break; + } + + return NOTIFY_OK; +} + #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { @@ -1221,25 +1248,9 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused, { struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr; struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev; - struct ipvl_dev *ipvlan = netdev_priv(dev); - - if (!ipvlan_is_valid_dev(dev)) - return NOTIFY_DONE; - - if (ipvlan_is_macnat(ipvlan->port)) - return notifier_from_errno(-EADDRNOTAVAIL); - switch (event) { - case NETDEV_UP: - if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { - NL_SET_ERR_MSG(i6vi->extack, - "Address already assigned to an ipvlan device"); - return notifier_from_errno(-EADDRINUSE); - } - break; - } - - return NOTIFY_OK; + return ipvlan_addr_validator_event(dev, event, i6vi->extack, + &i6vi->i6vi_addr, true); } #endif @@ -1315,25 +1326,9 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused, { struct in_validator_info *ivi = (struct in_validator_info *)ptr; struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev; - struct ipvl_dev *ipvlan = netdev_priv(dev); - - if (!ipvlan_is_valid_dev(dev)) - return NOTIFY_DONE; - - if (ipvlan_is_macnat(ipvlan->port)) - return notifier_from_errno(-EADDRNOTAVAIL); - switch (event) { - case NETDEV_UP: - if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { - NL_SET_ERR_MSG(ivi->extack, - "Address already assigned to an ipvlan device"); - return notifier_from_errno(-EADDRINUSE); - } - break; - } - - return NOTIFY_OK; + return ipvlan_addr_validator_event(dev, event, ivi->extack, + &ivi->ivi_addr, false); } static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { -- 2.25.1 Both IPv4 and IPv6 addr-event functions are very similar. Refactor to use common funcitons. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_main.c | 117 ++++++++++--------------------- 1 file changed, 37 insertions(+), 80 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index e50dd9022557..33da1d45eb24 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -1180,33 +1180,39 @@ static int ipvlan_addr_validator_event(struct net_device *dev, return NOTIFY_OK; } -#if IS_ENABLED(CONFIG_IPV6) -static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) +static int ipvlan_add_addr_event(struct ipvl_dev *ipvlan, const void *iaddr, + bool is_v6) { int ret = -EINVAL; spin_lock_bh(&ipvlan->port->addrs_lock); - if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) - netif_err(ipvlan, ifup, ipvlan->dev, - "Failed to add IPv6=%pI6c addr for %s intf\n", - ip6_addr, ipvlan->dev->name); - else - ret = ipvlan_add_addr(ipvlan, ip6_addr, true, NULL); + if (ipvlan_addr_busy(ipvlan->port, iaddr, is_v6)) { + if (is_v6) { + netif_err(ipvlan, ifup, ipvlan->dev, + "Failed to add IPv6=%pI6c on %s intf.\n", + iaddr, ipvlan->dev->name); + } else { + netif_err(ipvlan, ifup, ipvlan->dev, + "Failed to add IPv4=%pI4 on %s intf.\n", + iaddr, ipvlan->dev->name); + } + } else { + ret = ipvlan_add_addr(ipvlan, iaddr, is_v6, NULL); + } spin_unlock_bh(&ipvlan->port->addrs_lock); return ret; } -static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) +static void ipvlan_del_addr_event(struct ipvl_dev *ipvlan, const void *iaddr, + bool is_v6) { - return ipvlan_del_addr(ipvlan, ip6_addr, true); + return ipvlan_del_addr(ipvlan, iaddr, is_v6); } -static int ipvlan_addr6_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static int ipvlan_addr_event(struct net_device *dev, unsigned long event, + const void *iaddr, bool is_v6) { - struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; - struct net_device *dev = (struct net_device *)if6->idev->dev; - struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_dev *ipvlan; if (netif_is_ipvlan_port(dev)) { struct ipvl_port *port = ipvlan_port_get_rcu(dev); @@ -1216,11 +1222,9 @@ static int ipvlan_addr6_event(struct notifier_block *unused, switch (event) { case NETDEV_UP: - return ipvlan_port_add_addr_event(port, &if6->addr, - true); + return ipvlan_port_add_addr_event(port, iaddr, is_v6); case NETDEV_DOWN: - return ipvlan_port_del_addr_event(port, &if6->addr, - true); + return ipvlan_port_del_addr_event(port, iaddr, is_v6); default: return NOTIFY_OK; } @@ -1229,20 +1233,31 @@ static int ipvlan_addr6_event(struct notifier_block *unused, if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; + ipvlan = netdev_priv(dev); switch (event) { case NETDEV_UP: - if (ipvlan_add_addr6(ipvlan, &if6->addr)) + if (ipvlan_add_addr_event(ipvlan, iaddr, is_v6)) return NOTIFY_BAD; break; case NETDEV_DOWN: - ipvlan_del_addr6(ipvlan, &if6->addr); + ipvlan_del_addr_event(ipvlan, iaddr, is_v6); break; } return NOTIFY_OK; } +#if IS_ENABLED(CONFIG_IPV6) +static int ipvlan_addr6_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; + struct net_device *dev = (struct net_device *)if6->idev->dev; + + return ipvlan_addr_event(dev, event, &if6->addr, true); +} + static int ipvlan_addr6_validator_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -1254,71 +1269,13 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused, } #endif -static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) -{ - int ret = -EINVAL; - - spin_lock_bh(&ipvlan->port->addrs_lock); - if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) - netif_err(ipvlan, ifup, ipvlan->dev, - "Failed to add IPv4=%pI4 on %s intf.\n", - ip4_addr, ipvlan->dev->name); - else - ret = ipvlan_add_addr(ipvlan, ip4_addr, false, NULL); - spin_unlock_bh(&ipvlan->port->addrs_lock); - return ret; -} - -static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) -{ - return ipvlan_del_addr(ipvlan, ip4_addr, false); -} - static int ipvlan_addr4_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; - struct ipvl_dev *ipvlan = netdev_priv(dev); - struct in_addr ip4_addr; - - if (netif_is_ipvlan_port(dev)) { - struct ipvl_port *port = ipvlan_port_get_rcu(dev); - - if (!ipvlan_is_macnat(port)) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_UP: - return ipvlan_port_add_addr_event(port, - &if4->ifa_address, - false); - case NETDEV_DOWN: - return ipvlan_port_del_addr_event(port, - &if4->ifa_address, - false); - default: - return NOTIFY_OK; - } - } - - if (!ipvlan_is_valid_dev(dev)) - return NOTIFY_DONE; - switch (event) { - case NETDEV_UP: - ip4_addr.s_addr = if4->ifa_address; - if (ipvlan_add_addr4(ipvlan, &ip4_addr)) - return NOTIFY_BAD; - break; - - case NETDEV_DOWN: - ip4_addr.s_addr = if4->ifa_address; - ipvlan_del_addr4(ipvlan, &ip4_addr); - break; - } - - return NOTIFY_OK; + return ipvlan_addr_event(dev, event, &if4->ifa_address, false); } static int ipvlan_addr4_validator_event(struct notifier_block *unused, -- 2.25.1 Packets with pkt_type == PACKET_LOOPBACK are captured by handle_frame() function, but they don't have L2 header. We should not process them in handle_mode_l2(). This doesn't affect old L2 functionality, since handling was anyway incorrect. Handle them the same way as in br_handle_frame(): just pass the skb. To observe invalid behaviour, just start "ping -b" on bcast address of port-interface. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 425d2a12a4ee..d6975d5e2198 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -1160,6 +1160,9 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct sk_buff *skb = *pskb; bool need_eth_fix; + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + /* Ignore already seen packets. */ if (ipvlan_is_skb_marked(skb, port->dev)) return RX_HANDLER_PASS; -- 2.25.1 Implemented a self-test for ipvlan in l2macnat mode. The test verifies: 1) It's not possible to configure an ip in l2macnat mode on ipvtap 2) It creates several net namespaces - Default namespace emulates host, - ipvlan-tst-phy emulates some host in remote network - ipvlan-tst-0/1 emulate VMs on host. Test verifies, that MAC addresses are as expected in ARP/NEIGH tables: all MACs in 'tst-phy' points to "host" mac-address all MACs in Default and tst are real ones 3) The l2macnat mode has limited number of addresses remembered on port. Test verifies, that this limit really works. Signed-off-by: Dmitry Skorodumov --- tools/testing/selftests/net/Makefile | 3 + .../selftests/net/ipvtap_macnat_bridge.py | 174 +++++++++ .../selftests/net/ipvtap_macnat_test.sh | 332 ++++++++++++++++++ 3 files changed, 509 insertions(+) create mode 100755 tools/testing/selftests/net/ipvtap_macnat_bridge.py create mode 100755 tools/testing/selftests/net/ipvtap_macnat_test.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b5127e968108..ff28012d34db 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -203,6 +203,9 @@ YNL_GEN_PROGS := netlink-dumps TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_GEN_PROGS += $(YNL_GEN_PROGS) +TEST_PROGS += ipvtap_macnat_test.sh +TEST_FILES += ipvtap_macnat_bridge.py + TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) TEST_INCLUDES := forwarding/lib.sh diff --git a/tools/testing/selftests/net/ipvtap_macnat_bridge.py b/tools/testing/selftests/net/ipvtap_macnat_bridge.py new file mode 100755 index 000000000000..6fc4762b03cd --- /dev/null +++ b/tools/testing/selftests/net/ipvtap_macnat_bridge.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Script to bridge ipvtap and tap, +needed to simulate behaviour of virtual machine using ipvtap. + +ipvtap in macnat mode cannot have IP address. +Due to limitations of ipvtap, it also cannot be plugged +into bridge. +Use this script to connect ipvtap and tap and assing IP to tap. +""" + +import socket +import os +import select +import sys +import signal +import fcntl +import struct +import subprocess + +# Linux TUN/TAP constants +TUNSETIFF = 0x400454ca +IFF_TUN = 0x0001 +IFF_TAP = 0x0002 +IFF_NO_PI = 0x1000 + +ns_name = "non-initialized" + +class TapBridge: + def __init__(self, tap, ipvtap, buffer_size=65536): + self.tap_name = tap + self.ipvtap_name = ipvtap + self.buffer_size = buffer_size + self.running = False + + def open_tap_file(self, path): + """Open TAP interface as a file""" + try: + return os.open(path, os.O_RDWR) + except Exception as e: + print(f"Error opening {path}: {e}") + return None + + def open_ipvtap_sock(self, tap_name): + """Open a TAP interface using raw socket""" + try: + sock = socket.socket(socket.AF_PACKET, + socket.SOCK_RAW, + socket.ntohs(0x0003)) + sock.bind((tap_name, 0)) + sock.setblocking(False) + print(f"Connected to IPVTAP interface: {tap_name}") + return sock + + except Exception as e: + print(f"Error opening IPVTAP interface {tap_name}: {e}") + return None + + def create_tap_interface(self, tap_name): + """Create and configure a TAP interface using /dev/net/tun""" + try: + # Open the tun device + tun_fd = os.open('/dev/net/tun', os.O_RDWR) + if tun_fd < 0: + raise Exception("Failed to open /dev/net/tun") + + # Prepare the ifr structure + tap_name_bytes = tap_name.encode('utf-8') + ifr = struct.pack('16sH', tap_name_bytes, IFF_TAP | IFF_NO_PI) + + # Set the interface name and flags + result = fcntl.ioctl(tun_fd, TUNSETIFF, ifr) + + # Get the actual interface name that was set + unpacked = struct.unpack('16sH', result) + actual_name = unpacked[0].split(b'\x00')[0].decode() + print(f"Created TAP interface: {actual_name}") + + return tun_fd + + except Exception as e: + print(f"Error creating TAP interface {tap_name}: {e}") + return None + + def forward_data(self, from_fd, to_fd, description): + """Forward data from one file descriptor to another""" + try: + data = os.read(from_fd, self.buffer_size) + if data: + os.write(to_fd, data) + return True + return False + + except BlockingIOError: + return True + except Exception as e: + print(f"Error forwarding data {description}: {e}") + return False + + def run(self): + """Main bridge loop""" + # Create TAP interfaces + tap1_fd = self.create_tap_interface(self.tap_name) + + sock = self.open_ipvtap_sock(self.ipvtap_name) + tap2_fd = sock.fileno() + + if tap1_fd is None or tap2_fd is None: + print("Failed to create TAP interfaces") + return + + print("Press Ctrl+C to stop\n") + + self.running = True + stats = {'tap1_to_tap2': 0, 'tap2_to_tap1': 0} + while self.running: + try: + # Use select to monitor both file descriptors + readable, _, _ = select.select([tap1_fd, tap2_fd], [], [], 1.0) + + for fd in readable: + if fd == tap1_fd: + descr = f"from {self.tap_name} to {self.ipvtap_name}" + if self.forward_data(tap1_fd, tap2_fd, descr): + stats['tap1_to_tap2'] += 1 + else: + self.running = False + elif fd == tap2_fd: + descr = f"from {self.ipvtap_name} to {self.tap_name}" + if self.forward_data(tap2_fd, tap1_fd, descr): + stats['tap2_to_tap1'] += 1 + else: + self.running = False + + except KeyboardInterrupt: + print("\nShutting down...") + self.running = False + except Exception as e: + print(f"Error in main loop: {e}") + self.running = False + + # Cleanup + os.close(tap1_fd) + os.close(tap2_fd) + print(f"Bridge stopped in {ns_name}. Stats: {stats}") + + +def signal_handler(_sig, _frame): + print(f'\nReceived interrupt signal, shutting down bridge in {ns_name}') + sys.exit(0) + + +if __name__ == "__main__": + ns_name = subprocess.getoutput("ip netns identify") or "default" + + signal.signal(signal.SIGINT, signal_handler) + + # Check if running as root + if os.geteuid() != 0: + print("ERROR: This script must be run as root!") + sys.exit(1) + + if len(sys.argv) != 3: + print("Usage: tap_bridge.py tap_name ipvtap_name") + sys.exit(1) + + TAP = sys.argv[1] + IPVTAP = sys.argv[2] + + print(f"Starting TAP bridge between {TAP} and {IPVTAP} in {ns_name}") + bridge = TapBridge(TAP, IPVTAP) + bridge.run() diff --git a/tools/testing/selftests/net/ipvtap_macnat_test.sh b/tools/testing/selftests/net/ipvtap_macnat_test.sh new file mode 100755 index 000000000000..5f684a6d7603 --- /dev/null +++ b/tools/testing/selftests/net/ipvtap_macnat_test.sh @@ -0,0 +1,332 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Tests for ipvtap in macnat mode + +NS_TST0=ipvlan-tst-0 +NS_TST1=ipvlan-tst-1 +NS_PHY=ipvlan-tst-phy + +IP_HOST=172.25.0.1 +IP_PHY=172.25.0.2 +IP_TST0=172.25.0.10 +IP_TST1=172.25.0.30 + +IP_OK0=("172.25.0.10" "172.25.0.11" "172.25.0.12" "172.25.0.13") +IP6_OK0=("fc00::10" "fc00::11" "fc00::12" "fc00::13" ) + +IP_OVFL0="172.25.0.14" +IP6_OVFL0="fc00::14" + +IP6_HOST=fc00::1 +IP6_PHY=fc00::2 +IP6_TST0=fc00::10 +IP6_TST1=fc00::30 + +MAC_HOST="92:3a:00:00:00:01" +MAC_PHY="92:3a:00:00:00:02" +MAC_TST0="92:3a:00:00:00:10" +MAC_TST1="92:3a:00:00:00:30" + +VETH_HOST=vethtst +VETH_PHY=vethtst.p + +# +# The testing environment looks this way: +# +# |------HOST------| |------PHY-------| +# | veth<----------------->veth | +# |------|--|------| |----------------| +# | | +# | | |-----TST0-------| +# | |------------|----ipvtap | +# | |----------------| +# | +# | |-----TST1-------| +# |---------------|----ivtap | +# |----------------| +# +# The macnat mode is for virtual machines, so ipvtap-interface is supposed +# to be used only for traffic monitoring and doesn't have ip-address. +# +# To simulate a virtual machine on ipvtap, we create TAP-interfaces +# in TST environments and assing IP-addresses to them. +# TAP and IPVTAP are connected with simple python script. +# + +ns_run() { + ns=$1 + shift + if [[ "$ns" == "default" ]]; then + "$@" >/dev/null + else + ip netns exec "$ns" "$@" >/dev/null + fi +} + +configure_ns() { + local ns=$1 + local n=$2 + local ip=$3 + local ip6=$4 + local mac=$5 + + ns_run $ns ip link set lo up + + if ! ip link add netns $ns name ipvtap0.$n link $VETH_HOST \ + type ipvtap mode l2macnat bridge; then + exit_error "FAIL: Failed to configure ipvtap link." + fi + ns_run $ns ip link set ipvtap0.$n up + + ns_run $ns ip tuntap add mode tap tap0.$n + ns_run $ns ip link set dev tap0.$n address $mac + # disable dad + ns_run $ns sysctl -w net/ipv6/conf/tap0.$n/accept_dad=0 + ns_run $ns ip link set tap0.$n up + ns_run $ns ip a a $ip/24 dev tap0.$n + ns_run $ns ip a a $ip6/64 dev tap0.$n +} + +start_macnat_bridge() { + local ns=$1 + local n=$2 + ip netns exec $ns python3 ipvtap_macnat_bridge.py tap0.$n ipvtap0.$n & +} + +configure_veth() { + local ns=$1 + local veth=$2 + local ip=$3 + local ip6=$4 + local mac=$5 + + ns_run $ns ip link set lo up + ns_run $ns ethtool -K $veth tx off rx off + ns_run $ns ip link set dev $veth address $mac + ns_run $ns ip link set $veth up + ns_run $ns ip a a $ip/24 dev $veth + ns_run $ns ip a a $ip6/64 dev $veth +} + +setup_env() { + ip netns add $NS_TST0 + ip netns add $NS_TST1 + ip netns add $NS_PHY + + # setup simulated other-host (phy) and host itself + ip link add $VETH_HOST type veth peer name $VETH_PHY \ + netns $NS_PHY >/dev/null + + # host config + configure_veth default $VETH_HOST $IP_HOST $IP6_HOST $MAC_HOST + configure_veth $NS_PHY $VETH_PHY $IP_PHY $IP6_PHY $MAC_PHY + + # TST namespaces config + configure_ns $NS_TST0 0 $IP_TST0 $IP6_TST0 $MAC_TST0 + configure_ns $NS_TST1 1 $IP_TST1 $IP6_TST1 $MAC_TST1 +} + +ping_all() { + # This will learn MAC/IP addresses on ipvtap + local ns=$1 + + ns_run $ns ping -c 1 $IP_TST0 + ns_run $ns ping -c 1 $IP6_TST0 + + ns_run $ns ping -c 1 $IP_TST1 + ns_run $ns ping -c 1 $IP6_TST1 + + ns_run $ns ping -c 1 $IP_HOST + ns_run $ns ping -c 1 $IP6_HOST + + ns_run $ns ping -c 1 $IP_PHY + ns_run $ns ping -c 1 $IP6_PHY +} + +check_mac_eq() { + # Ensure IP corresponds to MAC. + local ns=$1 + local ip=$2 + local mac=$3 + local dev=$4 + + if [[ "$ns" == "default" ]]; then + out=$( + ip neigh show $ip dev $dev \ + | grep "$ip" \ + | grep "$mac" + ) + else + out=$( + ip netns exec $ns \ + ip neigh show $ip dev $dev \ + | grep "$ip" \ + | grep "$mac" + ) + fi + + if [[ 'X'$out'X' == "XX" ]]; then + exit_error "FAIL: '$ip' is not '$mac'" + fi +} + +cleanup_env() { + ip link del $VETH_HOST + ip netns del $NS_TST0 + ip netns del $NS_TST1 + ip netns del $NS_PHY +} + +exit_error() { + echo $1 + exit 1 +} + +test_check_mac() { + # All IPs in NS_PHY should have MAC of the host + check_mac_eq $NS_PHY $IP_TST0 $MAC_HOST $VETH_PHY + check_mac_eq $NS_PHY $IP6_TST0 $MAC_HOST $VETH_PHY + check_mac_eq $NS_PHY $IP_TST1 $MAC_HOST $VETH_PHY + check_mac_eq $NS_PHY $IP6_TST1 $MAC_HOST $VETH_PHY + check_mac_eq $NS_PHY $IP_HOST $MAC_HOST $VETH_PHY + check_mac_eq $NS_PHY $IP6_HOST $MAC_HOST $VETH_PHY + + # All IPs in TST0 should have corresponding MAC + check_mac_eq $NS_TST0 $IP_HOST $MAC_HOST tap0.0 + check_mac_eq $NS_TST0 $IP6_HOST $MAC_HOST tap0.0 + check_mac_eq $NS_TST0 $IP_TST1 $MAC_TST1 tap0.0 + check_mac_eq $NS_TST0 $IP6_TST1 $MAC_TST1 tap0.0 + check_mac_eq $NS_TST0 $IP_PHY $MAC_PHY tap0.0 + check_mac_eq $NS_TST0 $IP6_PHY $MAC_PHY tap0.0 + + # All IPs in host should have corresponding MAC + check_mac_eq default $IP_TST0 $MAC_TST0 $VETH_HOST + check_mac_eq default $IP6_TST0 $MAC_TST0 $VETH_HOST + check_mac_eq default $IP_TST1 $MAC_TST1 $VETH_HOST + check_mac_eq default $IP6_TST1 $MAC_TST1 $VETH_HOST + check_mac_eq default $IP_PHY $MAC_PHY $VETH_HOST + check_mac_eq default $IP6_PHY $MAC_PHY $VETH_HOST +} + +test_ip_add() { + # adding IPs to ipvtap should be forbidden and should fail + if ns_run $NS_TST0 ip a a 172.26.0.1/24 dev ipvtap0.0; then + exit_error "FAIL: Module allowed to add ip to ipvtap." + fi + + if ns_run $NS_TST0 ip a a fc01::1/64 dev ipvtap0.0; then + exit_error "FAIL: Module allowed to add ip6 to ipvtap." + fi +} + +test_ip_overflow() { + # The ipvtap remembers limited number of addresses on interface. + # Let's overflow it and check that oldest one doesn't work. + + ns_run $NS_TST0 ip addr flush dev tap0.0 + + # Add exactly 4 ip addresses + for ip in "${IP_OK0[@]}"; do + ns_run $NS_TST0 ip a a $ip/24 dev tap0.0 + ns_run $NS_TST0 ping -c 1 $IP_HOST -I $ip + done + + # Initial check that ping works + if ! ping -c 2 $IP_TST0; then + exit_error "FAIL: Failed to ping tst0" + fi + + # Add 1 more ip addresses + ns_run $NS_TST0 ip a a $IP_OVFL0/24 dev tap0.0 + ns_run $NS_TST0 ping -c 1 $IP_HOST -I $IP_OVFL0 + # check that ping to oldest one from host fails. + echo "the next ping should fail:" + if ping -c 2 $IP_TST0; then + exit_error "FAIL: IP-0 still exists on interface" + fi + + # ping host using address-0 and force relearn of IP0. + # Host should be able ping after that + ns_run $NS_TST0 ping -c 1 $IP_HOST -I $IP_TST0 + + if ! ping -c 2 $IP_TST0; then + exit_error "FAIL: Failed to ping tst0 at stage 3" + fi +} + +test_ip6_overflow() { + # The ipvtap stores limited number of addresses on interface. + # Let's overflow it and check that oldest one doesn't work. + + ns_run $NS_TST0 ip addr flush dev tap0.0 + + # Add exactly 4 ip addresses + for ip6 in "${IP6_OK0[@]}"; do + ns_run $NS_TST0 ip a a $ip6/64 dev tap0.0 + ns_run $NS_TST0 ping -c 1 $IP6_HOST -I $ip6 + done + + # Initial check that ping6 works + if ! ping -c 2 $IP6_TST0; then + exit_error "FAIL: Failed to ping6 tst0" + fi + + # Add 1 more ip6 addresses + ns_run $NS_TST0 ip a a $IP6_OVFL0/64 dev tap0.0 + ns_run $NS_TST0 ping -c 1 $IP6_HOST -I $IP6_OVFL0 + # check that ping to oldest one from host fails. + echo "the next ping should fail:" + if ping -c 2 $IP6_TST0; then + exit_error "FAIL: IP6-0 still exists on interface" + fi + + # ping host using address-0 and force relearn of IP0. + # Host should be able ping after that + ns_run $NS_TST0 ping -c 1 $IP6_HOST -I $IP6_TST0 + if ! ping -c 2 $IP6_TST0; then + exit_error "FAIL: Failed to ping6 tst0 at stage 3" + fi +} + +exec_test() { + echo "TEST: "$2 + $1 + echo "PASSED: "$2 +} + +trap cleanup_env EXIT + +echo "ipvlan macnat tests" +echo "===================" + +modprobe -q tap +modprobe -q ipvlan +modprobe -q ipvtap + +setup_env + +exec_test test_ip_add "ip add not allowed" + +start_macnat_bridge $NS_TST0 0 +mb_pid1=$! +start_macnat_bridge $NS_TST1 1 +mb_pid2=$! + +echo "<<< Preparation: pinging all...." +ping_all default +ping_all $NS_TST0 +ping_all $NS_TST1 +ping_all $NS_PHY +echo "Finished preparational pinging all. >>>" + +exec_test test_check_mac "mac correctness" +exec_test test_ip_overflow "ip learn capacity overflow" +exec_test test_ip6_overflow "ip6 learn capacity overflow" + +kill -INT $mb_pid1 +kill -INT $mb_pid2 +wait $mb_pid1 +wait $mb_pid2 + +echo "All tests passed" -- 2.25.1