Now it is possible to create link in L2E mode: learnable bridge. The IPs will be learned from TX-packets of child interfaces. Also, dev_add_pack() protocol is attached to the main port to support communication from main to child interfaces. This mode is intended for the desktop virtual machines, for bridging to Wireless interfaces. The mode should be specified while creating first child interface. It is not possible to change it after this. Signed-off-by: Dmitry Skorodumov --- Documentation/networking/ipvlan.rst | 11 ++ drivers/net/ipvlan/ipvlan.h | 21 ++++ drivers/net/ipvlan/ipvlan_core.c | 163 +++++++++++++++++++++++++--- drivers/net/ipvlan/ipvlan_main.c | 140 +++++++++++++++++++++--- include/uapi/linux/if_link.h | 1 + 5 files changed, 301 insertions(+), 35 deletions(-) diff --git a/Documentation/networking/ipvlan.rst b/Documentation/networking/ipvlan.rst index 895d0ccfd596..9539e8ac99f4 100644 --- a/Documentation/networking/ipvlan.rst +++ b/Documentation/networking/ipvlan.rst @@ -90,6 +90,17 @@ works in this mode and hence it is L3-symmetric (L3s). This will have slightly l performance but that shouldn't matter since you are choosing this mode over plain-L3 mode to make conn-tracking work. +4.4 L2E mode: +------------- + +This mode is an extension for the L2 mode. It is primarily intended for +desktop virtual machines for bridging to Wireless interfaces. In plain L2 +mode you have to configure IPs on slave interface to make it possible +mux-ing frames between slaves/master. In the L2E mode, ipvlan will +learn itself IPv4/IPv6 address from outgoing packets. Moreover, +the dev_add_pack() is configured on master interface to capture +outgoing frames and mux-ing it to slave interfaces, if needed. + 5. Mode flags: ============== diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 50de3ee204db..020e80df1e38 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -91,6 +91,7 @@ struct ipvl_port { possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_head ipvlans; + struct packet_type ipvl_ptype; u16 mode; u16 flags; u16 dev_id_start; @@ -103,6 +104,7 @@ struct ipvl_port { struct ipvl_skb_cb { bool tx_pkt; + void *mark; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) @@ -151,12 +153,31 @@ static inline void ipvlan_clear_vepa(struct ipvl_port *port) port->flags &= ~IPVLAN_F_VEPA; } +static inline bool ipvlan_is_learnable(struct ipvl_port *port) +{ + return port->mode == IPVLAN_MODE_L2E; +} + +static inline void ipvlan_mark_skb(struct sk_buff *skb, struct net_device *dev) +{ + IPVL_SKB_CB(skb)->mark = dev; +} + +static inline bool ipvlan_is_skb_marked(struct sk_buff *skb, struct net_device *dev) +{ + return (IPVL_SKB_CB(skb)->mark == dev); +} + void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); +void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev); void ipvlan_process_multicast(struct work_struct *work); +void ipvlan_multicast_enqueue(struct ipvl_port *port, + struct sk_buff *skb, bool tx_pkt); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); +int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index d7e3ddbcab6f..ffe8efd2f1aa 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -284,6 +284,18 @@ void ipvlan_process_multicast(struct work_struct *work) rcu_read_unlock(); if (tx_pkt) { + if (ipvlan_is_learnable(port)) { + /* Inject packet to main dev */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + local_bh_disable(); + nskb->pkt_type = pkt_type; + nskb->dev = port->dev; + dev_forward_skb(port->dev, nskb); + local_bh_enable(); + } + } + /* If the packet originated here, send it out. */ skb->dev = port->dev; skb->pkt_type = pkt_type; @@ -299,7 +311,7 @@ void ipvlan_process_multicast(struct work_struct *work) } } -static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) +void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) { bool xnet = true; @@ -414,6 +426,77 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, return addr; } +static inline bool is_ipv4_usable(__be32 addr) +{ + return !ipv4_is_lbcast(addr) && !ipv4_is_multicast(addr) && + !ipv4_is_zeronet(addr); +} + +static inline bool is_ipv6_usable(const struct in6_addr *addr) +{ + return !ipv6_addr_is_multicast(addr) && !ipv6_addr_loopback(addr) && + !ipv6_addr_any(addr); +} + +static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, + int addr_type) +{ + void *addr = NULL; + bool is_v6; + + switch (addr_type) { +#if IS_ENABLED(CONFIG_IPV6) + /* No need to handle IPVL_ICMPV6, since it never has valid src-address */ + case IPVL_IPV6: { + struct ipv6hdr *ip6h; + + ip6h = (struct ipv6hdr *)lyr3h; + if (!is_ipv6_usable(&ip6h->saddr)) + return; + is_v6 = true; + addr = &ip6h->saddr; + break; + } +#endif + case IPVL_IPV4: { + struct iphdr *ip4h; + __be32 *i4addr; + + ip4h = (struct iphdr *)lyr3h; + i4addr = &ip4h->saddr; + if (!is_ipv4_usable(*i4addr)) + return; + is_v6 = false; + addr = i4addr; + break; + } + case IPVL_ARP: { + struct arphdr *arph; + unsigned char *arp_ptr; + __be32 *i4addr; + + arph = (struct arphdr *)lyr3h; + arp_ptr = (unsigned char *)(arph + 1); + arp_ptr += ipvlan->port->dev->addr_len; + i4addr = (__be32 *)arp_ptr; + if (!is_ipv4_usable(*i4addr)) + return; + is_v6 = false; + addr = i4addr; + break; + } + default: + return; + } + + if (!ipvlan_ht_addr_lookup(ipvlan->port, addr, is_v6)) { + spin_lock_bh(&ipvlan->addrs_lock); + if (!ipvlan_addr_busy(ipvlan->port, addr, is_v6)) + ipvlan_add_addr(ipvlan, addr, is_v6); + spin_unlock_bh(&ipvlan->addrs_lock); + } +} + static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -561,8 +644,8 @@ static int ipvlan_process_outbound(struct sk_buff *skb) return ret; } -static void ipvlan_multicast_enqueue(struct ipvl_port *port, - struct sk_buff *skb, bool tx_pkt) +void ipvlan_multicast_enqueue(struct ipvl_port *port, + struct sk_buff *skb, bool tx_pkt) { if (skb->protocol == htons(ETH_P_PAUSE)) { kfree_skb(skb); @@ -618,15 +701,56 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) { - const struct ipvl_dev *ipvlan = netdev_priv(dev); - struct ethhdr *eth = skb_eth_hdr(skb); - struct ipvl_addr *addr; void *lyr3h; + struct ipvl_addr *addr; int addr_type; + bool same_mac_addr; + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ethhdr *eth = skb_eth_hdr(skb); + + if (ipvlan_is_learnable(ipvlan->port) && + ether_addr_equal(eth->h_source, dev->dev_addr)) { + /* ignore tx-packets from host */ + goto out_drop; + } + + same_mac_addr = ether_addr_equal(eth->h_dest, eth->h_source); + + lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); - if (!ipvlan_is_vepa(ipvlan->port) && - ether_addr_equal(eth->h_dest, eth->h_source)) { - lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); + if (ipvlan_is_learnable(ipvlan->port)) { + if (lyr3h) + ipvlan_addr_learn(ipvlan, lyr3h, addr_type); + /* Mark SKB in advance */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + ipvlan_mark_skb(skb, ipvlan->phy_dev); + } + + if (is_multicast_ether_addr(eth->h_dest)) { + skb_reset_mac_header(skb); + ipvlan_skb_crossing_ns(skb, NULL); + ipvlan_multicast_enqueue(ipvlan->port, skb, true); + return NET_XMIT_SUCCESS; + } + + if (ipvlan_is_vepa(ipvlan->port)) + goto tx_phy_dev; + + if (!same_mac_addr && + ether_addr_equal(eth->h_dest, ipvlan->phy_dev->dev_addr)) { + /* It is a packet from child with destination to main port. + * Pass it to main. + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + skb->pkt_type = PACKET_HOST; + skb->dev = ipvlan->phy_dev; + dev_forward_skb(ipvlan->phy_dev, skb); + return NET_XMIT_SUCCESS; + } else if (same_mac_addr) { if (lyr3h) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { @@ -649,16 +773,14 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) */ dev_forward_skb(ipvlan->phy_dev, skb); return NET_XMIT_SUCCESS; - - } else if (is_multicast_ether_addr(eth->h_dest)) { - skb_reset_mac_header(skb); - ipvlan_skb_crossing_ns(skb, NULL); - ipvlan_multicast_enqueue(ipvlan->port, skb, true); - return NET_XMIT_SUCCESS; } +tx_phy_dev: skb->dev = ipvlan->phy_dev; return dev_queue_xmit(skb); +out_drop: + consume_skb(skb); + return NET_XMIT_DROP; } int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) @@ -674,6 +796,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) switch(port->mode) { case IPVLAN_MODE_L2: + case IPVLAN_MODE_L2E: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: #ifdef CONFIG_IPVLAN_L3S @@ -737,17 +860,22 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; + /* Ignore already seen packets. */ + if (ipvlan_is_skb_marked(skb, port->dev)) + return RX_HANDLER_PASS; + if (is_multicast_ether_addr(eth->h_dest)) { if (ipvlan_external_frame(skb, port)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); - /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later * when work-queue processes this frame. This is * achieved by returning RX_HANDLER_PASS. */ + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + ipvlan_mark_skb(skb, port->dev); ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); } @@ -770,6 +898,7 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) switch (port->mode) { case IPVLAN_MODE_L2: + case IPVLAN_MODE_L2E: return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 660f3db11766..df5275bc30fc 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -7,6 +7,11 @@ #include "ipvlan.h" +static void ipvlan_set_learnable(struct ipvl_port *port) +{ + dev_add_pack(&port->ipvl_ptype); +} + static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { @@ -16,6 +21,15 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, ASSERT_RTNL(); if (port->mode != nval) { + /* Don't allow switch off the learnable bridge mode. + * Flags also must be set from the first port-link setup. + */ + if (port->mode == IPVLAN_MODE_L2E || + (nval == IPVLAN_MODE_L2E && port->count > 1)) { + netdev_err(port->dev, "L2E mode cannot be changed.\n"); + return -EINVAL; + } + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { flags = ipvlan->dev->flags; if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) { @@ -40,7 +54,10 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, ipvlan_l3s_unregister(port); } port->mode = nval; + if (port->mode == IPVLAN_MODE_L2E) + ipvlan_set_learnable(port); } + return 0; fail: @@ -59,6 +76,64 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, return err; } +static int ipvlan_port_receive(struct sk_buff *skb, struct net_device *wdev, + struct packet_type *pt, struct net_device *orig_wdev) +{ + struct ipvl_port *port; + struct ipvl_addr *addr; + struct ethhdr *eth; + void *lyr3h; + int addr_type; + + port = container_of(pt, struct ipvl_port, ipvl_ptype); + /* We are interested only in outgoing packets. + * rx-path is handled in rx_handler(). + */ + if (skb->pkt_type != PACKET_OUTGOING || ipvlan_is_skb_marked(skb, port->dev)) + goto out; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto no_mem; + + /* data should point to eth-header */ + skb_push(skb, skb->data - skb_mac_header(skb)); + skb->dev = port->dev; + eth = eth_hdr(skb); + + if (is_multicast_ether_addr(eth->h_dest)) { + ipvlan_skb_crossing_ns(skb, NULL); + skb->protocol = eth_type_trans(skb, skb->dev); + skb->pkt_type = PACKET_HOST; + ipvlan_mark_skb(skb, port->dev); + ipvlan_multicast_enqueue(port, skb, false); + return 0; + } + + lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); + if (addr) { + int ret, len; + + ipvlan_skb_crossing_ns(skb, addr->master->dev); + skb->protocol = eth_type_trans(skb, skb->dev); + skb->pkt_type = PACKET_HOST; + ipvlan_mark_skb(skb, port->dev); + len = skb->len + ETH_HLEN; + ret = netif_rx(skb); + ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, false); + return 0; + } + +out: + dev_kfree_skb(skb); +no_mem: + return 0; // actually, ret value is ignored +} + static int ipvlan_port_create(struct net_device *dev) { struct ipvl_port *port; @@ -84,6 +159,11 @@ static int ipvlan_port_create(struct net_device *dev) if (err) goto err; + port->ipvl_ptype.func = ipvlan_port_receive; + port->ipvl_ptype.type = htons(ETH_P_ALL); + port->ipvl_ptype.dev = dev; + port->ipvl_ptype.list.prev = LIST_POISON2; + netdev_hold(dev, &port->dev_tracker, GFP_KERNEL); return 0; @@ -100,6 +180,8 @@ static void ipvlan_port_destroy(struct net_device *dev) netdev_put(dev, &port->dev_tracker); if (port->mode == IPVLAN_MODE_L3S) ipvlan_l3s_unregister(port); + if (port->ipvl_ptype.list.prev != LIST_POISON2) + dev_remove_pack(&port->ipvl_ptype); netdev_rx_handler_unregister(dev); cancel_work_sync(&port->wq); while ((skb = __skb_dequeue(&port->backlog)) != NULL) { @@ -189,10 +271,13 @@ static int ipvlan_open(struct net_device *dev) else dev->flags &= ~IFF_NOARP; - rcu_read_lock(); - list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) - ipvlan_ht_addr_add(ipvlan, addr); - rcu_read_unlock(); + /* for learnable, addresses will be obtained from tx-packets. */ + if (!ipvlan_is_learnable(ipvlan->port)) { + rcu_read_lock(); + list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) + ipvlan_ht_addr_add(ipvlan, addr); + rcu_read_unlock(); + } return 0; } @@ -581,11 +666,21 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, INIT_LIST_HEAD(&ipvlan->addrs); spin_lock_init(&ipvlan->addrs_lock); - /* TODO Probably put random address here to be presented to the - * world but keep using the physical-dev address for the outgoing - * packets. + /* Flags are per port and latest update overrides. User has + * to be consistent in setting it just like the mode attribute. */ - eth_hw_addr_set(dev, phy_dev->dev_addr); + if (data && data[IFLA_IPVLAN_MODE]) + mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + if (mode != IPVLAN_MODE_L2E) { + /* TODO Probably put random address here to be presented to the + * world but keep using the physical-dev address for the outgoing + * packets. + */ + eth_hw_addr_set(dev, phy_dev->dev_addr); + } else { + eth_hw_addr_random(dev); + } dev->priv_flags |= IFF_NO_RX_HANDLER; @@ -597,6 +692,9 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, port = ipvlan_port_get_rtnl(phy_dev); ipvlan->port = port; + if (data && data[IFLA_IPVLAN_FLAGS]) + port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); + /* If the port-id base is at the MAX value, then wrap it around and * begin from 0x1 again. This may be due to a busy system where lots * of slaves are getting created and deleted. @@ -625,19 +723,13 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, if (err) goto remove_ida; - /* Flags are per port and latest update overrides. User has - * to be consistent in setting it just like the mode attribute. - */ - if (data && data[IFLA_IPVLAN_FLAGS]) - port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]); - - if (data && data[IFLA_IPVLAN_MODE]) - mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); - err = ipvlan_set_port_mode(port, mode, extack); if (err) goto unlink_netdev; + if (ipvlan_is_learnable(port)) + dev_set_allmulti(dev, 1); + list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); netif_stacked_transfer_operstate(phy_dev, dev); return 0; @@ -657,6 +749,9 @@ void ipvlan_link_delete(struct net_device *dev, struct list_head *head) struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; + if (ipvlan_is_learnable(ipvlan->port)) + dev_set_allmulti(dev, -1); + spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); @@ -793,6 +888,9 @@ static int ipvlan_device_event(struct notifier_block *unused, break; case NETDEV_CHANGEADDR: + if (ipvlan_is_learnable(ipvlan->port)) + break; + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { eth_hw_addr_set(ipvlan->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev); @@ -813,7 +911,7 @@ static int ipvlan_device_event(struct notifier_block *unused, } /* the caller must held the addrs lock */ -static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; @@ -928,6 +1026,9 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused, if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; + if (ipvlan_is_learnable(ipvlan->port)) + return notifier_from_errno(-EADDRNOTAVAIL); + switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) { @@ -999,6 +1100,9 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused, if (!ipvlan_is_valid_dev(dev)) return NOTIFY_DONE; + if (ipvlan_is_learnable(ipvlan->port)) + return notifier_from_errno(-EADDRNOTAVAIL); + switch (event) { case NETDEV_UP: if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3b491d96e52e..6b543c05392d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1269,6 +1269,7 @@ enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, IPVLAN_MODE_L3S, + IPVLAN_MODE_L2E, IPVLAN_MODE_MAX }; -- 2.25.1 Mcasts are sent to external net directly in ipvlan_xmit_mode_l2(). The ipvlan_process_multicast() for tx-packets just distributes them to local ifaces. This makes life a bit easier for further patches. When out-mcasts should be patched with proper MAC-address. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index ffe8efd2f1aa..9af0dcc307da 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -285,9 +285,10 @@ void ipvlan_process_multicast(struct work_struct *work) if (tx_pkt) { if (ipvlan_is_learnable(port)) { - /* Inject packet to main dev */ + /* Inject as rx-packet to main dev */ nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { + consumed = true; local_bh_disable(); nskb->pkt_type = pkt_type; nskb->dev = port->dev; @@ -295,17 +296,13 @@ void ipvlan_process_multicast(struct work_struct *work) local_bh_enable(); } } - - /* If the packet originated here, send it out. */ - skb->dev = port->dev; - skb->pkt_type = pkt_type; - dev_queue_xmit(skb); - } else { - if (consumed) - consume_skb(skb); - else - kfree_skb(skb); + /* Packet was already tx out in ipvlan_xmit_mode_l2(). */ } + if (consumed) + consume_skb(skb); + else + kfree_skb(skb); + dev_put(dev); cond_resched(); } @@ -729,10 +726,15 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) } if (is_multicast_ether_addr(eth->h_dest)) { - skb_reset_mac_header(skb); - ipvlan_skb_crossing_ns(skb, NULL); - ipvlan_multicast_enqueue(ipvlan->port, skb, true); - return NET_XMIT_SUCCESS; + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + skb_reset_mac_header(nskb); + ipvlan_skb_crossing_ns(nskb, NULL); + ipvlan_multicast_enqueue(ipvlan->port, nskb, true); + } + + goto tx_phy_dev; } if (ipvlan_is_vepa(ipvlan->port)) -- 2.25.1 Some WiFi enfironments sometimes send mcast packets with unicast eth_dst. Forcibly replace eth_dst to be bcast in this case if bridge is in L2E mode. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 57 ++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 9af0dcc307da..41059639f307 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -855,18 +855,69 @@ static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, return ret; } +static bool ipvlan_is_mcast(struct ipvl_port *port, void *lyr3h, int addr_type) +{ + switch (addr_type) { +#if IS_ENABLED(CONFIG_IPV6) + /* ToDo: handle ICMPV6*/ + case IPVL_IPV6: + return !is_ipv6_usable(&((struct ipv6hdr *)lyr3h)->daddr); +#endif + case IPVL_IPV4: { + /* Treat mcast, bcast and zero as multicast. */ + __be32 i4addr = ((struct iphdr *)lyr3h)->daddr; + + return !is_ipv4_usable(i4addr); + } + case IPVL_ARP: { + struct arphdr *arph; + unsigned char *arp_ptr; + __be32 i4addr; + + arph = (struct arphdr *)lyr3h; + arp_ptr = (unsigned char *)(arph + 1); + arp_ptr += (2 * port->dev->addr_len) + 4; + i4addr = *(__be32 *)arp_ptr; + return !is_ipv4_usable(i4addr); + } + } + return false; +} + +static bool ipvlan_is_l2_mcast(struct ipvl_port *port, struct sk_buff *skb, + bool *need_eth_fix) +{ + void *lyr3h; + int addr_type; + + /* In some wifi environments unicast dest address means nothing. + * IP still can be a mcast and frame should be treated as mcast. + */ + *need_eth_fix = false; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) + return true; + + if (!ipvlan_is_learnable(port)) + return false; + + lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); + *need_eth_fix = lyr3h && ipvlan_is_mcast(port, lyr3h, addr_type); + + return *need_eth_fix; +} + static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ipvl_port *port) { struct sk_buff *skb = *pskb; - struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; + bool need_eth_fix; /* Ignore already seen packets. */ if (ipvlan_is_skb_marked(skb, port->dev)) return RX_HANDLER_PASS; - if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_is_l2_mcast(port, skb, &need_eth_fix)) { if (ipvlan_external_frame(skb, port)) { /* External frames are queued for device local * distribution, but a copy is given to master @@ -877,6 +928,8 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { + if (need_eth_fix) + memset(eth_hdr(nskb)->h_dest, 0xff, ETH_ALEN); ipvlan_mark_skb(skb, port->dev); ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); -- 2.25.1 We remember the SRC MAC address of outgoing packets together with IP addresses. While RX, we patch MAC address with remembered MAC. We do patching for both eth_dst and ARPs. ToDo: support IPv6 Neighbours Discovery. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan.h | 5 +- drivers/net/ipvlan/ipvlan_core.c | 144 +++++++++++++++++++++++-------- drivers/net/ipvlan/ipvlan_main.c | 11 ++- 3 files changed, 118 insertions(+), 42 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 020e80df1e38..02a705bf9d42 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -78,6 +78,7 @@ struct ipvl_addr { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; + u8 hwaddr[ETH_ALEN]; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ @@ -177,7 +178,9 @@ void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); -int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); +int ipvlan_add_addr(struct ipvl_dev *ipvlan, + void *iaddr, bool is_v6, const u8 *hwaddr); +void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 41059639f307..fe8e59066c46 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -320,8 +320,36 @@ void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; } -static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, - bool local) +static int ipvlan_snat_rx_skb(struct ipvl_addr *addr, int addr_type, + struct sk_buff *skb) +{ + /* Here we have non-shared skb and free to modify it. */ + struct ethhdr *eth = eth_hdr(skb); + + if (addr_type == IPVL_ARP) { + struct arphdr *arph = arp_hdr(skb); + u8 *arp_ptr = (u8 *)(arph + 1); + u8 *dsthw = arp_ptr + addr->master->dev->addr_len + sizeof(u32); + const u8 *phy_addr = addr->master->phy_dev->dev_addr; + + /* Some access points may do ARP-proxy and answers us back. + * Client may treat this as address-conflict. + */ + if (ether_addr_equal(eth->h_source, phy_addr) && + ether_addr_equal(eth->h_dest, phy_addr) && + is_zero_ether_addr(dsthw)) { + return NET_RX_DROP; + } + if (ether_addr_equal(dsthw, phy_addr)) + ether_addr_copy(dsthw, addr->hwaddr); + } + + ether_addr_copy(eth->h_dest, addr->hwaddr); + return NET_RX_SUCCESS; +} + +static int ipvlan_rcv_frame(struct ipvl_addr *addr, int addr_type, + struct sk_buff **pskb, bool local) { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; @@ -331,10 +359,8 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, struct sk_buff *skb = *pskb; len = skb->len + ETH_HLEN; - /* Only packets exchanged between two local slaves need to have - * device-up check as well as skb-share check. - */ - if (local) { + + if (local || ipvlan_is_learnable(ipvlan->port)) { if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto out; @@ -345,6 +371,13 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, goto out; *pskb = skb; + if (!local && ipvlan_is_learnable(ipvlan->port)) { + if (ipvlan_snat_rx_skb(addr, addr_type, skb) != + NET_RX_SUCCESS) { + kfree_skb(skb); + goto out; + } + } } if (local) { @@ -436,8 +469,9 @@ static inline bool is_ipv6_usable(const struct in6_addr *addr) } static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, - int addr_type) + int addr_type, const u8 *hwaddr) { + struct ipvl_addr *ipvladdr; void *addr = NULL; bool is_v6; @@ -486,10 +520,18 @@ static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, return; } - if (!ipvlan_ht_addr_lookup(ipvlan->port, addr, is_v6)) { + /* handle situation when MAC changed, but IP is the same. */ + ipvladdr = ipvlan_ht_addr_lookup(ipvlan->port, addr, is_v6); + if (ipvladdr && !ether_addr_equal(ipvladdr->hwaddr, hwaddr)) { + /* del_addr is safe to call, because we are inside xmit*/ + ipvlan_del_addr(ipvladdr->master, addr, is_v6); + ipvladdr = NULL; + } + + if (!ipvladdr) { spin_lock_bh(&ipvlan->addrs_lock); if (!ipvlan_addr_busy(ipvlan->port, addr, is_v6)) - ipvlan_add_addr(ipvlan, addr, is_v6); + ipvlan_add_addr(ipvlan, addr, is_v6, hwaddr); spin_unlock_bh(&ipvlan->addrs_lock); } } @@ -687,7 +729,7 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); return NET_XMIT_DROP; } - ipvlan_rcv_frame(addr, &skb, true); + ipvlan_rcv_frame(addr, addr_type, &skb, true); return NET_XMIT_SUCCESS; } } @@ -712,12 +754,14 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) } same_mac_addr = ether_addr_equal(eth->h_dest, eth->h_source); + if (same_mac_addr && ipvlan_is_learnable(ipvlan->port)) + goto out_drop; lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (ipvlan_is_learnable(ipvlan->port)) { if (lyr3h) - ipvlan_addr_learn(ipvlan, lyr3h, addr_type); + ipvlan_addr_learn(ipvlan, lyr3h, addr_type, eth->h_source); /* Mark SKB in advance */ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) @@ -734,47 +778,74 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) ipvlan_multicast_enqueue(ipvlan->port, nskb, true); } - goto tx_phy_dev; + goto tx_frame_out; } if (ipvlan_is_vepa(ipvlan->port)) goto tx_phy_dev; - if (!same_mac_addr && + if (ipvlan_is_learnable(ipvlan->port) && ether_addr_equal(eth->h_dest, ipvlan->phy_dev->dev_addr)) { /* It is a packet from child with destination to main port. * Pass it to main. */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - return NET_XMIT_DROP; skb->pkt_type = PACKET_HOST; skb->dev = ipvlan->phy_dev; dev_forward_skb(ipvlan->phy_dev, skb); return NET_XMIT_SUCCESS; - } else if (same_mac_addr) { - if (lyr3h) { - addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); - if (addr) { - if (ipvlan_is_private(ipvlan->port)) { - consume_skb(skb); - return NET_XMIT_DROP; - } - ipvlan_rcv_frame(addr, &skb, true); - return NET_XMIT_SUCCESS; - } + } + + if (lyr3h) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) { + if (ipvlan_is_private(ipvlan->port)) + goto out_drop; + + ipvlan_rcv_frame(addr, addr_type, &skb, true); + return NET_XMIT_SUCCESS; } + } + +tx_frame_out: + /* We don't know destination. Now we have to handle case for + * non-learnable bridge and learnable case. + */ + if (!ipvlan_is_learnable(ipvlan->port)) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; + if (same_mac_addr) { + /* Packet definitely does not belong to any of the + * virtual devices, but the dest is local. So forward + * the skb for the main-dev. At the RX side we just return + * RX_PASS for it to be processed further on the stack. + */ + dev_forward_skb(ipvlan->phy_dev, skb); + return NET_XMIT_SUCCESS; + } + } else { + /* Ok. It is a packet to outside on learnable. Fix source eth-address. */ + struct sk_buff *orig_skb = skb; - /* Packet definitely does not belong to any of the - * virtual devices, but the dest is local. So forward - * the skb for the main-dev. At the RX side we just return - * RX_PASS for it to be processed further on the stack. - */ - dev_forward_skb(ipvlan->phy_dev, skb); - return NET_XMIT_SUCCESS; + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + skb_reset_mac_header(skb); + ether_addr_copy(skb_eth_hdr(skb)->h_source, + ipvlan->phy_dev->dev_addr); + + /* ToDo: Handle ICMPv6 for neighbours discovery.*/ + if (lyr3h && addr_type == IPVL_ARP) { + struct arphdr *arph; + /* must reparse new skb */ + if (skb != orig_skb && lyr3h && addr_type == IPVL_ARP) + lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, + &addr_type); + arph = (struct arphdr *)lyr3h; + ether_addr_copy((u8 *)(arph + 1), + ipvlan->phy_dev->dev_addr); + } } tx_phy_dev: @@ -849,8 +920,7 @@ static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); if (addr) - ret = ipvlan_rcv_frame(addr, pskb, false); - + ret = ipvlan_rcv_frame(addr, addr_type, pskb, false); out: return ret; } @@ -918,7 +988,7 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, return RX_HANDLER_PASS; if (ipvlan_is_l2_mcast(port, skb, &need_eth_fix)) { - if (ipvlan_external_frame(skb, port)) { + if (ipvlan_is_learnable(port) || ipvlan_external_frame(skb, port)) { /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index df5275bc30fc..6fdfeca6081d 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -911,7 +911,8 @@ static int ipvlan_device_event(struct notifier_block *unused, } /* the caller must held the addrs lock */ -int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6, + const u8 *hwaddr) { struct ipvl_addr *addr; @@ -929,6 +930,8 @@ int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) addr->atype = IPVL_IPV6; #endif } + if (hwaddr) + ether_addr_copy(addr->hwaddr, hwaddr); list_add_tail_rcu(&addr->anode, &ipvlan->addrs); @@ -941,7 +944,7 @@ int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) return 0; } -static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) { struct ipvl_addr *addr; @@ -982,7 +985,7 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else - ret = ipvlan_add_addr(ipvlan, ip6_addr, true); + ret = ipvlan_add_addr(ipvlan, ip6_addr, true, NULL); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } @@ -1053,7 +1056,7 @@ static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else - ret = ipvlan_add_addr(ipvlan, ip4_addr, false); + ret = ipvlan_add_addr(ipvlan, ip4_addr, false, NULL); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } -- 2.25.1 When ipvlan interface goes down, forget all learned addresses. This is a way to cleanup addresses when master dev switches to another network. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_main.c | 49 ++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 6fdfeca6081d..28ce36669d39 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -744,14 +744,10 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, } EXPORT_SYMBOL_GPL(ipvlan_link_new); -void ipvlan_link_delete(struct net_device *dev, struct list_head *head) +static void ipvlan_addrs_forget_all(struct ipvl_dev *ipvlan) { - struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_addr *addr, *next; - if (ipvlan_is_learnable(ipvlan->port)) - dev_set_allmulti(dev, -1); - spin_lock_bh(&ipvlan->addrs_lock); list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { ipvlan_ht_addr_del(addr); @@ -759,6 +755,16 @@ void ipvlan_link_delete(struct net_device *dev, struct list_head *head) kfree_rcu(addr, rcu); } spin_unlock_bh(&ipvlan->addrs_lock); +} + +void ipvlan_link_delete(struct net_device *dev, struct list_head *head) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (ipvlan_is_learnable(ipvlan->port)) + dev_set_allmulti(dev, -1); + + ipvlan_addrs_forget_all(ipvlan); ida_free(&ipvlan->port->ida, dev->dev_id); list_del_rcu(&ipvlan->pnode); @@ -816,6 +822,19 @@ int ipvlan_link_register(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(ipvlan_link_register); +static bool ipvlan_is_valid_dev(const struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (!netif_is_ipvlan(dev)) + return false; + + if (!ipvlan || !ipvlan->port) + return false; + + return true; +} + static int ipvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -827,6 +846,13 @@ static int ipvlan_device_event(struct notifier_block *unused, LIST_HEAD(lst_kill); int err; + if (event == NETDEV_DOWN && ipvlan_is_valid_dev(dev)) { + struct ipvl_dev *ipvlan = netdev_priv(dev); + + ipvlan_addrs_forget_all(ipvlan); + return NOTIFY_DONE; + } + if (!netif_is_ipvlan_port(dev)) return NOTIFY_DONE; @@ -961,19 +987,6 @@ void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) kfree_rcu(addr, rcu); } -static bool ipvlan_is_valid_dev(const struct net_device *dev) -{ - struct ipvl_dev *ipvlan = netdev_priv(dev); - - if (!netif_is_ipvlan(dev)) - return false; - - if (!ipvlan || !ipvlan->port) - return false; - - return true; -} - #if IS_ENABLED(CONFIG_IPV6) static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) { -- 2.25.1 If main port interface supports GSO, we need manually segment the skb before forwarding it to ipvlan interface. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_main.c | 50 ++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 28ce36669d39..f1b1f91f94c0 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -4,6 +4,7 @@ #include #include +#include #include "ipvlan.h" @@ -76,6 +77,41 @@ static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, return err; } +static int ipvlan_receive(struct ipvl_dev *ipvlan, struct sk_buff *skb) +{ + struct sk_buff *segs; + struct sk_buff *nskb; + ssize_t mac_hdr_size; + int ret, len; + + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + ipvlan_skb_crossing_ns(skb, ipvlan->dev); + ipvlan_mark_skb(skb, ipvlan->phy_dev); + if (skb_shinfo(skb)->gso_size == 0) { + len = skb->len + ETH_HLEN; + ret = netif_rx(skb); + ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, false); + return ret; + } + + mac_hdr_size = skb->network_header - skb->mac_header; + __skb_push(skb, mac_hdr_size); + segs = skb_gso_segment(skb, 0); + dev_kfree_skb(skb); + if (IS_ERR(segs)) + return 0; + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + __skb_pull(segs, mac_hdr_size); + len = segs->len + ETH_HLEN; + ret = netif_rx(segs); + ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, false); + } + return ret; +} + static int ipvlan_port_receive(struct sk_buff *skb, struct net_device *wdev, struct packet_type *pt, struct net_device *orig_wdev) { @@ -115,18 +151,8 @@ static int ipvlan_port_receive(struct sk_buff *skb, struct net_device *wdev, goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); - if (addr) { - int ret, len; - - ipvlan_skb_crossing_ns(skb, addr->master->dev); - skb->protocol = eth_type_trans(skb, skb->dev); - skb->pkt_type = PACKET_HOST; - ipvlan_mark_skb(skb, port->dev); - len = skb->len + ETH_HLEN; - ret = netif_rx(skb); - ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, false); - return 0; - } + if (addr) + return ipvlan_receive(addr->master, skb); out: dev_kfree_skb(skb); -- 2.25.1 To make IPv6 work with learnable l2-bridge, need to process the TX-path: * Replace Source-ll-addr in Solicitation ndisc, * Replace Target-ll-addr in Advertisement ndisc No need to do anything in RX-path Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan_core.c | 125 +++++++++++++++++++++++++++---- 1 file changed, 111 insertions(+), 14 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index fe8e59066c46..ce06a06d8a28 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -738,11 +738,117 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) return ipvlan_process_outbound(skb); } +static void ipvlan_snat_patch_tx_arp(struct ipvl_dev *ipvlan, + struct sk_buff *skb) +{ + int addr_type; + struct arphdr *arph; + + arph = (struct arphdr *)ipvlan_get_L3_hdr(ipvlan->port, skb, + &addr_type); + ether_addr_copy((u8 *)(arph + 1), ipvlan->phy_dev->dev_addr); +} + +#if IS_ENABLED(CONFIG_IPV6) + +static u8 *ipvlan_search_icmp6_ll_addr(struct sk_buff *skb, u8 icmp_option) +{ + /* skb is ensured to pullable for all ipv6 payload_len by caller */ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct icmp6hdr *icmph = (struct icmp6hdr *)(ip6h + 1); + int curr_off = sizeof(*icmph); + int ndsize = htons(ip6h->payload_len); + + if (icmph->icmp6_type != NDISC_ROUTER_SOLICITATION) + curr_off += sizeof(struct in6_addr); + + while ((curr_off + 2) < ndsize) { + u8 *data = (u8 *)icmph + curr_off; + u32 opt_len = data[1] << 3; + + if (unlikely(opt_len == 0)) + return NULL; + + if (data[0] != icmp_option) { + curr_off += opt_len; + continue; + } + + if (unlikely(opt_len < ETH_ALEN + 2)) + return NULL; + + if (unlikely(curr_off + opt_len > ndsize)) + return NULL; + + return data + 2; + } + + return NULL; +} + +static void ipvlan_snat_patch_tx_ipv6(struct ipvl_dev *ipvlan, + struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + struct icmp6hdr *icmph; + u8 icmp_option; + u8 *lladdr; + u16 ndsize; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) + return; + + if (ipv6_hdr(skb)->nexthdr != NEXTHDR_ICMP) + return; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)))) + return; + + ip6h = ipv6_hdr(skb); + icmph = (struct icmp6hdr *)(ip6h + 1); + + /* Patch Source-LL for solicitation, Target-LL for advertisement */ + if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmph->icmp6_type == NDISC_ROUTER_SOLICITATION) + icmp_option = ND_OPT_SOURCE_LL_ADDR; + else if (icmph->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) + icmp_option = ND_OPT_TARGET_LL_ADDR; + else + return; + + ndsize = htons(ip6h->payload_len); + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + ndsize))) + return; + + lladdr = ipvlan_search_icmp6_ll_addr(skb, icmp_option); + if (!lladdr) + return; + + ether_addr_copy(lladdr, ipvlan->phy_dev->dev_addr); + + ip6h = ipv6_hdr(skb); + icmph = (struct icmp6hdr *)(ip6h + 1); + icmph->icmp6_cksum = 0; + icmph->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ndsize, + IPPROTO_ICMPV6, + csum_partial(icmph, + ndsize, + 0)); + skb->ip_summed = CHECKSUM_COMPLETE; +} +#else +static void ipvlan_snat_patch_tx_ipv6(struct ipvl_dev *ipvlan, + struct sk_buff *skb) +{ +} +#endif + static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) { void *lyr3h; struct ipvl_addr *addr; - int addr_type; + int addr_type = -1; bool same_mac_addr; struct ipvl_dev *ipvlan = netdev_priv(dev); struct ethhdr *eth = skb_eth_hdr(skb); @@ -825,8 +931,6 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) } } else { /* Ok. It is a packet to outside on learnable. Fix source eth-address. */ - struct sk_buff *orig_skb = skb; - skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; @@ -835,17 +939,10 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) ether_addr_copy(skb_eth_hdr(skb)->h_source, ipvlan->phy_dev->dev_addr); - /* ToDo: Handle ICMPv6 for neighbours discovery.*/ - if (lyr3h && addr_type == IPVL_ARP) { - struct arphdr *arph; - /* must reparse new skb */ - if (skb != orig_skb && lyr3h && addr_type == IPVL_ARP) - lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, - &addr_type); - arph = (struct arphdr *)lyr3h; - ether_addr_copy((u8 *)(arph + 1), - ipvlan->phy_dev->dev_addr); - } + if (addr_type == IPVL_ARP) + ipvlan_snat_patch_tx_arp(ipvlan, skb); + else if (addr_type == IPVL_ICMPV6 || addr_type == IPVL_IPV6) + ipvlan_snat_patch_tx_ipv6(ipvlan, skb); } tx_phy_dev: -- 2.25.1 When some child attempts to send a packet with host ip, remember host IP in the list of ipvlan-addrs with mark "blocked". Don't send anything if child tries to send a packet with IP of main. ToDo: track addresses on main port and mark them as blocked if bridge already learned some of them from some of the children. Signed-off-by: Dmitry Skorodumov --- drivers/net/ipvlan/ipvlan.h | 4 ++- drivers/net/ipvlan/ipvlan_core.c | 61 +++++++++++++++++++++++++------- drivers/net/ipvlan/ipvlan_main.c | 9 ++--- 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 02a705bf9d42..7de9794efbda 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -74,6 +74,7 @@ struct ipvl_dev { struct ipvl_addr { struct ipvl_dev *master; /* Back pointer to master */ + bool is_blocked; /* Blocked. Addr from main iface */ union { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ @@ -179,7 +180,8 @@ void ipvlan_multicast_enqueue(struct ipvl_port *port, int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); int ipvlan_add_addr(struct ipvl_dev *ipvlan, - void *iaddr, bool is_v6, const u8 *hwaddr); + void *iaddr, bool is_v6, const u8 *hwaddr, + bool is_blocked); void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index ce06a06d8a28..8b2c2d455ea5 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -468,8 +468,30 @@ static inline bool is_ipv6_usable(const struct in6_addr *addr) !ipv6_addr_any(addr); } -static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, - int addr_type, const u8 *hwaddr) +static bool ipvlan_is_portaddr_busy(struct ipvl_dev *ipvlan, + void *addr, bool is_v6) +{ + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + if (is_v6) + return ipv6_chk_addr(dev_net(ipvlan->phy_dev), addr, + ipvlan->phy_dev, 1); + + in_dev = __in_dev_get_rcu(ipvlan->phy_dev); + if (!in_dev) + return false; + + in_dev_for_each_ifa_rcu(ifa, in_dev) + if (ifa->ifa_local == *(__be32 *)addr) + return true; + + return false; +} + +/* return -1 if frame should be dropped. */ +static int ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, + int addr_type, const u8 *hwaddr) { struct ipvl_addr *ipvladdr; void *addr = NULL; @@ -483,7 +505,7 @@ static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, ip6h = (struct ipv6hdr *)lyr3h; if (!is_ipv6_usable(&ip6h->saddr)) - return; + return 0; is_v6 = true; addr = &ip6h->saddr; break; @@ -496,7 +518,7 @@ static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, ip4h = (struct iphdr *)lyr3h; i4addr = &ip4h->saddr; if (!is_ipv4_usable(*i4addr)) - return; + return 0; is_v6 = false; addr = i4addr; break; @@ -511,17 +533,20 @@ static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, arp_ptr += ipvlan->port->dev->addr_len; i4addr = (__be32 *)arp_ptr; if (!is_ipv4_usable(*i4addr)) - return; + return 0; is_v6 = false; addr = i4addr; break; } default: - return; + return 0; } /* handle situation when MAC changed, but IP is the same. */ ipvladdr = ipvlan_ht_addr_lookup(ipvlan->port, addr, is_v6); + if (ipvladdr && ipvladdr->is_blocked) + return -1; + if (ipvladdr && !ether_addr_equal(ipvladdr->hwaddr, hwaddr)) { /* del_addr is safe to call, because we are inside xmit*/ ipvlan_del_addr(ipvladdr->master, addr, is_v6); @@ -529,11 +554,17 @@ static void ipvlan_addr_learn(struct ipvl_dev *ipvlan, void *lyr3h, } if (!ipvladdr) { + bool is_port_ip = ipvlan_is_portaddr_busy(ipvlan, addr, is_v6); + spin_lock_bh(&ipvlan->addrs_lock); if (!ipvlan_addr_busy(ipvlan->port, addr, is_v6)) - ipvlan_add_addr(ipvlan, addr, is_v6, hwaddr); + ipvlan_add_addr(ipvlan, addr, is_v6, hwaddr, is_port_ip); spin_unlock_bh(&ipvlan->addrs_lock); + + return is_port_ip ? -1 : 0; } + + return 0; } static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) @@ -724,11 +755,12 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) if (!ipvlan_is_vepa(ipvlan->port)) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); - if (addr) { + if (addr && !addr->is_blocked) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } + ipvlan_rcv_frame(addr, addr_type, &skb, true); return NET_XMIT_SUCCESS; } @@ -866,8 +898,12 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (ipvlan_is_learnable(ipvlan->port)) { - if (lyr3h) - ipvlan_addr_learn(ipvlan, lyr3h, addr_type, eth->h_source); + if (lyr3h) { + if (ipvlan_addr_learn(ipvlan, lyr3h, addr_type, + eth->h_source) < 0) + goto out_drop; + } + /* Mark SKB in advance */ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) @@ -903,7 +939,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) if (lyr3h) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); - if (addr) { + if (addr && !addr->is_blocked) { if (ipvlan_is_private(ipvlan->port)) goto out_drop; @@ -1016,8 +1052,9 @@ static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); - if (addr) + if (addr && !addr->is_blocked) ret = ipvlan_rcv_frame(addr, addr_type, pskb, false); + out: return ret; } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f1b1f91f94c0..5df6bdeadef5 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -151,7 +151,7 @@ static int ipvlan_port_receive(struct sk_buff *skb, struct net_device *wdev, goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); - if (addr) + if (addr && !addr->is_blocked) return ipvlan_receive(addr->master, skb); out: @@ -964,7 +964,7 @@ static int ipvlan_device_event(struct notifier_block *unused, /* the caller must held the addrs lock */ int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6, - const u8 *hwaddr) + const u8 *hwaddr, bool is_blocked) { struct ipvl_addr *addr; @@ -973,6 +973,7 @@ int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6, return -ENOMEM; addr->master = ipvlan; + addr->is_blocked = is_blocked; if (!is_v6) { memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr)); addr->atype = IPVL_IPV4; @@ -1024,7 +1025,7 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) "Failed to add IPv6=%pI6c addr for %s intf\n", ip6_addr, ipvlan->dev->name); else - ret = ipvlan_add_addr(ipvlan, ip6_addr, true, NULL); + ret = ipvlan_add_addr(ipvlan, ip6_addr, true, NULL, false); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } @@ -1095,7 +1096,7 @@ static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) "Failed to add IPv4=%pI4 on %s intf.\n", ip4_addr, ipvlan->dev->name); else - ret = ipvlan_add_addr(ipvlan, ip4_addr, false, NULL); + ret = ipvlan_add_addr(ipvlan, ip4_addr, false, NULL, false); spin_unlock_bh(&ipvlan->addrs_lock); return ret; } -- 2.25.1